001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.columnar.processors.feature; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.data.columnar.ColumnarFeature; 023import org.tribuo.data.columnar.FeatureProcessor; 024 025import java.util.ArrayList; 026import java.util.Comparator; 027import java.util.LinkedHashMap; 028import java.util.List; 029import java.util.Map; 030 031/** 032 * Processes a feature list, aggregating all the feature values with the same name. 033 * <p> 034 * The aggregation is user controllable. 035 * <p> 036 * In most cases this will be unnecessary as the feature names will be unique as they are keyed by the field name, 037 * however it's possible to induce collisions via text fields or other mechanisms. 038 */ 039public class UniqueProcessor implements FeatureProcessor { 040 041 /** 042 * The type of reduction operation to perform. 043 */ 044 public enum UniqueType { 045 /** 046 * Select the first feature value in the list. 047 */ 048 FIRST, 049 /** 050 * Select the last feature value in the list. 051 */ 052 LAST, 053 /** 054 * Select the maximum feature value in the list. 055 */ 056 MAX, 057 /** 058 * Select the minimum feature value in the list. 059 */ 060 MIN, 061 /** 062 * Add together all the feature values. Uses the field names from the first element. 063 */ 064 SUM; 065 } 066 067 @Config(mandatory=true,description="The operation to perform.") 068 private UniqueType reductionType; 069 070 /** 071 * For OLCUT 072 */ 073 private UniqueProcessor() {} 074 075 /** 076 * Creates a UniqueProcessor using the specified reduction operation. 077 * @param reductionType The reduction operation to perform. 078 */ 079 public UniqueProcessor(UniqueType reductionType) { 080 this.reductionType = reductionType; 081 } 082 083 @Override 084 public List<ColumnarFeature> process(List<ColumnarFeature> features) { 085 if (features.isEmpty()) { 086 return features; 087 } 088 Map<String,List<ColumnarFeature>> map = new LinkedHashMap<>(); 089 for (ColumnarFeature f : features) { 090 map.computeIfAbsent(f.getName(), (s) -> new ArrayList<>()).add(f); 091 } 092 093 // Unique the features 094 List<ColumnarFeature> returnVal = new ArrayList<>(); 095 for (Map.Entry<String,List<ColumnarFeature>> e : map.entrySet()) { 096 returnVal.add(uniqueList(reductionType, e.getValue())); 097 } 098 return returnVal; 099 } 100 101 /** 102 * Processes the list returning the unique feature. 103 * <p> 104 * Throws {@link IllegalArgumentException} if the list is empty. 105 * @param type The unique operation to perform. 106 * @param list The list of features to process. 107 * @return The unique feature. 108 */ 109 private static ColumnarFeature uniqueList(UniqueType type, List<ColumnarFeature> list) { 110 if (list.isEmpty()) { 111 throw new IllegalArgumentException("List must contain at least one feature"); 112 } else if (list.size() == 1) { 113 return list.get(0); 114 } else { 115 switch (type) { 116 case FIRST: 117 return list.get(0); 118 case LAST: 119 return list.get(list.size()-1); 120 case MAX: 121 return list.stream().max(Comparator.comparingDouble(ColumnarFeature::getValue)).get(); 122 case MIN: 123 return list.stream().min(Comparator.comparingDouble(ColumnarFeature::getValue)).get(); 124 case SUM: 125 double value = 0.0; 126 for (ColumnarFeature f : list) { 127 value += f.getValue(); 128 } 129 ColumnarFeature first = list.get(0); 130 if (first.getFieldName().equals(ColumnarFeature.CONJUNCTION)) { 131 return new ColumnarFeature(first.getFirstFieldName(),first.getSecondFieldName(),first.getColumnEntry(),value); 132 } else { 133 return new ColumnarFeature(first.getFieldName(),first.getColumnEntry(),value); 134 } 135 default: 136 throw new IllegalStateException("Unknown enum type " + type); 137 } 138 } 139 } 140 141 @Override 142 public ConfiguredObjectProvenance getProvenance() { 143 return new ConfiguredObjectProvenanceImpl(this,"FeatureProcessor"); 144 } 145}