001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.columnar.processors.feature;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.data.columnar.ColumnarFeature;
023import org.tribuo.data.columnar.FeatureProcessor;
024
025import java.util.ArrayList;
026import java.util.Comparator;
027import java.util.LinkedHashMap;
028import java.util.List;
029import java.util.Map;
030
031/**
032 * Processes a feature list, aggregating all the feature values with the same name.
033 * <p>
034 * The aggregation is user controllable.
035 * <p>
036 * In most cases this will be unnecessary as the feature names will be unique as they are keyed by the field name,
037 * however it's possible to induce collisions via text fields or other mechanisms.
038 */
039public class UniqueProcessor implements FeatureProcessor {
040
041    /**
042     * The type of reduction operation to perform.
043     */
044    public enum UniqueType {
045        /**
046         * Select the first feature value in the list.
047         */
048        FIRST,
049        /**
050         * Select the last feature value in the list.
051         */
052        LAST,
053        /**
054         * Select the maximum feature value in the list.
055         */
056        MAX,
057        /**
058         * Select the minimum feature value in the list.
059         */
060        MIN,
061        /**
062         * Add together all the feature values. Uses the field names from the first element.
063         */
064        SUM;
065    }
066
067    @Config(mandatory=true,description="The operation to perform.")
068    private UniqueType reductionType;
069
070    /**
071     * For OLCUT
072     */
073    private UniqueProcessor() {}
074
075    /**
076     * Creates a UniqueProcessor using the specified reduction operation.
077     * @param reductionType The reduction operation to perform.
078     */
079    public UniqueProcessor(UniqueType reductionType) {
080        this.reductionType = reductionType;
081    }
082
083    @Override
084    public List<ColumnarFeature> process(List<ColumnarFeature> features) {
085        if (features.isEmpty()) {
086            return features;
087        }
088        Map<String,List<ColumnarFeature>> map = new LinkedHashMap<>();
089        for (ColumnarFeature f : features) {
090            map.computeIfAbsent(f.getName(), (s) -> new ArrayList<>()).add(f);
091        }
092
093        // Unique the features
094        List<ColumnarFeature> returnVal = new ArrayList<>();
095        for (Map.Entry<String,List<ColumnarFeature>> e : map.entrySet()) {
096            returnVal.add(uniqueList(reductionType, e.getValue()));
097        }
098        return returnVal;
099    }
100
101    /**
102     * Processes the list returning the unique feature.
103     * <p>
104     * Throws {@link IllegalArgumentException} if the list is empty.
105     * @param type The unique operation to perform.
106     * @param list The list of features to process.
107     * @return The unique feature.
108     */
109    private static ColumnarFeature uniqueList(UniqueType type, List<ColumnarFeature> list) {
110        if (list.isEmpty()) {
111            throw new IllegalArgumentException("List must contain at least one feature");
112        } else if (list.size() == 1) {
113            return list.get(0);
114        } else {
115            switch (type) {
116                case FIRST:
117                    return list.get(0);
118                case LAST:
119                    return list.get(list.size()-1);
120                case MAX:
121                    return list.stream().max(Comparator.comparingDouble(ColumnarFeature::getValue)).get();
122                case MIN:
123                    return list.stream().min(Comparator.comparingDouble(ColumnarFeature::getValue)).get();
124                case SUM:
125                    double value = 0.0;
126                    for (ColumnarFeature f : list) {
127                        value += f.getValue();
128                    }
129                    ColumnarFeature first = list.get(0);
130                    if (first.getFieldName().equals(ColumnarFeature.CONJUNCTION)) {
131                        return new ColumnarFeature(first.getFirstFieldName(),first.getSecondFieldName(),first.getColumnEntry(),value);
132                    } else {
133                        return new ColumnarFeature(first.getFieldName(),first.getColumnEntry(),value);
134                    }
135                default:
136                    throw new IllegalStateException("Unknown enum type " + type);
137            }
138        }
139    }
140
141    @Override
142    public ConfiguredObjectProvenance getProvenance() {
143        return new ConfiguredObjectProvenanceImpl(this,"FeatureProcessor");
144    }
145}