001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.classification.sequence.viterbi;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.Feature;
023import org.tribuo.classification.Label;
024
025import java.util.ArrayList;
026import java.util.Collections;
027import java.util.List;
028
029/**
030 * A label feature extractor that produces several kinds of label-based features.
031 * <p>
032 * The options are: the most recent output, the least recent output, recent bigrams, recent trigrams, recent 4-grams.
033 */
034public class DefaultFeatureExtractor implements LabelFeatureExtractor {
035
036    private static final long serialVersionUID = 1L;
037
038    /**
039     * indicates the position of the first (most recent) outcome to include. For example, the
040     * default value of 1 means that if the outcomes produced so far by the classifier were [A, B,
041     * C, D], then the first outcome to be used as a feature would be D since it is the most recent.
042     */
043    @Config(mandatory = true, description = "Position of the most recent outcome to include.")
044    private int mostRecentOutcome;
045
046    /**
047     * indicates the position of the last (least recent) outcome to include. For example, the
048     * default value of 3 means that if the outcomes produced so far by the classifier were [A, B,
049     * C, D], then the last outcome to be used as a feature would be B since and is considered the
050     * least recent.
051     */
052    @Config(mandatory = true, description = "Position of the least recent output to include.")
053    private int leastRecentOutcome;
054
055    /**
056     * when true indicates that bigrams of outcomes should be included as features
057     */
058    @Config(mandatory = true, description = "Use bigrams of the labels as features.")
059    private boolean useBigram;
060
061    /**
062     * indicates that trigrams of outcomes should be included as features
063     */
064    @Config(mandatory = true, description = "Use trigrams of the labels as features.")
065    private boolean useTrigram;
066
067    /**
068     * indicates that 4-grams of outcomes should be included as features
069     */
070    @Config(mandatory = true, description = "Use 4-grams of the labels as features.")
071    private boolean use4gram;
072
073    public DefaultFeatureExtractor() {
074        this(1, 3, true, true, false);
075    }
076
077    public DefaultFeatureExtractor(int mostRecentOutcome, int leastRecentOutcome, boolean useBigram, boolean useTrigram, boolean use4gram) {
078        this.mostRecentOutcome = mostRecentOutcome;
079        this.leastRecentOutcome = leastRecentOutcome;
080        this.useBigram = useBigram;
081        this.useTrigram = useTrigram;
082        this.use4gram = use4gram;
083    }
084
085    @Override
086    public String toString() {
087        return "DefaultFeatureExtractor(mostRecent=" + mostRecentOutcome + ",leastRecent=" + leastRecentOutcome + ",useBigram=" + useBigram + ",useTrigram=" + useTrigram + ",use4gram=" + use4gram + ")";
088    }
089
090    @Override
091    public List<Feature> extractFeatures(List<Label> previousOutcomes, double value) {
092        if (previousOutcomes == null || previousOutcomes.size() == 0) {
093            return Collections.emptyList();
094        }
095
096        List<Feature> features = new ArrayList<>();
097
098        for (int i = mostRecentOutcome; i <= leastRecentOutcome; i++) {
099            int index = previousOutcomes.size() - i;
100            if (index >= 0) {
101                Feature feature = new Feature("PreviousOutcome_L" + i + "_" + previousOutcomes.get(index).getLabel(), value);
102                features.add(feature);
103            }
104        }
105
106        if (useBigram && previousOutcomes.size() >= 2) {
107            int size = previousOutcomes.size();
108            String featureValue = previousOutcomes.get(size - 1).getLabel() + "_" + previousOutcomes.get(size - 2).getLabel();
109            Feature feature = new Feature("PreviousOutcomes_L1_2gram_L2R_" + featureValue, value);
110            features.add(feature);
111        }
112
113        if (useTrigram && previousOutcomes.size() >= 3) {
114            int size = previousOutcomes.size();
115            String featureValue = previousOutcomes.get(size - 1).getLabel() + "_" + previousOutcomes.get(size - 2).getLabel() + "_"
116                    + previousOutcomes.get(size - 3).getLabel();
117            Feature feature = new Feature("PreviousOutcomes_L1_3gram_L2R_" + featureValue, value);
118            features.add(feature);
119        }
120
121        if (use4gram && previousOutcomes.size() >= 4) {
122            int size = previousOutcomes.size();
123            String featureValue = previousOutcomes.get(size - 1).getLabel() + "_" + previousOutcomes.get(size - 2).getLabel() + "_"
124                    + previousOutcomes.get(size - 3).getLabel() + "_" + previousOutcomes.get(size - 4).getLabel();
125            Feature feature = new Feature("PreviousOutcomes_L1_4gram_L2R_" + featureValue, value);
126            features.add(feature);
127        }
128
129        return features;
130    }
131
132    @Override
133    public ConfiguredObjectProvenance getProvenance() {
134        return new ConfiguredObjectProvenanceImpl(this, "LabelFeatureExtractor");
135    }
136}