001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.classification.sequence.viterbi; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.Feature; 023import org.tribuo.classification.Label; 024 025import java.util.ArrayList; 026import java.util.Collections; 027import java.util.List; 028 029/** 030 * A label feature extractor that produces several kinds of label-based features. 031 * <p> 032 * The options are: the most recent output, the least recent output, recent bigrams, recent trigrams, recent 4-grams. 033 */ 034public class DefaultFeatureExtractor implements LabelFeatureExtractor { 035 036 private static final long serialVersionUID = 1L; 037 038 /** 039 * indicates the position of the first (most recent) outcome to include. For example, the 040 * default value of 1 means that if the outcomes produced so far by the classifier were [A, B, 041 * C, D], then the first outcome to be used as a feature would be D since it is the most recent. 042 */ 043 @Config(mandatory = true, description = "Position of the most recent outcome to include.") 044 private int mostRecentOutcome; 045 046 /** 047 * indicates the position of the last (least recent) outcome to include. For example, the 048 * default value of 3 means that if the outcomes produced so far by the classifier were [A, B, 049 * C, D], then the last outcome to be used as a feature would be B since and is considered the 050 * least recent. 051 */ 052 @Config(mandatory = true, description = "Position of the least recent output to include.") 053 private int leastRecentOutcome; 054 055 /** 056 * when true indicates that bigrams of outcomes should be included as features 057 */ 058 @Config(mandatory = true, description = "Use bigrams of the labels as features.") 059 private boolean useBigram; 060 061 /** 062 * indicates that trigrams of outcomes should be included as features 063 */ 064 @Config(mandatory = true, description = "Use trigrams of the labels as features.") 065 private boolean useTrigram; 066 067 /** 068 * indicates that 4-grams of outcomes should be included as features 069 */ 070 @Config(mandatory = true, description = "Use 4-grams of the labels as features.") 071 private boolean use4gram; 072 073 public DefaultFeatureExtractor() { 074 this(1, 3, true, true, false); 075 } 076 077 public DefaultFeatureExtractor(int mostRecentOutcome, int leastRecentOutcome, boolean useBigram, boolean useTrigram, boolean use4gram) { 078 this.mostRecentOutcome = mostRecentOutcome; 079 this.leastRecentOutcome = leastRecentOutcome; 080 this.useBigram = useBigram; 081 this.useTrigram = useTrigram; 082 this.use4gram = use4gram; 083 } 084 085 @Override 086 public String toString() { 087 return "DefaultFeatureExtractor(mostRecent=" + mostRecentOutcome + ",leastRecent=" + leastRecentOutcome + ",useBigram=" + useBigram + ",useTrigram=" + useTrigram + ",use4gram=" + use4gram + ")"; 088 } 089 090 @Override 091 public List<Feature> extractFeatures(List<Label> previousOutcomes, double value) { 092 if (previousOutcomes == null || previousOutcomes.size() == 0) { 093 return Collections.emptyList(); 094 } 095 096 List<Feature> features = new ArrayList<>(); 097 098 for (int i = mostRecentOutcome; i <= leastRecentOutcome; i++) { 099 int index = previousOutcomes.size() - i; 100 if (index >= 0) { 101 Feature feature = new Feature("PreviousOutcome_L" + i + "_" + previousOutcomes.get(index).getLabel(), value); 102 features.add(feature); 103 } 104 } 105 106 if (useBigram && previousOutcomes.size() >= 2) { 107 int size = previousOutcomes.size(); 108 String featureValue = previousOutcomes.get(size - 1).getLabel() + "_" + previousOutcomes.get(size - 2).getLabel(); 109 Feature feature = new Feature("PreviousOutcomes_L1_2gram_L2R_" + featureValue, value); 110 features.add(feature); 111 } 112 113 if (useTrigram && previousOutcomes.size() >= 3) { 114 int size = previousOutcomes.size(); 115 String featureValue = previousOutcomes.get(size - 1).getLabel() + "_" + previousOutcomes.get(size - 2).getLabel() + "_" 116 + previousOutcomes.get(size - 3).getLabel(); 117 Feature feature = new Feature("PreviousOutcomes_L1_3gram_L2R_" + featureValue, value); 118 features.add(feature); 119 } 120 121 if (use4gram && previousOutcomes.size() >= 4) { 122 int size = previousOutcomes.size(); 123 String featureValue = previousOutcomes.get(size - 1).getLabel() + "_" + previousOutcomes.get(size - 2).getLabel() + "_" 124 + previousOutcomes.get(size - 3).getLabel() + "_" + previousOutcomes.get(size - 4).getLabel(); 125 Feature feature = new Feature("PreviousOutcomes_L1_4gram_L2R_" + featureValue, value); 126 features.add(feature); 127 } 128 129 return features; 130 } 131 132 @Override 133 public ConfiguredObjectProvenance getProvenance() { 134 return new ConfiguredObjectProvenanceImpl(this, "LabelFeatureExtractor"); 135 } 136}