001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.sequence; 018 019import org.tribuo.Example; 020import org.tribuo.Feature; 021import org.tribuo.FeatureMap; 022import org.tribuo.Output; 023import org.tribuo.OutputFactory; 024import org.tribuo.hash.HashedFeatureMap; 025import org.tribuo.impl.ArrayExample; 026import org.tribuo.impl.BinaryFeaturesExample; 027import org.tribuo.util.Merger; 028 029import java.io.Serializable; 030import java.util.ArrayList; 031import java.util.Iterator; 032import java.util.List; 033import java.util.logging.Logger; 034 035/** 036 * A sequence of examples, used for sequence classification. 037 */ 038public class SequenceExample<T extends Output<T>> implements Iterable<Example<T>>, Serializable { 039 private static final long serialVersionUID = 1L; 040 041 private static final Logger logger = Logger.getLogger(SequenceExample.class.getName()); 042 043 public static final float DEFAULT_WEIGHT = 1.0f; 044 045 private final List<Example<T>> examples; 046 private float weight = 1.0f; 047 048 /** 049 * Creates an empty sequence example. 050 */ 051 public SequenceExample() { 052 this(new ArrayList<>()); 053 } 054 055 /** 056 * Creates a sequence example from the list of examples. 057 * <p> 058 * The examples are not copied by this method. 059 * @param examples The examples to incorporate. 060 */ 061 public SequenceExample(List<Example<T>> examples) { 062 this(examples,DEFAULT_WEIGHT); 063 } 064 065 /** 066 * Creates a sequence example from the list of examples, setting the weight. 067 * <p> 068 * The examples are encapsulated by this constructor, not copied. 069 * @param examples The examples to incorporate. 070 * @param weight The weight of this sequence. 071 */ 072 public SequenceExample(List<Example<T>> examples, float weight) { 073 this.examples = examples; 074 this.weight = weight; 075 } 076 077 /** 078 * Creates a sequence example from the supplied outputs and list of list of features. 079 * <p> 080 * The features are copied out by this constructor. The outputs and features lists 081 * must be of the same length. Sets the weight to {@link SequenceExample#DEFAULT_WEIGHT}. 082 * @param outputs The outputs for each sequence element. 083 * @param features The features for each sequence element. 084 */ 085 public SequenceExample(List<T> outputs, List<? extends List<? extends Feature>> features) { 086 this(outputs,features,DEFAULT_WEIGHT); 087 } 088 089 /** 090 * Creates a sequence example from the supplied weight, outputs and list of list of features. 091 * <p> 092 * The features are copied out by this constructor. The outputs and features lists 093 * must be of the same length. 094 * @param outputs The outputs for each sequence element. 095 * @param features The features for each sequence element. 096 * @param weight The weight for this sequence example. 097 */ 098 public SequenceExample(List<T> outputs, List<? extends List<? extends Feature>> features, float weight) { 099 this(outputs, features, weight, false); 100 } 101 102 public SequenceExample(List<T> outputs, List<? extends List<? extends Feature>> features, boolean attemptBinaryFeatures) { 103 this(outputs, features, DEFAULT_WEIGHT, attemptBinaryFeatures); 104 } 105 106 public SequenceExample(List<T> outputs, List<? extends List<? extends Feature>> features, float weight, boolean attemptBinaryFeatures) { 107 if (outputs.size() != features.size()) { 108 throw new IllegalArgumentException("outputs.size() = " + outputs.size() + ", features.size() = " + features.size()); 109 } 110 111 List<Example<T>> examples = new ArrayList<>(outputs.size()); 112 113 for (int i = 0; i < outputs.size(); i++) { 114 List<? extends Feature> list = features.get(i); 115 Example<T> example = null; 116 if(attemptBinaryFeatures){ 117 try { 118 example = new BinaryFeaturesExample<>(outputs.get(i), list); 119 } catch(IllegalArgumentException iae){ 120 logger.finer("attempted to create BinaryFeaturesExample but not all of the features were binary"); 121 example = new ArrayExample<>(outputs.get(i), list); 122 } 123 } else { 124 example = new ArrayExample<>(outputs.get(i), list); 125 } 126 examples.add(example); 127 } 128 129 this.examples = examples; 130 this.weight = weight; 131 } 132 133 /** 134 * Creates a deep copy of the supplied sequence example. 135 * @param other The sequence example to copy. 136 */ 137 public SequenceExample(SequenceExample<T> other) { 138 this.examples = new ArrayList<>(other.size()); 139 for(Example<T> example : other) { 140 examples.add(example.copy()); 141 } 142 this.weight = other.weight; 143 } 144 145 /** 146 * Return how many examples are in this sequence. 147 * @return The number of examples. 148 */ 149 public int size() { 150 return examples.size(); 151 } 152 153 /** 154 * Removes the features in the supplied list from each example contained in this sequence. 155 * @param features The features to remove. 156 */ 157 public void removeFeatures(List<Feature> features) { 158 for (Example<T> e : examples) { 159 e.removeFeatures(features); 160 } 161 } 162 163 /** 164 * Gets the example found at the specified index. 165 * @param i The index to lookup. 166 * @return The {@link Example} for index i. 167 */ 168 public Example<T> get(int i) { 169 return examples.get(i); 170 } 171 172 /** 173 * Checks that each {@link Example} in this sequence is valid. 174 * @return True if each {@link Example} is valid, false otherwise. 175 */ 176 public boolean validateExample() { 177 if (examples.isEmpty()) { 178 return false; 179 } else { 180 boolean valid = true; 181 for (Example<T> e : examples) { 182 valid &= e.validateExample(); 183 } 184 return valid; 185 } 186 } 187 188 /** 189 * Reduces the features in each example using the supplied {@link Merger}. 190 * @param merger The merger to use in the reduction. 191 */ 192 public void reduceByName(Merger merger) { 193 for (Example<T> e : examples) { 194 e.reduceByName(merger); 195 } 196 } 197 198 /** 199 * Sets the weight of this sequence. 200 * @param weight The new weight. 201 */ 202 public void setWeight(float weight) { 203 this.weight = weight; 204 } 205 206 /** 207 * Gets the weight of this sequence. 208 * @return The weight of this sequence. 209 */ 210 public float getWeight() { 211 return weight; 212 } 213 214 /** 215 * Adds an {@link Example} to this sequence. 216 * @param e The example to add. 217 */ 218 public void addExample(Example<T> e) { 219 examples.add(e); 220 } 221 222 /** 223 * Returns a deep copy of this SequenceExample. 224 * @return A deep copy. 225 */ 226 public SequenceExample<T> copy() { 227 return new SequenceExample<>(this); 228 } 229 230 @Override 231 public Iterator<Example<T>> iterator() { 232 return examples.iterator(); 233 } 234 235 /** 236 * Creates an iterator over every feature in this sequence. 237 * @return An iterator over features. 238 */ 239 public Iterator<Feature> featureIterator() { 240 return new FeatureIterator<>(iterator()); 241 } 242 243 /** 244 * Reassigns feature name Strings in each Example inside this SequenceExample to point to 245 * those in the {@link FeatureMap}. This significantly reduces memory allocation. It is called 246 * when a SequenceExample is added to a {@link MutableSequenceDataset}, and should not be 247 * called outside of that context as it may interact unexpectedly with 248 * {@link HashedFeatureMap}. 249 * @param featureMap The feature map containing canonical feature names. 250 */ 251 public void canonicalise(FeatureMap featureMap) { 252 for (Example<T> e : examples) { 253 e.canonicalize(featureMap); 254 } 255 } 256 257 /** 258 * Creates a SequenceExample using {@link OutputFactory#getUnknownOutput()} as the output for each 259 * sequence element. 260 * <p> 261 * Note: this method is used to create SequenceExamples at prediction time where there is no 262 * ground truth {@link Output}. 263 * @param features The features for each sequence element. 264 * @param outputFactory The output factory to use. 265 * @param <T> The type of the {@link Output}. 266 * @return A new SequenceExample. 267 */ 268 public static <T extends Output<T>> SequenceExample<T> createWithEmptyOutputs(List<? extends List<? extends Feature>> features, OutputFactory<T> outputFactory) { 269 ArrayList<Example<T>> examples = new ArrayList<>(features.size()); 270 271 for (List<? extends Feature> list : features) { 272 ArrayExample<T> example = new ArrayExample<>(outputFactory.getUnknownOutput()); 273 example.addAll(list); 274 examples.add(example); 275 } 276 277 return new SequenceExample<>(examples); 278 } 279 280 private static class FeatureIterator<T extends Output<T>> implements Iterator<Feature> { 281 private final Iterator<Example<T>> itr; 282 private Iterator<Feature> featureItr; 283 284 public FeatureIterator(Iterator<Example<T>> e) { 285 itr = e; 286 } 287 288 @Override 289 public boolean hasNext() { 290 if ((featureItr != null) && (featureItr.hasNext())) { 291 return true; 292 } else if (itr.hasNext()) { 293 while (itr.hasNext()) { 294 featureItr = itr.next().iterator(); 295 if (featureItr.hasNext()) { 296 return true; 297 } 298 } 299 return false; 300 } else { 301 return false; 302 } 303 } 304 305 @Override 306 public Feature next() { 307 if (featureItr != null) { 308 return featureItr.next(); 309 } else { 310 return null; 311 } 312 } 313 } 314} 315