001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo; 018 019import com.oracle.labs.mlrg.olcut.provenance.ListProvenance; 020import org.tribuo.hash.HashedFeatureMap; 021import org.tribuo.hash.Hasher; 022import org.tribuo.provenance.DataProvenance; 023import org.tribuo.provenance.DatasetProvenance; 024import org.tribuo.util.Merger; 025 026import java.io.Serializable; 027import java.util.ArrayList; 028import java.util.List; 029import java.util.Set; 030import java.util.logging.Logger; 031 032/** 033 * This is a {@link Dataset} which has an {@link ImmutableFeatureMap} to store the feature information. 034 * Whenever an example is added to this dataset it removes features that do not exist in the {@link FeatureMap}. 035 * The dataset is immutable after construction (unless the examples are modified). 036 * <p> 037 * This class is mostly for performance optimisations inside the framework, and should not 038 * generally be used by external code. 039 */ 040public class ImmutableDataset<T extends Output<T>> extends Dataset<T> implements Serializable { 041 private static final long serialVersionUID = 1L; 042 043 private static final Logger logger = Logger.getLogger(ImmutableDataset.class.getName()); 044 045 /** 046 * Output information, and id numbers for outputs found in this dataset. 047 */ 048 protected ImmutableOutputInfo<T> outputIDInfo; 049 050 /** 051 * A map from feature names to IDs for the features found in this dataset. 052 */ 053 protected ImmutableFeatureMap featureIDMap; 054 055 /** 056 * If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it. 057 */ 058 protected final boolean dropInvalidExamples; 059 060 private DatasetProvenance provenance = null; 061 062 /** 063 * If you call this it's your job to setup outputMap, featureIDMap and fill it with examples. 064 * <p> 065 * Note: Sets dropInvalidExamples to false. 066 * 067 * @param description A description of the input data (including preprocessing steps). 068 * @param outputFactory The factory for this output type. 069 */ 070 protected ImmutableDataset(DataProvenance description, OutputFactory<T> outputFactory) { 071 super(description,outputFactory); 072 dropInvalidExamples = false; 073 } 074 075 /** 076 * Creates a dataset from a data source. It copies the feature and output maps 077 * from the supplied model. 078 * @param dataSource The examples. 079 * @param model A model to extract feature and output maps from. 080 * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it. 081 */ 082 public ImmutableDataset(DataSource<T> dataSource, Model<T> model, boolean dropInvalidExamples) { 083 this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory(),model.getFeatureIDMap(),model.getOutputIDInfo(),dropInvalidExamples); 084 } 085 086 /** 087 * Creates a dataset from a data source. Creates immutable feature and output maps from the 088 * supplied ones. 089 * @param dataSource The examples. 090 * @param featureIDMap The feature map. 091 * @param outputIDInfo The output map. 092 * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it. 093 */ 094 public ImmutableDataset(DataSource<T> dataSource, FeatureMap featureIDMap, OutputInfo<T> outputIDInfo, boolean dropInvalidExamples) { 095 this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory(),featureIDMap,outputIDInfo, dropInvalidExamples); 096 } 097 098 /** 099 * Creates a dataset from a data source. Creates immutable feature and output maps from the 100 * supplied ones. 101 * @param dataSource The examples. 102 * @param description A description of the input data (including preprocessing steps). 103 * @param outputFactory The output factory. 104 * @param featureIDMap The feature id map, used to remove unknown features. 105 * @param outputIDInfo The output id map. 106 * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it. 107 */ 108 public ImmutableDataset(Iterable<Example<T>> dataSource, DataProvenance description, OutputFactory<T> outputFactory, FeatureMap featureIDMap, OutputInfo<T> outputIDInfo, boolean dropInvalidExamples) { 109 this(dataSource,description, outputFactory, new ImmutableFeatureMap(featureIDMap), outputIDInfo.generateImmutableOutputInfo(), dropInvalidExamples); 110 } 111 112 /** 113 * Creates a dataset from a data source. 114 * @param dataSource The examples. 115 * @param description A description of the input data (including preprocessing steps). 116 * @param outputFactory The factory for this output type. 117 * @param featureIDMap The feature id map, used to remove unknown features. 118 * @param outputIDInfo The output id map. 119 * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it. 120 */ 121 public ImmutableDataset(Iterable<Example<T>> dataSource, DataProvenance description, OutputFactory<T> outputFactory, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo, boolean dropInvalidExamples) { 122 super(description,outputFactory); 123 this.featureIDMap = featureIDMap; 124 this.outputIDInfo = outputIDInfo; 125 this.dropInvalidExamples = dropInvalidExamples; 126 127 for (Example<T> ex : dataSource) { 128 add(ex); 129 } 130 } 131 132 /** 133 * This is dangerous, and should not be used unless you've overridden everything in ImmutableDataset. 134 * <p> 135 * Note: Sets dropInvalidExamples to false. 136 * 137 * @param description A description of the data you're going to add to this dataset. 138 * @param outputFactory The factory for this output type. 139 * @param featureIDMap The feature id map, used to remove unknown features. 140 * @param outputIDInfo The output id map. 141 */ 142 protected ImmutableDataset(DataProvenance description, OutputFactory<T> outputFactory, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo) { 143 super(description,outputFactory); 144 this.featureIDMap = featureIDMap; 145 this.outputIDInfo = outputIDInfo; 146 this.dropInvalidExamples = false; 147 } 148 149 /** 150 * Adds an {@link Example} to the dataset, which will remove features with 151 * unknown names. 152 * @param ex An {@link Example} to add to the dataset. 153 */ 154 protected void add(Example<T> ex) { 155 if (!ex.validateExample()) { 156 if (dropInvalidExamples) { 157 logger.severe("Dropping invalid Example: " + ex.toString()); 158 return; 159 } else { 160 throw new IllegalArgumentException("Example had duplicate features, invalid features or no features."); 161 } 162 } 163 innerAdd(ex); 164 } 165 166 /** 167 * Adds an {@link Example} to the dataset. Use only 168 * when the example has already been validated. 169 * @param ex An {@link Example} to add to the dataset. 170 */ 171 private void unsafeAdd(Example<T> ex) { 172 data.add(ex); 173 } 174 175 /** 176 * Adds a {@link Example} to the dataset, which will insert feature ids, 177 * remove unknown features and sort the examples by the feature ids (merging duplicate ids). 178 * @param ex The example to add. 179 * @param merger The {@link Merger} to use. 180 */ 181 protected void add(Example<T> ex, Merger merger) { 182 ex.reduceByName(merger); 183 innerAdd(ex); 184 } 185 186 private void innerAdd(Example<T> ex) { 187 // 188 // Find and remove features that aren't in the feature domain of this dataset. 189 List<Feature> featuresToRemove = new ArrayList<>(); 190 for (Feature f : ex) { 191 VariableInfo info = featureIDMap.get(f.getName()); 192 if (info == null) { 193 featuresToRemove.add(f); 194 } 195 } 196 ex.removeFeatures(featuresToRemove); 197 // 198 // Handle case where Example is empty after removing out-of-domain features. 199 if (ex.size() == 0) { 200 if (dropInvalidExamples) { 201 logger.severe("Dropping invalid Example: " + ex.toString() + ", invalid features - " + featuresToRemove); 202 } else { 203 throw new IllegalArgumentException("This Dataset does not know any of the Features in this Example."); 204 } 205 } else { 206 ex.canonicalize(featureIDMap); 207 data.add(ex); 208 } 209 } 210 211 @Override 212 public Set<T> getOutputs() { 213 return outputIDInfo.getDomain(); 214 } 215 216 @Override 217 public ImmutableFeatureMap getFeatureIDMap() { 218 return featureIDMap; 219 } 220 221 @Override 222 public ImmutableFeatureMap getFeatureMap() { 223 return featureIDMap; 224 } 225 226 @Override 227 public ImmutableOutputInfo<T> getOutputIDInfo() { 228 return outputIDInfo; 229 } 230 231 @Override 232 public ImmutableOutputInfo<T> getOutputInfo() { 233 return outputIDInfo; 234 } 235 236 /** 237 * Returns true if this immutable dataset dropped any invalid examples on construction. 238 * @return True if it drops invalid examples. 239 */ 240 public boolean getDropInvalidExamples() { 241 return dropInvalidExamples; 242 } 243 244 @Override 245 public String toString() { 246 return String.format("ImmutableDataset(source=%s,dropInvalidExamples=%b)", sourceProvenance, dropInvalidExamples); 247 } 248 249 @Override 250 public synchronized DatasetProvenance getProvenance() { 251 if (provenance == null) { 252 provenance = cacheProvenance(); 253 } 254 return provenance; 255 } 256 257 /** 258 * Computes the DatasetProvenance. 259 * @return A new dataset provenance. 260 */ 261 private DatasetProvenance cacheProvenance() { 262 return new DatasetProvenance(sourceProvenance,new ListProvenance<>(),this); 263 } 264 265 /** 266 * Creates an immutable deep copy of the supplied dataset. 267 * @param dataset The dataset to copy. 268 * @param <T> The type of output. 269 * @return An immutable copy of the dataset. 270 */ 271 public static <T extends Output<T>> ImmutableDataset<T> copyDataset(Dataset<T> dataset) { 272 ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,dataset.getFeatureIDMap(),dataset.getOutputIDInfo()); 273 for (Example<T> e : dataset) { 274 copy.unsafeAdd(e.copy()); 275 } 276 return copy; 277 } 278 279 /** 280 * Creates an immutable deep copy of the supplied dataset, using a different feature and output map. 281 * @param dataset The dataset to copy. 282 * @param featureIDMap The new feature map to use. Removes features which are not found in this map. 283 * @param outputIDInfo The new output info to use. 284 * @param <T> The type of output. 285 * @return An immutable copy of the dataset. 286 */ 287 public static <T extends Output<T>> ImmutableDataset<T> copyDataset(Dataset<T> dataset, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo) { 288 ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,featureIDMap,outputIDInfo); 289 for (Example<T> e : dataset) { 290 copy.add(e.copy()); 291 } 292 return copy; 293 } 294 295 /** 296 * Creates an immutable deep copy of the supplied dataset. 297 * @param dataset The dataset to copy. 298 * @param featureIDMap The new feature map to use. Removes features which are not found in this map. 299 * @param outputIDInfo The new output info to use. 300 * @param merger The merge function to use to reduce features given new ids. 301 * @param <T> The type of output. 302 * @return An immutable copy of the dataset. 303 */ 304 public static <T extends Output<T>> ImmutableDataset<T> copyDataset(Dataset<T> dataset, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo, Merger merger) { 305 ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,featureIDMap,outputIDInfo); 306 for (Example<T> e : dataset) { 307 copy.add(e.copy(),merger); 308 } 309 return copy; 310 } 311 312 /** 313 * Creates an immutable shallow copy of the supplied dataset, using the hasher to generate a 314 * {@link HashedFeatureMap} which transparently maps from the feature name to the hashed variant. 315 * @param dataset The dataset to copy. 316 * @param hasher The hashing function to use. 317 * @param <T> The type of output. 318 * @return An immutable copy of the dataset. 319 */ 320 public static <T extends Output<T>> ImmutableDataset<T> hashFeatureMap(Dataset<T> dataset, Hasher hasher) { 321 ImmutableFeatureMap featureIDMap = HashedFeatureMap.generateHashedFeatureMap(dataset.getFeatureMap(),hasher); 322 ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,featureIDMap,dataset.getOutputIDInfo()); 323 for (Example<T> e : dataset) { 324 copy.unsafeAdd(e); 325 } 326 return copy; 327 } 328}