001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo; 018 019import com.oracle.labs.mlrg.olcut.provenance.ListProvenance; 020import com.oracle.labs.mlrg.olcut.provenance.ObjectProvenance; 021import org.tribuo.provenance.DataProvenance; 022import org.tribuo.provenance.DatasetProvenance; 023import org.tribuo.transform.TransformerMap; 024 025import java.util.ArrayList; 026import java.util.Collection; 027import java.util.Collections; 028import java.util.List; 029import java.util.Map; 030import java.util.Set; 031 032/** 033 * A MutableDataset is a {@link Dataset} with a {@link MutableFeatureMap} which grows over time. 034 * Whenever an {@link Example} is added to the dataset it observes each feature and output 035 * keeping appropriate statistics in the {@link FeatureMap} and {@link OutputInfo}. 036 */ 037public class MutableDataset<T extends Output<T>> extends Dataset<T> { 038 private static final long serialVersionUID = 1L; 039 040 /** 041 * Information about the outputs in this dataset. 042 */ 043 protected final MutableOutputInfo<T> outputMap; 044 045 /** 046 * A map from feature names to feature info objects. 047 */ 048 protected final MutableFeatureMap featureMap; 049 050 /** 051 * The provenances of the transformations applied to this dataset. 052 */ 053 protected final List<ObjectProvenance> transformProvenances = new ArrayList<>(); 054 055 /** 056 * Denotes if this dataset contains implicit zeros or not. 057 */ 058 protected boolean dense = false; 059 060 /** 061 * Creates an empty dataset. 062 * @param sourceProvenance A description of the input data, including preprocessing steps. 063 * @param outputFactory The output factory. 064 */ 065 public MutableDataset(DataProvenance sourceProvenance, OutputFactory<T> outputFactory) { 066 super(sourceProvenance,outputFactory); 067 this.featureMap = new MutableFeatureMap(); 068 this.outputMap = outputFactory.generateInfo(); 069 } 070 071 /** 072 * Creates a dataset from a data source. This method will create the output 073 * and feature maps that are needed for training and evaluating classifiers. 074 * @param dataSource The examples. 075 * @param provenance A description of the input data, including preprocessing steps. 076 * @param outputFactory The output factory. 077 */ 078 public MutableDataset(Iterable<Example<T>> dataSource, DataProvenance provenance, OutputFactory<T> outputFactory) { 079 super(provenance,outputFactory); 080 this.featureMap = new MutableFeatureMap(); 081 this.outputMap = outputFactory.generateInfo(); 082 for (Example<T> ex : dataSource) { 083 add(ex); 084 } 085 } 086 087 /** 088 * Creates a dataset from a data source. This method creates the output and feature maps 089 * needed for training and evaluating classifiers. 090 * @param dataSource The examples. 091 */ 092 public MutableDataset(DataSource<T> dataSource) { 093 this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory()); 094 } 095 096 /** 097 * Adds an example to the dataset, which observes the output and each feature value. 098 * <p> 099 * It also canonicalises the reference to each feature's name (i.e., replacing the reference 100 * to a feature's name with the canonical one stored in this Dataset's {@link VariableInfo}). 101 * This greatly reduces the memory footprint. 102 * @param ex The example to add. 103 */ 104 public void add(Example<T> ex) { 105 if (!ex.validateExample()) { 106 throw new IllegalArgumentException("Example had duplicate features, invalid features or no features."); 107 } 108 outputMap.observe(ex.getOutput()); 109 data.add(ex); 110 for (Feature f : ex) { 111 featureMap.add(f.getName(),f.getValue()); 112 } 113 ex.canonicalize(featureMap); 114 dense = false; 115 } 116 117 /** 118 * Adds all the Examples in the supplied collection to this dataset. 119 * @param collection The collection of Examples. 120 */ 121 public void addAll(Collection<? extends Example<T>> collection) { 122 for (Example<T> e : collection) { 123 add(e); 124 } 125 } 126 127 /** 128 * Sets the weights in each example according to their output. 129 * @param weights A map of {@link Output}s to float weights. 130 */ 131 public void setWeights(Map<T,Float> weights) { 132 for (Example<T> e : this) { 133 Float weight = weights.get(e.getOutput()); 134 if (weight != null) { 135 e.setWeight(weight); 136 } else { 137 e.setWeight(1.0f); 138 } 139 } 140 } 141 142 /** 143 * Gets the set of possible outputs in this dataset. 144 * <p> 145 * In the case of regression returns a Set containing dimension names. 146 * @return The set of possible outputs. 147 */ 148 @Override 149 public Set<T> getOutputs() { 150 return outputMap.getDomain(); 151 } 152 153 @Override 154 public ImmutableFeatureMap getFeatureIDMap() { 155 return new ImmutableFeatureMap(featureMap); 156 } 157 158 @Override 159 public MutableFeatureMap getFeatureMap() { 160 return featureMap; 161 } 162 163 @Override 164 public ImmutableOutputInfo<T> getOutputIDInfo() { 165 return outputMap.generateImmutableOutputInfo(); 166 } 167 168 @Override 169 public OutputInfo<T> getOutputInfo() { 170 return outputMap; 171 } 172 173 @Override 174 public String toString(){ 175 if (transformProvenances.isEmpty()) { 176 return "MutableDataset(source=" + sourceProvenance + ",isDense="+dense+")"; 177 } else { 178 return "MutableDataset(source=" + sourceProvenance + ",isDense="+dense+",transforms="+transformProvenances.toString()+")"; 179 } 180 } 181 182 /** 183 * Is the dataset dense (i.e., do all features in the domain have a value in each example). 184 * @return True if the dataset is dense. 185 */ 186 public boolean isDense() { 187 return dense; 188 } 189 190 /** 191 * Applies all the transformations from the {@link TransformerMap} to this dataset. 192 * @param transformerMap The transformations to apply. 193 */ 194 public void transform(TransformerMap transformerMap) { 195 featureMap.clear(); 196 for (Example<T> example : data) { 197 example.transform(transformerMap); 198 for (Feature f : example) { 199 featureMap.add(f.getName(),f.getValue()); 200 } 201 } 202 transformProvenances.add(transformerMap.getProvenance()); 203 } 204 205 /** 206 * Iterates through the examples, converting implicit zeros into explicit zeros. 207 */ 208 public void densify() { 209 ArrayList<String> featureNames = new ArrayList<>(featureMap.keySet()); 210 Collections.sort(featureNames); 211 for (Example<T> example : data) { 212 example.densify(featureNames); 213 } 214 dense = true; 215 } 216 217 /** 218 * Clears all the examples out of this dataset, and flushes the FeatureMap, OutputInfo, and transform provenances. 219 */ 220 public void clear() { 221 outputMap.clear(); 222 featureMap.clear(); 223 data.clear(); 224 transformProvenances.clear(); 225 } 226 227 @Override 228 public DatasetProvenance getProvenance() { 229 return new DatasetProvenance(sourceProvenance, new ListProvenance<>(transformProvenances), this); 230 } 231 232 /** 233 * Creates a deep copy of the supplied {@link Dataset} which is mutable. 234 * <p> 235 * Copies the individual examples using their copy method. 236 * @param other The dataset to copy. 237 * @param <T> The output type. 238 * @return A mutable deep copy of the dataset. 239 */ 240 public static <T extends Output<T>> MutableDataset<T> createDeepCopy(Dataset<T> other) { 241 MutableDataset<T> copy = new MutableDataset<>(other.getProvenance(),other.outputFactory); 242 243 for (Example<T> e : other) { 244 copy.add(e.copy()); 245 } 246 247 return copy; 248 } 249}