001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo;
018
019import com.oracle.labs.mlrg.olcut.provenance.ListProvenance;
020import org.tribuo.hash.HashedFeatureMap;
021import org.tribuo.hash.Hasher;
022import org.tribuo.provenance.DataProvenance;
023import org.tribuo.provenance.DatasetProvenance;
024import org.tribuo.util.Merger;
025
026import java.io.Serializable;
027import java.util.ArrayList;
028import java.util.List;
029import java.util.Set;
030import java.util.logging.Logger;
031
032/**
033 * This is a {@link Dataset} which has an {@link ImmutableFeatureMap} to store the feature information.
034 * Whenever an example is added to this dataset it removes features that do not exist in the {@link FeatureMap}.
035 * The dataset is immutable after construction (unless the examples are modified).
036 * <p>
037 * This class is mostly for performance optimisations inside the framework, and should not
038 * generally be used by external code.
039 */
040public class ImmutableDataset<T extends Output<T>> extends Dataset<T> implements Serializable {
041    private static final long serialVersionUID = 1L;
042
043    private static final Logger logger = Logger.getLogger(ImmutableDataset.class.getName());
044    
045    /**
046     * Output information, and id numbers for outputs found in this dataset.
047     */
048    protected ImmutableOutputInfo<T> outputIDInfo;
049
050    /**
051     * A map from feature names to IDs for the features found in this dataset.
052     */
053    protected ImmutableFeatureMap featureIDMap;
054
055    /**
056     * If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it.
057     */
058    protected final boolean dropInvalidExamples;
059
060    private DatasetProvenance provenance = null;
061
062    /**
063     * If you call this it's your job to setup outputMap, featureIDMap and fill it with examples.
064     * <p>
065     * Note: Sets dropInvalidExamples to false.
066     *
067     * @param description A description of the input data (including preprocessing steps).
068     * @param outputFactory The factory for this output type.
069     */
070    protected ImmutableDataset(DataProvenance description, OutputFactory<T> outputFactory) {
071        super(description,outputFactory);
072        dropInvalidExamples = false;
073    }
074
075    /**
076     * Creates a dataset from a data source. It copies the feature and output maps
077     * from the supplied model.
078     * @param dataSource The examples.
079     * @param model A model to extract feature and output maps from.
080     * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it.
081     */
082    public ImmutableDataset(DataSource<T> dataSource, Model<T> model, boolean dropInvalidExamples) {
083        this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory(),model.getFeatureIDMap(),model.getOutputIDInfo(),dropInvalidExamples);
084    }
085
086    /**
087     * Creates a dataset from a data source. Creates immutable feature and output maps from the
088     * supplied ones.
089     * @param dataSource The examples.
090     * @param featureIDMap The feature map.
091     * @param outputIDInfo The output map.
092     * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it.
093     */
094    public ImmutableDataset(DataSource<T> dataSource, FeatureMap featureIDMap, OutputInfo<T> outputIDInfo, boolean dropInvalidExamples) {
095        this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory(),featureIDMap,outputIDInfo, dropInvalidExamples);
096    }
097
098    /**
099     * Creates a dataset from a data source. Creates immutable feature and output maps from the
100     * supplied ones.
101     * @param dataSource The examples.
102     * @param description A description of the input data (including preprocessing steps).
103     * @param outputFactory The output factory.
104     * @param featureIDMap The feature id map, used to remove unknown features.
105     * @param outputIDInfo The output id map.
106     * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it.
107     */
108    public ImmutableDataset(Iterable<Example<T>> dataSource, DataProvenance description, OutputFactory<T> outputFactory, FeatureMap featureIDMap, OutputInfo<T> outputIDInfo, boolean dropInvalidExamples) {
109        this(dataSource,description, outputFactory, new ImmutableFeatureMap(featureIDMap), outputIDInfo.generateImmutableOutputInfo(), dropInvalidExamples);
110    }
111
112    /**
113     * Creates a dataset from a data source.
114     * @param dataSource The examples.
115     * @param description A description of the input data (including preprocessing steps).
116     * @param outputFactory The factory for this output type.
117     * @param featureIDMap The feature id map, used to remove unknown features.
118     * @param outputIDInfo The output id map.
119     * @param dropInvalidExamples If true, instead of throwing an exception when an invalid {@link Example} is encountered, this Dataset will log a warning and drop it.
120     */
121    public ImmutableDataset(Iterable<Example<T>> dataSource, DataProvenance description, OutputFactory<T> outputFactory, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo, boolean dropInvalidExamples) {
122        super(description,outputFactory);
123        this.featureIDMap = featureIDMap;
124        this.outputIDInfo = outputIDInfo;
125        this.dropInvalidExamples = dropInvalidExamples;
126
127        for (Example<T> ex : dataSource) {
128            add(ex);
129        }
130    }
131
132    /**
133     * This is dangerous, and should not be used unless you've overridden everything in ImmutableDataset.
134     * <p>
135     * Note: Sets dropInvalidExamples to false.
136     *
137     * @param description A description of the data you're going to add to this dataset.
138     * @param outputFactory The factory for this output type.
139     * @param featureIDMap The feature id map, used to remove unknown features.
140     * @param outputIDInfo The output id map.
141     */
142    protected ImmutableDataset(DataProvenance description, OutputFactory<T> outputFactory, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo) {
143        super(description,outputFactory);
144        this.featureIDMap = featureIDMap;
145        this.outputIDInfo = outputIDInfo;
146        this.dropInvalidExamples = false;
147    }
148
149    /**
150     * Adds an {@link Example} to the dataset, which will remove features with
151     * unknown names.
152     * @param ex An {@link Example} to add to the dataset.
153     */
154    protected void add(Example<T> ex) {
155        if (!ex.validateExample()) {
156            if (dropInvalidExamples) {
157                logger.severe("Dropping invalid Example: " + ex.toString());
158                return;
159            } else {
160                throw new IllegalArgumentException("Example had duplicate features, invalid features or no features.");
161            }
162        }
163        innerAdd(ex);
164    }
165
166    /**
167     * Adds an {@link Example} to the dataset. Use only
168     * when the example has already been validated.
169     * @param ex An {@link Example} to add to the dataset.
170     */
171    private void unsafeAdd(Example<T> ex) {
172        data.add(ex);
173    }
174
175    /**
176     * Adds a {@link Example} to the dataset, which will insert feature ids,
177     * remove unknown features and sort the examples by the feature ids (merging duplicate ids).
178     * @param ex The example to add.
179     * @param merger The {@link Merger} to use.
180     */
181    protected void add(Example<T> ex, Merger merger) {
182        ex.reduceByName(merger);
183        innerAdd(ex);
184    }
185
186    private void innerAdd(Example<T> ex) {
187        //
188        // Find and remove features that aren't in the feature domain of this dataset.
189        List<Feature> featuresToRemove = new ArrayList<>();
190        for (Feature f : ex) {
191            VariableInfo info = featureIDMap.get(f.getName());
192            if (info == null) {
193                featuresToRemove.add(f);
194            }
195        }
196        ex.removeFeatures(featuresToRemove);
197        //
198        // Handle case where Example is empty after removing out-of-domain features.
199        if (ex.size() == 0) {
200            if (dropInvalidExamples) {
201                logger.severe("Dropping invalid Example: " + ex.toString() + ", invalid features - " + featuresToRemove);
202            } else {
203                throw new IllegalArgumentException("This Dataset does not know any of the Features in this Example.");
204            }
205        } else {
206            ex.canonicalize(featureIDMap);
207            data.add(ex);
208        }
209    }
210
211    @Override
212    public Set<T> getOutputs() {
213        return outputIDInfo.getDomain();
214    }
215
216    @Override
217    public ImmutableFeatureMap getFeatureIDMap() {
218        return featureIDMap;
219    }
220
221    @Override
222    public ImmutableFeatureMap getFeatureMap() {
223        return featureIDMap;
224    }
225
226    @Override
227    public ImmutableOutputInfo<T> getOutputIDInfo() {
228        return outputIDInfo;
229    }
230
231    @Override
232    public ImmutableOutputInfo<T> getOutputInfo() {
233        return outputIDInfo;
234    }
235
236    /**
237     * Returns true if this immutable dataset dropped any invalid examples on construction.
238     * @return True if it drops invalid examples.
239     */
240    public boolean getDropInvalidExamples() {
241        return dropInvalidExamples;
242    }
243
244    @Override
245    public String toString() {
246        return String.format("ImmutableDataset(source=%s,dropInvalidExamples=%b)", sourceProvenance, dropInvalidExamples);
247    }
248
249    @Override
250    public synchronized DatasetProvenance getProvenance() {
251        if (provenance == null) {
252            provenance = cacheProvenance();
253        }
254        return provenance;
255    }
256
257    /**
258     * Computes the DatasetProvenance.
259     * @return A new dataset provenance.
260     */
261    private DatasetProvenance cacheProvenance() {
262        return new DatasetProvenance(sourceProvenance,new ListProvenance<>(),this);
263    }
264
265    /**
266     * Creates an immutable deep copy of the supplied dataset.
267     * @param dataset The dataset to copy.
268     * @param <T> The type of output.
269     * @return An immutable copy of the dataset.
270     */
271    public static <T extends Output<T>> ImmutableDataset<T> copyDataset(Dataset<T> dataset) {
272        ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,dataset.getFeatureIDMap(),dataset.getOutputIDInfo());
273        for (Example<T> e : dataset) {
274            copy.unsafeAdd(e.copy());
275        }
276        return copy;
277    }
278
279    /**
280     * Creates an immutable deep copy of the supplied dataset, using a different feature and output map.
281     * @param dataset The dataset to copy.
282     * @param featureIDMap The new feature map to use. Removes features which are not found in this map.
283     * @param outputIDInfo The new output info to use.
284     * @param <T> The type of output.
285     * @return An immutable copy of the dataset.
286     */
287    public static <T extends Output<T>> ImmutableDataset<T> copyDataset(Dataset<T> dataset, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo) {
288        ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,featureIDMap,outputIDInfo);
289        for (Example<T> e : dataset) {
290            copy.add(e.copy());
291        }
292        return copy;
293    }
294
295    /**
296     * Creates an immutable deep copy of the supplied dataset.
297     * @param dataset The dataset to copy.
298     * @param featureIDMap The new feature map to use. Removes features which are not found in this map.
299     * @param outputIDInfo The new output info to use.
300     * @param merger The merge function to use to reduce features given new ids.
301     * @param <T> The type of output.
302     * @return An immutable copy of the dataset.
303     */
304    public static <T extends Output<T>> ImmutableDataset<T> copyDataset(Dataset<T> dataset, ImmutableFeatureMap featureIDMap, ImmutableOutputInfo<T> outputIDInfo, Merger merger) {
305        ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,featureIDMap,outputIDInfo);
306        for (Example<T> e : dataset) {
307            copy.add(e.copy(),merger);
308        }
309        return copy;
310    }
311
312    /**
313     * Creates an immutable shallow copy of the supplied dataset, using the hasher to generate a
314     * {@link HashedFeatureMap} which transparently maps from the feature name to the hashed variant.
315     * @param dataset The dataset to copy.
316     * @param hasher The hashing function to use.
317     * @param <T> The type of output.
318     * @return An immutable copy of the dataset.
319     */
320    public static <T extends Output<T>> ImmutableDataset<T> hashFeatureMap(Dataset<T> dataset, Hasher hasher) {
321        ImmutableFeatureMap featureIDMap = HashedFeatureMap.generateHashedFeatureMap(dataset.getFeatureMap(),hasher);
322        ImmutableDataset<T> copy = new ImmutableDataset<>(dataset.getProvenance(),dataset.outputFactory,featureIDMap,dataset.getOutputIDInfo());
323        for (Example<T> e : dataset) {
324            copy.unsafeAdd(e);
325        }
326        return copy;
327    }
328}