001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo;
018
019import com.oracle.labs.mlrg.olcut.provenance.ListProvenance;
020import com.oracle.labs.mlrg.olcut.provenance.ObjectProvenance;
021import org.tribuo.provenance.DataProvenance;
022import org.tribuo.provenance.DatasetProvenance;
023import org.tribuo.transform.TransformerMap;
024
025import java.util.ArrayList;
026import java.util.Collection;
027import java.util.Collections;
028import java.util.List;
029import java.util.Map;
030import java.util.Set;
031
032/**
033 * A MutableDataset is a {@link Dataset} with a {@link MutableFeatureMap} which grows over time.
034 * Whenever an {@link Example} is added to the dataset it observes each feature and output
035 * keeping appropriate statistics in the {@link FeatureMap} and {@link OutputInfo}.
036 */
037public class MutableDataset<T extends Output<T>> extends Dataset<T> {
038    private static final long serialVersionUID = 1L;
039
040    /**
041     * Information about the outputs in this dataset.
042     */
043    protected final MutableOutputInfo<T> outputMap;
044
045    /**
046     * A map from feature names to feature info objects.
047     */
048    protected final MutableFeatureMap featureMap;
049
050    /**
051     * The provenances of the transformations applied to this dataset.
052     */
053    protected final List<ObjectProvenance> transformProvenances = new ArrayList<>();
054
055    /**
056     * Denotes if this dataset contains implicit zeros or not.
057     */
058    protected boolean dense = false;
059
060    /**
061     * Creates an empty dataset.
062     * @param sourceProvenance A description of the input data, including preprocessing steps.
063     * @param outputFactory The output factory.
064     */
065    public MutableDataset(DataProvenance sourceProvenance, OutputFactory<T> outputFactory) {
066        super(sourceProvenance,outputFactory);
067        this.featureMap = new MutableFeatureMap();
068        this.outputMap = outputFactory.generateInfo();
069    }
070
071    /**
072     * Creates a dataset from a data source. This method will create the output
073     * and feature maps that are needed for training and evaluating classifiers.
074     * @param dataSource The examples.
075     * @param provenance A description of the input data, including preprocessing steps.
076     * @param outputFactory The output factory.
077     */
078    public MutableDataset(Iterable<Example<T>> dataSource, DataProvenance provenance, OutputFactory<T> outputFactory) {
079        super(provenance,outputFactory);
080        this.featureMap = new MutableFeatureMap();
081        this.outputMap = outputFactory.generateInfo();
082        for (Example<T> ex : dataSource) {
083            add(ex);
084        }
085    }
086
087    /**
088     * Creates a dataset from a data source. This method creates the output and feature maps
089     * needed for training and evaluating classifiers.
090     * @param dataSource The examples.
091     */
092    public MutableDataset(DataSource<T> dataSource) {
093        this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory());
094    }
095
096    /**
097     * Adds an example to the dataset, which observes the output and each feature value.
098     * <p>
099     * It also canonicalises the reference to each feature's name (i.e., replacing the reference
100     * to a feature's name with the canonical one stored in this Dataset's {@link VariableInfo}).
101     * This greatly reduces the memory footprint.
102     * @param ex The example to add.
103     */
104    public void add(Example<T> ex) {
105        if (!ex.validateExample()) {
106            throw new IllegalArgumentException("Example had duplicate features, invalid features or no features.");
107        }
108        outputMap.observe(ex.getOutput());
109        data.add(ex);
110        for (Feature f : ex) {
111            featureMap.add(f.getName(),f.getValue());
112        }
113        ex.canonicalize(featureMap);
114        dense = false;
115    }
116
117    /**
118     * Adds all the Examples in the supplied collection to this dataset.
119     * @param collection The collection of Examples.
120     */
121    public void addAll(Collection<? extends Example<T>> collection) {
122        for (Example<T> e : collection) {
123            add(e);
124        }
125    }
126
127    /**
128     * Sets the weights in each example according to their output.
129     * @param weights A map of {@link Output}s to float weights.
130     */
131    public void setWeights(Map<T,Float> weights) {
132        for (Example<T> e : this) {
133            Float weight = weights.get(e.getOutput());
134            if (weight != null) {
135                e.setWeight(weight);
136            } else {
137                e.setWeight(1.0f);
138            }
139        }
140    }
141
142    /**
143     * Gets the set of possible outputs in this dataset.
144     * <p>
145     * In the case of regression returns a Set containing dimension names.
146     * @return The set of possible outputs.
147     */
148    @Override
149    public Set<T> getOutputs() {
150        return outputMap.getDomain();
151    }
152
153    @Override
154    public ImmutableFeatureMap getFeatureIDMap() {
155        return new ImmutableFeatureMap(featureMap);
156    }
157
158    @Override
159    public MutableFeatureMap getFeatureMap() {
160        return featureMap;
161    }
162
163    @Override
164    public ImmutableOutputInfo<T> getOutputIDInfo() {
165        return outputMap.generateImmutableOutputInfo();
166    }
167
168    @Override
169    public OutputInfo<T> getOutputInfo() {
170        return outputMap;
171    }
172
173    @Override
174    public String toString(){
175        if (transformProvenances.isEmpty()) {
176            return "MutableDataset(source=" + sourceProvenance + ",isDense="+dense+")";
177        } else {
178            return "MutableDataset(source=" + sourceProvenance + ",isDense="+dense+",transforms="+transformProvenances.toString()+")";
179        }
180    }
181
182    /**
183     * Is the dataset dense (i.e., do all features in the domain have a value in each example).
184     * @return True if the dataset is dense.
185     */
186    public boolean isDense() {
187        return dense;
188    }
189
190    /**
191     * Applies all the transformations from the {@link TransformerMap} to this dataset.
192     * @param transformerMap The transformations to apply.
193     */
194    public void transform(TransformerMap transformerMap) {
195        featureMap.clear();
196        for (Example<T> example : data) {
197            example.transform(transformerMap);
198            for (Feature f : example) {
199                featureMap.add(f.getName(),f.getValue());
200            }
201        }
202        transformProvenances.add(transformerMap.getProvenance());
203    }
204
205    /**
206     * Iterates through the examples, converting implicit zeros into explicit zeros.
207     */
208    public void densify() {
209        ArrayList<String> featureNames = new ArrayList<>(featureMap.keySet());
210        Collections.sort(featureNames);
211        for (Example<T> example : data) {
212            example.densify(featureNames);
213        }
214        dense = true;
215    }
216
217    /**
218     * Clears all the examples out of this dataset, and flushes the FeatureMap, OutputInfo, and transform provenances.
219     */
220    public void clear() {
221        outputMap.clear();
222        featureMap.clear();
223        data.clear();
224        transformProvenances.clear();
225    }
226
227    @Override
228    public DatasetProvenance getProvenance() {
229        return new DatasetProvenance(sourceProvenance, new ListProvenance<>(transformProvenances), this);
230    }
231
232    /**
233     * Creates a deep copy of the supplied {@link Dataset} which is mutable.
234     * <p>
235     * Copies the individual examples using their copy method.
236     * @param other The dataset to copy.
237     * @param <T> The output type.
238     * @return A mutable deep copy of the dataset.
239     */
240    public static <T extends Output<T>> MutableDataset<T> createDeepCopy(Dataset<T> other) {
241        MutableDataset<T> copy = new MutableDataset<>(other.getProvenance(),other.outputFactory);
242
243        for (Example<T> e : other) {
244            copy.add(e.copy());
245        }
246
247        return copy;
248    }
249}