001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.sequence;
018
019import com.oracle.labs.mlrg.olcut.provenance.ListProvenance;
020import org.tribuo.Example;
021import org.tribuo.Feature;
022import org.tribuo.ImmutableFeatureMap;
023import org.tribuo.ImmutableOutputInfo;
024import org.tribuo.MutableFeatureMap;
025import org.tribuo.MutableOutputInfo;
026import org.tribuo.Output;
027import org.tribuo.OutputFactory;
028import org.tribuo.OutputInfo;
029import org.tribuo.provenance.DataProvenance;
030import org.tribuo.provenance.DatasetProvenance;
031
032import java.util.Collection;
033import java.util.Set;
034
035/**
036 * A MutableSequenceDataset is a {@link SequenceDataset} with a {@link MutableFeatureMap} which grows over time.
037 * Whenever an {@link SequenceExample} is added to the dataset.
038 */
039public class MutableSequenceDataset<T extends Output<T>> extends SequenceDataset<T> {
040    private static final long serialVersionUID = 1L;
041
042    /**
043     * A map from labels to IDs for the labels found in this dataset.
044     */
045    protected final MutableOutputInfo<T> outputInfo;
046
047    /**
048     * A map from feature names to IDs for the features found in this dataset.
049     */
050    protected final MutableFeatureMap featureMap;
051
052    /**
053     * Creates an empty sequence dataset.
054     * @param sourceProvenance A description of the input data, including preprocessing steps.
055     * @param outputFactory The output factory.
056     */
057    public MutableSequenceDataset(DataProvenance sourceProvenance, OutputFactory<T> outputFactory) {
058        super(sourceProvenance, outputFactory);
059        this.featureMap = new MutableFeatureMap();
060        this.outputInfo = outputFactory.generateInfo();
061    }
062
063    /**
064     * Creates a dataset from a data source. This method will create the output
065     * and feature ID maps that are needed for training and evaluating classifiers.
066     * @param dataSource The input data.
067     * @param sourceProvenance A description of the data, including preprocessing steps.
068     * @param outputFactory The output factory.
069     */
070    public MutableSequenceDataset(Iterable<SequenceExample<T>> dataSource, DataProvenance sourceProvenance, OutputFactory<T> outputFactory) {
071        super(sourceProvenance, outputFactory);
072        this.featureMap = new MutableFeatureMap();
073        this.outputInfo = outputFactory.generateInfo();
074        for (SequenceExample<T> ex : dataSource) {
075            add(ex);
076        }
077    }
078
079    public MutableSequenceDataset(SequenceDataSource<T> dataSource) {
080        this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory());
081    }
082
083    //special purpose constructor created for ViterbiTrainer
084    public MutableSequenceDataset(ImmutableSequenceDataset<T> dataset) {
085        super(dataset.getProvenance(),dataset.getOutputFactory());
086
087        this.featureMap = new MutableFeatureMap();
088        this.outputInfo = dataset.getOutputInfo().generateMutableOutputInfo();
089        for (SequenceExample<T> ex : dataset) {
090            add(new SequenceExample<>(ex));
091        }
092    }
093
094    /**
095     * Adds a {@link SequenceExample} to this dataset.
096     * <p>
097     * It also canonicalises the reference to each feature's name (i.e., replacing the reference
098     * to a feature's name with the canonical one stored in this Dataset's {@link org.tribuo.VariableInfo}).
099     * This greatly reduces the memory footprint.
100     * @param ex The example to add.
101     */
102    public void add(SequenceExample<T> ex) {
103        if (!ex.validateExample()) {
104            throw new IllegalArgumentException("SequenceExample had duplicate features, no features or no Examples.");
105        }
106        data.add(ex);
107        for (Example<T> e : ex) {
108            outputInfo.observe(e.getOutput());
109            for (Feature f : e) {
110                featureMap.add(f.getName(),f.getValue());
111            }
112        }
113        ex.canonicalise(featureMap);
114    }
115
116    /**
117     * Adds all the SequenceExamples in the supplied collection to this dataset.
118     * @param collection The collection of SequenceExamples.
119     */
120    public void addAll(Collection<SequenceExample<T>> collection) {
121        for (SequenceExample<T> e : collection) {
122            add(e);
123        }
124    }
125
126    @Override
127    public Set<T> getOutputs() {
128        return outputInfo.getDomain();
129    }
130
131    @Override
132    public ImmutableFeatureMap getFeatureIDMap() {
133        return new ImmutableFeatureMap(featureMap);
134    }
135
136    @Override
137    public MutableFeatureMap getFeatureMap() {
138        return featureMap;
139    }
140
141    @Override
142    public ImmutableOutputInfo<T> getOutputIDInfo() {
143        return outputInfo.generateImmutableOutputInfo();
144    }
145
146    @Override
147    public OutputInfo<T> getOutputInfo() {
148        return outputInfo;
149    }
150
151    @Override
152    public String toString(){
153        return "MutableSequenceDataset(source="+ sourceProvenance.toString()+")";
154    }
155
156    @Override
157    public DatasetProvenance getProvenance() {
158        return new DatasetProvenance(sourceProvenance, new ListProvenance<>(), this);
159    }
160}