001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.sequence; 018 019import com.oracle.labs.mlrg.olcut.provenance.ListProvenance; 020import org.tribuo.Example; 021import org.tribuo.Feature; 022import org.tribuo.ImmutableFeatureMap; 023import org.tribuo.ImmutableOutputInfo; 024import org.tribuo.MutableFeatureMap; 025import org.tribuo.MutableOutputInfo; 026import org.tribuo.Output; 027import org.tribuo.OutputFactory; 028import org.tribuo.OutputInfo; 029import org.tribuo.provenance.DataProvenance; 030import org.tribuo.provenance.DatasetProvenance; 031 032import java.util.Collection; 033import java.util.Set; 034 035/** 036 * A MutableSequenceDataset is a {@link SequenceDataset} with a {@link MutableFeatureMap} which grows over time. 037 * Whenever an {@link SequenceExample} is added to the dataset. 038 */ 039public class MutableSequenceDataset<T extends Output<T>> extends SequenceDataset<T> { 040 private static final long serialVersionUID = 1L; 041 042 /** 043 * A map from labels to IDs for the labels found in this dataset. 044 */ 045 protected final MutableOutputInfo<T> outputInfo; 046 047 /** 048 * A map from feature names to IDs for the features found in this dataset. 049 */ 050 protected final MutableFeatureMap featureMap; 051 052 /** 053 * Creates an empty sequence dataset. 054 * @param sourceProvenance A description of the input data, including preprocessing steps. 055 * @param outputFactory The output factory. 056 */ 057 public MutableSequenceDataset(DataProvenance sourceProvenance, OutputFactory<T> outputFactory) { 058 super(sourceProvenance, outputFactory); 059 this.featureMap = new MutableFeatureMap(); 060 this.outputInfo = outputFactory.generateInfo(); 061 } 062 063 /** 064 * Creates a dataset from a data source. This method will create the output 065 * and feature ID maps that are needed for training and evaluating classifiers. 066 * @param dataSource The input data. 067 * @param sourceProvenance A description of the data, including preprocessing steps. 068 * @param outputFactory The output factory. 069 */ 070 public MutableSequenceDataset(Iterable<SequenceExample<T>> dataSource, DataProvenance sourceProvenance, OutputFactory<T> outputFactory) { 071 super(sourceProvenance, outputFactory); 072 this.featureMap = new MutableFeatureMap(); 073 this.outputInfo = outputFactory.generateInfo(); 074 for (SequenceExample<T> ex : dataSource) { 075 add(ex); 076 } 077 } 078 079 public MutableSequenceDataset(SequenceDataSource<T> dataSource) { 080 this(dataSource,dataSource.getProvenance(),dataSource.getOutputFactory()); 081 } 082 083 //special purpose constructor created for ViterbiTrainer 084 public MutableSequenceDataset(ImmutableSequenceDataset<T> dataset) { 085 super(dataset.getProvenance(),dataset.getOutputFactory()); 086 087 this.featureMap = new MutableFeatureMap(); 088 this.outputInfo = dataset.getOutputInfo().generateMutableOutputInfo(); 089 for (SequenceExample<T> ex : dataset) { 090 add(new SequenceExample<>(ex)); 091 } 092 } 093 094 /** 095 * Adds a {@link SequenceExample} to this dataset. 096 * <p> 097 * It also canonicalises the reference to each feature's name (i.e., replacing the reference 098 * to a feature's name with the canonical one stored in this Dataset's {@link org.tribuo.VariableInfo}). 099 * This greatly reduces the memory footprint. 100 * @param ex The example to add. 101 */ 102 public void add(SequenceExample<T> ex) { 103 if (!ex.validateExample()) { 104 throw new IllegalArgumentException("SequenceExample had duplicate features, no features or no Examples."); 105 } 106 data.add(ex); 107 for (Example<T> e : ex) { 108 outputInfo.observe(e.getOutput()); 109 for (Feature f : e) { 110 featureMap.add(f.getName(),f.getValue()); 111 } 112 } 113 ex.canonicalise(featureMap); 114 } 115 116 /** 117 * Adds all the SequenceExamples in the supplied collection to this dataset. 118 * @param collection The collection of SequenceExamples. 119 */ 120 public void addAll(Collection<SequenceExample<T>> collection) { 121 for (SequenceExample<T> e : collection) { 122 add(e); 123 } 124 } 125 126 @Override 127 public Set<T> getOutputs() { 128 return outputInfo.getDomain(); 129 } 130 131 @Override 132 public ImmutableFeatureMap getFeatureIDMap() { 133 return new ImmutableFeatureMap(featureMap); 134 } 135 136 @Override 137 public MutableFeatureMap getFeatureMap() { 138 return featureMap; 139 } 140 141 @Override 142 public ImmutableOutputInfo<T> getOutputIDInfo() { 143 return outputInfo.generateImmutableOutputInfo(); 144 } 145 146 @Override 147 public OutputInfo<T> getOutputInfo() { 148 return outputInfo; 149 } 150 151 @Override 152 public String toString(){ 153 return "MutableSequenceDataset(source="+ sourceProvenance.toString()+")"; 154 } 155 156 @Override 157 public DatasetProvenance getProvenance() { 158 return new DatasetProvenance(sourceProvenance, new ListProvenance<>(), this); 159 } 160}