001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import org.tribuo.ConfigurableDataSource;
021import org.tribuo.Example;
022import org.tribuo.Output;
023import org.tribuo.OutputFactory;
024
025import java.io.File;
026import java.nio.file.Path;
027import java.util.ArrayList;
028import java.util.Arrays;
029import java.util.Iterator;
030import java.util.List;
031
032/**
033 * A base class for textual data sets. We assume that all textual data is 
034 * written and read using UTF-8.
035 */
036public abstract class TextDataSource<T extends Output<T>> implements ConfigurableDataSource<T> {
037    
038    /**
039     * Document preprocessors that should be run on the documents that make up
040     * this data set.
041     */
042    @Config(description="The document preprocessors to run on each document in the data source.")
043    protected List<DocumentPreprocessor> preprocessors = new ArrayList<>();
044    
045    /**
046     * The path that data was read from.
047     */
048    @Config(mandatory=true,description="The path to read the data from.")
049    protected Path path;
050
051    /**
052     * The factory that converts a String into an {@link Output}.
053     */
054    @Config(mandatory=true,description="The factory that converts a String into an Output instance.")
055    protected OutputFactory<T> outputFactory;
056
057    /**
058     * The extractor that we'll use to turn text into examples.
059     */
060    @Config(mandatory=true,description="The feature extractor that generates Features from text.")
061    protected TextFeatureExtractor<T> extractor;
062
063    /**
064     * The actual data read out of the text file.
065     */
066    protected final List<Example<T>> data = new ArrayList<>();
067
068    /**
069     * for olcut
070     */
071    protected TextDataSource() {}
072
073    /**
074     * Creates a text data set by reading it from a path.
075     * @param path the path to read data from
076     * @param outputFactory the output factory used to generate the outputs.
077     * @param extractor The feature extractor to run on the text.
078     * @param preprocessors processors that will be run on the data before it
079     * is added as examples.
080     */
081    public TextDataSource(Path path, OutputFactory<T> outputFactory, TextFeatureExtractor<T> extractor, DocumentPreprocessor... preprocessors) {
082        this.path = path;
083        this.outputFactory = outputFactory;
084        this.extractor = extractor;
085        this.preprocessors.addAll(Arrays.asList(preprocessors));
086    }
087    
088    public TextDataSource(File file, OutputFactory<T> outputFactory, TextFeatureExtractor<T> extractor, DocumentPreprocessor... preprocessors) {
089        this(file.toPath(), outputFactory, extractor, preprocessors);
090    }
091    
092    @Override
093    public Iterator<Example<T>> iterator() {
094        if (!data.isEmpty()) {
095            return data.iterator();
096        } else {
097            throw new IllegalStateException("read was not called in " + this.getClass().getName());
098        }
099    }
100
101    @Override
102    public String toString() {
103        StringBuilder buffer = new StringBuilder();
104
105        buffer.append(this.getClass().getSimpleName());
106        buffer.append("(path=");
107        buffer.append(path.toString());
108        buffer.append(",extractor=");
109        buffer.append(extractor.toString());
110        buffer.append(",preprocessors=");
111        buffer.append(preprocessors.toString());
112        buffer.append(")");
113
114        return buffer.toString();
115    }
116
117    /**
118     * A method that can be overridden to do different things to each document
119     * that we've read. By default iterates the preprocessors and applies them to the document.
120     *
121     * @param doc The document to handle
122     * @return a (possibly modified) version of the document.
123     */
124    protected String handleDoc(String doc) {
125        String newDoc = doc;
126        for (DocumentPreprocessor p : preprocessors) {
127            newDoc = p.processDoc(newDoc);
128        }
129        return newDoc;
130    }
131
132    /**
133     * Reads the data from the Path.
134     * @throws java.io.IOException if there is any error reading the data.
135     */
136    protected abstract void read() throws java.io.IOException;
137
138    /**
139     * Returns the output factory used to convert the text input into an {@link Output}.
140     *
141     * @return The output factory.
142     */
143    @Override
144    public OutputFactory<T> getOutputFactory() {
145        return outputFactory;
146    }
147
148}