001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import org.tribuo.ConfigurableDataSource; 021import org.tribuo.Example; 022import org.tribuo.Output; 023import org.tribuo.OutputFactory; 024 025import java.io.File; 026import java.nio.file.Path; 027import java.util.ArrayList; 028import java.util.Arrays; 029import java.util.Iterator; 030import java.util.List; 031 032/** 033 * A base class for textual data sets. We assume that all textual data is 034 * written and read using UTF-8. 035 */ 036public abstract class TextDataSource<T extends Output<T>> implements ConfigurableDataSource<T> { 037 038 /** 039 * Document preprocessors that should be run on the documents that make up 040 * this data set. 041 */ 042 @Config(description="The document preprocessors to run on each document in the data source.") 043 protected List<DocumentPreprocessor> preprocessors = new ArrayList<>(); 044 045 /** 046 * The path that data was read from. 047 */ 048 @Config(mandatory=true,description="The path to read the data from.") 049 protected Path path; 050 051 /** 052 * The factory that converts a String into an {@link Output}. 053 */ 054 @Config(mandatory=true,description="The factory that converts a String into an Output instance.") 055 protected OutputFactory<T> outputFactory; 056 057 /** 058 * The extractor that we'll use to turn text into examples. 059 */ 060 @Config(mandatory=true,description="The feature extractor that generates Features from text.") 061 protected TextFeatureExtractor<T> extractor; 062 063 /** 064 * The actual data read out of the text file. 065 */ 066 protected final List<Example<T>> data = new ArrayList<>(); 067 068 /** 069 * for olcut 070 */ 071 protected TextDataSource() {} 072 073 /** 074 * Creates a text data set by reading it from a path. 075 * @param path the path to read data from 076 * @param outputFactory the output factory used to generate the outputs. 077 * @param extractor The feature extractor to run on the text. 078 * @param preprocessors processors that will be run on the data before it 079 * is added as examples. 080 */ 081 public TextDataSource(Path path, OutputFactory<T> outputFactory, TextFeatureExtractor<T> extractor, DocumentPreprocessor... preprocessors) { 082 this.path = path; 083 this.outputFactory = outputFactory; 084 this.extractor = extractor; 085 this.preprocessors.addAll(Arrays.asList(preprocessors)); 086 } 087 088 public TextDataSource(File file, OutputFactory<T> outputFactory, TextFeatureExtractor<T> extractor, DocumentPreprocessor... preprocessors) { 089 this(file.toPath(), outputFactory, extractor, preprocessors); 090 } 091 092 @Override 093 public Iterator<Example<T>> iterator() { 094 if (!data.isEmpty()) { 095 return data.iterator(); 096 } else { 097 throw new IllegalStateException("read was not called in " + this.getClass().getName()); 098 } 099 } 100 101 @Override 102 public String toString() { 103 StringBuilder buffer = new StringBuilder(); 104 105 buffer.append(this.getClass().getSimpleName()); 106 buffer.append("(path="); 107 buffer.append(path.toString()); 108 buffer.append(",extractor="); 109 buffer.append(extractor.toString()); 110 buffer.append(",preprocessors="); 111 buffer.append(preprocessors.toString()); 112 buffer.append(")"); 113 114 return buffer.toString(); 115 } 116 117 /** 118 * A method that can be overridden to do different things to each document 119 * that we've read. By default iterates the preprocessors and applies them to the document. 120 * 121 * @param doc The document to handle 122 * @return a (possibly modified) version of the document. 123 */ 124 protected String handleDoc(String doc) { 125 String newDoc = doc; 126 for (DocumentPreprocessor p : preprocessors) { 127 newDoc = p.processDoc(newDoc); 128 } 129 return newDoc; 130 } 131 132 /** 133 * Reads the data from the Path. 134 * @throws java.io.IOException if there is any error reading the data. 135 */ 136 protected abstract void read() throws java.io.IOException; 137 138 /** 139 * Returns the output factory used to convert the text input into an {@link Output}. 140 * 141 * @return The output factory. 142 */ 143 @Override 144 public OutputFactory<T> getOutputFactory() { 145 return outputFactory; 146 } 147 148}