001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.Feature; 023import org.tribuo.data.text.FeatureAggregator; 024import org.tribuo.data.text.FeatureTransformer; 025import org.tribuo.data.text.TextPipeline; 026import org.tribuo.data.text.TextProcessingException; 027import org.tribuo.data.text.TextProcessor; 028import org.tribuo.util.tokens.Tokenizer; 029 030import java.util.ArrayList; 031import java.util.List; 032import java.util.logging.Level; 033import java.util.logging.Logger; 034 035/** 036 * A pipeline for generating ngram features. 037 */ 038public class TokenPipeline implements TextPipeline { 039 040 private static final Logger logger = Logger.getLogger(TokenPipeline.class.getName()); 041 042 private List<TextProcessor> processors = new ArrayList<>(); 043 private List<FeatureTransformer> transformers = new ArrayList<>(); 044 private FeatureAggregator aggregator; 045 046 @Config(mandatory = true,description="Use term counting, otherwise emit binary features.") 047 private boolean termCounting; 048 049 @Config(description="Dimension to map the hash into.") 050 private int hashDim = -1; 051 052 @Config(mandatory = true,description="Tokenizer to use.") 053 private Tokenizer tokenizer; 054 055 @Config(description="n in the n-gram to emit.") 056 private int ngram = 2; 057 058 /** 059 * Creates a new token pipeline. 060 * 061 * @param tokenizer The tokenizer to use to split up the text into words (i.e., 062 * features.) 063 * @param ngram The maximum size of ngram features to add to the features 064 * generated by the pipeline. A value of {@code n} means that ngram features 065 * of size 1-n will be generated. A good standard value to use is 2, which means 066 * that unigram and bigram features will be generated. You will very likely see 067 * diminishing returns for larger values of {@code n} but there will be times 068 * when they will be necessary. 069 * @param termCounting If {@code true}, multiple occurrences of terms 070 * in the document will be counted and the count will be used as the value 071 * of the features that are produced. 072 */ 073 public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting) { 074 this(tokenizer, ngram, termCounting, -1); 075 } 076 077 /** 078 * Creates a new token pipeline. 079 * 080 * @param tokenizer The tokenizer to use to split up the text into words 081 * (i.e., features.) 082 * @param ngram The maximum size of ngram features to add to the features 083 * generated by the pipeline. A value of {@code n} means that ngram 084 * features of size 1-n will be generated. A good standard value to use is 085 * 2, which means that unigram and bigram features will be generated. You 086 * will very likely see diminishing returns for larger values of 087 * {@code n} but there will be times when they will be necessary. 088 * @param termCounting If {@code true}, multiple occurrences of terms 089 * in the document will be counted and the count will be used as the value 090 * of the features that are produced. 091 * @param dimension The maximum dimension for the feature space. If this value 092 * is greater than 0, then at most {@code dimension} features will be 093 * through the use of a hashing function that will collapse the feature 094 * space. 095 */ 096 public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension) { 097 this.tokenizer = tokenizer; 098 this.ngram = ngram; 099 this.hashDim = dimension; 100 this.termCounting = termCounting; 101 postConfig(); 102 } 103 104 /** 105 * For olcut. 106 */ 107 private TokenPipeline() {} 108 109 /** 110 * Used by the OLCUT configuration system, and should not be called by external code. 111 */ 112 @Override 113 public void postConfig() { 114 for (int i = 1; i <= ngram; ++i) { 115 processors.add(new NgramProcessor(tokenizer,i,1)); 116 } 117 if (hashDim > 0) { 118 transformers.add(new FeatureHasher(hashDim)); 119 } 120 if (termCounting) { 121 aggregator = new SumAggregator(); 122 } else { 123 aggregator = new UniqueAggregator(1); 124 } 125 } 126 127 @Override 128 public String toString() { 129 if (transformers.size() > 0) { 130 return ngram + "gramPipeline({1.."+ngram+"}-grams,hashing)"; 131 } else { 132 return ngram + "gramPipeline({1.."+ngram+"}-grams)"; 133 } 134 } 135 136 @Override 137 public List<Feature> process(String tag, String data) { 138 List<Feature> features = new ArrayList<>(); 139 140 for (TextProcessor p : processors) { 141 try { 142 features.addAll(p.process(tag,data)); 143 } catch (TextProcessingException e) { 144 logger.log(Level.INFO, String.format("TextProcessingException thrown by processor %s with text %s",p,data), e); 145 } 146 } 147 148 for (FeatureTransformer transformer: transformers) { 149 features = transformer.map(tag,features); 150 } 151 152 return aggregator.aggregate(features); 153 } 154 155 @Override 156 public ConfiguredObjectProvenance getProvenance() { 157 return new ConfiguredObjectProvenanceImpl(this,"TextPipeline"); 158 } 159 160}