001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.Feature;
023import org.tribuo.data.text.FeatureAggregator;
024import org.tribuo.data.text.FeatureTransformer;
025import org.tribuo.data.text.TextPipeline;
026import org.tribuo.data.text.TextProcessingException;
027import org.tribuo.data.text.TextProcessor;
028import org.tribuo.util.tokens.Tokenizer;
029
030import java.util.ArrayList;
031import java.util.List;
032import java.util.logging.Level;
033import java.util.logging.Logger;
034
035/**
036 * A pipeline for generating ngram features.
037 */
038public class TokenPipeline implements TextPipeline {
039
040    private static final Logger logger = Logger.getLogger(TokenPipeline.class.getName());
041
042    private List<TextProcessor> processors = new ArrayList<>();
043    private List<FeatureTransformer> transformers = new ArrayList<>();
044    private FeatureAggregator aggregator;
045
046    @Config(mandatory = true,description="Use term counting, otherwise emit binary features.")
047    private boolean termCounting;
048
049    @Config(description="Dimension to map the hash into.")
050    private int hashDim = -1;
051
052    @Config(mandatory = true,description="Tokenizer to use.")
053    private Tokenizer tokenizer;
054
055    @Config(description="n in the n-gram to emit.")
056    private int ngram = 2;
057
058    /**
059     * Creates a new token pipeline.
060     * 
061     * @param tokenizer The tokenizer to use to split up the text into words (i.e., 
062     * features.)
063     * @param ngram The maximum size of ngram features to add to the features
064     * generated by the pipeline. A value of {@code n} means that ngram features
065     * of size 1-n will be generated. A good standard value to use is 2, which means
066     * that unigram and bigram features will be generated. You will very likely see
067     * diminishing returns for larger values of {@code n} but there will be times
068     * when they will be necessary.
069     * @param termCounting If {@code true}, multiple occurrences of terms
070     * in the document will be counted and the count will be used as the value
071     * of the features that are produced.
072     */
073    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting) {
074        this(tokenizer, ngram, termCounting, -1);
075    }
076    
077    /**
078     * Creates a new token pipeline.
079     *
080     * @param tokenizer The tokenizer to use to split up the text into words
081     * (i.e., features.)
082     * @param ngram The maximum size of ngram features to add to the features
083     * generated by the pipeline. A value of {@code n} means that ngram
084     * features of size 1-n will be generated. A good standard value to use is
085     * 2, which means that unigram and bigram features will be generated. You
086     * will very likely see diminishing returns for larger values of
087     * {@code n} but there will be times when they will be necessary.
088     * @param termCounting If {@code true}, multiple occurrences of terms
089     * in the document will be counted and the count will be used as the value
090     * of the features that are produced.
091     * @param dimension The maximum dimension for the feature space. If this value 
092     * is greater than 0, then at most {@code dimension} features will be
093     * through the use of a hashing function that will collapse the feature 
094     * space.
095     */
096    public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension) {
097        this.tokenizer = tokenizer;
098        this.ngram = ngram;
099        this.hashDim = dimension;
100        this.termCounting = termCounting;
101        postConfig();
102    }
103
104    /**
105     * For olcut.
106     */
107    private TokenPipeline() {}
108
109    /**
110     * Used by the OLCUT configuration system, and should not be called by external code.
111     */
112    @Override
113    public void postConfig() {
114        for (int i = 1; i <= ngram; ++i) {
115            processors.add(new NgramProcessor(tokenizer,i,1));
116        }
117        if (hashDim > 0) {
118            transformers.add(new FeatureHasher(hashDim));
119        }
120        if (termCounting) {
121            aggregator = new SumAggregator();
122        } else {
123            aggregator = new UniqueAggregator(1);
124        }
125    }
126
127    @Override
128    public String toString() {
129        if (transformers.size() > 0) {
130            return ngram + "gramPipeline({1.."+ngram+"}-grams,hashing)";
131        } else {
132            return ngram + "gramPipeline({1.."+ngram+"}-grams)";
133        }
134    }
135
136    @Override
137    public List<Feature> process(String tag, String data) {
138        List<Feature> features = new ArrayList<>();
139
140        for (TextProcessor p : processors) {
141            try {
142                features.addAll(p.process(tag,data));
143            } catch (TextProcessingException e) {
144                logger.log(Level.INFO, String.format("TextProcessingException thrown by processor %s with text %s",p,data), e);
145            }
146        }
147
148        for (FeatureTransformer transformer: transformers) {
149            features = transformer.map(tag,features);
150        }
151
152        return aggregator.aggregate(features);
153    }
154
155    @Override
156    public ConfiguredObjectProvenance getProvenance() {
157        return new ConfiguredObjectProvenanceImpl(this,"TextPipeline");
158    }
159
160}