001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.Feature;
023import org.tribuo.data.text.FeatureAggregator;
024import org.tribuo.data.text.TextPipeline;
025import org.tribuo.data.text.TextProcessingException;
026import org.tribuo.data.text.TextProcessor;
027import org.tribuo.util.tokens.Tokenizer;
028
029import java.util.ArrayList;
030import java.util.List;
031import java.util.logging.Level;
032import java.util.logging.Logger;
033
034/**
035 * An example implementation of {@link TextPipeline}. Generates unique ngrams.
036 */
037public class BasicPipeline implements TextPipeline {
038
039    private static final Logger logger = Logger.getLogger(BasicPipeline.class.getName());
040
041    private List<TextProcessor> processors = new ArrayList<>();
042    private FeatureAggregator aggregator = new UniqueAggregator();
043
044    @Config(mandatory = true,description="Tokenizer to use.")
045    private Tokenizer tokenizer;
046
047    @Config(description="n in the n-gram to emit.")
048    private int ngram = 2;
049
050    public BasicPipeline(Tokenizer tokenizer, int ngram) {
051        this.tokenizer = tokenizer;
052        this.ngram = ngram;
053        postConfig();
054    }
055
056    /**
057     * For olcut
058     */
059    private BasicPipeline() {}
060
061    /**
062     * Used by the OLCUT configuration system, and should not be called by external code.
063     */
064    @Override
065    public void postConfig() {
066        for (int i = 1; i <= ngram; ++i) {
067            processors.add(new NgramProcessor(tokenizer,i,1.0));
068        }
069    }
070
071    @Override
072    public String toString() {
073        return ngram + "gramPipeline({1.."+ngram+"}-grams)";
074    }
075
076    @Override
077    public List<Feature> process(String tag, String data) {
078        List<Feature> features = new ArrayList<>();
079
080        for (TextProcessor p : processors) {
081            try {
082                features.addAll(p.process(tag,data));
083            } catch (TextProcessingException e) {
084                logger.log(Level.INFO, String.format("TextProcessingException thrown by processor %s with text %s",p,data), e);
085            }
086        }
087        //logger.log(Level.INFO,features.toString());
088
089        return aggregator.aggregate(features);
090    }
091
092    @Override
093    public ConfiguredObjectProvenance getProvenance() {
094        return new ConfiguredObjectProvenanceImpl(this,"TextPipeline");
095    }
096
097}