001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.Feature; 023import org.tribuo.data.text.FeatureAggregator; 024import org.tribuo.data.text.TextPipeline; 025import org.tribuo.data.text.TextProcessingException; 026import org.tribuo.data.text.TextProcessor; 027import org.tribuo.util.tokens.Tokenizer; 028 029import java.util.ArrayList; 030import java.util.List; 031import java.util.logging.Level; 032import java.util.logging.Logger; 033 034/** 035 * An example implementation of {@link TextPipeline}. Generates unique ngrams. 036 */ 037public class BasicPipeline implements TextPipeline { 038 039 private static final Logger logger = Logger.getLogger(BasicPipeline.class.getName()); 040 041 private List<TextProcessor> processors = new ArrayList<>(); 042 private FeatureAggregator aggregator = new UniqueAggregator(); 043 044 @Config(mandatory = true,description="Tokenizer to use.") 045 private Tokenizer tokenizer; 046 047 @Config(description="n in the n-gram to emit.") 048 private int ngram = 2; 049 050 public BasicPipeline(Tokenizer tokenizer, int ngram) { 051 this.tokenizer = tokenizer; 052 this.ngram = ngram; 053 postConfig(); 054 } 055 056 /** 057 * For olcut 058 */ 059 private BasicPipeline() {} 060 061 /** 062 * Used by the OLCUT configuration system, and should not be called by external code. 063 */ 064 @Override 065 public void postConfig() { 066 for (int i = 1; i <= ngram; ++i) { 067 processors.add(new NgramProcessor(tokenizer,i,1.0)); 068 } 069 } 070 071 @Override 072 public String toString() { 073 return ngram + "gramPipeline({1.."+ngram+"}-grams)"; 074 } 075 076 @Override 077 public List<Feature> process(String tag, String data) { 078 List<Feature> features = new ArrayList<>(); 079 080 for (TextProcessor p : processors) { 081 try { 082 features.addAll(p.process(tag,data)); 083 } catch (TextProcessingException e) { 084 logger.log(Level.INFO, String.format("TextProcessingException thrown by processor %s with text %s",p,data), e); 085 } 086 } 087 //logger.log(Level.INFO,features.toString()); 088 089 return aggregator.aggregate(features); 090 } 091 092 @Override 093 public ConfiguredObjectProvenance getProvenance() { 094 return new ConfiguredObjectProvenanceImpl(this,"TextPipeline"); 095 } 096 097}