001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.Feature; 023import org.tribuo.data.text.TextProcessingException; 024import org.tribuo.data.text.TextProcessor; 025import org.tribuo.util.tokens.Tokenizer; 026 027import java.util.ArrayList; 028import java.util.List; 029import java.util.logging.Logger; 030 031/** 032 * A text processor that will generate token ngrams of a particular size. 033 */ 034public class NgramProcessor implements TextProcessor { 035 036 private static final Logger logger = Logger.getLogger(NgramProcessor.class.getName()); 037 038 @Config(description="n in the n-gram to emit.") 039 private int n = 2; 040 041 @Config(description="Value to emit for each n-gram.") 042 private double value = 1.0; 043 044 @Config(mandatory = true,description="Tokenizer to use.") 045 private Tokenizer tokenizer; 046 047 private ThreadLocal<Tokenizer> tokenizerThreadLocal; 048 049 /** 050 * Creates a processor that will generate token ngrams of size {@code n}. 051 * 052 * @param tokenizer The tokenizer to use to process text. 053 * @param n the size of the ngram to generate 054 * @param value the value we will put in the new features. 055 */ 056 public NgramProcessor(Tokenizer tokenizer, int n, double value) { 057 if (n < 1) { 058 throw new IllegalArgumentException("n = " + n + ", must be a positive integer."); 059 } 060 this.n = n; 061 this.value = value; 062 this.tokenizer = tokenizer; 063 this.tokenizerThreadLocal = ThreadLocal.withInitial(() -> {try { return this.tokenizer.clone(); } catch (CloneNotSupportedException e) { throw new IllegalArgumentException("Tokenizer not cloneable",e); }}); 064 } 065 066 /** 067 * For olcut. 068 */ 069 private NgramProcessor() {} 070 071 /** 072 * Used by the OLCUT configuration system, and should not be called by external code. 073 */ 074 @Override 075 public void postConfig() { 076 this.tokenizerThreadLocal = ThreadLocal.withInitial(() -> {try { return tokenizer.clone(); } catch (CloneNotSupportedException e) { throw new IllegalArgumentException("Tokenizer not cloneable",e); }}); 077 } 078 079 @Override 080 public List<Feature> process(String text) throws TextProcessingException { 081 return innerProcess(n+"-N=",text); 082 } 083 084 @Override 085 public List<Feature> process(String tag, String text) throws TextProcessingException { 086 if (tag == null || tag.isEmpty()) { 087 return innerProcess(n+"-N=",text); 088 } else { 089 return innerProcess(tag + "-" + n + "-N=", text); 090 } 091 } 092 093 private List<Feature> innerProcess(String tag, String text) { 094 List<Feature> ret = new ArrayList<>(); 095 096 List<String> words = tokenizerThreadLocal.get().split(text); 097 098 if (words.size() < n) { 099 return ret; 100 } 101 102 StringBuilder ngram = new StringBuilder(); 103 for (int start = 0, end = n; end <= words.size(); start++, end++) { 104 ngram.delete(0,ngram.length()); 105 ngram.append(tag); 106 for (int i = start; i < end; ++i) { 107 ngram.append(words.get(i)); 108 ngram.append('/'); 109 } 110 ngram.deleteCharAt(ngram.length()-1); 111 if (ngram.length() > 0 && Character.isLetterOrDigit(ngram.charAt(0))) { 112 String ngramString = ngram.toString(); 113 ret.add(new Feature(ngramString, value)); 114 } 115 } 116 return ret; 117 } 118 119 @Override 120 public ConfiguredObjectProvenance getProvenance() { 121 return new ConfiguredObjectProvenanceImpl(this,"TextProcessor"); 122 } 123 124}