Source code

001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.Feature;
023import org.tribuo.data.text.TextProcessingException;
024import org.tribuo.data.text.TextProcessor;
025import org.tribuo.util.tokens.Tokenizer;
026
027import java.util.ArrayList;
028import java.util.List;
029import java.util.logging.Logger;
030
031/**
032 * A text processor that will generate token ngrams of a particular size.
033 */
034public class NgramProcessor implements TextProcessor {
035
036    private static final Logger logger = Logger.getLogger(NgramProcessor.class.getName());
037
038    @Config(description="n in the n-gram to emit.")
039    private int n = 2;
040
041    @Config(description="Value to emit for each n-gram.")
042    private double value = 1.0;
043
044    @Config(mandatory = true,description="Tokenizer to use.")
045    private Tokenizer tokenizer;
046
047    private ThreadLocal<Tokenizer> tokenizerThreadLocal;
048    
049    /**
050     * Creates a processor that will generate token ngrams of size {@code n}.
051     * 
052     * @param tokenizer The tokenizer to use to process text.
053     * @param n the size of the ngram to generate
054     * @param value the value we will put in the new features.
055     */
056    public NgramProcessor(Tokenizer tokenizer, int n, double value) {
057        if (n < 1) {
058            throw new IllegalArgumentException("n = " + n + ", must be a positive integer.");
059        }
060        this.n = n;
061        this.value = value;
062        this.tokenizer = tokenizer;
063        this.tokenizerThreadLocal = ThreadLocal.withInitial(() -> {try { return this.tokenizer.clone(); } catch (CloneNotSupportedException e) { throw new IllegalArgumentException("Tokenizer not cloneable",e); }});
064    }
065
066    /**
067     * For olcut.
068     */
069    private NgramProcessor() {}
070
071    /**
072     * Used by the OLCUT configuration system, and should not be called by external code.
073     */
074    @Override
075    public void postConfig() {
076        this.tokenizerThreadLocal = ThreadLocal.withInitial(() -> {try { return tokenizer.clone(); } catch (CloneNotSupportedException e) { throw new IllegalArgumentException("Tokenizer not cloneable",e); }});
077    }
078
079    @Override
080    public List<Feature> process(String text) throws TextProcessingException {
081        return innerProcess(n+"-N=",text);
082    }
083
084    @Override
085    public List<Feature> process(String tag, String text) throws TextProcessingException {
086        if (tag == null || tag.isEmpty()) {
087            return innerProcess(n+"-N=",text);
088        } else {
089            return innerProcess(tag + "-" + n + "-N=", text);
090        }
091    }
092
093    private List<Feature> innerProcess(String tag, String text) {
094        List<Feature> ret = new ArrayList<>();
095
096        List<String> words = tokenizerThreadLocal.get().split(text);
097
098        if (words.size() < n) {
099            return ret;
100        }
101
102        StringBuilder ngram = new StringBuilder();
103        for (int start = 0, end = n; end <= words.size(); start++, end++) {
104            ngram.delete(0,ngram.length());
105            ngram.append(tag);
106            for (int i = start; i < end; ++i) {
107                ngram.append(words.get(i));
108                ngram.append('/');
109            }
110            ngram.deleteCharAt(ngram.length()-1);
111            if (ngram.length() > 0 && Character.isLetterOrDigit(ngram.charAt(0))) {
112                String ngramString = ngram.toString();
113                ret.add(new Feature(ngramString, value));
114            }
115        }
116        return ret;
117    }
118
119    @Override
120    public ConfiguredObjectProvenance getProvenance() {
121        return new ConfiguredObjectProvenanceImpl(this,"TextProcessor");
122    }
123
124}