001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.util.tokens.Token;
023import org.tribuo.util.tokens.Tokenizer;
024
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028/**
029 * This implementation of {@link Tokenizer} is instantiated with a regular
030 * expression pattern which determines how to split a string into tokens. That
031 * is, the pattern defines the "splits", not the tokens. For example, to
032 * tokenize on white space provide the pattern "\s+".
033 *
034 * @author Philip Ogren
035 */
036public class SplitPatternTokenizer implements Tokenizer {
037
038    /**
039     * The default split pattern, which is [\.,]?\s+.
040     */
041    public static final String SIMPLE_DEFAULT_PATTERN = "[\\.,]?\\s+";
042
043    @Config(description="The regex to split with.")
044    private String splitPatternRegex = SIMPLE_DEFAULT_PATTERN;
045
046    private Pattern splitPattern;
047
048    private CharSequence cs;
049
050    private int start;
051
052    private int end;
053
054    private Matcher matcher;
055
056    private int prevMatchEnd;
057
058    private boolean done;
059
060    private boolean ready;
061
062    /**
063     * Initializes a case insensitive tokenizer with the pattern [\.,]?\s+
064     */
065    public SplitPatternTokenizer() {
066        postConfig();
067    }
068
069    /**
070     * Constructs a splitting tokenizer using the supplied regex.
071     * @param splitPatternRegex The regex to use.
072     */
073    public SplitPatternTokenizer(String splitPatternRegex) {
074        this.splitPatternRegex = splitPatternRegex;
075        postConfig();
076    }
077
078    /**
079     * Used by the OLCUT configuration system, and should not be called by external code.
080     */
081    @Override
082    public void postConfig() {
083        splitPattern = Pattern.compile(splitPatternRegex);
084        ready = false;
085        cs = null;
086    }
087
088    @Override
089    public ConfiguredObjectProvenance getProvenance() {
090        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
091    }
092
093    /**
094     * Gets the String form of the regex in use.
095     * @return The regex.
096     */
097    public String getSplitPatternRegex() {
098        return splitPatternRegex;
099    }
100
101    @Override
102    public void reset(CharSequence cs) {
103        this.cs = cs;
104        matcher = splitPattern.matcher(cs);
105        start = -1;
106        end = -1;
107        prevMatchEnd = 0;
108        done = false;
109        ready = false;
110    }
111
112    @Override
113    public boolean advance() {
114        if (cs == null) {
115            throw new IllegalStateException("SplitPatternTokenizer has not been reset.");
116        }
117        //
118        // We've gotten everything.
119        if (done) {
120            return false;
121        }
122        if (matcher.find()) {
123            //
124            // We might get a match at the start of the string, so reset and 
125            // call advance to see if we can find a later match.
126            if (matcher.start() == 0) {
127                prevMatchEnd = matcher.end();
128                return advance();
129            }
130            //
131            // A regular match, so the actual text runs from the end of the 
132            // previous match to the start of this one.
133            start = prevMatchEnd;
134            end = matcher.start();
135            prevMatchEnd = matcher.end();
136            ready = true;
137        } else {
138            //
139            // Handle the end of the string, keeping in mind that the last match
140            // might have included the end of the string already.
141            start = prevMatchEnd;
142            end = cs.length();
143            done = true;
144            ready = start < end;
145        }
146        return ready;
147    }
148
149    @Override
150    public String getText() {
151        if (ready) {
152            return cs.subSequence(start, end).toString();
153        } else {
154            throw new IllegalStateException("SplitPatternTokenizer is not ready.");
155        }
156    }
157
158    @Override
159    public int getStart() {
160        if (ready) {
161            return start;
162        } else {
163            throw new IllegalStateException("SplitPatternTokenizer is not ready.");
164        }
165    }
166
167    @Override
168    public int getEnd() {
169        if (ready) {
170            return end;
171        } else {
172            throw new IllegalStateException("SplitPatternTokenizer is not ready.");
173        }
174    }
175
176    @Override
177    public Token.TokenType getType() {
178        if (ready) {
179            return Token.TokenType.WORD;
180        } else {
181            throw new IllegalStateException("SplitPatternTokenizer is not ready.");
182        }
183    }
184
185    @Override
186    public SplitPatternTokenizer clone() {
187        try {
188            SplitPatternTokenizer copy = (SplitPatternTokenizer) super.clone();
189            copy.postConfig(); //ready is set in postConfig.
190            return copy;
191        } catch (CloneNotSupportedException e) {
192            throw new AssertionError("SplitPatternTokenizer is Cloneable, but the clone call failed.");
193        }
194    }
195}