Source code

001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
020import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
021import org.tribuo.util.tokens.Token;
022import org.tribuo.util.tokens.Tokenizer;
023
024/**
025 * This tokenizer is loosely based on the notion of word shape which is a common
026 * feature used in NLP. The idea here is that continuous runs of letters in the
027 * same character class will be grouped together. White space characters are
028 * used as delimiters. The character classes are: uppercase letters, lowercase
029 * letters, digits, and everything else goes into its own character class. So,
030 * for example, "1234abcd" would be split into "1234" and "abcd". And "!@#$"
031 * would result in four tokens. Please see unit tests.
032 * <p>
033 * Strings are split according to whitespace and contiguous runs of characters
034 * in the same character classes. Except for one exception - if uppercase
035 * letters are immediately followed by lowercase letters, then we keep them
036 * together. This has the effect of recognizing camel case and splits
037 * "CamelCase" into "Camel" and "Case". It also splits "ABCdef AAbb" into
038 * "ABCdef" and "AAbb".
039 */
040public class ShapeTokenizer implements Tokenizer {
041
042    private String cs;
043
044    private int pos;
045
046    private String token;
047
048    private StringBuilder tb = new StringBuilder();
049
050    private int start;
051
052    private int end;
053
054    private char currClass;
055
056    private int prevClass;
057
058    private boolean ready;
059
060    public ShapeTokenizer() { }
061
062    @Override
063    public ConfiguredObjectProvenance getProvenance() {
064        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
065    }
066
067    @Override
068    public void reset(CharSequence cs) {
069        this.cs = cs.toString();
070        pos = 0;
071        start = -1;
072        end = -1;
073        prevClass = -1;
074        token = null;
075        ready = false;
076    }
077
078    private char getClass(int cp) {
079        if (Character.isUpperCase(cp)) {
080            return 'A';
081        } else if (Character.isLowerCase(cp)) {
082            return 'a';
083        } else if (Character.isDigit(cp)) {
084            return '1';
085        } else if (Character.isWhitespace(cp)) {
086            return ' ';
087        } else {
088            return (char) cp;
089        }
090    }
091
092    @Override
093    public boolean advance() {
094        if (cs == null) {
095            throw new IllegalStateException("ShapeTokenizer has not been reset.");
096        }
097        tb.delete(0, tb.length());
098        start = pos;
099        while (pos < cs.length()) {
100            int cp = cs.codePointAt(pos);
101            int lcp = Character.charCount(cp);
102
103            currClass = getClass(cp);
104
105            //
106            // Skip spaces at the start of the token.
107            if (tb.length() == 0 && currClass == ' ') {
108                pos += lcp;
109                start = pos;
110                prevClass = currClass;
111                continue;
112            }
113
114            //
115            // When do we want to end the current token? When we cross a boundary
116            // between token classes when we're not at the start of the string,
117            // except when that boundary is between 
118            // upper and lower case characters.
119            if (currClass != prevClass && prevClass != -1) {
120                if (!(prevClass == 'A' && currClass == 'a')) {
121                    if (tb.length() > 0) {
122                        token = tb.toString();
123                        prevClass = currClass;
124                        //
125                        // Note that we're not increasing pos here: we want
126                        // to work on this current character the next time that
127                        // we get called!
128                        ready = true;
129                        return true;
130                    }
131                }
132            }
133
134            //
135            // We didn't end the token, so collect the current character,
136            // unless it's a space!
137            if (currClass != ' ') {
138                tb.appendCodePoint(cp);
139            }
140            prevClass = currClass;
141            pos += lcp;
142            end = pos;
143        }
144
145        if (tb.length() > 0) {
146            token = tb.toString();
147            ready = true;
148            return true;
149        }
150
151        return false;
152    }
153
154    @Override
155    public String getText() {
156        if (ready) {
157            return token;
158        } else {
159            throw new IllegalStateException("ShapeTokenizer is not ready.");
160        }
161    }
162
163    @Override
164    public int getStart() {
165        if (ready) {
166            return start;
167        } else {
168            throw new IllegalStateException("ShapeTokenizer is not ready.");
169        }
170    }
171
172    @Override
173    public int getEnd() {
174        if (ready) {
175            return end;
176        } else {
177            throw new IllegalStateException("ShapeTokenizer is not ready.");
178        }
179    }
180
181    @Override
182    public Token.TokenType getType() {
183        if (ready) {
184            return Token.TokenType.WORD;
185        } else {
186            throw new IllegalStateException("ShapeTokenizer is not ready.");
187        }
188    }
189
190    @Override
191    public ShapeTokenizer clone() {
192        try {
193            ShapeTokenizer copy = (ShapeTokenizer) super.clone();
194            copy.tb = new StringBuilder();
195            copy.ready = false;
196            copy.cs = null;
197            return copy;
198        } catch (CloneNotSupportedException e) {
199            throw new AssertionError("ShapeTokenizer is Cloneable, but clone call failed");
200        }
201    }
202
203}