001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.util.tokens.Token.TokenType;
023import org.tribuo.util.tokens.Tokenizer;
024
025import java.util.Arrays;
026
027/**
028 * This implementation of {@link Tokenizer} is instantiated with an array of
029 * characters that are considered split characters. That is, the split
030 * characters define where to split the input text. It's a very simplistic
031 * tokenizer that has one simple exceptional case that it handles: how to deal
032 * with split characters that appear in between digits (e.g., 3/5 and 3.1415).
033 * It's not really very general purpose, but may suffice for some use cases.
034 * <p>
035 * In addition to the split characters specified it also splits on anything
036 * that is considered whitespace by {@link Character#isWhitespace(char)}.
037 * @author Philip Ogren
038 */
039public class SplitCharactersTokenizer implements Tokenizer {
040
041    public static final char[] DEFAULT_SPLIT_CHARACTERS = new char[]{'*', '(', ')', '&', '[', ']', '{', '}', '`',
042            '\'', '|', ';', ':', '\\', '!', '-', '?'};
043    public static final char[] DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS = new char[]{'.', ',', '/',};
044
045    @Config(description="The characters to split on.")
046    private char[] splitCharacters = DEFAULT_SPLIT_CHARACTERS;
047
048    @Config(description="The characters to split on unless we're in a number.")
049    private char[] splitXDigitsCharacters = DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS;
050
051    private CharSequence cs;
052
053    private int start;
054
055    private int end;
056
057    private int p;
058
059    private StringBuilder token = new StringBuilder();
060
061    private boolean ready;
062
063    public SplitCharactersTokenizer() {
064    }
065
066    /**
067     * @param splitCharacters        characters to be replaced with a space in the
068     *                               input text (e.g., "abc|def" becomes "abc def")
069     * @param splitXDigitsCharacters characters to be replaced with a space in
070     *                               the input text except in the circumstance where the character immediately
071     *                               adjacent to the left and right are digits (e.g., "abc.def" becomes "abc
072     *                               def" but "3.1415" remains "3.1415").
073     */
074    public SplitCharactersTokenizer(char[] splitCharacters, char[] splitXDigitsCharacters) {
075        this.splitCharacters = splitCharacters;
076        this.splitXDigitsCharacters = splitXDigitsCharacters;
077    }
078
079    /**
080     * Creates a tokenizer that splits on whitespace.
081     * @return A whitespace tokenizer.
082     */
083    public static SplitCharactersTokenizer createWhitespaceTokenizer() {
084        return new SplitCharactersTokenizer(new char[0], new char[0]);
085    }
086
087    @Override
088    public ConfiguredObjectProvenance getProvenance() {
089        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
090    }
091
092    @Override
093    public void reset(CharSequence cs) {
094        this.cs = cs;
095        start = -1;
096        end = -1;
097        p = 0;
098        token.delete(0, token.length());
099        ready = false;
100    }
101
102    @Override
103    public boolean advance() {
104        if (cs == null) {
105            throw new IllegalStateException("SplitCharactersTokenizer has not been reset.");
106        }
107        if (p >= cs.length()) {
108            return false;
109        }
110        token.delete(0, token.length());
111        while (p < cs.length()) {
112            char c = cs.charAt(p);
113            //
114            // First, let's figure out if this is a character that we
115            // want to keep in a token. We want to keep a character if it's
116            // not one of our split characters or if it's one of the "keep in
117            // digits" characters and it's surrounded by digits.
118            boolean keepCharacter = !(isSplitCharacter(c) || (isSplitXDigitCharacter(c) && (p == 0
119                    || p == cs.length() - 1
120                    || !Character.isDigit(cs.charAt(p - 1))
121                    || !Character.isDigit(cs.charAt(p + 1)))));
122
123            p++;
124            //
125            // If we want to keep it, then go ahead and do that and remember
126            // where the end of the token is.
127            if (keepCharacter) {
128                //
129                // If this is the first character that we're keeping, remember
130                // where the token started.
131                if (token.length() == 0) {
132                    start = p - 1;
133                }
134                token.append(c);
135                end = p;
136            }
137
138            //
139            // OK, if we didnt want to keep this character, and we've already
140            // collected some stuff, then we've got a token to send, so let's
141            // break out of the loop. This should allow us to skip runs of
142            // breaking characters.
143            if (!keepCharacter && token.length() > 0) {
144                break;
145            }
146        }
147
148        //
149        // We advanced if we have some stuff collected.
150        if (token.length() > 0) {
151            ready = true;
152            return true;
153        } else {
154            return false;
155        }
156    }
157
158    @Override
159    public String getText() {
160        if (ready) {
161            return token.toString();
162        } else {
163            throw new IllegalStateException("SplitCharactersTokenizer is not ready.");
164        }
165    }
166
167    @Override
168    public int getStart() {
169        if (ready) {
170            return start;
171        } else {
172            throw new IllegalStateException("SplitCharactersTokenizer is not ready.");
173        }
174    }
175
176    @Override
177    public int getEnd() {
178        if (ready) {
179            return end;
180        } else {
181            throw new IllegalStateException("SplitCharactersTokenizer is not ready.");
182        }
183    }
184
185    @Override
186    public TokenType getType() {
187        if (ready) {
188            return TokenType.WORD;
189        } else {
190            throw new IllegalStateException("SplitCharactersTokenizer is not ready.");
191        }
192    }
193
194    @Override
195    public SplitCharactersTokenizer clone() {
196        try {
197            SplitCharactersTokenizer copy = (SplitCharactersTokenizer) super.clone();
198            copy.token = new StringBuilder();
199            copy.splitCharacters = splitCharacters == null ? null : Arrays.copyOf(splitCharacters, splitCharacters.length);
200            copy.splitXDigitsCharacters = splitXDigitsCharacters == null ? null : Arrays.copyOf(splitXDigitsCharacters, splitXDigitsCharacters.length);
201            copy.ready = false;
202            copy.cs = null;
203            return copy;
204        } catch (CloneNotSupportedException e) {
205            throw new AssertionError("SplitCharactersTokenizer is Cloneable, but clone call failed");
206        }
207    }
208
209    /**
210     * Is this character a split character for this tokenizer instance.
211     * @param c The character to check.
212     * @return True if it's a split character.
213     */
214    public boolean isSplitCharacter(char c) {
215        return isCharacter(c, splitCharacters) || Character.isWhitespace(c);
216    }
217
218    /**
219     * Is this character a split character except inside a digit for this tokenizer instance.
220     * @param c The character to check.
221     * @return True if it's a split character.
222     */
223    public boolean isSplitXDigitCharacter(char c) {
224        return isCharacter(c, splitXDigitsCharacters);
225    }
226
227    private boolean isCharacter(char c, char[] chars) {
228        if (chars == null) {
229            return false;
230        }
231        for (char ch : chars) {
232            if (ch == c) {
233                return true;
234            }
235        }
236        return false;
237    }
238
239    /**
240     * Returns a copy of the split characters.
241     * @return A copy of the split characters.
242     */
243    public char[] getSplitCharacters() {
244        return Arrays.copyOf(splitCharacters,splitCharacters.length);
245    }
246
247    /**
248     * Returns a copy of the split characters except inside digits.
249     * @return A copy of the split characters.
250     */
251    public char[] getSplitXDigitsCharacters() {
252        return Arrays.copyOf(splitXDigitsCharacters,splitXDigitsCharacters.length);
253    }
254
255}