Source code

001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens;
018
019import com.oracle.labs.mlrg.olcut.config.Configurable;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.Provenancable;
022
023import java.util.ArrayList;
024import java.util.Collections;
025import java.util.List;
026import java.util.function.Supplier;
027
028/**
029 * An interface for things that tokenize text: breaking it into words according
030 * to some set of rules.
031 * <p>
032 * Note that tokenizers are not guaranteed to be thread safe! Using the same
033 * tokenizer from multiple threads may result in strange behavior.
034 * <p>
035 * Tokenizers which are not ready throw {@link IllegalStateException}
036 * when {@link Tokenizer#advance} or any get method is called.
037 * <p>
038 * Most Tokenizers are Cloneable, and implement the Cloneable interface.
039 */
040public interface Tokenizer extends Configurable, Cloneable, Provenancable<ConfiguredObjectProvenance> {
041
042    static Supplier<Tokenizer> createSupplier(Tokenizer tokenizer) {
043        Supplier<Tokenizer> supplier = () -> {
044            try {
045                return tokenizer.clone();
046            } catch (CloneNotSupportedException e) {
047                throw new RuntimeException(e);
048            }
049        };
050        return supplier;
051    }
052
053    static ThreadLocal<Tokenizer> createThreadLocal(Tokenizer tokenizer) {
054        return ThreadLocal.withInitial(createSupplier(tokenizer));
055    }
056
057    /**
058     * Resets the tokenizer so that it operates on a new sequence of characters.
059     *
060     * @param cs a character sequence to tokenize
061     */
062    public void reset(CharSequence cs);
063
064    /**
065     * Advances the tokenizer to the next token.
066     *
067     * @return {@code true} if there is such a token, {@code false}
068     * otherwise.
069     */
070    public boolean advance();
071
072    /**
073     * Gets the text of the current token, as a string
074     *
075     * @return the text of the current token
076     */
077    public String getText();
078
079    /**
080     * Gets the starting character offset of the current token in the character
081     * sequence
082     *
083     * @return the starting character offset of the token
084     */
085    public int getStart();
086
087    /**
088     * Gets the ending offset (exclusive) of the current token in the character
089     * sequence
090     *
091     * @return the exclusive ending character offset for the current token.
092     */
093    public int getEnd();
094
095    /**
096     * Gets the type of the current token.
097     *
098     * @return the type of the current token.
099     */
100    public Token.TokenType getType();
101
102    /**
103     * Clones a tokenizer with it's configuration. Cloned tokenizers are
104     * not processing the same text as the original tokenizer and need to be reset
105     * with a fresh CharSequence.
106     *
107     * @return A tokenizer with the same configuration, but independent state.
108     * @throws CloneNotSupportedException if the tokenizer isn't cloneable.
109     */
110    public Tokenizer clone() throws CloneNotSupportedException;
111
112    /**
113     * Generates a Token object from the current state of the tokenizer.
114     * @return The token object from the current state.
115     */
116    default public Token getToken() {
117        return new Token(getText(), getStart(), getEnd(), getType());
118    }
119
120    /**
121     * Uses this tokenizer to tokenize a string and return the list of tokens
122     * that were generated. Many applications will simply want to take a
123     * character sequence and get a list of tokens, so this will do that for
124     * them.
125     *
126     * <p>
127     * Here is the contract of the tokenize function:
128     * <ul>
129     * <li>all returned tokens correspond to substrings of the input text</li>
130     * <li>the tokens do not overlap</li>
131     * <li>the tokens are returned in the order that they appear in the text
132     * </li>
133     * <li>the value of Token.text should be the same as calling
134     * text.substring(token.start, token.end)
135     * </ul>
136     *
137     * @param cs a sequence of characters to tokenize
138     * @return the tokens discovered in the character sequence, in order
139     * (true?).
140     */
141    default List<Token> tokenize(CharSequence cs) {
142        if (cs == null || cs.length() == 0) {
143            return Collections.emptyList();
144        }
145        List<Token> tokens = new ArrayList<>();
146        reset(cs);
147        while (advance()) {
148            tokens.add(getToken());
149        }
150        return tokens;
151    }
152
153    /**
154     * Uses this tokenizer to split a string into it's component substrings.
155     * Many applications will simply want the component strings making up a
156     * larger character sequence.
157     *
158     * @param cs the character sequence to tokenize
159     * @return a list of strings making up the character sequence.
160     */
161    default List<String> split(CharSequence cs) {
162        if (cs == null || cs.length() == 0) {
163            return Collections.emptyList();
164        }
165        List<String> tokens = new ArrayList<>();
166        reset(cs);
167        while (advance()) {
168            tokens.add(getText());
169        }
170        return tokens;
171    }
172}