001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens; 018 019import com.oracle.labs.mlrg.olcut.config.Configurable; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.Provenancable; 022 023import java.util.ArrayList; 024import java.util.Collections; 025import java.util.List; 026import java.util.function.Supplier; 027 028/** 029 * An interface for things that tokenize text: breaking it into words according 030 * to some set of rules. 031 * <p> 032 * Note that tokenizers are not guaranteed to be thread safe! Using the same 033 * tokenizer from multiple threads may result in strange behavior. 034 * <p> 035 * Tokenizers which are not ready throw {@link IllegalStateException} 036 * when {@link Tokenizer#advance} or any get method is called. 037 * <p> 038 * Most Tokenizers are Cloneable, and implement the Cloneable interface. 039 */ 040public interface Tokenizer extends Configurable, Cloneable, Provenancable<ConfiguredObjectProvenance> { 041 042 static Supplier<Tokenizer> createSupplier(Tokenizer tokenizer) { 043 Supplier<Tokenizer> supplier = () -> { 044 try { 045 return tokenizer.clone(); 046 } catch (CloneNotSupportedException e) { 047 throw new RuntimeException(e); 048 } 049 }; 050 return supplier; 051 } 052 053 static ThreadLocal<Tokenizer> createThreadLocal(Tokenizer tokenizer) { 054 return ThreadLocal.withInitial(createSupplier(tokenizer)); 055 } 056 057 /** 058 * Resets the tokenizer so that it operates on a new sequence of characters. 059 * 060 * @param cs a character sequence to tokenize 061 */ 062 public void reset(CharSequence cs); 063 064 /** 065 * Advances the tokenizer to the next token. 066 * 067 * @return {@code true} if there is such a token, {@code false} 068 * otherwise. 069 */ 070 public boolean advance(); 071 072 /** 073 * Gets the text of the current token, as a string 074 * 075 * @return the text of the current token 076 */ 077 public String getText(); 078 079 /** 080 * Gets the starting character offset of the current token in the character 081 * sequence 082 * 083 * @return the starting character offset of the token 084 */ 085 public int getStart(); 086 087 /** 088 * Gets the ending offset (exclusive) of the current token in the character 089 * sequence 090 * 091 * @return the exclusive ending character offset for the current token. 092 */ 093 public int getEnd(); 094 095 /** 096 * Gets the type of the current token. 097 * 098 * @return the type of the current token. 099 */ 100 public Token.TokenType getType(); 101 102 /** 103 * Clones a tokenizer with it's configuration. Cloned tokenizers are 104 * not processing the same text as the original tokenizer and need to be reset 105 * with a fresh CharSequence. 106 * 107 * @return A tokenizer with the same configuration, but independent state. 108 * @throws CloneNotSupportedException if the tokenizer isn't cloneable. 109 */ 110 public Tokenizer clone() throws CloneNotSupportedException; 111 112 /** 113 * Generates a Token object from the current state of the tokenizer. 114 * @return The token object from the current state. 115 */ 116 default public Token getToken() { 117 return new Token(getText(), getStart(), getEnd(), getType()); 118 } 119 120 /** 121 * Uses this tokenizer to tokenize a string and return the list of tokens 122 * that were generated. Many applications will simply want to take a 123 * character sequence and get a list of tokens, so this will do that for 124 * them. 125 * 126 * <p> 127 * Here is the contract of the tokenize function: 128 * <ul> 129 * <li>all returned tokens correspond to substrings of the input text</li> 130 * <li>the tokens do not overlap</li> 131 * <li>the tokens are returned in the order that they appear in the text 132 * </li> 133 * <li>the value of Token.text should be the same as calling 134 * text.substring(token.start, token.end) 135 * </ul> 136 * 137 * @param cs a sequence of characters to tokenize 138 * @return the tokens discovered in the character sequence, in order 139 * (true?). 140 */ 141 default List<Token> tokenize(CharSequence cs) { 142 if (cs == null || cs.length() == 0) { 143 return Collections.emptyList(); 144 } 145 List<Token> tokens = new ArrayList<>(); 146 reset(cs); 147 while (advance()) { 148 tokens.add(getToken()); 149 } 150 return tokens; 151 } 152 153 /** 154 * Uses this tokenizer to split a string into it's component substrings. 155 * Many applications will simply want the component strings making up a 156 * larger character sequence. 157 * 158 * @param cs the character sequence to tokenize 159 * @return a list of strings making up the character sequence. 160 */ 161 default List<String> split(CharSequence cs) { 162 if (cs == null || cs.length() == 0) { 163 return Collections.emptyList(); 164 } 165 List<String> tokens = new ArrayList<>(); 166 reset(cs); 167 while (advance()) { 168 tokens.add(getText()); 169 } 170 return tokens; 171 } 172}