001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.util.tokens.Token.TokenType; 023import org.tribuo.util.tokens.Tokenizer; 024 025import java.util.Arrays; 026 027/** 028 * This implementation of {@link Tokenizer} is instantiated with an array of 029 * characters that are considered split characters. That is, the split 030 * characters define where to split the input text. It's a very simplistic 031 * tokenizer that has one simple exceptional case that it handles: how to deal 032 * with split characters that appear in between digits (e.g., 3/5 and 3.1415). 033 * It's not really very general purpose, but may suffice for some use cases. 034 * <p> 035 * In addition to the split characters specified it also splits on anything 036 * that is considered whitespace by {@link Character#isWhitespace(char)}. 037 * @author Philip Ogren 038 */ 039public class SplitCharactersTokenizer implements Tokenizer { 040 041 public static final char[] DEFAULT_SPLIT_CHARACTERS = new char[]{'*', '(', ')', '&', '[', ']', '{', '}', '`', 042 '\'', '|', ';', ':', '\\', '!', '-', '?'}; 043 public static final char[] DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS = new char[]{'.', ',', '/',}; 044 045 @Config(description="The characters to split on.") 046 private char[] splitCharacters = DEFAULT_SPLIT_CHARACTERS; 047 048 @Config(description="The characters to split on unless we're in a number.") 049 private char[] splitXDigitsCharacters = DEFAULT_SPLIT_EXCEPTING_IN_DIGITS_CHARACTERS; 050 051 private CharSequence cs; 052 053 private int start; 054 055 private int end; 056 057 private int p; 058 059 private StringBuilder token = new StringBuilder(); 060 061 private boolean ready; 062 063 public SplitCharactersTokenizer() { 064 } 065 066 /** 067 * @param splitCharacters characters to be replaced with a space in the 068 * input text (e.g., "abc|def" becomes "abc def") 069 * @param splitXDigitsCharacters characters to be replaced with a space in 070 * the input text except in the circumstance where the character immediately 071 * adjacent to the left and right are digits (e.g., "abc.def" becomes "abc 072 * def" but "3.1415" remains "3.1415"). 073 */ 074 public SplitCharactersTokenizer(char[] splitCharacters, char[] splitXDigitsCharacters) { 075 this.splitCharacters = splitCharacters; 076 this.splitXDigitsCharacters = splitXDigitsCharacters; 077 } 078 079 /** 080 * Creates a tokenizer that splits on whitespace. 081 * @return A whitespace tokenizer. 082 */ 083 public static SplitCharactersTokenizer createWhitespaceTokenizer() { 084 return new SplitCharactersTokenizer(new char[0], new char[0]); 085 } 086 087 @Override 088 public ConfiguredObjectProvenance getProvenance() { 089 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 090 } 091 092 @Override 093 public void reset(CharSequence cs) { 094 this.cs = cs; 095 start = -1; 096 end = -1; 097 p = 0; 098 token.delete(0, token.length()); 099 ready = false; 100 } 101 102 @Override 103 public boolean advance() { 104 if (cs == null) { 105 throw new IllegalStateException("SplitCharactersTokenizer has not been reset."); 106 } 107 if (p >= cs.length()) { 108 return false; 109 } 110 token.delete(0, token.length()); 111 while (p < cs.length()) { 112 char c = cs.charAt(p); 113 // 114 // First, let's figure out if this is a character that we 115 // want to keep in a token. We want to keep a character if it's 116 // not one of our split characters or if it's one of the "keep in 117 // digits" characters and it's surrounded by digits. 118 boolean keepCharacter = !(isSplitCharacter(c) || (isSplitXDigitCharacter(c) && (p == 0 119 || p == cs.length() - 1 120 || !Character.isDigit(cs.charAt(p - 1)) 121 || !Character.isDigit(cs.charAt(p + 1))))); 122 123 p++; 124 // 125 // If we want to keep it, then go ahead and do that and remember 126 // where the end of the token is. 127 if (keepCharacter) { 128 // 129 // If this is the first character that we're keeping, remember 130 // where the token started. 131 if (token.length() == 0) { 132 start = p - 1; 133 } 134 token.append(c); 135 end = p; 136 } 137 138 // 139 // OK, if we didnt want to keep this character, and we've already 140 // collected some stuff, then we've got a token to send, so let's 141 // break out of the loop. This should allow us to skip runs of 142 // breaking characters. 143 if (!keepCharacter && token.length() > 0) { 144 break; 145 } 146 } 147 148 // 149 // We advanced if we have some stuff collected. 150 if (token.length() > 0) { 151 ready = true; 152 return true; 153 } else { 154 return false; 155 } 156 } 157 158 @Override 159 public String getText() { 160 if (ready) { 161 return token.toString(); 162 } else { 163 throw new IllegalStateException("SplitCharactersTokenizer is not ready."); 164 } 165 } 166 167 @Override 168 public int getStart() { 169 if (ready) { 170 return start; 171 } else { 172 throw new IllegalStateException("SplitCharactersTokenizer is not ready."); 173 } 174 } 175 176 @Override 177 public int getEnd() { 178 if (ready) { 179 return end; 180 } else { 181 throw new IllegalStateException("SplitCharactersTokenizer is not ready."); 182 } 183 } 184 185 @Override 186 public TokenType getType() { 187 if (ready) { 188 return TokenType.WORD; 189 } else { 190 throw new IllegalStateException("SplitCharactersTokenizer is not ready."); 191 } 192 } 193 194 @Override 195 public SplitCharactersTokenizer clone() { 196 try { 197 SplitCharactersTokenizer copy = (SplitCharactersTokenizer) super.clone(); 198 copy.token = new StringBuilder(); 199 copy.splitCharacters = splitCharacters == null ? null : Arrays.copyOf(splitCharacters, splitCharacters.length); 200 copy.splitXDigitsCharacters = splitXDigitsCharacters == null ? null : Arrays.copyOf(splitXDigitsCharacters, splitXDigitsCharacters.length); 201 copy.ready = false; 202 copy.cs = null; 203 return copy; 204 } catch (CloneNotSupportedException e) { 205 throw new AssertionError("SplitCharactersTokenizer is Cloneable, but clone call failed"); 206 } 207 } 208 209 /** 210 * Is this character a split character for this tokenizer instance. 211 * @param c The character to check. 212 * @return True if it's a split character. 213 */ 214 public boolean isSplitCharacter(char c) { 215 return isCharacter(c, splitCharacters) || Character.isWhitespace(c); 216 } 217 218 /** 219 * Is this character a split character except inside a digit for this tokenizer instance. 220 * @param c The character to check. 221 * @return True if it's a split character. 222 */ 223 public boolean isSplitXDigitCharacter(char c) { 224 return isCharacter(c, splitXDigitsCharacters); 225 } 226 227 private boolean isCharacter(char c, char[] chars) { 228 if (chars == null) { 229 return false; 230 } 231 for (char ch : chars) { 232 if (ch == c) { 233 return true; 234 } 235 } 236 return false; 237 } 238 239 /** 240 * Returns a copy of the split characters. 241 * @return A copy of the split characters. 242 */ 243 public char[] getSplitCharacters() { 244 return Arrays.copyOf(splitCharacters,splitCharacters.length); 245 } 246 247 /** 248 * Returns a copy of the split characters except inside digits. 249 * @return A copy of the split characters. 250 */ 251 public char[] getSplitXDigitsCharacters() { 252 return Arrays.copyOf(splitXDigitsCharacters,splitXDigitsCharacters.length); 253 } 254 255}