001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 020import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 021import org.tribuo.util.tokens.Token; 022import org.tribuo.util.tokens.Tokenizer; 023 024/** 025 * This tokenizer is loosely based on the notion of word shape which is a common 026 * feature used in NLP. The idea here is that continuous runs of letters in the 027 * same character class will be grouped together. White space characters are 028 * used as delimiters. The character classes are: uppercase letters, lowercase 029 * letters, digits, and everything else goes into its own character class. So, 030 * for example, "1234abcd" would be split into "1234" and "abcd". And "!@#$" 031 * would result in four tokens. Please see unit tests. 032 * <p> 033 * Strings are split according to whitespace and contiguous runs of characters 034 * in the same character classes. Except for one exception - if uppercase 035 * letters are immediately followed by lowercase letters, then we keep them 036 * together. This has the effect of recognizing camel case and splits 037 * "CamelCase" into "Camel" and "Case". It also splits "ABCdef AAbb" into 038 * "ABCdef" and "AAbb". 039 */ 040public class ShapeTokenizer implements Tokenizer { 041 042 private String cs; 043 044 private int pos; 045 046 private String token; 047 048 private StringBuilder tb = new StringBuilder(); 049 050 private int start; 051 052 private int end; 053 054 private char currClass; 055 056 private int prevClass; 057 058 private boolean ready; 059 060 public ShapeTokenizer() { } 061 062 @Override 063 public ConfiguredObjectProvenance getProvenance() { 064 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 065 } 066 067 @Override 068 public void reset(CharSequence cs) { 069 this.cs = cs.toString(); 070 pos = 0; 071 start = -1; 072 end = -1; 073 prevClass = -1; 074 token = null; 075 ready = false; 076 } 077 078 private char getClass(int cp) { 079 if (Character.isUpperCase(cp)) { 080 return 'A'; 081 } else if (Character.isLowerCase(cp)) { 082 return 'a'; 083 } else if (Character.isDigit(cp)) { 084 return '1'; 085 } else if (Character.isWhitespace(cp)) { 086 return ' '; 087 } else { 088 return (char) cp; 089 } 090 } 091 092 @Override 093 public boolean advance() { 094 if (cs == null) { 095 throw new IllegalStateException("ShapeTokenizer has not been reset."); 096 } 097 tb.delete(0, tb.length()); 098 start = pos; 099 while (pos < cs.length()) { 100 int cp = cs.codePointAt(pos); 101 int lcp = Character.charCount(cp); 102 103 currClass = getClass(cp); 104 105 // 106 // Skip spaces at the start of the token. 107 if (tb.length() == 0 && currClass == ' ') { 108 pos += lcp; 109 start = pos; 110 prevClass = currClass; 111 continue; 112 } 113 114 // 115 // When do we want to end the current token? When we cross a boundary 116 // between token classes when we're not at the start of the string, 117 // except when that boundary is between 118 // upper and lower case characters. 119 if (currClass != prevClass && prevClass != -1) { 120 if (!(prevClass == 'A' && currClass == 'a')) { 121 if (tb.length() > 0) { 122 token = tb.toString(); 123 prevClass = currClass; 124 // 125 // Note that we're not increasing pos here: we want 126 // to work on this current character the next time that 127 // we get called! 128 ready = true; 129 return true; 130 } 131 } 132 } 133 134 // 135 // We didn't end the token, so collect the current character, 136 // unless it's a space! 137 if (currClass != ' ') { 138 tb.appendCodePoint(cp); 139 } 140 prevClass = currClass; 141 pos += lcp; 142 end = pos; 143 } 144 145 if (tb.length() > 0) { 146 token = tb.toString(); 147 ready = true; 148 return true; 149 } 150 151 return false; 152 } 153 154 @Override 155 public String getText() { 156 if (ready) { 157 return token; 158 } else { 159 throw new IllegalStateException("ShapeTokenizer is not ready."); 160 } 161 } 162 163 @Override 164 public int getStart() { 165 if (ready) { 166 return start; 167 } else { 168 throw new IllegalStateException("ShapeTokenizer is not ready."); 169 } 170 } 171 172 @Override 173 public int getEnd() { 174 if (ready) { 175 return end; 176 } else { 177 throw new IllegalStateException("ShapeTokenizer is not ready."); 178 } 179 } 180 181 @Override 182 public Token.TokenType getType() { 183 if (ready) { 184 return Token.TokenType.WORD; 185 } else { 186 throw new IllegalStateException("ShapeTokenizer is not ready."); 187 } 188 } 189 190 @Override 191 public ShapeTokenizer clone() { 192 try { 193 ShapeTokenizer copy = (ShapeTokenizer) super.clone(); 194 copy.tb = new StringBuilder(); 195 copy.ready = false; 196 copy.cs = null; 197 return copy; 198 } catch (CloneNotSupportedException e) { 199 throw new AssertionError("ShapeTokenizer is Cloneable, but clone call failed"); 200 } 201 } 202 203}