001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.universal; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.util.tokens.Token; 023import org.tribuo.util.tokens.Tokenizer; 024 025import java.util.Arrays; 026import java.util.LinkedList; 027import java.util.Queue; 028 029/** 030 * This class was originally written for the purpose of document indexing in an 031 * information retrieval context (principally used in Sun Labs' Minion search 032 * engine). It was refactored here to implement the Tokenizer interface taking 033 * care that the the 'ngram' tokens had correct character offsets. This is 034 * typically not required in the document indexing context - but is essential 035 * in other kinds of text processing / NLP tasks. 036 * <p> 037 * This tokenizer has some specific behavior in how it handles "ngram" 038 * characters - i.e., those characters for which {@link #isNgram(char)} returns 039 * true (CJK characters and others). For these characters, it will generate 040 * tokens corresponding to character bigrams in addition to tokens corresponding 041 * to token unigrams. Most of the other tokenizers will generate tokens that 042 * have no overlapping spans but here the character bigram tokens will overlap 043 * with the character unigram tokens. 044 * <p> 045 * This tokenizer uses bigram tokenization whenever it encounters 'ngram' 046 * characters in the CJK range (among others see {@link #isNgram(char)}). It 047 * otherwise tokenizes using punctuation and whitespace separators to separate 048 * words. Within runs of 'ngram' characters the tokenizer will generate tokens 049 * corresponding to two adjacent characters in addition to tokens corresponding 050 * to each character. The tokens corresponding to character bigrams may overlap 051 * with the previous and next token. An end-of-line between two 'ngram' 052 * characters is ignored (i.e., a character bigram token will be created.) 053 * <p> 054 * For example, a sequence of three Chinese characters, 非常感, would tokenize as 055 * three WORD type tokens: 非, 常, and 感 and two NGRAM type tokens: 非常 and 常感. 056 * Here these tokens will have character offsets that correspond to the 057 * character offsets into the text. Here are the tokens listed with their 058 * character offsets: 059 * <ul> 060 * <li>非[0,1]</li> 061 * <li>非常[0,2]</li> 062 * <li>常[1,2]</li> 063 * <li>常感[1,3]</li> 064 * <li>感[2,3]</li> 065 * </ul> 066 */ 067public class UniversalTokenizer implements Tokenizer { 068 069 /** 070 * The length of the longest token that we will generate. 071 */ 072 protected int maxTokenLength = 256; 073 private boolean eofReached = false; 074 075 /** 076 * The character position in the character sequence that we're tokenizing. 077 */ 078 private int pos; 079 080 /** 081 * The starting offset of the current buffer in the token stream. 082 */ 083 private int start; 084 085 /** 086 * If <code>true</code> then unigrams will be generated for each n-gram 087 * character. 088 */ 089 private boolean generateUnigrams = true; 090 091 /** 092 * If <code>true</code> then character bigrams will be generated for each n-gram 093 * character as defined by {@link #isNgram(char)}. 094 */ 095 private boolean generateNgrams = true; 096 /** 097 * The state of the tokenizer determined by previous history. 098 */ 099 private State state; 100 /** 101 * The character sequence that we're currently processing. 102 */ 103 private CharSequence cs; 104 /** 105 * The token that we're building. 106 */ 107 private char[] buffer; 108 /** 109 * A string representation of the current token. 110 */ 111 private String currToken; 112 /** 113 * The current type of the token. 114 */ 115 private Token.TokenType currType; 116 /** 117 * The current word position of the token. 118 */ 119 private int currPos; 120 /** 121 * The starting offset of the current token. 122 */ 123 private int startOffset; 124 /** 125 * The ending offset of the current token. 126 */ 127 private int endOffset; 128 /** 129 * The length of the current token we're building. 130 */ 131 private int tokenLength; 132 /** 133 * Whether this is the first token. 134 */ 135 private boolean firstToken; 136 /** 137 * Is the tokenizer ready? 138 */ 139 private boolean ready; 140 @Config 141 private boolean sendPunct = false; 142 /** 143 * A set of tokens that were generated and need to be returned. 144 */ 145 private Queue<Range> queuedTokens; 146 private Queue<Range> pool; 147 /** 148 * The current character that we're processing. 149 */ 150 private char c; 151 152 /** 153 * @param sendPunct if sendPunct is true, then the tokenizer will generate punctuation tokens. 154 */ 155 public UniversalTokenizer(boolean sendPunct) { 156 super(); 157 this.sendPunct = sendPunct; 158 this.buffer = new char[maxTokenLength]; 159 this.tokenLength = 0; 160 this.state = State.SKIPPING; 161 this.queuedTokens = new LinkedList<>(); 162 this.pool = new LinkedList<>(); 163 } 164 165 /** 166 * Constructs a universal tokenizer which doesn't send punctuation. 167 */ 168 public UniversalTokenizer() { 169 this(false); 170 } 171 172 /** 173 * A quick check for whether a character should be kept in a word or should 174 * be removed from the word if it occurs at one of the ends. An 175 * approximation of Character.isLetterOrDigit, but is faster and more 176 * correct, since it doesn't count the smart quotes as letters. 177 * 178 * @param c The character to check. 179 * @return True if the input is a letter or digit. 180 */ 181 public static boolean isLetterOrDigit(char c) { 182 if ((c <= 122 && c >= 97) 183 || // most frequent: lowercase a...z 184 (c <= 90 && c >= 65) 185 || // frequent: uppercase A...Z 186 (c <= 57 && c >= 48) // frequent: numbers 0...9 187 ) { 188 return true; 189 } else if ((c <= 96) 190 || // includes whitespace 191 (c == 210 || c == 211) 192 || // (smart quotes) 193 (c >= 123 && c <= 127) // {|}~DEL 194 ) { 195 return false; 196 } else if ((c >= 3021 && c <= 3029) 197 || // Hangzhou-style numerals 198 (c >= 65 && c <= 90) 199 || // frequent: uppercase A...Z 200 (c >= 48 && c <= 57) // frequent: numbers 0...9 201 ) { 202 return true; 203 } else { 204 return Character.isLetterOrDigit(c); 205 } 206 } 207 208 /** 209 * A quick check for whether a character is a digit. 210 * 211 * @param c The character to check 212 * @return True if the input is a digit. 213 */ 214 public static boolean isDigit(char c) { 215 if ((c <= 57 && c >= 48) // most frequent: ASCII numbers 0...9 216 ) { 217 return true; 218 } else if (c <= 255) { 219 return false; 220 } else { 221 return Character.isDigit(c); 222 } 223 } 224 225 /** 226 * A quick check for whether a character is whitespace. 227 * 228 * @param c The character to check 229 * @return True if the input is a whitespace character. 230 */ 231 public static boolean isWhitespace(char c) { 232 //test for white space 233 if ((c == 32) 234 || // Space 235 (c <= 13 && c >= 9) 236 || // Tab, Linefeed, PageUp, Page, Return 237 (c <= 4 && c >= 1) // STX, SOT, ETX (Enter), EOT 238 ) { 239 return true; 240 } else if (c <= 255) { 241 return false; 242 } else { 243 return Character.isWhitespace(c); 244 } 245 } 246 247 /** 248 * A quick check for a character in a language that may not separate words 249 * with whitespace (includes Arabic, CJK, and Thai). Uses Unicode Standard 250 * Version 2.0. 251 * 252 * @param c The character to check 253 * @return True if the input character is in a region which is not whitespace separated. 254 */ 255 public static boolean isNgram(char c) { 256 // Test for characters that may not separate words with white 257 // space and therefore require bigram treatment. 258 // Uses Unicode Standard Version 2.0. 259 if (c > '\u3002' && c <= '\uD7FF') { // (CJK Characters) 260 return (c < '\u3040' || c > '\u30FF'); // - Hiragana and Katakana 261 } else if ((c >= '\u0600' && c <= '\u06FF') || // (Arabic) 262 (c >= '\uF900' && c <= '\uFAFF') || // (CJK Compatibility Ideographs) 263 (c >= '\u1100' && c <= '\u11FF') || // (Hangul Jamo) 264 (c >= '\uFB50' && c <= '\uFE2F') || // (Arabic Presentation Forms-A) 265 (c >= '\uFE30' && c <= '\uFE4F') || // (CJK Compatibility Forms) 266 (c >= '\uFE70' && c <= '\uFEFF') || // (Arabic Presentation Forms-B) 267 (c >= '\uFF60' && c <= '\uFFDF') || // (CJK Half Width Forms) 268 (c >= '\u0E00' && c <= '\u0E7F') || // (Thai) 269 (c >= '\u0E80' && c <= '\u0EFF') || // (Lao) 270 (c >= '\u0F00' && c <= '\u0FBF') || // (Tibetan) 271 (c >= '\u0B80' && c <= '\u0BFF') || // (Tamil) 272 (c >= '\u0C00' && c <= '\u0C7F') || // (Telugu) 273 (c >= '\u0C80' && c <= '\u0CFF') || // (Kannada) 274 (c >= '\u0D00' && c <= '\u0D7F') || // (Malayalam) 275 (c >= '\u10A0' && c <= '\u10FF')) { // (Georgian) 276 return true; 277 } else { 278 return false; 279 } 280 } 281 282 public boolean isGenerateUnigrams() { 283 return generateUnigrams; 284 } 285 286 public void setGenerateUnigrams(boolean generateUnigrams) { 287 this.generateUnigrams = generateUnigrams; 288 } 289 290 public boolean isGenerateNgrams() { 291 return generateNgrams; 292 } 293 294 public void setGenerateNgrams(boolean generateNgrams) { 295 this.generateNgrams = generateNgrams; 296 } 297 298 public int getMaxTokenLength() { 299 return maxTokenLength; 300 } 301 302 public void setMaxTokenLength(int maxTokenLength) { 303 this.maxTokenLength = maxTokenLength; 304 } 305 306 @Override 307 public ConfiguredObjectProvenance getProvenance() { 308 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 309 } 310 311 @Override 312 public final boolean advance() { 313 if (cs == null) { 314 throw new IllegalStateException("UniversalTokenizer has not been reset."); 315 } 316 // 317 // Do we have tokens queued up to go? 318 if (queuedTokens.size() > 0) { 319 handleQueued(); 320 return true; 321 } 322 323 // 324 // If we've already read the data, then we're done. 325 if (eofReached) { 326 return false; 327 } 328 329 // 330 // Read characters until we have one or more tokens to send. 331 while (pos < cs.length()) { 332 c = cs.charAt(pos); 333 handleChar(); 334 pos++; 335 if (queuedTokens.size() > 0) { 336 handleQueued(); 337 return true; 338 } 339 } 340 341 eofReached = true; 342 makeTokens(); 343 if (queuedTokens.size() > 0) { 344 handleQueued(); 345 return true; 346 } 347 return false; 348 } 349 350 private void handleQueued() { 351 ready = true; 352 Range range = queuedTokens.poll(); 353 currToken = new String(range.buff, 0, range.len); 354 startOffset = range.start; 355 endOffset = range.end; 356 if (firstToken && range.incr == 0) { 357 range.incr = 1; 358 firstToken = false; 359 } 360 currType = range.type; 361 currPos = range.incr; 362 pool.offer(range); 363 } 364 365 /** 366 * Handle a character to add to the token buffer. 367 */ 368 protected void handleChar() { 369 370 // 371 // ASCII characters. 372 if ((c >= 97 && c <= 122) || (c >= 65 && c <= 90)) { 373 if (state == State.NGRAM) { 374 makeTokens(); 375 } 376 addChar(); 377 state = State.COLLECTING; 378 return; 379 } 380 381 // 382 // ASCII space. We need to treat other whitespace differently, depending 383 // on whether we're ngram tokenizing. 384 if (c == 32) { 385 switch (state) { 386 case COLLECTING: 387 case NGRAM: 388 // The transition from collecting or n-gram to whitespace 389 // causes us to emit tokens. 390 makeTokens(); 391 break; 392 case SKIPPING: 393 break; 394 default: 395 break; 396 } 397 sendPunct(); 398 state = State.SKIPPING; 399 return; 400 } 401 402 if (isNgram(c)) { 403 // CJK characters (Chinese, Japanese, Korean) 404 // to be tokenized with bigram tokens. 405 // (Put this test here so these languages will tokenize 406 // more efficiently and it doesn't cost much for the non CJK 407 // languages.) 408 409 switch (state) { 410 case SKIPPING: 411 state = State.NGRAM; 412 break; 413 case COLLECTING: 414 makeTokens(); 415 state = State.NGRAM; 416 break; 417 case NGRAM: 418 break; 419 default: 420 break; 421 } 422 addChar(); 423 return; 424 } 425 426 if (c == 0 || (state == State.NGRAM && (c >= 10 && c <= 13))) { 427 // While processing ngram character regions, Linefeed, PageUp, Page, Return 428 // don't do anything, so just return. 429 return; 430 } 431 432 if (isWhitespace(c)) { 433 // The rest of the white space characters for break: 434 switch (state) { 435 case COLLECTING: 436 case NGRAM: 437 // The transition from collecting to whitespace 438 // causes us to emit tokens. 439 makeTokens(); 440 break; 441 case SKIPPING: 442 break; 443 default: 444 break; 445 } 446 sendPunct(); 447 state = State.SKIPPING; 448 return; 449 } 450 451 if ((c >= 48 && c <= 57) || (c > 255 && Character.isDigit(c))) { 452 453 // 454 // The digits. 455 switch (state) { 456 case SKIPPING: 457 state = State.COLLECTING; 458 break; 459 case NGRAM: 460 makeTokens(); 461 state = State.COLLECTING; 462 break; 463 case COLLECTING: 464 break; 465 default: 466 break; 467 } 468 addChar(); 469 return; 470 } 471 472 // 473 // Any other letter or digit. 474 if (isLetterOrDigit(c)) { 475 if (state == State.NGRAM) { 476 makeTokens(); 477 } 478 addChar(); 479 state = State.COLLECTING; 480 return; 481 } 482 483 // Anything other than the above cases, we break. 484 if (state != State.SKIPPING) { 485 makeTokens(); 486 } 487 sendPunct(); 488 state = State.SKIPPING; 489 } 490 491 private void sendPunct() { 492 if (sendPunct && !isWhitespace(c)) { 493 Range r = getRange(); 494 r.punct(c, pos); 495 queuedTokens.add(r); 496 } 497 } 498 499 /** 500 * Add a character to the buffer that we're building for a token. 501 */ 502 protected void addChar() { 503 504 // 505 // First see if token buffer needs to be expanded. 506 // Note: tokLen points to the next unused slot in token. 507 if (buffer.length <= tokenLength) { 508 buffer = Arrays.copyOf(buffer, tokenLength + 32); 509 } 510 511 if (tokenLength == 0) { 512 start = pos; 513 } 514 buffer[tokenLength++] = c; 515 516 if (tokenLength >= maxTokenLength) { 517 makeTokens(); 518 } 519 } 520 521 @Override 522 public int getStart() { 523 if (ready) { 524 return startOffset; 525 } else { 526 throw new IllegalStateException("UniversalTokenizer is not ready."); 527 } 528 } 529 530 @Override 531 public int getEnd() { 532 if (ready) { 533 return endOffset; 534 } else { 535 throw new IllegalStateException("UniversalTokenizer is not ready."); 536 } 537 } 538 539 @Override 540 public String getText() { 541 if (ready) { 542 return currToken; 543 } else { 544 throw new IllegalStateException("UniversalTokenizer is not ready."); 545 } 546 } 547 548 @Override 549 public Token.TokenType getType() { 550 if (ready) { 551 return currType; 552 } else { 553 throw new IllegalStateException("UniversalTokenizer is not ready."); 554 } 555 } 556 557 public int getPos() { 558 return currPos; 559 } 560 561 @Override 562 public Tokenizer clone() { 563 try { 564 UniversalTokenizer copy = (UniversalTokenizer) super.clone(); 565 566 copy.buffer = new char[maxTokenLength]; 567 copy.tokenLength = 0; 568 copy.state = State.SKIPPING; 569 copy.pool = new LinkedList<>(); 570 copy.queuedTokens = new LinkedList<>(); 571 copy.currToken = null; 572 copy.ready = false; 573 copy.cs = null; 574 575 return copy; 576 } catch (CloneNotSupportedException e) { 577 throw new AssertionError("UniversalTokenizer is Cloneable, but clone call failed"); 578 } 579 } 580 581 /** 582 * Reset state of tokenizer to clean slate. 583 */ 584 @Override 585 public void reset(CharSequence cs) { 586 this.cs = cs; 587 pos = 0; 588 tokenLength = 0; 589 start = -1; 590 state = State.SKIPPING; 591 eofReached = false; 592 firstToken = true; 593 c = 0; 594 startOffset = -1; 595 endOffset = -1; 596 currToken = null; 597 ready = false; 598 } 599 600 private Range getRange() { 601 if (pool.isEmpty()) { 602 return new Range(); 603 } 604 return pool.remove(); 605 } 606 607 /** 608 * Make one or more tokens from our current collected characters. 609 */ 610 protected void makeTokens() { 611 612 // 613 // Don't generate empty tokens. 614 if (tokenLength <= 0) { 615 return; 616 } 617 618 if (state == State.NGRAM) { 619 // if we only have one character, then just generate a single 620 // token and be done. 621 if (tokenLength == 1) { 622 Range range = getRange(); 623 range.set(buffer[0], start); 624 queuedTokens.add(range); 625 tokenLength = 0; 626 return; 627 } 628 629 for (int i = 0; i < tokenLength; i++) { 630 if (generateUnigrams) { 631 // Generate a unigram for this character. 632 Range range = getRange(); 633 range.set(buffer[i], start + i); 634 queuedTokens.add(range); 635 } 636 if (generateNgrams && i < tokenLength - 1) { 637 // Generate a bigram for this character. 638 Range range = getRange(); 639 range.set(buffer[i], buffer[i + 1], start + i); 640 queuedTokens.add(range); 641 } 642 } 643 } else { 644 // Generate one token from the buffer. 645 Range range = getRange(); 646 range.set(buffer, tokenLength, start); 647 queuedTokens.add(range); 648 } 649 tokenLength = 0; 650 } 651 652 private enum State { 653 SKIPPING, 654 COLLECTING, 655 NGRAM, 656 } 657 658}