001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.universal;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.util.tokens.Token;
023import org.tribuo.util.tokens.Tokenizer;
024
025import java.util.Arrays;
026import java.util.LinkedList;
027import java.util.Queue;
028
029/**
030 * This class was originally written for the purpose of document indexing in an
031 * information retrieval context (principally used in Sun Labs' Minion search
032 * engine). It was refactored here to implement the Tokenizer interface taking
033 * care that the the 'ngram' tokens had correct character offsets. This is
034 * typically not required in the document indexing context - but is essential
035 * in other kinds of text processing / NLP tasks.
036 * <p>
037 * This tokenizer has some specific behavior in how it handles "ngram"
038 * characters - i.e., those characters for which {@link #isNgram(char)} returns
039 * true (CJK characters and others). For these characters, it will generate
040 * tokens corresponding to character bigrams in addition to tokens corresponding
041 * to token unigrams. Most of the other tokenizers will generate tokens that
042 * have no overlapping spans but here the character bigram tokens will overlap
043 * with the character unigram tokens.
044 * <p>
045 * This tokenizer uses bigram tokenization whenever it encounters 'ngram'
046 * characters in the CJK range (among others see {@link #isNgram(char)}). It
047 * otherwise tokenizes using punctuation and whitespace separators to separate
048 * words. Within runs of 'ngram' characters the tokenizer will generate tokens
049 * corresponding to two adjacent characters in addition to tokens corresponding
050 * to each character. The tokens corresponding to character bigrams may overlap
051 * with the previous and next token. An end-of-line between two 'ngram'
052 * characters is ignored (i.e., a character bigram token will be created.)
053 * <p>
054 * For example, a sequence of three Chinese characters, 非常感, would tokenize as
055 * three WORD type tokens: 非, 常, and 感 and two NGRAM type tokens: 非常 and 常感.
056 * Here these tokens will have character offsets that correspond to the
057 * character offsets into the text. Here are the tokens listed with their
058 * character offsets:
059 * <ul>
060 * <li>非[0,1]</li>
061 * <li>非常[0,2]</li>
062 * <li>常[1,2]</li>
063 * <li>常感[1,3]</li>
064 * <li>感[2,3]</li>
065 * </ul>
066 */
067public class UniversalTokenizer implements Tokenizer {
068
069    /**
070     * The length of the longest token that we will generate.
071     */
072    protected int maxTokenLength = 256;
073    private boolean eofReached = false;
074
075    /**
076     * The character position in the character sequence that we're tokenizing.
077     */
078    private int pos;
079
080    /**
081     * The starting offset of the current buffer in the token stream.
082     */
083    private int start;
084
085    /**
086     * If <code>true</code> then unigrams will be generated for each n-gram
087     * character.
088     */
089    private boolean generateUnigrams = true;
090
091    /**
092     * If <code>true</code> then character bigrams will be generated for each n-gram
093     * character as defined by {@link #isNgram(char)}.
094     */
095    private boolean generateNgrams = true;
096    /**
097     * The state of the tokenizer determined by previous history.
098     */
099    private State state;
100    /**
101     * The character sequence that we're currently processing.
102     */
103    private CharSequence cs;
104    /**
105     * The token that we're building.
106     */
107    private char[] buffer;
108    /**
109     * A string representation of the current token.
110     */
111    private String currToken;
112    /**
113     * The current type of the token.
114     */
115    private Token.TokenType currType;
116    /**
117     * The current word position of the token.
118     */
119    private int currPos;
120    /**
121     * The starting offset of the current token.
122     */
123    private int startOffset;
124    /**
125     * The ending offset of the current token.
126     */
127    private int endOffset;
128    /**
129     * The length of the current token we're building.
130     */
131    private int tokenLength;
132    /**
133     * Whether this is the first token.
134     */
135    private boolean firstToken;
136    /**
137     * Is the tokenizer ready?
138     */
139    private boolean ready;
140    @Config
141    private boolean sendPunct = false;
142    /**
143     * A set of tokens that were generated and need to be returned.
144     */
145    private Queue<Range> queuedTokens;
146    private Queue<Range> pool;
147    /**
148     * The current character that we're processing.
149     */
150    private char c;
151
152    /**
153     * @param sendPunct if sendPunct is true, then the tokenizer will generate punctuation tokens.
154     */
155    public UniversalTokenizer(boolean sendPunct) {
156        super();
157        this.sendPunct = sendPunct;
158        this.buffer = new char[maxTokenLength];
159        this.tokenLength = 0;
160        this.state = State.SKIPPING;
161        this.queuedTokens = new LinkedList<>();
162        this.pool = new LinkedList<>();
163    }
164
165    /**
166     * Constructs a universal tokenizer which doesn't send punctuation.
167     */
168    public UniversalTokenizer() {
169        this(false);
170    }
171
172    /**
173     * A quick check for whether a character should be kept in a word or should
174     * be removed from the word if it occurs at one of the ends. An
175     * approximation of Character.isLetterOrDigit, but is faster and more
176     * correct, since it doesn't count the smart quotes as letters.
177     *
178     * @param c The character to check.
179     * @return True if the input is a letter or digit.
180     */
181    public static boolean isLetterOrDigit(char c) {
182        if ((c <= 122 && c >= 97)
183                || // most frequent: lowercase a...z
184                (c <= 90 && c >= 65)
185                || // frequent: uppercase A...Z
186                (c <= 57 && c >= 48) // frequent: numbers 0...9
187        ) {
188            return true;
189        } else if ((c <= 96)
190                || // includes whitespace
191                (c == 210 || c == 211)
192                || // (smart quotes)
193                (c >= 123 && c <= 127) // {|}~DEL
194        ) {
195            return false;
196        } else if ((c >= 3021 && c <= 3029)
197                || // Hangzhou-style numerals
198                (c >= 65 && c <= 90)
199                || // frequent: uppercase A...Z
200                (c >= 48 && c <= 57) // frequent: numbers 0...9
201        ) {
202            return true;
203        } else {
204            return Character.isLetterOrDigit(c);
205        }
206    }
207
208    /**
209     * A quick check for whether a character is a digit.
210     *
211     * @param c The character to check
212     * @return True if the input is a digit.
213     */
214    public static boolean isDigit(char c) {
215        if ((c <= 57 && c >= 48) // most frequent: ASCII numbers 0...9
216        ) {
217            return true;
218        } else if (c <= 255) {
219            return false;
220        } else {
221            return Character.isDigit(c);
222        }
223    }
224
225    /**
226     * A quick check for whether a character is whitespace.
227     *
228     * @param c The character to check
229     * @return True if the input is a whitespace character.
230     */
231    public static boolean isWhitespace(char c) {
232        //test for white space
233        if ((c == 32)
234                || // Space
235                (c <= 13 && c >= 9)
236                || // Tab, Linefeed, PageUp, Page, Return
237                (c <= 4 && c >= 1) // STX, SOT, ETX (Enter), EOT
238        ) {
239            return true;
240        } else if (c <= 255) {
241            return false;
242        } else {
243            return Character.isWhitespace(c);
244        }
245    }
246
247    /**
248     * A quick check for a character in a language that may not separate words
249     * with whitespace (includes Arabic, CJK, and Thai). Uses Unicode Standard
250     * Version 2.0.
251     *
252     * @param c The character to check
253     * @return True if the input character is in a region which is not whitespace separated.
254     */
255    public static boolean isNgram(char c) {
256        // Test for characters that may not separate words with white
257        // space and therefore require bigram treatment.
258        // Uses Unicode Standard Version 2.0.
259        if (c > '\u3002' && c <= '\uD7FF') {           // (CJK Characters)
260            return (c < '\u3040' || c > '\u30FF');   // - Hiragana and Katakana
261        } else if ((c >= '\u0600' && c <= '\u06FF') || // (Arabic)
262                (c >= '\uF900' && c <= '\uFAFF') || // (CJK Compatibility Ideographs)
263                (c >= '\u1100' && c <= '\u11FF') || // (Hangul Jamo)
264                (c >= '\uFB50' && c <= '\uFE2F') || // (Arabic Presentation Forms-A)
265                (c >= '\uFE30' && c <= '\uFE4F') || // (CJK Compatibility Forms)
266                (c >= '\uFE70' && c <= '\uFEFF') || // (Arabic Presentation Forms-B)
267                (c >= '\uFF60' && c <= '\uFFDF') || // (CJK Half Width Forms)
268                (c >= '\u0E00' && c <= '\u0E7F') || // (Thai)
269                (c >= '\u0E80' && c <= '\u0EFF') || // (Lao)
270                (c >= '\u0F00' && c <= '\u0FBF') || // (Tibetan)
271                (c >= '\u0B80' && c <= '\u0BFF') || // (Tamil)
272                (c >= '\u0C00' && c <= '\u0C7F') || // (Telugu)
273                (c >= '\u0C80' && c <= '\u0CFF') || // (Kannada)
274                (c >= '\u0D00' && c <= '\u0D7F') || // (Malayalam)
275                (c >= '\u10A0' && c <= '\u10FF')) { // (Georgian)
276            return true;
277        } else {
278            return false;
279        }
280    }
281
282    public boolean isGenerateUnigrams() {
283        return generateUnigrams;
284    }
285
286    public void setGenerateUnigrams(boolean generateUnigrams) {
287        this.generateUnigrams = generateUnigrams;
288    }
289
290    public boolean isGenerateNgrams() {
291        return generateNgrams;
292    }
293
294    public void setGenerateNgrams(boolean generateNgrams) {
295        this.generateNgrams = generateNgrams;
296    }
297
298    public int getMaxTokenLength() {
299        return maxTokenLength;
300    }
301
302    public void setMaxTokenLength(int maxTokenLength) {
303        this.maxTokenLength = maxTokenLength;
304    }
305
306    @Override
307    public ConfiguredObjectProvenance getProvenance() {
308        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
309    }
310
311    @Override
312    public final boolean advance() {
313        if (cs == null) {
314            throw new IllegalStateException("UniversalTokenizer has not been reset.");
315        }
316        //
317        // Do we have tokens queued up to go?
318        if (queuedTokens.size() > 0) {
319            handleQueued();
320            return true;
321        }
322
323        //
324        // If we've already read the data, then we're done.
325        if (eofReached) {
326            return false;
327        }
328
329        //
330        // Read characters until we have one or more tokens to send.
331        while (pos < cs.length()) {
332            c = cs.charAt(pos);
333            handleChar();
334            pos++;
335            if (queuedTokens.size() > 0) {
336                handleQueued();
337                return true;
338            }
339        }
340
341        eofReached = true;
342        makeTokens();
343        if (queuedTokens.size() > 0) {
344            handleQueued();
345            return true;
346        }
347        return false;
348    }
349
350    private void handleQueued() {
351        ready = true;
352        Range range = queuedTokens.poll();
353        currToken = new String(range.buff, 0, range.len);
354        startOffset = range.start;
355        endOffset = range.end;
356        if (firstToken && range.incr == 0) {
357            range.incr = 1;
358            firstToken = false;
359        }
360        currType = range.type;
361        currPos = range.incr;
362        pool.offer(range);
363    }
364
365    /**
366     * Handle a character to add to the token buffer.
367     */
368    protected void handleChar() {
369
370        //
371        // ASCII characters.
372        if ((c >= 97 && c <= 122) || (c >= 65 && c <= 90)) {
373            if (state == State.NGRAM) {
374                makeTokens();
375            }
376            addChar();
377            state = State.COLLECTING;
378            return;
379        }
380
381        //
382        // ASCII space. We need to treat other whitespace differently, depending
383        // on whether we're ngram tokenizing.
384        if (c == 32) {
385            switch (state) {
386                case COLLECTING:
387                case NGRAM:
388                    // The transition from collecting or n-gram to whitespace
389                    // causes us to emit tokens.
390                    makeTokens();
391                    break;
392                case SKIPPING:
393                    break;
394                default:
395                    break;
396            }
397            sendPunct();
398            state = State.SKIPPING;
399            return;
400        }
401
402        if (isNgram(c)) {
403            // CJK characters (Chinese, Japanese, Korean)
404            // to be tokenized with bigram tokens.
405            // (Put this test here so these languages will tokenize
406            // more efficiently and it doesn't cost much for the non CJK
407            // languages.)
408
409            switch (state) {
410                case SKIPPING:
411                    state = State.NGRAM;
412                    break;
413                case COLLECTING:
414                    makeTokens();
415                    state = State.NGRAM;
416                    break;
417                case NGRAM:
418                    break;
419                default:
420                    break;
421            }
422            addChar();
423            return;
424        }
425
426        if (c == 0 || (state == State.NGRAM && (c >= 10 && c <= 13))) {
427            // While processing ngram character regions, Linefeed, PageUp, Page, Return
428            // don't do anything, so just return.
429            return;
430        }
431
432        if (isWhitespace(c)) {
433            // The rest of the white space characters for break:
434            switch (state) {
435                case COLLECTING:
436                case NGRAM:
437                    // The transition from collecting to whitespace
438                    // causes us to emit tokens.
439                    makeTokens();
440                    break;
441                case SKIPPING:
442                    break;
443                default:
444                    break;
445            }
446            sendPunct();
447            state = State.SKIPPING;
448            return;
449        }
450
451        if ((c >= 48 && c <= 57) || (c > 255 && Character.isDigit(c))) {
452
453            //
454            // The digits.
455            switch (state) {
456                case SKIPPING:
457                    state = State.COLLECTING;
458                    break;
459                case NGRAM:
460                    makeTokens();
461                    state = State.COLLECTING;
462                    break;
463                case COLLECTING:
464                    break;
465                default:
466                    break;
467            }
468            addChar();
469            return;
470        }
471
472        //
473        // Any other letter or digit.
474        if (isLetterOrDigit(c)) {
475            if (state == State.NGRAM) {
476                makeTokens();
477            }
478            addChar();
479            state = State.COLLECTING;
480            return;
481        }
482
483        // Anything other than the above cases, we break.
484        if (state != State.SKIPPING) {
485            makeTokens();
486        }
487        sendPunct();
488        state = State.SKIPPING;
489    }
490
491    private void sendPunct() {
492        if (sendPunct && !isWhitespace(c)) {
493            Range r = getRange();
494            r.punct(c, pos);
495            queuedTokens.add(r);
496        }
497    }
498
499    /**
500     * Add a character to the buffer that we're building for a token.
501     */
502    protected void addChar() {
503
504        //
505        // First see if token buffer needs to be expanded.
506        // Note: tokLen points to the next unused slot in token.
507        if (buffer.length <= tokenLength) {
508            buffer = Arrays.copyOf(buffer, tokenLength + 32);
509        }
510
511        if (tokenLength == 0) {
512            start = pos;
513        }
514        buffer[tokenLength++] = c;
515
516        if (tokenLength >= maxTokenLength) {
517            makeTokens();
518        }
519    }
520
521    @Override
522    public int getStart() {
523        if (ready) {
524            return startOffset;
525        } else {
526            throw new IllegalStateException("UniversalTokenizer is not ready.");
527        }
528    }
529
530    @Override
531    public int getEnd() {
532        if (ready) {
533            return endOffset;
534        } else {
535            throw new IllegalStateException("UniversalTokenizer is not ready.");
536        }
537    }
538
539    @Override
540    public String getText() {
541        if (ready) {
542            return currToken;
543        } else {
544            throw new IllegalStateException("UniversalTokenizer is not ready.");
545        }
546    }
547
548    @Override
549    public Token.TokenType getType() {
550        if (ready) {
551            return currType;
552        } else {
553            throw new IllegalStateException("UniversalTokenizer is not ready.");
554        }
555    }
556
557    public int getPos() {
558        return currPos;
559    }
560
561    @Override
562    public Tokenizer clone() {
563        try {
564            UniversalTokenizer copy = (UniversalTokenizer) super.clone();
565
566            copy.buffer = new char[maxTokenLength];
567            copy.tokenLength = 0;
568            copy.state = State.SKIPPING;
569            copy.pool = new LinkedList<>();
570            copy.queuedTokens = new LinkedList<>();
571            copy.currToken = null;
572            copy.ready = false;
573            copy.cs = null;
574
575            return copy;
576        } catch (CloneNotSupportedException e) {
577            throw new AssertionError("UniversalTokenizer is Cloneable, but clone call failed");
578        }
579    }
580
581    /**
582     * Reset state of tokenizer to clean slate.
583     */
584    @Override
585    public void reset(CharSequence cs) {
586        this.cs = cs;
587        pos = 0;
588        tokenLength = 0;
589        start = -1;
590        state = State.SKIPPING;
591        eofReached = false;
592        firstToken = true;
593        c = 0;
594        startOffset = -1;
595        endOffset = -1;
596        currToken = null;
597        ready = false;
598    }
599
600    private Range getRange() {
601        if (pool.isEmpty()) {
602            return new Range();
603        }
604        return pool.remove();
605    }
606
607    /**
608     * Make one or more tokens from our current collected characters.
609     */
610    protected void makeTokens() {
611
612        //
613        // Don't generate empty tokens.
614        if (tokenLength <= 0) {
615            return;
616        }
617
618        if (state == State.NGRAM) {
619            // if we only have one character, then just generate a single
620            // token and be done.
621            if (tokenLength == 1) {
622                Range range = getRange();
623                range.set(buffer[0], start);
624                queuedTokens.add(range);
625                tokenLength = 0;
626                return;
627            }
628
629            for (int i = 0; i < tokenLength; i++) {
630                if (generateUnigrams) {
631                    // Generate a unigram for this character.
632                    Range range = getRange();
633                    range.set(buffer[i], start + i);
634                    queuedTokens.add(range);
635                }
636                if (generateNgrams && i < tokenLength - 1) {
637                    // Generate a bigram for this character.
638                    Range range = getRange();
639                    range.set(buffer[i], buffer[i + 1], start + i);
640                    queuedTokens.add(range);
641                }
642            }
643        } else {
644            // Generate one token from the buffer.
645            Range range = getRange();
646            range.set(buffer, tokenLength, start);
647            queuedTokens.add(range);
648        }
649        tokenLength = 0;
650    }
651
652    private enum State {
653        SKIPPING,
654        COLLECTING,
655        NGRAM,
656    }
657
658}