001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens;
018
019/**
020 * A single token extracted from a String.
021 * <p>
022 * Tokens are immutable.
023 */
024public class Token {
025
026    public final String text;
027    public final int start;
028    public final int end;
029    public final TokenType type;
030
031    /**
032     * Constructs a token.
033     * @param text  should be equivalent to the substring of the original
034     *              tokenized text for the given character offsets start and end
035     * @param start the starting offset of the token
036     * @param end   the ending offset of the token (exclusive or inclusive?)
037     */
038    public Token(String text, int start, int end) {
039        this(text, start, end, TokenType.WORD);
040    }
041
042    /**
043     * Constructs a token.
044     * @param text  should be equivalent to the substring of the original
045     *              tokenized text for the given character offsets start and end
046     * @param start the starting offset of the token
047     * @param end   the ending offset of the token (exclusive or inclusive?)
048     * @param type  the type of the token
049     */
050    public Token(String text, int start, int end, TokenType type) {
051        this.text = text;
052        this.start = start;
053        this.end = end;
054        this.type = type;
055    }
056
057    /**
058     * The number of characters in this token.
059     * @return The number of characters.
060     */
061    public int length() {
062        return this.end - this.start;
063    }
064
065    @Override
066    public String toString() {
067        return this.text + "[type=" + this.type + "," + this.start + "," + this.end + "]";
068    }
069
070    /**
071     * Tokenizers may product multiple kinds of tokens, depending on the
072     * application to which they're being put. For example, when processing a
073     * document for highlighting during querying, we need to send through
074     * whitespace and punctuation so that the document looks as it did in it's
075     * original form. For most tokenizer applications, they will only send word
076     * tokens.
077     */
078    public enum TokenType {
079        WORD,
080        NGRAM,
081        PUNCTUATION,
082        WHITESPACE
083    }
084
085}