001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens; 018 019/** 020 * A single token extracted from a String. 021 * <p> 022 * Tokens are immutable. 023 */ 024public class Token { 025 026 public final String text; 027 public final int start; 028 public final int end; 029 public final TokenType type; 030 031 /** 032 * Constructs a token. 033 * @param text should be equivalent to the substring of the original 034 * tokenized text for the given character offsets start and end 035 * @param start the starting offset of the token 036 * @param end the ending offset of the token (exclusive or inclusive?) 037 */ 038 public Token(String text, int start, int end) { 039 this(text, start, end, TokenType.WORD); 040 } 041 042 /** 043 * Constructs a token. 044 * @param text should be equivalent to the substring of the original 045 * tokenized text for the given character offsets start and end 046 * @param start the starting offset of the token 047 * @param end the ending offset of the token (exclusive or inclusive?) 048 * @param type the type of the token 049 */ 050 public Token(String text, int start, int end, TokenType type) { 051 this.text = text; 052 this.start = start; 053 this.end = end; 054 this.type = type; 055 } 056 057 /** 058 * The number of characters in this token. 059 * @return The number of characters. 060 */ 061 public int length() { 062 return this.end - this.start; 063 } 064 065 @Override 066 public String toString() { 067 return this.text + "[type=" + this.type + "," + this.start + "," + this.end + "]"; 068 } 069 070 /** 071 * Tokenizers may product multiple kinds of tokens, depending on the 072 * application to which they're being put. For example, when processing a 073 * document for highlighting during querying, we need to send through 074 * whitespace and punctuation so that the document looks as it did in it's 075 * original form. For most tokenizer applications, they will only send word 076 * tokens. 077 */ 078 public enum TokenType { 079 WORD, 080 NGRAM, 081 PUNCTUATION, 082 WHITESPACE 083 } 084 085}