001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.util.tokens.Token; 023import org.tribuo.util.tokens.Tokenizer; 024 025import java.text.BreakIterator; 026import java.util.Locale; 027 028/** 029 * A tokenizer wrapping a {@link BreakIterator} instance. 030 */ 031public class BreakIteratorTokenizer implements Tokenizer { 032 033 @Config(mandatory = true) 034 private String localeStr; 035 036 private Locale locale; 037 038 private BreakIterator breakIterator; 039 040 private CharSequence cs; 041 042 private int start; 043 044 private int startOffset; 045 private int endOffset; 046 047 private String token; 048 049 private boolean ready; 050 051 /** 052 * Default constructor for configuration system. 053 */ 054 @SuppressWarnings("unused") 055 private BreakIteratorTokenizer() { 056 } 057 058 public BreakIteratorTokenizer(Locale locale) { 059 this.locale = locale; 060 this.localeStr = locale.toLanguageTag(); 061 breakIterator = BreakIterator.getWordInstance(locale); 062 ready = false; 063 cs = null; 064 } 065 066 /** 067 * Used by the OLCUT configuration system, and should not be called by external code. 068 */ 069 @Override 070 public void postConfig() { 071 locale = Locale.forLanguageTag(localeStr); 072 breakIterator = BreakIterator.getWordInstance(locale); 073 ready = false; 074 cs = null; 075 } 076 077 /** 078 * Returns the locale string this tokenizer uses. 079 * @return The locale string. 080 */ 081 public String getLanguageTag() { 082 return localeStr; 083 } 084 085 @Override 086 public ConfiguredObjectProvenance getProvenance() { 087 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 088 } 089 090 @Override 091 public void reset(CharSequence cs) { 092 this.cs = cs; 093 breakIterator.setText(cs.toString()); 094 start = breakIterator.first(); 095 startOffset = -1; 096 endOffset = -1; 097 token = null; 098 ready = false; 099 } 100 101 @Override 102 public boolean advance() { 103 if (cs == null) { 104 throw new IllegalStateException("BreakIteratorTokenizer has not been reset."); 105 } 106 int end = breakIterator.next(); 107 while (end != BreakIterator.DONE) { 108 token = cs.subSequence(start, end).toString(); 109 startOffset = start; 110 endOffset = end; 111 start = end; 112 if (!token.trim().isEmpty()) { 113 ready = true; 114 return true; 115 } else { 116 end = breakIterator.next(); 117 } 118 } 119 120 return false; 121 } 122 123 @Override 124 public String getText() { 125 if (ready) { 126 return token; 127 } else { 128 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 129 } 130 } 131 132 @Override 133 public int getStart() { 134 if (ready) { 135 return startOffset; 136 } else { 137 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 138 } 139 } 140 141 @Override 142 public int getEnd() { 143 if (ready) { 144 return endOffset; 145 } else { 146 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 147 } 148 } 149 150 @Override 151 public Token.TokenType getType() { 152 if (ready) { 153 return Token.TokenType.WORD; 154 } else { 155 throw new IllegalStateException("BreakIteratorTokenizer is not ready."); 156 } 157 } 158 159 @Override 160 public BreakIteratorTokenizer clone() { 161 try { 162 BreakIteratorTokenizer copy = (BreakIteratorTokenizer) super.clone(); 163 copy.postConfig(); 164 return copy; 165 } catch (CloneNotSupportedException e) { 166 throw new AssertionError("BreakIteratorTokenizer is Cloneable, but clone call failed"); 167 } 168 } 169} 170