001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.util.tokens.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.util.tokens.Token; 023import org.tribuo.util.tokens.Tokenizer; 024 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028/** 029 * This implementation of {@link Tokenizer} is instantiated with a regular 030 * expression pattern which determines how to split a string into tokens. That 031 * is, the pattern defines the "splits", not the tokens. For example, to 032 * tokenize on white space provide the pattern "\s+". 033 * 034 * @author Philip Ogren 035 */ 036public class SplitPatternTokenizer implements Tokenizer { 037 038 /** 039 * The default split pattern, which is [\.,]?\s+. 040 */ 041 public static final String SIMPLE_DEFAULT_PATTERN = "[\\.,]?\\s+"; 042 043 @Config(description="The regex to split with.") 044 private String splitPatternRegex = SIMPLE_DEFAULT_PATTERN; 045 046 private Pattern splitPattern; 047 048 private CharSequence cs; 049 050 private int start; 051 052 private int end; 053 054 private Matcher matcher; 055 056 private int prevMatchEnd; 057 058 private boolean done; 059 060 private boolean ready; 061 062 /** 063 * Initializes a case insensitive tokenizer with the pattern [\.,]?\s+ 064 */ 065 public SplitPatternTokenizer() { 066 postConfig(); 067 } 068 069 /** 070 * Constructs a splitting tokenizer using the supplied regex. 071 * @param splitPatternRegex The regex to use. 072 */ 073 public SplitPatternTokenizer(String splitPatternRegex) { 074 this.splitPatternRegex = splitPatternRegex; 075 postConfig(); 076 } 077 078 /** 079 * Used by the OLCUT configuration system, and should not be called by external code. 080 */ 081 @Override 082 public void postConfig() { 083 splitPattern = Pattern.compile(splitPatternRegex); 084 ready = false; 085 cs = null; 086 } 087 088 @Override 089 public ConfiguredObjectProvenance getProvenance() { 090 return new ConfiguredObjectProvenanceImpl(this, "Tokenizer"); 091 } 092 093 /** 094 * Gets the String form of the regex in use. 095 * @return The regex. 096 */ 097 public String getSplitPatternRegex() { 098 return splitPatternRegex; 099 } 100 101 @Override 102 public void reset(CharSequence cs) { 103 this.cs = cs; 104 matcher = splitPattern.matcher(cs); 105 start = -1; 106 end = -1; 107 prevMatchEnd = 0; 108 done = false; 109 ready = false; 110 } 111 112 @Override 113 public boolean advance() { 114 if (cs == null) { 115 throw new IllegalStateException("SplitPatternTokenizer has not been reset."); 116 } 117 // 118 // We've gotten everything. 119 if (done) { 120 return false; 121 } 122 if (matcher.find()) { 123 // 124 // We might get a match at the start of the string, so reset and 125 // call advance to see if we can find a later match. 126 if (matcher.start() == 0) { 127 prevMatchEnd = matcher.end(); 128 return advance(); 129 } 130 // 131 // A regular match, so the actual text runs from the end of the 132 // previous match to the start of this one. 133 start = prevMatchEnd; 134 end = matcher.start(); 135 prevMatchEnd = matcher.end(); 136 ready = true; 137 } else { 138 // 139 // Handle the end of the string, keeping in mind that the last match 140 // might have included the end of the string already. 141 start = prevMatchEnd; 142 end = cs.length(); 143 done = true; 144 ready = start < end; 145 } 146 return ready; 147 } 148 149 @Override 150 public String getText() { 151 if (ready) { 152 return cs.subSequence(start, end).toString(); 153 } else { 154 throw new IllegalStateException("SplitPatternTokenizer is not ready."); 155 } 156 } 157 158 @Override 159 public int getStart() { 160 if (ready) { 161 return start; 162 } else { 163 throw new IllegalStateException("SplitPatternTokenizer is not ready."); 164 } 165 } 166 167 @Override 168 public int getEnd() { 169 if (ready) { 170 return end; 171 } else { 172 throw new IllegalStateException("SplitPatternTokenizer is not ready."); 173 } 174 } 175 176 @Override 177 public Token.TokenType getType() { 178 if (ready) { 179 return Token.TokenType.WORD; 180 } else { 181 throw new IllegalStateException("SplitPatternTokenizer is not ready."); 182 } 183 } 184 185 @Override 186 public SplitPatternTokenizer clone() { 187 try { 188 SplitPatternTokenizer copy = (SplitPatternTokenizer) super.clone(); 189 copy.postConfig(); //ready is set in postConfig. 190 return copy; 191 } catch (CloneNotSupportedException e) { 192 throw new AssertionError("SplitPatternTokenizer is Cloneable, but the clone call failed."); 193 } 194 } 195}