001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.util.tokens.Token;
023import org.tribuo.util.tokens.Tokenizer;
024
025import java.text.BreakIterator;
026import java.util.Locale;
027
028/**
029 * A tokenizer wrapping a {@link BreakIterator} instance.
030 */
031public class BreakIteratorTokenizer implements Tokenizer {
032
033    @Config(mandatory = true)
034    private String localeStr;
035
036    private Locale locale;
037
038    private BreakIterator breakIterator;
039
040    private CharSequence cs;
041
042    private int start;
043
044    private int startOffset;
045    private int endOffset;
046
047    private String token;
048
049    private boolean ready;
050
051    /**
052     * Default constructor for configuration system.
053     */
054    @SuppressWarnings("unused")
055    private BreakIteratorTokenizer() {
056    }
057
058    public BreakIteratorTokenizer(Locale locale) {
059        this.locale = locale;
060        this.localeStr = locale.toLanguageTag();
061        breakIterator = BreakIterator.getWordInstance(locale);
062        ready = false;
063        cs = null;
064    }
065
066    /**
067     * Used by the OLCUT configuration system, and should not be called by external code.
068     */
069    @Override
070    public void postConfig() {
071        locale = Locale.forLanguageTag(localeStr);
072        breakIterator = BreakIterator.getWordInstance(locale);
073        ready = false;
074        cs = null;
075    }
076
077    /**
078     * Returns the locale string this tokenizer uses.
079     * @return The locale string.
080     */
081    public String getLanguageTag() {
082        return localeStr;
083    }
084
085    @Override
086    public ConfiguredObjectProvenance getProvenance() {
087        return new ConfiguredObjectProvenanceImpl(this, "Tokenizer");
088    }
089
090    @Override
091    public void reset(CharSequence cs) {
092        this.cs = cs;
093        breakIterator.setText(cs.toString());
094        start = breakIterator.first();
095        startOffset = -1;
096        endOffset = -1;
097        token = null;
098        ready = false;
099    }
100
101    @Override
102    public boolean advance() {
103        if (cs == null) {
104            throw new IllegalStateException("BreakIteratorTokenizer has not been reset.");
105        }
106        int end = breakIterator.next();
107        while (end != BreakIterator.DONE) {
108            token = cs.subSequence(start, end).toString();
109            startOffset = start;
110            endOffset = end;
111            start = end;
112            if (!token.trim().isEmpty()) {
113                ready = true;
114                return true;
115            } else {
116                end = breakIterator.next();
117            }
118        }
119
120        return false;
121    }
122
123    @Override
124    public String getText() {
125        if (ready) {
126            return token;
127        } else {
128            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
129        }
130    }
131
132    @Override
133    public int getStart() {
134        if (ready) {
135            return startOffset;
136        } else {
137            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
138        }
139    }
140
141    @Override
142    public int getEnd() {
143        if (ready) {
144            return endOffset;
145        } else {
146            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
147        }
148    }
149
150    @Override
151    public Token.TokenType getType() {
152        if (ready) {
153            return Token.TokenType.WORD;
154        } else {
155            throw new IllegalStateException("BreakIteratorTokenizer is not ready.");
156        }
157    }
158
159    @Override
160    public BreakIteratorTokenizer clone() {
161        try {
162            BreakIteratorTokenizer copy = (BreakIteratorTokenizer) super.clone();
163            copy.postConfig();
164            return copy;
165        } catch (CloneNotSupportedException e) {
166            throw new AssertionError("BreakIteratorTokenizer is Cloneable, but clone call failed");
167        }
168    }
169}
170