001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.util.tokens.options;
018
019import com.oracle.labs.mlrg.olcut.config.Option;
020import org.tribuo.util.tokens.Tokenizer;
021import org.tribuo.util.tokens.impl.NonTokenizer;
022import org.tribuo.util.tokens.impl.ShapeTokenizer;
023import org.tribuo.util.tokens.universal.UniversalTokenizer;
024
025import java.util.logging.Logger;
026
027/**
028 * CLI Options for all the tokenizers in the core package.
029 */
030public class CoreTokenizerOptions implements TokenizerOptions {
031
032    private static final Logger logger = Logger.getLogger(CoreTokenizerOptions.class.getName());
033    public BreakIteratorTokenizerOptions breakIteratorOptions;
034    public SplitCharactersTokenizerOptions splitCharactersTokenizerOptions;
035    public SplitPatternTokenizerOptions splitPatternTokenizerOptions;
036    @Option(longName = "core-tokenizer-type", usage = "Type of tokenizer")
037    public CoreTokenizerType coreTokenizerType = CoreTokenizerType.SPLIT_CHARACTERS;
038
039    @Override
040    public Tokenizer getTokenizer() {
041        Tokenizer tokenizer;
042        logger.info("Using " + coreTokenizerType);
043        switch (coreTokenizerType) {
044            case BREAK_ITERATOR:
045                tokenizer = breakIteratorOptions.getTokenizer();
046                break;
047            case SPLIT_CHARACTERS:
048                tokenizer = splitCharactersTokenizerOptions.getTokenizer();
049                break;
050            case NON:
051                tokenizer = new NonTokenizer();
052                break;
053            case SHAPE:
054                tokenizer = new ShapeTokenizer();
055                break;
056            case SPLIT_PATTERN:
057                tokenizer = splitPatternTokenizerOptions.getTokenizer();
058                break;
059            case UNIVERSAL:
060                tokenizer = new UniversalTokenizer();
061                break;
062            default:
063                throw new IllegalArgumentException("Unknown tokenizer " + coreTokenizerType);
064        }
065        return tokenizer;
066    }
067
068    public enum CoreTokenizerType {
069        BREAK_ITERATOR, SPLIT_CHARACTERS, NON, SHAPE, SPLIT_PATTERN, UNIVERSAL
070    }
071
072}