001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text;
018
019import com.oracle.labs.mlrg.olcut.config.Configurable;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.Provenancable;
022import org.tribuo.Feature;
023
024import java.util.List;
025
026/**
027 * A TextProcessor takes some text and optionally a feature tag and generates a list of {@link Feature}s from that text.
028 */
029public interface TextProcessor extends Configurable, Provenancable<ConfiguredObjectProvenance> {
030
031    /**
032     * Extracts features from the supplied text.
033     * @param text The text to extract.
034     * @return The extracted features.
035     * @throws TextProcessingException If an error occurred during extraction (usually from tokenization).
036     */
037    public List<Feature> process(String text) throws TextProcessingException;
038
039    /**
040     * Extracts features from the supplied text.
041     * @param tag The feature name tag.
042     * @param text The text to extract.
043     * @return The extracted features.
044     * @throws TextProcessingException If an error occurred during extraction (usually from tokenization).
045     */
046    public List<Feature> process(String tag, String text) throws TextProcessingException;
047
048}