001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text; 018 019import com.oracle.labs.mlrg.olcut.config.Configurable; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.Provenancable; 022import org.tribuo.Feature; 023 024import java.util.List; 025 026/** 027 * A TextProcessor takes some text and optionally a feature tag and generates a list of {@link Feature}s from that text. 028 */ 029public interface TextProcessor extends Configurable, Provenancable<ConfiguredObjectProvenance> { 030 031 /** 032 * Extracts features from the supplied text. 033 * @param text The text to extract. 034 * @return The extracted features. 035 * @throws TextProcessingException If an error occurred during extraction (usually from tokenization). 036 */ 037 public List<Feature> process(String text) throws TextProcessingException; 038 039 /** 040 * Extracts features from the supplied text. 041 * @param tag The feature name tag. 042 * @param text The text to extract. 043 * @return The extracted features. 044 * @throws TextProcessingException If an error occurred during extraction (usually from tokenization). 045 */ 046 public List<Feature> process(String tag, String text) throws TextProcessingException; 047 048}