001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.columnar;
018
019import com.oracle.labs.mlrg.olcut.config.Configurable;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.Provenancable;
022
023import java.util.List;
024
025/**
026 * An interface for things that process the columns in a data set.
027 */
028public interface FieldProcessor extends Configurable, Provenancable<ConfiguredObjectProvenance> {
029
030    /**
031     * The namespacing separator.
032     */
033    public static final String NAMESPACE = "#";
034
035    /**
036     * The types of generated features.
037     */
038    public enum GeneratedFeatureType {
039        /**
040         * Categoricals binarised into separate features.
041         */
042        BINARISED_CATEGORICAL,
043        /**
044         * Categorical features with the values converted into doubles.
045         */
046        CATEGORICAL,
047        /**
048         * Real valued features.
049         */
050        REAL,
051        /**
052         * Text features.
053         */
054        TEXT
055    }
056
057    /**
058     * Gets the field name this FieldProcessor uses.
059     * @return The field name.
060     */
061    public String getFieldName();
062
063    /**
064     * Processes the field value and generates a (possibly empty) list of {@link ColumnarFeature}s.
065     * @param value The field value to process.
066     * @return A list of {@link ColumnarFeature}s.
067     */
068    public List<ColumnarFeature> process(String value);
069
070    /**
071     * Returns the feature type this FieldProcessor generates.
072     * @return The feature type.
073     */
074    public GeneratedFeatureType getFeatureType();
075
076    /**
077     * Binarised categoricals can be namespaced, where the field name is appended with "#&lt;non-negative-int&gt;" to denote the
078     * namespace. This allows one FieldProcessor to emit multiple binarised categoricals from the same field value,
079     * provided each emitted feature is in a different namespace. Without this guarantee it's impossible to
080     * recover the original categorical distribution before binarisation.
081     *
082     * If there is only a single namespace, it is omitted from the feature name.
083     * @return The number of namespaces.
084     */
085    default public int getNumNamespaces() {
086        return 1;
087    }
088
089    /**
090     * Returns a copy of this FieldProcessor bound to the supplied newFieldName.
091     * @param newFieldName The new field name for the copy.
092     * @return A copy of this FieldProcessor.
093     */
094    public FieldProcessor copy(String newFieldName);
095}