001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.columnar; 018 019import com.oracle.labs.mlrg.olcut.config.Configurable; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.Provenancable; 022 023import java.util.List; 024 025/** 026 * An interface for things that process the columns in a data set. 027 */ 028public interface FieldProcessor extends Configurable, Provenancable<ConfiguredObjectProvenance> { 029 030 /** 031 * The namespacing separator. 032 */ 033 public static final String NAMESPACE = "#"; 034 035 /** 036 * The types of generated features. 037 */ 038 public enum GeneratedFeatureType { 039 /** 040 * Categoricals binarised into separate features. 041 */ 042 BINARISED_CATEGORICAL, 043 /** 044 * Categorical features with the values converted into doubles. 045 */ 046 CATEGORICAL, 047 /** 048 * Real valued features. 049 */ 050 REAL, 051 /** 052 * Text features. 053 */ 054 TEXT 055 } 056 057 /** 058 * Gets the field name this FieldProcessor uses. 059 * @return The field name. 060 */ 061 public String getFieldName(); 062 063 /** 064 * Processes the field value and generates a (possibly empty) list of {@link ColumnarFeature}s. 065 * @param value The field value to process. 066 * @return A list of {@link ColumnarFeature}s. 067 */ 068 public List<ColumnarFeature> process(String value); 069 070 /** 071 * Returns the feature type this FieldProcessor generates. 072 * @return The feature type. 073 */ 074 public GeneratedFeatureType getFeatureType(); 075 076 /** 077 * Binarised categoricals can be namespaced, where the field name is appended with "#<non-negative-int>" to denote the 078 * namespace. This allows one FieldProcessor to emit multiple binarised categoricals from the same field value, 079 * provided each emitted feature is in a different namespace. Without this guarantee it's impossible to 080 * recover the original categorical distribution before binarisation. 081 * 082 * If there is only a single namespace, it is omitted from the feature name. 083 * @return The number of namespaces. 084 */ 085 default public int getNumNamespaces() { 086 return 1; 087 } 088 089 /** 090 * Returns a copy of this FieldProcessor bound to the supplied newFieldName. 091 * @param newFieldName The new field name for the copy. 092 * @return A copy of this FieldProcessor. 093 */ 094 public FieldProcessor copy(String newFieldName); 095}