001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl; 022import org.tribuo.Feature; 023import org.tribuo.data.text.FeatureTransformer; 024import org.tribuo.util.MurmurHash3; 025 026import java.util.ArrayList; 027import java.util.List; 028import java.util.logging.Logger; 029 030/** 031 * Hashes the feature names to reduce the dimensionality. 032 */ 033public class FeatureHasher implements FeatureTransformer { 034 035 private static final Logger logger = Logger.getLogger(FeatureHasher.class.getName()); 036 037 @Config(mandatory = true,description="Dimension to map the hash into.") 038 private int dimension; 039 040 public FeatureHasher(int dimension) { 041 this.dimension = dimension; 042 } 043 044 /** 045 * For olcut. 046 */ 047 private FeatureHasher() {} 048 049 @Override 050 public List<Feature> map(String tag, List<Feature> features) { 051 052 List<Feature> hashedFeatures = new ArrayList<>(); 053 054 for (Feature feature : features) { 055 int hash = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), 38495); 056 //int bit = hash & 1; 057 int bit = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), 77777) & 1; 058 hash = hash >>> 1; 059 int code = hash % dimension; 060 061 int change = bit == 1 ? 1 : -1; 062 063 Feature newFeature = new Feature(tag + "-hash="+code,change); 064 hashedFeatures.add(newFeature); 065 } 066 067 return hashedFeatures; 068 069 } 070 071 @Override 072 public ConfiguredObjectProvenance getProvenance() { 073 return new ConfiguredObjectProvenanceImpl(this,"FeatureTransformer"); 074 } 075}