001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.Feature;
023import org.tribuo.data.text.FeatureTransformer;
024import org.tribuo.util.MurmurHash3;
025
026import java.util.ArrayList;
027import java.util.List;
028import java.util.logging.Logger;
029
030/**
031 * Hashes the feature names to reduce the dimensionality.
032 */
033public class FeatureHasher implements FeatureTransformer {
034
035    private static final Logger logger = Logger.getLogger(FeatureHasher.class.getName());
036
037    @Config(mandatory = true,description="Dimension to map the hash into.")
038    private int dimension;
039    
040    public FeatureHasher(int dimension) {
041        this.dimension = dimension;
042    }
043
044    /**
045     * For olcut.
046     */
047    private FeatureHasher() {}
048    
049    @Override
050    public List<Feature> map(String tag, List<Feature> features) {
051
052        List<Feature> hashedFeatures = new ArrayList<>();
053        
054        for (Feature feature : features) {
055            int hash = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), 38495);
056            //int bit = hash & 1;
057            int bit = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), 77777) & 1; 
058            hash = hash >>> 1;
059            int code = hash % dimension;
060                        
061            int change = bit == 1 ? 1 : -1;
062
063            Feature newFeature = new Feature(tag + "-hash="+code,change);
064            hashedFeatures.add(newFeature);
065        }
066
067        return hashedFeatures;
068        
069    }
070
071    @Override
072    public ConfiguredObjectProvenance getProvenance() {
073        return new ConfiguredObjectProvenanceImpl(this,"FeatureTransformer");
074    }
075}