001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.impl;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collection;
022import java.util.Collections;
023import java.util.HashMap;
024import java.util.HashSet;
025import java.util.Iterator;
026import java.util.List;
027import java.util.Map;
028import java.util.Objects;
029import java.util.PriorityQueue;
030import java.util.Set;
031import java.util.logging.Logger;
032import java.util.stream.Collectors;
033
034import org.tribuo.Example;
035import org.tribuo.Feature;
036import org.tribuo.FeatureMap;
037import org.tribuo.Output;
038import org.tribuo.VariableInfo;
039import org.tribuo.transform.TransformerMap;
040import org.tribuo.util.Merger;
041
042import com.oracle.labs.mlrg.olcut.util.SortUtil;
043
044/**
045 * An {@link Example} backed by a single array of feature names. This
046 * implementation is modeled after {@link ArrayExample} but does not store
047 * feature values because it assumes only binary features - i.e., features with a
048 * feature value of 1.0. The following methods throw an
049 * {@link UnsupportedOperationException}:
050 * <ul>
051 * <li>{@link #densify(List)}</li>
052 * <li>{@link #densify(FeatureMap)}</li>
053 * <li>{@link #set(Feature)}</li>
054 * <li>{@link #transform(TransformerMap)}</li>
055 * </ul>
056 *
057 * @param <T> The output type.
058 */
059public final class BinaryFeaturesExample<T extends Output<T>> extends Example<T> {
060    private static final long serialVersionUID = 1L;
061
062    private static final Logger logger = Logger.getLogger(BinaryFeaturesExample.class.getName());
063
064    public static final int DEFAULT_SIZE = 10;
065
066    protected String[] featureNames;
067
068    protected int size = 0;
069
070    /**
071     * Constructs an example from an output and a weight, with an initial size for
072     * the feature arrays.
073     * 
074     * @param output      The output.
075     * @param weight      The weight.
076     * @param initialSize The initial size of the feature arrays.
077     */
078    public BinaryFeaturesExample(T output, float weight, int initialSize) {
079        super(output,weight);
080        featureNames = new String[initialSize];
081    }
082
083    /**
084     * Constructs an example from an output, a weight and the metadata.
085     * 
086     * @param output   The output.
087     * @param weight   The weight.
088     * @param metadata The metadata.
089     */
090    public BinaryFeaturesExample(T output, float weight, Map<String,Object> metadata) {
091        super(output,weight,metadata);
092        featureNames = new String[DEFAULT_SIZE];
093    }
094
095    /**
096     * Constructs an example from an output and a weight.
097     * 
098     * @param output The output.
099     * @param weight The example weight.
100     */
101    public BinaryFeaturesExample(T output, float weight) {
102        super(output,weight);
103        featureNames = new String[DEFAULT_SIZE];
104    }
105
106    /**
107     * Constructs an example from an output and the metadata.
108     * 
109     * @param output   The output.
110     * @param metadata The metadata.
111     */
112    public BinaryFeaturesExample(T output, Map<String,Object> metadata) {
113        super(output,metadata);
114        featureNames = new String[DEFAULT_SIZE];
115    }
116
117    /**
118     * Constructs an example from an output.
119     * 
120     * @param output The output.
121     */
122    public BinaryFeaturesExample(T output) {
123        super(output);
124        featureNames = new String[DEFAULT_SIZE];
125    }
126
127    /**
128     * Constructs an example from an output and an array of names. This is currently
129     * the most efficient constructor.
130     * 
131     * @param output The output.
132     * @param names  The feature names.
133     */
134    public BinaryFeaturesExample(T output, String[] names) {
135        super(output);
136        size = names.length;
137        featureNames = Arrays.copyOf(names,names.length);
138        sort();
139    }
140
141    /**
142     * Constructs an example from an output and a list of features. This constructor
143     * will throw an {@link IllegalArgumentException} if any of the features have a
144     * feature value that does not equal 1.0.
145     * 
146     * @param output   The output (e.g., label) of the example
147     * @param features The list of features.
148     */
149    public BinaryFeaturesExample(T output, List<? extends Feature> features) {
150        super(output);
151        size = features.size();
152        featureNames = new String[size];
153
154        int i = 0;
155        for (Feature f : features) {
156            checkIsBinary(f);
157            featureNames[i] = f.getName();
158            i++;
159        }
160
161        sort();
162    }
163
164    /**
165     * Copy constructor. This constructor will throw an
166     * {@link IllegalArgumentException} if any of the features have a feature value
167     * that does not equal 1.0.
168     * 
169     * @param other The example to copy.
170     */
171    public BinaryFeaturesExample(Example<T> other) {
172        super(other);
173        featureNames = new String[other.size()];
174        for (Feature f : other) {
175            checkIsBinary(f);
176            featureNames[size] = f.getName();
177            size++;
178        }
179    }
180
181    /**
182     * Clones an example's features, but uses the supplied output and weight.
183     * @param output The output to use.
184     * @param other The features to use.
185     * @param weight The weight to use.
186     * @param <U> The output type of the other example.
187     */
188    public <U extends Output<U>> BinaryFeaturesExample(T output, Example<U> other, float weight) {
189        super(output,weight);
190        featureNames = new String[other.size()];
191        for (Feature f : other) {
192            checkIsBinary(f);
193            featureNames[size] = f.getName();
194            size++;
195        }
196    }
197
198    /**
199     * Adds a single feature with a value of 1.
200     * @param name The name of the feature.
201     */
202    public void add(String name) {
203        if (size >= featureNames.length) {
204            growArray();
205        }
206        //
207        // TODO: find the right insertion position, System.arraycopy
208        // everything up one and then write the new value.
209        featureNames[size] = name;
210        size++;
211        sort();
212    }
213
214    public static boolean isBinary(Feature feature) {
215        return feature.getValue() == 1.0;
216    }
217
218    public static void checkIsBinary(Feature feature) {
219        if(!isBinary(feature)) {
220            throw new IllegalArgumentException("non-binary features are not allowed in BinaryFeaturesExample: value="+feature.getValue());
221        }
222    }
223
224    /**
225     * Adds a feature to this example. This method will throw an {@link IllegalArgumentException} if
226     * any of the features have a feature value that does not equal 1.0.
227     * 
228     * @param feature The feature to add to this example.
229     */
230    @Override
231    public void add(Feature feature) {
232        checkIsBinary(feature);
233        add(feature.getName());
234    }
235
236    /**
237     * Adds a collection of features to this example. This method will throw an
238     * {@link IllegalArgumentException} if any of the features have a feature value
239     * that does not equal 1.0.
240     * 
241     * @param features The features to add to this example.
242     */
243    @Override
244    public void addAll(Collection<? extends Feature> features) {
245        if (size + features.size() >= featureNames.length) {
246            growArray(size+features.size());
247        }
248        for (Feature f : features) {
249            checkIsBinary(f);
250            featureNames[size] = f.getName();
251            size++;
252        }
253        sort();
254    }
255
256    /**
257     * Grows the backing arrays storing the names.
258     * @param minCapacity The new minimum capacity required.
259     */
260    protected void growArray(int minCapacity) {
261        int newCapacity = newCapacity(minCapacity);
262        featureNames = Arrays.copyOf(featureNames,newCapacity);
263    }
264
265    /**
266     * Grows the backing arrays by size+1.
267     */
268    protected void growArray() {
269        growArray(size+1);
270    }
271
272    /**
273     * Returns a capacity at least as large as the given minimum capacity.
274     * Returns the current capacity increased by 50% if that suffices.
275     * Will not return a capacity greater than MAX_ARRAY_SIZE unless
276     * the given minimum capacity is greater than MAX_ARRAY_SIZE.
277     *
278     * @param minCapacity the desired minimum capacity
279     * @throws OutOfMemoryError if minCapacity is less than zero
280     * @return The new capacity.
281     */
282    protected int newCapacity(int minCapacity) {
283        // overflow-conscious code
284        int oldCapacity = featureNames.length;
285        int newCapacity = oldCapacity + (oldCapacity >> 1);
286        if (newCapacity - minCapacity <= 0) {
287            if (minCapacity < 0) {
288                // overflow
289                throw new OutOfMemoryError();
290            }
291            return minCapacity;
292        }
293        return newCapacity;
294    }
295
296    /**
297     * Sorts the feature list to maintain the lexicographic order invariant.
298     */
299    @Override
300    protected void sort() {
301        int[] sortedIndices = SortUtil.argsort(featureNames,0,size,true);
302
303        String[] newNames = Arrays.copyOf(featureNames,size);
304        for (int i = 0; i < sortedIndices.length; i++) {
305            featureNames[i] = newNames[sortedIndices[i]];
306        }
307    }
308
309    @Override
310    public int size() {
311        return size;
312    }
313
314    @Override
315    public void removeFeatures(List<Feature> featureList) {
316        Map<String,List<Integer>> map = new HashMap<>();
317        for (int i = 0; i < featureNames.length; i++) {
318            List<Integer> list = map.computeIfAbsent(featureNames[i],(k) -> new ArrayList<>());
319            list.add(i);
320        }
321
322        PriorityQueue<Integer> removeQueue = new PriorityQueue<>();
323        for (Feature f : featureList) {
324            List<Integer> i = map.get(f.getName());
325            if (i != null) {
326                // If we've found this feature remove it from the map to prevent double counting
327                map.remove(f.getName());
328                removeQueue.addAll(i);
329            }
330        }
331
332        String[] newNames = new String[size-removeQueue.size()];
333
334        int source = 0;
335        int dest = 0;
336        while (!removeQueue.isEmpty()) {
337            int curRemoveIdx = removeQueue.poll();
338            while (source < curRemoveIdx) {
339                newNames[dest] = featureNames[source];
340                source++;
341                dest++;
342            }
343            source++;
344        }
345        while (source < size) {
346            newNames[dest] = featureNames[source];
347            source++;
348            dest++;
349        }
350        featureNames = newNames;
351        size = featureNames.length;
352    }
353
354    @Override
355    public boolean validateExample() {
356        if (size == 0) {
357            return false;
358        } else {
359            Set<String> names = new HashSet<>(Arrays.asList(featureNames).subList(0, size));
360            return names.size() == size;
361        }
362    }
363
364    @Override
365    public BinaryFeaturesExample<T> copy() {
366        return new BinaryFeaturesExample<>(this);
367    }
368
369    @Override
370    public Feature lookup(String i) {
371        int index = Arrays.binarySearch(featureNames,0,size,i);
372        if (index < 0) {
373            return null;
374        } else {
375            return new Feature(featureNames[index], 1.0);
376        }
377    }
378
379    @Override
380    public void set(Feature feature) {
381        throw new UnsupportedOperationException();
382    }
383
384    @Override
385    public void transform(TransformerMap transformerMap) {
386        throw new UnsupportedOperationException();
387    }
388
389    /**
390     * @param merger this parameter is ignored because we do not store feature values in this class.  So, the 'merging' can be thought of as a logical AND.
391     */
392    @Override
393    public void reduceByName(Merger merger) {
394        if (size > 0) {
395            List<String> newNames = Arrays.stream(featureNames, 0, size).distinct().collect(Collectors.toList());
396            Collections.sort(newNames);
397            featureNames = newNames.toArray(new String[newNames.size()]);
398            size = featureNames.length;
399        } else {
400            logger.finer("Reducing an example with no features.");
401        }
402    }
403
404    @Override
405    public void densify(List<String> featureList) {
406        throw new UnsupportedOperationException();
407    }
408
409    @Override
410    protected void densify(FeatureMap fMap) {
411        throw new UnsupportedOperationException();
412    }
413
414    @Override
415    public Iterator<Feature> iterator() {
416        return new BinaryFeaturesExampleIterator();
417    }
418
419    @Override
420    public String toString() {
421        StringBuilder builder = new StringBuilder();
422
423        builder.append("ArrayExample(numFeatures=");
424        builder.append(size);
425        builder.append(",output=");
426        builder.append(output);
427        builder.append(",weight=");
428        builder.append(weight);
429        if (metadata != null) {
430            builder.append(",metadata=");
431            builder.append(metadata.toString());
432        }
433        builder.append(",features=[");
434        for(int i = 0; i < size; i++) {
435            builder.append('(').append(featureNames[i]).append(", ").append(1.0).append(')');
436            if(i != 0) {
437                builder.append(", ");
438            }
439        }
440        //builder.append(features.toString());
441        builder.append("])");
442
443        return builder.toString();
444    }
445
446    @Override
447    public boolean equals(Object o) {
448        if (this == o) return true;
449        if (!(o instanceof BinaryFeaturesExample)) return false;
450        BinaryFeaturesExample<?> that = (BinaryFeaturesExample<?>) o;
451        if (Objects.equals(metadata,that.metadata) && output.getClass().equals(that.output.getClass())) {
452            @SuppressWarnings("unchecked") //guarded by a getClass.
453            boolean outputTest = output.fullEquals((T)that.output);
454            if(outputTest && size == that.size) {
455                //we do not use Arrays.equals here because these are "backing arrays" which could be different sizes 
456                for(int i=0; i<size; i++) {
457                    if(!this.featureNames[i].equals(that.featureNames[i])) return false;
458                }
459                return true;
460            }
461            return false;
462        } else {
463            return false;
464        }
465    }
466
467    @Override
468    public int hashCode() {
469        int result = Objects.hash(size);
470        result = 31 * result + output.hashCode();
471        //we don't use Arrays.hashCode here because featureNames 
472        //is a backing array and its length could be arbitrarily diverging 
473        //from the member size.  
474        for(int i=0; i<size; i++) {
475            result = 31 * result + featureNames[i].hashCode();
476        }
477        return result;
478    }
479
480    class BinaryFeaturesExampleIterator implements Iterator<Feature> {
481        int pos = 0;
482
483        @Override
484        public boolean hasNext() {
485            return pos < size;
486        }
487
488        @Override
489        public Feature next() {
490            Feature f = new Feature(featureNames[pos], 1.0);
491            pos++;
492            return f;
493        }
494    }
495
496    @Override
497    public void canonicalize(FeatureMap featureMap) {
498        for(int i=0; i< featureNames.length; i++) {
499            VariableInfo vi = featureMap.get(featureNames[i]);
500            if(vi != null) {
501                featureNames[i] = vi.getName();
502            }
503        }
504    }
505}