001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.impl;
018
019import com.oracle.labs.mlrg.olcut.util.SortUtil;
020import org.tribuo.Example;
021import org.tribuo.Feature;
022import org.tribuo.FeatureMap;
023import org.tribuo.Output;
024import org.tribuo.VariableInfo;
025import org.tribuo.transform.Transformer;
026import org.tribuo.transform.TransformerMap;
027import org.tribuo.util.Merger;
028
029import java.util.ArrayList;
030import java.util.Arrays;
031import java.util.Collection;
032import java.util.HashMap;
033import java.util.HashSet;
034import java.util.Iterator;
035import java.util.List;
036import java.util.Map;
037import java.util.Objects;
038import java.util.PriorityQueue;
039import java.util.Set;
040import java.util.logging.Logger;
041
042/**
043 * An {@link Example} backed by two arrays, one of String and one of double.
044 */
045public class ArrayExample<T extends Output<T>> extends Example<T> {
046    private static final long serialVersionUID = 1L;
047
048    private static final Logger logger = Logger.getLogger(ArrayExample.class.getName());
049
050    public static final int DEFAULT_SIZE = 10;
051
052    protected String[] featureNames;
053
054    protected double[] featureValues;
055
056    protected int size = 0;
057
058    /**
059     * Constructs an example from an output and a weight, with an initial
060     * size for the feature arrays.
061     * @param output The output.
062     * @param weight The weight.
063     * @param initialSize The initial size of the feature arrays.
064     */
065    public ArrayExample(T output, float weight, int initialSize) {
066        super(output,weight);
067        featureNames = new String[initialSize];
068        featureValues = new double[initialSize];
069    }
070
071    /**
072     * Constructs an example from an output, a weight and the metadata.
073     * @param output The output.
074     * @param weight The weight.
075     * @param metadata The metadata.
076     */
077    public ArrayExample(T output, float weight, Map<String,Object> metadata) {
078        super(output,weight,metadata);
079        featureNames = new String[DEFAULT_SIZE];
080        featureValues = new double[DEFAULT_SIZE];
081    }
082
083    /**
084     * Constructs an example from an output and a weight.
085     * @param output The output.
086     * @param weight The example weight.
087     */
088    public ArrayExample(T output, float weight) {
089        super(output,weight);
090        featureNames = new String[DEFAULT_SIZE];
091        featureValues = new double[DEFAULT_SIZE];
092    }
093
094    /**
095     * Constructs an example from an output and the metadata.
096     * @param output The output.
097     * @param metadata The metadata.
098     */
099    public ArrayExample(T output, Map<String,Object> metadata) {
100        super(output,metadata);
101        featureNames = new String[DEFAULT_SIZE];
102        featureValues = new double[DEFAULT_SIZE];
103    }
104
105    /**
106     * Constructs an example from an output.
107     * @param output The output.
108     */
109    public ArrayExample(T output) {
110        super(output);
111        featureNames = new String[DEFAULT_SIZE];
112        featureValues = new double[DEFAULT_SIZE];
113    }
114
115    /**
116     * Constructs an example from an output, an array of names and an array of values.
117     * This is currently the most efficient constructor.
118     * @param output The output.
119     * @param names The feature names.
120     * @param values The feature values.
121     */
122    public ArrayExample(T output, String[] names, double[] values) {
123        super(output);
124        if (names.length != values.length) {
125            throw new IllegalArgumentException("names.length != values.length, names = " + names.length + ", values = " + values.length);
126        }
127
128        size = names.length;
129        featureNames = Arrays.copyOf(names,names.length);
130        featureValues = Arrays.copyOf(values,values.length);
131
132        sort();
133    }
134
135    /**
136     * Constructs an example from an output and a list of features.
137     * @param output The output.
138     * @param features The list of features.
139     */
140    public ArrayExample(T output, List<? extends Feature> features) {
141        super(output);
142        size = features.size();
143        featureNames = new String[size];
144        featureValues = new double[size];
145
146        int i = 0;
147        for (Feature f : features) {
148            featureNames[i] = f.getName();
149            featureValues[i] = f.getValue();
150            i++;
151        }
152
153        sort();
154    }
155
156    /**
157     * Copy constructor.
158     * @param other The example to copy.
159     */
160    public ArrayExample(Example<T> other) {
161        super(other);
162        featureNames = new String[other.size()];
163        featureValues = new double[other.size()];
164        for (Feature f : other) {
165            featureNames[size] = f.getName();
166            featureValues[size] = f.getValue();
167            size++;
168        }
169    }
170
171    /**
172     * Clones an example's features, but uses the supplied output and weight.
173     * @param output The output to use.
174     * @param other The features to use.
175     * @param weight The weight to use.
176     * @param <U> The output type of the other example.
177     */
178    public <U extends Output<U>> ArrayExample(T output, Example<U> other, float weight) {
179        super(output,weight);
180        featureNames = new String[other.size()];
181        featureValues = new double[other.size()];
182        for (Feature f : other) {
183            featureNames[size] = f.getName();
184            featureValues[size] = f.getValue();
185            size++;
186        }
187    }
188
189    /**
190     * Adds a single feature.
191     * @param name The name of the feature.
192     * @param value The value of the feature.
193     */
194    public void add(String name, double value) {
195        if (size >= featureNames.length) {
196            growArray();
197        }
198        //
199        // TODO: find the right insertion position, System.arraycopy
200        // everything up one and then write the new value.
201        featureNames[size] = name;
202        featureValues[size] = value;
203        size++;
204        sort();
205    }
206
207    @Override
208    public void add(Feature feature) {
209        add(feature.getName(),feature.getValue());
210    }
211
212    @Override
213    public void addAll(Collection<? extends Feature> features) {
214        if (size + features.size() >= featureNames.length) {
215            growArray(size+features.size());
216        }
217        for (Feature f : features) {
218            featureNames[size] = f.getName();
219            featureValues[size] = f.getValue();
220            size++;
221        }
222        sort();
223    }
224
225    /**
226     * Grows the backing arrays storing the names and values.
227     * @param minCapacity The new minimum capacity required.
228     */
229    protected void growArray(int minCapacity) {
230        int newCapacity = newCapacity(minCapacity);
231        featureNames = Arrays.copyOf(featureNames,newCapacity);
232        featureValues = Arrays.copyOf(featureValues,newCapacity);
233    }
234
235    /**
236     * Grows the backing arrays by size+1.
237     */
238    protected void growArray() {
239        growArray(size+1);
240    }
241
242    /**
243     * Returns a capacity at least as large as the given minimum capacity.
244     * Returns the current capacity increased by 50% if that suffices.
245     * Will not return a capacity greater than MAX_ARRAY_SIZE unless
246     * the given minimum capacity is greater than MAX_ARRAY_SIZE.
247     *
248     * @param minCapacity the desired minimum capacity
249     * @throws OutOfMemoryError if minCapacity is less than zero
250     * @return The new capacity.
251     */
252    protected int newCapacity(int minCapacity) {
253        // overflow-conscious code
254        int oldCapacity = featureNames.length;
255        int newCapacity = oldCapacity + (oldCapacity >> 1);
256        if (newCapacity - minCapacity <= 0) {
257            if (minCapacity < 0) {
258                // overflow
259                throw new OutOfMemoryError();
260            }
261            return minCapacity;
262        }
263        return newCapacity;
264    }
265
266    /**
267     * Sorts the feature list to maintain the lexicographic order invariant.
268     */
269    @Override
270    protected void sort() {
271        int[] sortedIndices = SortUtil.argsort(featureNames,0,size,true);
272
273        String[] newNames = Arrays.copyOf(featureNames,size);
274        double[] newValues = Arrays.copyOf(featureValues,size);
275        for (int i = 0; i < sortedIndices.length; i++) {
276            featureNames[i] = newNames[sortedIndices[i]];
277            featureValues[i] = newValues[sortedIndices[i]];
278        }
279    }
280
281    /**
282     * Returns a copy of the feature values array at the specific size.
283     * @param newSize The new size.
284     * @return A copy of the feature values.
285     */
286    public double[] copyValues(int newSize) {
287        return Arrays.copyOf(featureValues,newSize);
288    }
289
290    @Override
291    public int size() {
292        return size;
293    }
294
295    @Override
296    public void removeFeatures(List<Feature> featureList) {
297        Map<String,List<Integer>> map = new HashMap<>();
298        for (int i = 0; i < featureNames.length; i++) {
299            List<Integer> list = map.computeIfAbsent(featureNames[i],(k) -> new ArrayList<>());
300            list.add(i);
301        }
302
303        PriorityQueue<Integer> removeQueue = new PriorityQueue<>();
304        for (Feature f : featureList) {
305            List<Integer> i = map.get(f.getName());
306            if (i != null) {
307                // If we've found this feature remove it from the map to prevent double counting
308                map.remove(f.getName());
309                removeQueue.addAll(i);
310            }
311        }
312
313        String[] newNames = new String[size-removeQueue.size()];
314        double[] newValues = new double[size-removeQueue.size()];
315
316        int source = 0;
317        int dest = 0;
318        while (!removeQueue.isEmpty()) {
319            int curRemoveIdx = removeQueue.poll();
320            while (source < curRemoveIdx) {
321                newNames[dest] = featureNames[source];
322                newValues[dest] = featureValues[source];
323                source++;
324                dest++;
325            }
326            source++;
327        }
328        while (source < size) {
329            newNames[dest] = featureNames[source];
330            newValues[dest] = featureValues[source];
331            source++;
332            dest++;
333        }
334        featureNames = newNames;
335        featureValues = newValues;
336        size = featureNames.length;
337    }
338
339    @Override
340    public void reduceByName(Merger merger) {
341        if (size > 0) {
342            int[] sortedIndices = SortUtil.argsort(featureNames,0,size,true);
343            String[] newNames = new String[featureNames.length];
344            double[] newValues = new double[featureNames.length];
345            for (int i = 0; i < sortedIndices.length; i++) {
346                newNames[i] = featureNames[sortedIndices[i]];
347                newValues[i] = featureValues[sortedIndices[i]];
348            }
349            featureNames[0] = newNames[0];
350            featureValues[0] = newValues[0];
351            int dest = 0;
352            for (int i = 1; i < size; i++) {
353                while ((i < size) && newNames[i].equals(featureNames[dest])) {
354                    featureValues[dest] = merger.merge(featureValues[dest], newValues[i]);
355                    i++;
356                }
357                if (i < size) {
358                    dest++;
359                    featureNames[dest] = newNames[i];
360                    featureValues[dest] = newValues[i];
361                }
362            }
363            size = dest + 1;
364        } else {
365            logger.finer("Reducing an example with no features.");
366        }
367    }
368
369    @Override
370    public boolean validateExample() {
371        if (size == 0) {
372            return false;
373        } else {
374            for (int i = 0; i < size; i++) {
375                if (Double.isNaN(featureValues[i])) {
376                    return false;
377                }
378            }
379            Set<String> names = new HashSet<>(Arrays.asList(featureNames).subList(0, size));
380            return names.size() == size;
381        }
382    }
383
384    @Override
385    public ArrayExample<T> copy() {
386        return new ArrayExample<>(this);
387    }
388
389    @Override
390    public Feature lookup(String i) {
391        int index = Arrays.binarySearch(featureNames,0,size,i);
392        if (index < 0) {
393            return null;
394        } else {
395            return new Feature(featureNames[index],featureValues[index]);
396        }
397    }
398
399    @Override
400    public void set(Feature feature) {
401        int index = Arrays.binarySearch(featureNames,0,size,feature.getName());
402        if (index < 0) {
403            throw new IllegalArgumentException("Feature " + feature + " not found in example.");
404        } else {
405            featureValues[index] = feature.getValue();
406        }
407    }
408
409    @Override
410    public void transform(TransformerMap transformerMap) {
411        for (Map.Entry<String,List<Transformer>> e : transformerMap.entrySet()) {
412            int index = Arrays.binarySearch(featureNames,0,size,e.getKey());
413            if (index >= 0) {
414                double value = featureValues[index];
415                for (Transformer t : e.getValue()) {
416                    value = t.transform(value);
417                }
418                featureValues[index] = value;
419            }
420        }
421    }
422
423    @Override
424    public void densify(List<String> featureList) {
425        // Ensure we have enough space.
426        if (featureList.size() > featureNames.length) {
427            growArray(featureList.size());
428        }
429        int insertedCount = 0;
430        int curPos = 0;
431        for (String curName : featureList) {
432            // If we've reached the end of our old feature set, just insert.
433            if (curPos == size) {
434                featureNames[size + insertedCount] = curName;
435                insertedCount++;
436            } else {
437                // Check to see if our insertion candidate is the same as the current feature name.
438                int comparison = curName.compareTo(featureNames[curPos]);
439                if (comparison < 0) {
440                    // If it's earlier, insert it.
441                    featureNames[size + insertedCount] = curName;
442                    insertedCount++;
443                } else if (comparison == 0) {
444                    // Otherwise just bump our pointer, we've already got this feature.
445                    curPos++;
446                }
447            }
448        }
449        // Bump the size up by the number of inserted features.
450        size += insertedCount;
451        // Sort the features
452        sort();
453    }
454
455    @Override
456    public Iterator<Feature> iterator() {
457        return new ArrayExampleIterator();
458    }
459
460    @Override
461    public String toString() {
462        StringBuilder builder = new StringBuilder();
463
464        builder.append("ArrayExample(numFeatures=");
465        builder.append(size);
466        builder.append(",output=");
467        builder.append(output);
468        builder.append(",weight=");
469        builder.append(weight);
470        if (metadata != null) {
471            builder.append(",metadata=");
472            builder.append(metadata.toString());
473        }
474        builder.append(",features=[");
475        for(int i = 0; i < size; i++) {
476            builder.append('(').append(featureNames[i]).append(", ").append(featureValues[i]).append(')');
477            if(i != 0) {
478                builder.append(", ");
479            }
480        }
481        //builder.append(features.toString());
482        builder.append("])");
483
484        return builder.toString();
485    }
486
487    @Override
488    public boolean equals(Object o) {
489        if (this == o) return true;
490        if (!(o instanceof ArrayExample)) return false;
491        ArrayExample<?> that = (ArrayExample<?>) o;
492        if (Objects.equals(metadata,that.metadata) && output.getClass().equals(that.output.getClass())) {
493            @SuppressWarnings("unchecked") //guarded by a getClass.
494            boolean outputTest = output.fullEquals((T)that.output);
495            if(outputTest && size == that.size) {
496                //we do not use Arrays.equals here because these are "backing arrays" which could be different sizes 
497                for(int i=0; i<size; i++) {
498                    if(!this.featureNames[i].equals(that.featureNames[i])) return false;
499                    if(this.featureValues[i] != that.featureValues[i]) return false;
500                }
501                return true;
502            }
503            return false;
504        } else {
505            return false;
506        }
507    }
508
509    @Override
510    public int hashCode() {
511        int result = Objects.hash(size);
512        result = 31 * result + output.hashCode();
513        //we don't use Arrays.hashCode here because featureNames and featureValues
514        //are backing arrays and the length of each could be arbitrarily diverging 
515        //from the member size.  
516        for(int i=0; i<size; i++) {
517            result = 31 * result + featureNames[i].hashCode();
518            result = 31 * result + Double.hashCode(featureValues[i]);
519        }
520        return result;
521    }
522
523    class ArrayExampleIterator implements Iterator<Feature> {
524        int pos = 0;
525
526        @Override
527        public boolean hasNext() {
528            return pos < size;
529        }
530
531        @Override
532        public Feature next() {
533            Feature f = new Feature(featureNames[pos],featureValues[pos]);
534            pos++;
535            return f;
536        }
537    }
538
539    @Override
540    public void canonicalize(FeatureMap featureMap) {
541        for(int i=0; i< featureNames.length; i++) {
542            VariableInfo vi = featureMap.get(featureNames[i]);
543            if(vi != null) {
544                featureNames[i] = vi.getName();
545            }
546        }
547    }
548}