001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.regression;
018
019import com.oracle.labs.mlrg.olcut.util.Pair;
020import com.oracle.labs.mlrg.olcut.util.SortUtil;
021import org.tribuo.Output;
022import org.tribuo.OutputInfo;
023import org.tribuo.util.Util;
024
025import java.util.Arrays;
026import java.util.Collections;
027import java.util.HashSet;
028import java.util.Iterator;
029import java.util.List;
030import java.util.Optional;
031import java.util.Set;
032
033/**
034 * An {@link Output} for n-dimensional real valued regression.
035 * <p>
036 * In addition to the regressed values, it may optionally contain
037 * variances. Otherwise the variances are set to {@link Double#NaN}.
038 * </p>
039 * <p>
040 * Within a {@link org.tribuo.DataSource} or {@link org.tribuo.Dataset}
041 * each Regressor must contain the same set of named dimensions. The dimensions stored in a
042 * Regressor are sorted by the natural ordering of their names (i.e., using the String comparator).
043 * This allows the use of direct indexing into the elements.
044 * </p>
045 * <p>
046 * Note {@link Regressor#fullEquals} compares the dimensions, the regressed values and the
047 * variances. However unlike {@link Double#equals}, if the two variances being compared are
048 * set to the sentinel value of {@link Double#NaN}, then they are considered equal.
049 * </p>
050 */
051public class Regressor implements Output<Regressor>, Iterable<Regressor.DimensionTuple> {
052    private static final long serialVersionUID = 1L;
053    public static final double TOLERANCE = 1e-12;
054
055    public static final String DEFAULT_NAME = "DIM";
056
057    private final String[] names;
058
059    private final double[] values;
060
061    private final double[] variances;
062
063    private boolean hashCache = false;
064
065    private int hashCode;
066
067    /**
068     * Constructs a regressor from the supplied named values. Throws {@link IllegalArgumentException}
069     * if the arrays are not all the same size.
070     * @param names The names of the dimensions.
071     * @param values The values of the dimensions.
072     * @param variances The variances of the specified values.
073     */
074    public Regressor(String[] names, double[] values, double[] variances) {
075        if ((names.length != values.length) || (names.length != variances.length)) {
076            throw new IllegalArgumentException("Arrays must be the same length, names.length="+names.length+", values.length="+values.length+",variances.length="+variances.length);
077        }
078        int[] indices = SortUtil.argsort(names,true);
079        this.names = new String[names.length];
080        this.values = new double[values.length];
081        this.variances = new double[variances.length];
082        for (int i = 0; i < indices.length; i++) {
083            this.names[i] = names[indices[i]];
084            this.values[i] = values[indices[i]];
085            this.variances[i] = variances[indices[i]];
086        }
087        Set<String> nameSet = new HashSet<>(Arrays.asList(this.names));
088        if (nameSet.size() != this.names.length) {
089            throw new IllegalArgumentException("Names must all be unique, found " + (this.names.length - nameSet.size()) + " duplicates");
090        }
091    }
092
093    /**
094     * Constructs a regressor from the supplied named values. Uses {@link Double#NaN} as
095     * the variances.
096     * @param names The names of the dimensions.
097     * @param values The values of the dimensions.
098     */
099    public Regressor(String[] names, double[] values) {
100        this(names, values, Util.generateUniformVector(values.length,Double.NaN));
101    }
102
103    /**
104     * Constructs a regressor from the supplied dimension tuples.
105     * @param dimensions The named values to use.
106     */
107    public Regressor(DimensionTuple[] dimensions) {
108        int[] indices = SortUtil.argsort(extractNames(dimensions),true);
109        this.names = new String[dimensions.length];
110        this.values = new double[names.length];
111        this.variances = new double[names.length];
112        for (int i = 0; i < dimensions.length; i++) {
113            DimensionTuple cur = dimensions[indices[i]];
114            names[i] = cur.getName();
115            values[i] = cur.getValue();
116            variances[i] = cur.getVariance();
117        }
118        Set<String> nameSet = new HashSet<>(Arrays.asList(this.names));
119        if (nameSet.size() != this.names.length) {
120            throw new IllegalArgumentException("Names must be unique, found " + (this.names.length - nameSet.size()) + " duplicates");
121        }
122    }
123
124    /**
125     * Constructs a regressor containing a single dimension, using
126     * {@link Double#NaN} as the variance.
127     * @param name The name of the dimension.
128     * @param value The value of the dimension.
129     */
130    public Regressor(String name, double value) {
131        this(name,value,Double.NaN);
132    }
133
134    /**
135     * Constructs a regressor containing a single dimension.
136     * @param name The name of the dimension.
137     * @param value The value of the dimension.
138     * @param variance The variance of this value.
139     */
140    public Regressor(String name, double value, double variance) {
141        this.names = new String[]{name};
142        this.values = new double[]{value};
143        this.variances = new double[]{variance};
144    }
145
146    /**
147     * Returns the number of dimensions in this regressor.
148     * @return The number of dimensions.
149     */
150    public int size() {
151        return names.length;
152    }
153
154    /**
155     * The names of the dimensions. Always sorted by their natural ordering.
156     * @return The names of the dimensions.
157     */
158    public String[] getNames() {
159        return names;
160    }
161
162    /**
163     * Returns the regression values.
164     * @return The regression values.
165     */
166    public double[] getValues() {
167        return values;
168    }
169
170    /**
171     * The variances of the regressed values, if known.
172     *
173     * Returns Double.NaN otherwise.
174     * @return The variance of the regressed values.
175     */
176    public double[] getVariances() {
177        return variances;
178    }
179
180    @Override
181    public String toString() {
182        StringBuilder builder = new StringBuilder();
183
184        for (int i = 0; i < names.length; i++) {
185            builder.append('(');
186            if (Double.isNaN(variances[i])) {
187                builder.append(names[i]);
188                builder.append(',');
189                builder.append(values[i]);
190            } else {
191                builder.append(names[i]);
192                builder.append(',');
193                builder.append(values[i]);
194                builder.append(",var=");
195                builder.append(variances[i]);
196            }
197            builder.append("),");
198        }
199
200        builder.deleteCharAt(builder.length()-1);
201
202        return builder.toString();
203    }
204
205    /**
206     *  Returns a dimension tuple for the requested dimension, or optional empty if
207     *  it's not valid.
208     * @param name The dimension name.
209     * @return A tuple representing that dimension.
210     */
211    public Optional<DimensionTuple> getDimension(String name) {
212        int i = 0;
213        while (i < names.length) {
214            if (names[i].equals(name)) {
215                return Optional.of(new DimensionTuple(name, values[i], variances[i]));
216            }
217            i++;
218        }
219        return Optional.empty();
220    }
221
222    @Override
223    public Iterator<DimensionTuple> iterator() {
224        return new RegressorIterator();
225    }
226
227    @Override
228    public Regressor copy() {
229        return new Regressor(names,Arrays.copyOf(values,values.length),Arrays.copyOf(variances,variances.length));
230    }
231
232    @Override
233    public String getSerializableForm(boolean includeConfidence) {
234        StringBuilder builder = new StringBuilder();
235        for (int i = 0; i < names.length; i++) {
236            builder.append(names[i]);
237            builder.append('=');
238            builder.append(values[i]);
239            if (includeConfidence && !Double.isNaN(variances[i])) {
240                builder.append('\u00B1');
241                builder.append(variances[i]);
242            }
243            builder.append(',');
244        }
245        builder.deleteCharAt(builder.length()-1);
246        return builder.toString();
247    }
248
249    @Override
250    public boolean fullEquals(Regressor other) {
251        if (!Arrays.equals(names,other.names)) {
252            return false;
253        } else {
254            for (int i = 0; i < values.length; i++) {
255                if (Math.abs(values[i] - other.values[i]) > TOLERANCE) {
256                    return false;
257                } else {
258                    double ourVar = variances[i];
259                    double otherVar = other.variances[i];
260                    if ((Math.abs(ourVar-otherVar) > TOLERANCE) || (Double.isNaN(ourVar) ^ Double.isNaN(otherVar))) {
261                        return false;
262                    }
263                }
264            }
265            return true;
266        }
267    }
268
269    /**
270     * Regressors are equal if they have the same number of dimensions and equal dimension names.
271     *
272     * @param o An object.
273     * @return True if Object is a Regressor with the same dimension names, false otherwise.
274     */
275    @Override
276    public boolean equals(Object o) {
277        if (this == o) {
278            return true;
279        } else if (o instanceof Regressor) {
280            return Arrays.deepEquals(names,((Regressor)o).names);
281        } else {
282            return false;
283        }
284    }
285
286    /**
287     * Regressor's hashcode is based on the hash of the dimension names.
288     * <p>
289     * It's cached on first access.
290     * @return A hashcode.
291     */
292    @Override
293    public synchronized int hashCode() {
294        if (!hashCache) {
295            hashCode = 11;
296            for (int i = 0; i < names.length; i++) {
297                hashCode ^= names[i].hashCode();
298            }
299            hashCache = true;
300        }
301        return hashCode;
302    }
303
304    /**
305     * Returns a comma separated list of the dimension names.
306     * @return The dimension names comma separated.
307     */
308    public String getDimensionNamesString() {
309        return getDimensionNamesString(',');
310    }
311
312    /**
313     * Returns a delimiter separated list of the dimension names.
314     * @param separator The separator to use.
315     * @return The dimension names.
316     */
317    public String getDimensionNamesString(char separator) {
318        return String.join(""+separator,names);
319    }
320
321    /**
322     * Extracts a String array of each dimension name from an array of DimensionTuples.
323     * @param values The dimensions.
324     * @return The names of the dimensions.
325     */
326    private static String[] extractNames(DimensionTuple[] values) {
327        String[] extractedNames = new String[values.length];
328
329        for (int i = 0; i < values.length; i++) {
330            extractedNames[i] = values[i].getName();
331        }
332
333        return extractedNames;
334    }
335
336    /**
337     * Extracts the names from the supplied Regressor domain in their canonical order.
338     * @param info The OutputInfo to use.
339     * @return The dimension names from this domain.
340     */
341    public static String[] extractNames(OutputInfo<Regressor> info) {
342        String[] extractedNames = new String[info.size()];
343        int i = 0;
344        for (Regressor r : info.getDomain()) {
345            extractedNames[i] = r.getNames()[0];
346            i++;
347        }
348        Arrays.sort(extractedNames);
349        return extractedNames;
350    }
351
352    /**
353     * Parses a string of the form:
354     * <pre>
355     * dimension-name=output,...,dimension-name=output
356     * </pre>
357     * where output must be readable by {@link Double#parseDouble}.
358     * @param s The string form of a multiple regressor.
359     * @return A regressor parsed from the input string.
360     */
361    public static Regressor parseString(String s) {
362        return parseString(s,',');
363    }
364
365    /**
366     * Parses a string of the form:
367     * <pre>
368     * dimension-name=output&lt;splitChar&gt;...&lt;splitChar&gt;dimension-name=output
369     * </pre>
370     * where output must be readable by {@link Double#parseDouble}.
371     * @param s The string form of a regressor.
372     * @param splitChar The char to split on.
373     * @return A regressor parsed from the input string.
374     */
375    public static Regressor parseString(String s, char splitChar) {
376        if (splitChar == '=') {
377            throw new IllegalArgumentException("Can't split on an equals symbol");
378        }
379        String[] tokens = s.split(""+splitChar);
380
381        String[] names = new String[tokens.length];
382        double[] values = new double[tokens.length];
383
384        Set<String> nameSet = new HashSet<>();
385
386        for (int i = 0; i < tokens.length; i++) {
387            Pair<String,Double> element = parseElement(i,tokens[i]);
388            names[i] = element.getA();
389            values[i] = element.getB();
390            nameSet.add(element.getA());
391        }
392
393        if (nameSet.size() != tokens.length) {
394            throw new IllegalArgumentException("Duplicated dimension names");
395        }
396
397        return new Regressor(names,values);
398    }
399
400    /**
401     * Parses a string of the form:
402     * <pre>
403     * dimension-name=output-double
404     * </pre>
405     * where the output must be readable by {@link Double#parseDouble}.
406     * @param idx The index of this string in a list.
407     * @param s The string form of a single dimension from a regressor.
408     * @return A tuple representing the dimension name and the value.
409     */
410    public static Pair<String,Double> parseElement(int idx, String s) {
411        String[] split = s.split("=");
412        if (split.length == 2) {
413            return new Pair<>(split[0], Double.parseDouble(split[1]));
414        } else if (split.length == 1) {
415            //No dimension name found.
416            return new Pair<>(DEFAULT_NAME + "-" + idx, Double.parseDouble(split[0]));
417        } else {
418            throw new IllegalArgumentException("Failed to parse element " + s);
419        }
420    }
421
422    /**
423     * Creates a Regressor from a list of dimension tuples.
424     * @param dimensions The dimensions to use.
425     * @return A Regressor representing these dimensions.
426     */
427    public static Regressor createFromPairList(List<Pair<String,Double>> dimensions) {
428        int numDimensions = dimensions.size();
429        String[] names = new String[numDimensions];
430        double[] values = new double[numDimensions];
431        for (int i = 0; i < numDimensions; i++) {
432            Pair<String,Double> p = dimensions.get(i);
433            names[i] = p.getA();
434            values[i] = p.getB();
435        }
436        return new Regressor(names,values);
437    }
438
439    /**
440     * A {@link Regressor} which contains a single dimension, used internally
441     * when the model implementation doesn't natively support multi-dimensional
442     * regression outputs.
443     */
444    public final static class DimensionTuple extends Regressor {
445        private static final long serialVersionUID = 1L;
446
447        private final String name;
448        private final double value;
449        private final double variance;
450
451        /**
452         * Creates a dimension tuple from the supplied name, value and variance.
453         * @param name The dimension name.
454         * @param value The dimension value.
455         * @param variance The variance of the value, if known. Otherwise {@link Double#NaN}.
456         */
457        public DimensionTuple(String name, double value, double variance) {
458            super(name,value,variance);
459            this.name = name;
460            this.value = value;
461            this.variance = variance;
462        }
463
464        /**
465         * Creates a dimension tuple from the supplied name and value.
466         * Sets the variance to {@link Double#NaN}.
467         * @param name The dimension name.
468         * @param value The dimension value.
469         */
470        public DimensionTuple(String name, double value) {
471            this(name,value,Double.NaN);
472        }
473
474        @Override
475        public int size() {
476            return 1;
477        }
478
479        @Override
480        public String toString() {
481            if (Double.isNaN(variance)) {
482                return name+"="+value;
483            } else {
484                return name+"=("+value+",var="+variance+")";
485            }
486        }
487
488        @Override
489        public Optional<DimensionTuple> getDimension(String name) {
490            if (this.name.equals(name)) {
491                return Optional.of(this);
492            } else {
493                return Optional.empty();
494            }
495        }
496
497        @Override
498        public Iterator<DimensionTuple> iterator() {
499            return Collections.singletonList(this).iterator();
500        }
501
502        @Override
503        public DimensionTuple copy() {
504            return new DimensionTuple(name,value,variance);
505        }
506
507        /**
508         * Returns the name.
509         * @return The name.
510         */
511        public String getName() {
512            return name;
513        }
514
515        /**
516         * Returns the value.
517         * @return The value.
518         */
519        public double getValue() {
520            return value;
521        }
522
523        /**
524         * Returns the variance.
525         * @return The variance.
526         */
527        public double getVariance() {
528            return variance;
529        }
530
531        @Override
532        public String getSerializableForm(boolean includeConfidence) {
533            String tmp = name + "=" + value;
534            if (includeConfidence && !Double.isNaN(variance)) {
535                return tmp + "\u00B1" + variance;
536            } else {
537                return tmp;
538            }
539        }
540
541        @Override
542        public boolean fullEquals(Regressor other) {
543            if (!equals(other)) {
544                return false;
545            } else {
546                // Now check values for equality
547                // Must have only one value
548                double otherValue = other.values[0];
549                double otherVar = other.variances[0];
550                if ((Math.abs(value-otherValue) > TOLERANCE) || (Double.isNaN(value) ^ Double.isNaN(otherValue))) {
551                    return false;
552                }
553                return (!(Math.abs(variance - otherVar) > TOLERANCE)) && (Double.isNaN(variance) == Double.isNaN(otherVar));
554            }
555        }
556
557        @Override
558        public boolean equals(Object o) {
559            if (this == o) {
560                return true;
561            } else if (o instanceof DimensionTuple) {
562                return name.equals(((DimensionTuple)o).name);
563            } else if (o instanceof Regressor) {
564                Regressor other = (Regressor) o;
565                return other.size() == 1 && other.getNames()[0].equals(name);
566            } else {
567                return false;
568            }
569        }
570
571        /**
572         * All regressors have a hashcode based on only the dimension names.
573         * @return A hashcode.
574         */
575        @Override
576        public int hashCode() {
577            return 11 ^ name.hashCode();
578        }
579
580        @Override
581        public String getDimensionNamesString() {
582            return name;
583        }
584    }
585
586    private class RegressorIterator implements Iterator<DimensionTuple> {
587        private int i = 0;
588
589        @Override
590        public boolean hasNext() {
591            return i < names.length;
592        }
593
594        @Override
595        public DimensionTuple next() {
596            DimensionTuple r = new DimensionTuple(names[i],values[i],variances[i]);
597            i++;
598            return r;
599        }
600    }
601}