001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.regression; 018 019import com.oracle.labs.mlrg.olcut.util.Pair; 020import com.oracle.labs.mlrg.olcut.util.SortUtil; 021import org.tribuo.Output; 022import org.tribuo.OutputInfo; 023import org.tribuo.util.Util; 024 025import java.util.Arrays; 026import java.util.Collections; 027import java.util.HashSet; 028import java.util.Iterator; 029import java.util.List; 030import java.util.Optional; 031import java.util.Set; 032 033/** 034 * An {@link Output} for n-dimensional real valued regression. 035 * <p> 036 * In addition to the regressed values, it may optionally contain 037 * variances. Otherwise the variances are set to {@link Double#NaN}. 038 * </p> 039 * <p> 040 * Within a {@link org.tribuo.DataSource} or {@link org.tribuo.Dataset} 041 * each Regressor must contain the same set of named dimensions. The dimensions stored in a 042 * Regressor are sorted by the natural ordering of their names (i.e., using the String comparator). 043 * This allows the use of direct indexing into the elements. 044 * </p> 045 * <p> 046 * Note {@link Regressor#fullEquals} compares the dimensions, the regressed values and the 047 * variances. However unlike {@link Double#equals}, if the two variances being compared are 048 * set to the sentinel value of {@link Double#NaN}, then they are considered equal. 049 * </p> 050 */ 051public class Regressor implements Output<Regressor>, Iterable<Regressor.DimensionTuple> { 052 private static final long serialVersionUID = 1L; 053 public static final double TOLERANCE = 1e-12; 054 055 public static final String DEFAULT_NAME = "DIM"; 056 057 private final String[] names; 058 059 private final double[] values; 060 061 private final double[] variances; 062 063 private boolean hashCache = false; 064 065 private int hashCode; 066 067 /** 068 * Constructs a regressor from the supplied named values. Throws {@link IllegalArgumentException} 069 * if the arrays are not all the same size. 070 * @param names The names of the dimensions. 071 * @param values The values of the dimensions. 072 * @param variances The variances of the specified values. 073 */ 074 public Regressor(String[] names, double[] values, double[] variances) { 075 if ((names.length != values.length) || (names.length != variances.length)) { 076 throw new IllegalArgumentException("Arrays must be the same length, names.length="+names.length+", values.length="+values.length+",variances.length="+variances.length); 077 } 078 int[] indices = SortUtil.argsort(names,true); 079 this.names = new String[names.length]; 080 this.values = new double[values.length]; 081 this.variances = new double[variances.length]; 082 for (int i = 0; i < indices.length; i++) { 083 this.names[i] = names[indices[i]]; 084 this.values[i] = values[indices[i]]; 085 this.variances[i] = variances[indices[i]]; 086 } 087 Set<String> nameSet = new HashSet<>(Arrays.asList(this.names)); 088 if (nameSet.size() != this.names.length) { 089 throw new IllegalArgumentException("Names must all be unique, found " + (this.names.length - nameSet.size()) + " duplicates"); 090 } 091 } 092 093 /** 094 * Constructs a regressor from the supplied named values. Uses {@link Double#NaN} as 095 * the variances. 096 * @param names The names of the dimensions. 097 * @param values The values of the dimensions. 098 */ 099 public Regressor(String[] names, double[] values) { 100 this(names, values, Util.generateUniformVector(values.length,Double.NaN)); 101 } 102 103 /** 104 * Constructs a regressor from the supplied dimension tuples. 105 * @param dimensions The named values to use. 106 */ 107 public Regressor(DimensionTuple[] dimensions) { 108 int[] indices = SortUtil.argsort(extractNames(dimensions),true); 109 this.names = new String[dimensions.length]; 110 this.values = new double[names.length]; 111 this.variances = new double[names.length]; 112 for (int i = 0; i < dimensions.length; i++) { 113 DimensionTuple cur = dimensions[indices[i]]; 114 names[i] = cur.getName(); 115 values[i] = cur.getValue(); 116 variances[i] = cur.getVariance(); 117 } 118 Set<String> nameSet = new HashSet<>(Arrays.asList(this.names)); 119 if (nameSet.size() != this.names.length) { 120 throw new IllegalArgumentException("Names must be unique, found " + (this.names.length - nameSet.size()) + " duplicates"); 121 } 122 } 123 124 /** 125 * Constructs a regressor containing a single dimension, using 126 * {@link Double#NaN} as the variance. 127 * @param name The name of the dimension. 128 * @param value The value of the dimension. 129 */ 130 public Regressor(String name, double value) { 131 this(name,value,Double.NaN); 132 } 133 134 /** 135 * Constructs a regressor containing a single dimension. 136 * @param name The name of the dimension. 137 * @param value The value of the dimension. 138 * @param variance The variance of this value. 139 */ 140 public Regressor(String name, double value, double variance) { 141 this.names = new String[]{name}; 142 this.values = new double[]{value}; 143 this.variances = new double[]{variance}; 144 } 145 146 /** 147 * Returns the number of dimensions in this regressor. 148 * @return The number of dimensions. 149 */ 150 public int size() { 151 return names.length; 152 } 153 154 /** 155 * The names of the dimensions. Always sorted by their natural ordering. 156 * @return The names of the dimensions. 157 */ 158 public String[] getNames() { 159 return names; 160 } 161 162 /** 163 * Returns the regression values. 164 * @return The regression values. 165 */ 166 public double[] getValues() { 167 return values; 168 } 169 170 /** 171 * The variances of the regressed values, if known. 172 * 173 * Returns Double.NaN otherwise. 174 * @return The variance of the regressed values. 175 */ 176 public double[] getVariances() { 177 return variances; 178 } 179 180 @Override 181 public String toString() { 182 StringBuilder builder = new StringBuilder(); 183 184 for (int i = 0; i < names.length; i++) { 185 builder.append('('); 186 if (Double.isNaN(variances[i])) { 187 builder.append(names[i]); 188 builder.append(','); 189 builder.append(values[i]); 190 } else { 191 builder.append(names[i]); 192 builder.append(','); 193 builder.append(values[i]); 194 builder.append(",var="); 195 builder.append(variances[i]); 196 } 197 builder.append("),"); 198 } 199 200 builder.deleteCharAt(builder.length()-1); 201 202 return builder.toString(); 203 } 204 205 /** 206 * Returns a dimension tuple for the requested dimension, or optional empty if 207 * it's not valid. 208 * @param name The dimension name. 209 * @return A tuple representing that dimension. 210 */ 211 public Optional<DimensionTuple> getDimension(String name) { 212 int i = 0; 213 while (i < names.length) { 214 if (names[i].equals(name)) { 215 return Optional.of(new DimensionTuple(name, values[i], variances[i])); 216 } 217 i++; 218 } 219 return Optional.empty(); 220 } 221 222 @Override 223 public Iterator<DimensionTuple> iterator() { 224 return new RegressorIterator(); 225 } 226 227 @Override 228 public Regressor copy() { 229 return new Regressor(names,Arrays.copyOf(values,values.length),Arrays.copyOf(variances,variances.length)); 230 } 231 232 @Override 233 public String getSerializableForm(boolean includeConfidence) { 234 StringBuilder builder = new StringBuilder(); 235 for (int i = 0; i < names.length; i++) { 236 builder.append(names[i]); 237 builder.append('='); 238 builder.append(values[i]); 239 if (includeConfidence && !Double.isNaN(variances[i])) { 240 builder.append('\u00B1'); 241 builder.append(variances[i]); 242 } 243 builder.append(','); 244 } 245 builder.deleteCharAt(builder.length()-1); 246 return builder.toString(); 247 } 248 249 @Override 250 public boolean fullEquals(Regressor other) { 251 if (!Arrays.equals(names,other.names)) { 252 return false; 253 } else { 254 for (int i = 0; i < values.length; i++) { 255 if (Math.abs(values[i] - other.values[i]) > TOLERANCE) { 256 return false; 257 } else { 258 double ourVar = variances[i]; 259 double otherVar = other.variances[i]; 260 if ((Math.abs(ourVar-otherVar) > TOLERANCE) || (Double.isNaN(ourVar) ^ Double.isNaN(otherVar))) { 261 return false; 262 } 263 } 264 } 265 return true; 266 } 267 } 268 269 /** 270 * Regressors are equal if they have the same number of dimensions and equal dimension names. 271 * 272 * @param o An object. 273 * @return True if Object is a Regressor with the same dimension names, false otherwise. 274 */ 275 @Override 276 public boolean equals(Object o) { 277 if (this == o) { 278 return true; 279 } else if (o instanceof Regressor) { 280 return Arrays.deepEquals(names,((Regressor)o).names); 281 } else { 282 return false; 283 } 284 } 285 286 /** 287 * Regressor's hashcode is based on the hash of the dimension names. 288 * <p> 289 * It's cached on first access. 290 * @return A hashcode. 291 */ 292 @Override 293 public synchronized int hashCode() { 294 if (!hashCache) { 295 hashCode = 11; 296 for (int i = 0; i < names.length; i++) { 297 hashCode ^= names[i].hashCode(); 298 } 299 hashCache = true; 300 } 301 return hashCode; 302 } 303 304 /** 305 * Returns a comma separated list of the dimension names. 306 * @return The dimension names comma separated. 307 */ 308 public String getDimensionNamesString() { 309 return getDimensionNamesString(','); 310 } 311 312 /** 313 * Returns a delimiter separated list of the dimension names. 314 * @param separator The separator to use. 315 * @return The dimension names. 316 */ 317 public String getDimensionNamesString(char separator) { 318 return String.join(""+separator,names); 319 } 320 321 /** 322 * Extracts a String array of each dimension name from an array of DimensionTuples. 323 * @param values The dimensions. 324 * @return The names of the dimensions. 325 */ 326 private static String[] extractNames(DimensionTuple[] values) { 327 String[] extractedNames = new String[values.length]; 328 329 for (int i = 0; i < values.length; i++) { 330 extractedNames[i] = values[i].getName(); 331 } 332 333 return extractedNames; 334 } 335 336 /** 337 * Extracts the names from the supplied Regressor domain in their canonical order. 338 * @param info The OutputInfo to use. 339 * @return The dimension names from this domain. 340 */ 341 public static String[] extractNames(OutputInfo<Regressor> info) { 342 String[] extractedNames = new String[info.size()]; 343 int i = 0; 344 for (Regressor r : info.getDomain()) { 345 extractedNames[i] = r.getNames()[0]; 346 i++; 347 } 348 Arrays.sort(extractedNames); 349 return extractedNames; 350 } 351 352 /** 353 * Parses a string of the form: 354 * <pre> 355 * dimension-name=output,...,dimension-name=output 356 * </pre> 357 * where output must be readable by {@link Double#parseDouble}. 358 * @param s The string form of a multiple regressor. 359 * @return A regressor parsed from the input string. 360 */ 361 public static Regressor parseString(String s) { 362 return parseString(s,','); 363 } 364 365 /** 366 * Parses a string of the form: 367 * <pre> 368 * dimension-name=output<splitChar>...<splitChar>dimension-name=output 369 * </pre> 370 * where output must be readable by {@link Double#parseDouble}. 371 * @param s The string form of a regressor. 372 * @param splitChar The char to split on. 373 * @return A regressor parsed from the input string. 374 */ 375 public static Regressor parseString(String s, char splitChar) { 376 if (splitChar == '=') { 377 throw new IllegalArgumentException("Can't split on an equals symbol"); 378 } 379 String[] tokens = s.split(""+splitChar); 380 381 String[] names = new String[tokens.length]; 382 double[] values = new double[tokens.length]; 383 384 Set<String> nameSet = new HashSet<>(); 385 386 for (int i = 0; i < tokens.length; i++) { 387 Pair<String,Double> element = parseElement(i,tokens[i]); 388 names[i] = element.getA(); 389 values[i] = element.getB(); 390 nameSet.add(element.getA()); 391 } 392 393 if (nameSet.size() != tokens.length) { 394 throw new IllegalArgumentException("Duplicated dimension names"); 395 } 396 397 return new Regressor(names,values); 398 } 399 400 /** 401 * Parses a string of the form: 402 * <pre> 403 * dimension-name=output-double 404 * </pre> 405 * where the output must be readable by {@link Double#parseDouble}. 406 * @param idx The index of this string in a list. 407 * @param s The string form of a single dimension from a regressor. 408 * @return A tuple representing the dimension name and the value. 409 */ 410 public static Pair<String,Double> parseElement(int idx, String s) { 411 String[] split = s.split("="); 412 if (split.length == 2) { 413 return new Pair<>(split[0], Double.parseDouble(split[1])); 414 } else if (split.length == 1) { 415 //No dimension name found. 416 return new Pair<>(DEFAULT_NAME + "-" + idx, Double.parseDouble(split[0])); 417 } else { 418 throw new IllegalArgumentException("Failed to parse element " + s); 419 } 420 } 421 422 /** 423 * Creates a Regressor from a list of dimension tuples. 424 * @param dimensions The dimensions to use. 425 * @return A Regressor representing these dimensions. 426 */ 427 public static Regressor createFromPairList(List<Pair<String,Double>> dimensions) { 428 int numDimensions = dimensions.size(); 429 String[] names = new String[numDimensions]; 430 double[] values = new double[numDimensions]; 431 for (int i = 0; i < numDimensions; i++) { 432 Pair<String,Double> p = dimensions.get(i); 433 names[i] = p.getA(); 434 values[i] = p.getB(); 435 } 436 return new Regressor(names,values); 437 } 438 439 /** 440 * A {@link Regressor} which contains a single dimension, used internally 441 * when the model implementation doesn't natively support multi-dimensional 442 * regression outputs. 443 */ 444 public final static class DimensionTuple extends Regressor { 445 private static final long serialVersionUID = 1L; 446 447 private final String name; 448 private final double value; 449 private final double variance; 450 451 /** 452 * Creates a dimension tuple from the supplied name, value and variance. 453 * @param name The dimension name. 454 * @param value The dimension value. 455 * @param variance The variance of the value, if known. Otherwise {@link Double#NaN}. 456 */ 457 public DimensionTuple(String name, double value, double variance) { 458 super(name,value,variance); 459 this.name = name; 460 this.value = value; 461 this.variance = variance; 462 } 463 464 /** 465 * Creates a dimension tuple from the supplied name and value. 466 * Sets the variance to {@link Double#NaN}. 467 * @param name The dimension name. 468 * @param value The dimension value. 469 */ 470 public DimensionTuple(String name, double value) { 471 this(name,value,Double.NaN); 472 } 473 474 @Override 475 public int size() { 476 return 1; 477 } 478 479 @Override 480 public String toString() { 481 if (Double.isNaN(variance)) { 482 return name+"="+value; 483 } else { 484 return name+"=("+value+",var="+variance+")"; 485 } 486 } 487 488 @Override 489 public Optional<DimensionTuple> getDimension(String name) { 490 if (this.name.equals(name)) { 491 return Optional.of(this); 492 } else { 493 return Optional.empty(); 494 } 495 } 496 497 @Override 498 public Iterator<DimensionTuple> iterator() { 499 return Collections.singletonList(this).iterator(); 500 } 501 502 @Override 503 public DimensionTuple copy() { 504 return new DimensionTuple(name,value,variance); 505 } 506 507 /** 508 * Returns the name. 509 * @return The name. 510 */ 511 public String getName() { 512 return name; 513 } 514 515 /** 516 * Returns the value. 517 * @return The value. 518 */ 519 public double getValue() { 520 return value; 521 } 522 523 /** 524 * Returns the variance. 525 * @return The variance. 526 */ 527 public double getVariance() { 528 return variance; 529 } 530 531 @Override 532 public String getSerializableForm(boolean includeConfidence) { 533 String tmp = name + "=" + value; 534 if (includeConfidence && !Double.isNaN(variance)) { 535 return tmp + "\u00B1" + variance; 536 } else { 537 return tmp; 538 } 539 } 540 541 @Override 542 public boolean fullEquals(Regressor other) { 543 if (!equals(other)) { 544 return false; 545 } else { 546 // Now check values for equality 547 // Must have only one value 548 double otherValue = other.values[0]; 549 double otherVar = other.variances[0]; 550 if ((Math.abs(value-otherValue) > TOLERANCE) || (Double.isNaN(value) ^ Double.isNaN(otherValue))) { 551 return false; 552 } 553 return (!(Math.abs(variance - otherVar) > TOLERANCE)) && (Double.isNaN(variance) == Double.isNaN(otherVar)); 554 } 555 } 556 557 @Override 558 public boolean equals(Object o) { 559 if (this == o) { 560 return true; 561 } else if (o instanceof DimensionTuple) { 562 return name.equals(((DimensionTuple)o).name); 563 } else if (o instanceof Regressor) { 564 Regressor other = (Regressor) o; 565 return other.size() == 1 && other.getNames()[0].equals(name); 566 } else { 567 return false; 568 } 569 } 570 571 /** 572 * All regressors have a hashcode based on only the dimension names. 573 * @return A hashcode. 574 */ 575 @Override 576 public int hashCode() { 577 return 11 ^ name.hashCode(); 578 } 579 580 @Override 581 public String getDimensionNamesString() { 582 return name; 583 } 584 } 585 586 private class RegressorIterator implements Iterator<DimensionTuple> { 587 private int i = 0; 588 589 @Override 590 public boolean hasNext() { 591 return i < names.length; 592 } 593 594 @Override 595 public DimensionTuple next() { 596 DimensionTuple r = new DimensionTuple(names[i],values[i],variances[i]); 597 i++; 598 return r; 599 } 600 } 601}