001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.impl; 018 019import com.oracle.labs.mlrg.olcut.util.SortUtil; 020import org.tribuo.Example; 021import org.tribuo.Feature; 022import org.tribuo.FeatureMap; 023import org.tribuo.Output; 024import org.tribuo.VariableInfo; 025import org.tribuo.transform.Transformer; 026import org.tribuo.transform.TransformerMap; 027import org.tribuo.util.Merger; 028 029import java.util.ArrayList; 030import java.util.Arrays; 031import java.util.Collection; 032import java.util.HashMap; 033import java.util.HashSet; 034import java.util.Iterator; 035import java.util.List; 036import java.util.Map; 037import java.util.Objects; 038import java.util.PriorityQueue; 039import java.util.Set; 040import java.util.logging.Logger; 041 042/** 043 * An {@link Example} backed by two arrays, one of String and one of double. 044 */ 045public class ArrayExample<T extends Output<T>> extends Example<T> { 046 private static final long serialVersionUID = 1L; 047 048 private static final Logger logger = Logger.getLogger(ArrayExample.class.getName()); 049 050 public static final int DEFAULT_SIZE = 10; 051 052 protected String[] featureNames; 053 054 protected double[] featureValues; 055 056 protected int size = 0; 057 058 /** 059 * Constructs an example from an output and a weight, with an initial 060 * size for the feature arrays. 061 * @param output The output. 062 * @param weight The weight. 063 * @param initialSize The initial size of the feature arrays. 064 */ 065 public ArrayExample(T output, float weight, int initialSize) { 066 super(output,weight); 067 featureNames = new String[initialSize]; 068 featureValues = new double[initialSize]; 069 } 070 071 /** 072 * Constructs an example from an output, a weight and the metadata. 073 * @param output The output. 074 * @param weight The weight. 075 * @param metadata The metadata. 076 */ 077 public ArrayExample(T output, float weight, Map<String,Object> metadata) { 078 super(output,weight,metadata); 079 featureNames = new String[DEFAULT_SIZE]; 080 featureValues = new double[DEFAULT_SIZE]; 081 } 082 083 /** 084 * Constructs an example from an output and a weight. 085 * @param output The output. 086 * @param weight The example weight. 087 */ 088 public ArrayExample(T output, float weight) { 089 super(output,weight); 090 featureNames = new String[DEFAULT_SIZE]; 091 featureValues = new double[DEFAULT_SIZE]; 092 } 093 094 /** 095 * Constructs an example from an output and the metadata. 096 * @param output The output. 097 * @param metadata The metadata. 098 */ 099 public ArrayExample(T output, Map<String,Object> metadata) { 100 super(output,metadata); 101 featureNames = new String[DEFAULT_SIZE]; 102 featureValues = new double[DEFAULT_SIZE]; 103 } 104 105 /** 106 * Constructs an example from an output. 107 * @param output The output. 108 */ 109 public ArrayExample(T output) { 110 super(output); 111 featureNames = new String[DEFAULT_SIZE]; 112 featureValues = new double[DEFAULT_SIZE]; 113 } 114 115 /** 116 * Constructs an example from an output, an array of names and an array of values. 117 * This is currently the most efficient constructor. 118 * @param output The output. 119 * @param names The feature names. 120 * @param values The feature values. 121 */ 122 public ArrayExample(T output, String[] names, double[] values) { 123 super(output); 124 if (names.length != values.length) { 125 throw new IllegalArgumentException("names.length != values.length, names = " + names.length + ", values = " + values.length); 126 } 127 128 size = names.length; 129 featureNames = Arrays.copyOf(names,names.length); 130 featureValues = Arrays.copyOf(values,values.length); 131 132 sort(); 133 } 134 135 /** 136 * Constructs an example from an output and a list of features. 137 * @param output The output. 138 * @param features The list of features. 139 */ 140 public ArrayExample(T output, List<? extends Feature> features) { 141 super(output); 142 size = features.size(); 143 featureNames = new String[size]; 144 featureValues = new double[size]; 145 146 int i = 0; 147 for (Feature f : features) { 148 featureNames[i] = f.getName(); 149 featureValues[i] = f.getValue(); 150 i++; 151 } 152 153 sort(); 154 } 155 156 /** 157 * Copy constructor. 158 * @param other The example to copy. 159 */ 160 public ArrayExample(Example<T> other) { 161 super(other); 162 featureNames = new String[other.size()]; 163 featureValues = new double[other.size()]; 164 for (Feature f : other) { 165 featureNames[size] = f.getName(); 166 featureValues[size] = f.getValue(); 167 size++; 168 } 169 } 170 171 /** 172 * Clones an example's features, but uses the supplied output and weight. 173 * @param output The output to use. 174 * @param other The features to use. 175 * @param weight The weight to use. 176 * @param <U> The output type of the other example. 177 */ 178 public <U extends Output<U>> ArrayExample(T output, Example<U> other, float weight) { 179 super(output,weight); 180 featureNames = new String[other.size()]; 181 featureValues = new double[other.size()]; 182 for (Feature f : other) { 183 featureNames[size] = f.getName(); 184 featureValues[size] = f.getValue(); 185 size++; 186 } 187 } 188 189 /** 190 * Adds a single feature. 191 * @param name The name of the feature. 192 * @param value The value of the feature. 193 */ 194 public void add(String name, double value) { 195 if (size >= featureNames.length) { 196 growArray(); 197 } 198 // 199 // TODO: find the right insertion position, System.arraycopy 200 // everything up one and then write the new value. 201 featureNames[size] = name; 202 featureValues[size] = value; 203 size++; 204 sort(); 205 } 206 207 @Override 208 public void add(Feature feature) { 209 add(feature.getName(),feature.getValue()); 210 } 211 212 @Override 213 public void addAll(Collection<? extends Feature> features) { 214 if (size + features.size() >= featureNames.length) { 215 growArray(size+features.size()); 216 } 217 for (Feature f : features) { 218 featureNames[size] = f.getName(); 219 featureValues[size] = f.getValue(); 220 size++; 221 } 222 sort(); 223 } 224 225 /** 226 * Grows the backing arrays storing the names and values. 227 * @param minCapacity The new minimum capacity required. 228 */ 229 protected void growArray(int minCapacity) { 230 int newCapacity = newCapacity(minCapacity); 231 featureNames = Arrays.copyOf(featureNames,newCapacity); 232 featureValues = Arrays.copyOf(featureValues,newCapacity); 233 } 234 235 /** 236 * Grows the backing arrays by size+1. 237 */ 238 protected void growArray() { 239 growArray(size+1); 240 } 241 242 /** 243 * Returns a capacity at least as large as the given minimum capacity. 244 * Returns the current capacity increased by 50% if that suffices. 245 * Will not return a capacity greater than MAX_ARRAY_SIZE unless 246 * the given minimum capacity is greater than MAX_ARRAY_SIZE. 247 * 248 * @param minCapacity the desired minimum capacity 249 * @throws OutOfMemoryError if minCapacity is less than zero 250 * @return The new capacity. 251 */ 252 protected int newCapacity(int minCapacity) { 253 // overflow-conscious code 254 int oldCapacity = featureNames.length; 255 int newCapacity = oldCapacity + (oldCapacity >> 1); 256 if (newCapacity - minCapacity <= 0) { 257 if (minCapacity < 0) { 258 // overflow 259 throw new OutOfMemoryError(); 260 } 261 return minCapacity; 262 } 263 return newCapacity; 264 } 265 266 /** 267 * Sorts the feature list to maintain the lexicographic order invariant. 268 */ 269 @Override 270 protected void sort() { 271 int[] sortedIndices = SortUtil.argsort(featureNames,0,size,true); 272 273 String[] newNames = Arrays.copyOf(featureNames,size); 274 double[] newValues = Arrays.copyOf(featureValues,size); 275 for (int i = 0; i < sortedIndices.length; i++) { 276 featureNames[i] = newNames[sortedIndices[i]]; 277 featureValues[i] = newValues[sortedIndices[i]]; 278 } 279 } 280 281 /** 282 * Returns a copy of the feature values array at the specific size. 283 * @param newSize The new size. 284 * @return A copy of the feature values. 285 */ 286 public double[] copyValues(int newSize) { 287 return Arrays.copyOf(featureValues,newSize); 288 } 289 290 @Override 291 public int size() { 292 return size; 293 } 294 295 @Override 296 public void removeFeatures(List<Feature> featureList) { 297 Map<String,List<Integer>> map = new HashMap<>(); 298 for (int i = 0; i < featureNames.length; i++) { 299 List<Integer> list = map.computeIfAbsent(featureNames[i],(k) -> new ArrayList<>()); 300 list.add(i); 301 } 302 303 PriorityQueue<Integer> removeQueue = new PriorityQueue<>(); 304 for (Feature f : featureList) { 305 List<Integer> i = map.get(f.getName()); 306 if (i != null) { 307 // If we've found this feature remove it from the map to prevent double counting 308 map.remove(f.getName()); 309 removeQueue.addAll(i); 310 } 311 } 312 313 String[] newNames = new String[size-removeQueue.size()]; 314 double[] newValues = new double[size-removeQueue.size()]; 315 316 int source = 0; 317 int dest = 0; 318 while (!removeQueue.isEmpty()) { 319 int curRemoveIdx = removeQueue.poll(); 320 while (source < curRemoveIdx) { 321 newNames[dest] = featureNames[source]; 322 newValues[dest] = featureValues[source]; 323 source++; 324 dest++; 325 } 326 source++; 327 } 328 while (source < size) { 329 newNames[dest] = featureNames[source]; 330 newValues[dest] = featureValues[source]; 331 source++; 332 dest++; 333 } 334 featureNames = newNames; 335 featureValues = newValues; 336 size = featureNames.length; 337 } 338 339 @Override 340 public void reduceByName(Merger merger) { 341 if (size > 0) { 342 int[] sortedIndices = SortUtil.argsort(featureNames,0,size,true); 343 String[] newNames = new String[featureNames.length]; 344 double[] newValues = new double[featureNames.length]; 345 for (int i = 0; i < sortedIndices.length; i++) { 346 newNames[i] = featureNames[sortedIndices[i]]; 347 newValues[i] = featureValues[sortedIndices[i]]; 348 } 349 featureNames[0] = newNames[0]; 350 featureValues[0] = newValues[0]; 351 int dest = 0; 352 for (int i = 1; i < size; i++) { 353 while ((i < size) && newNames[i].equals(featureNames[dest])) { 354 featureValues[dest] = merger.merge(featureValues[dest], newValues[i]); 355 i++; 356 } 357 if (i < size) { 358 dest++; 359 featureNames[dest] = newNames[i]; 360 featureValues[dest] = newValues[i]; 361 } 362 } 363 size = dest + 1; 364 } else { 365 logger.finer("Reducing an example with no features."); 366 } 367 } 368 369 @Override 370 public boolean validateExample() { 371 if (size == 0) { 372 return false; 373 } else { 374 for (int i = 0; i < size; i++) { 375 if (Double.isNaN(featureValues[i])) { 376 return false; 377 } 378 } 379 Set<String> names = new HashSet<>(Arrays.asList(featureNames).subList(0, size)); 380 return names.size() == size; 381 } 382 } 383 384 @Override 385 public ArrayExample<T> copy() { 386 return new ArrayExample<>(this); 387 } 388 389 @Override 390 public Feature lookup(String i) { 391 int index = Arrays.binarySearch(featureNames,0,size,i); 392 if (index < 0) { 393 return null; 394 } else { 395 return new Feature(featureNames[index],featureValues[index]); 396 } 397 } 398 399 @Override 400 public void set(Feature feature) { 401 int index = Arrays.binarySearch(featureNames,0,size,feature.getName()); 402 if (index < 0) { 403 throw new IllegalArgumentException("Feature " + feature + " not found in example."); 404 } else { 405 featureValues[index] = feature.getValue(); 406 } 407 } 408 409 @Override 410 public void transform(TransformerMap transformerMap) { 411 for (Map.Entry<String,List<Transformer>> e : transformerMap.entrySet()) { 412 int index = Arrays.binarySearch(featureNames,0,size,e.getKey()); 413 if (index >= 0) { 414 double value = featureValues[index]; 415 for (Transformer t : e.getValue()) { 416 value = t.transform(value); 417 } 418 featureValues[index] = value; 419 } 420 } 421 } 422 423 @Override 424 public void densify(List<String> featureList) { 425 // Ensure we have enough space. 426 if (featureList.size() > featureNames.length) { 427 growArray(featureList.size()); 428 } 429 int insertedCount = 0; 430 int curPos = 0; 431 for (String curName : featureList) { 432 // If we've reached the end of our old feature set, just insert. 433 if (curPos == size) { 434 featureNames[size + insertedCount] = curName; 435 insertedCount++; 436 } else { 437 // Check to see if our insertion candidate is the same as the current feature name. 438 int comparison = curName.compareTo(featureNames[curPos]); 439 if (comparison < 0) { 440 // If it's earlier, insert it. 441 featureNames[size + insertedCount] = curName; 442 insertedCount++; 443 } else if (comparison == 0) { 444 // Otherwise just bump our pointer, we've already got this feature. 445 curPos++; 446 } 447 } 448 } 449 // Bump the size up by the number of inserted features. 450 size += insertedCount; 451 // Sort the features 452 sort(); 453 } 454 455 @Override 456 public Iterator<Feature> iterator() { 457 return new ArrayExampleIterator(); 458 } 459 460 @Override 461 public String toString() { 462 StringBuilder builder = new StringBuilder(); 463 464 builder.append("ArrayExample(numFeatures="); 465 builder.append(size); 466 builder.append(",output="); 467 builder.append(output); 468 builder.append(",weight="); 469 builder.append(weight); 470 if (metadata != null) { 471 builder.append(",metadata="); 472 builder.append(metadata.toString()); 473 } 474 builder.append(",features=["); 475 for(int i = 0; i < size; i++) { 476 builder.append('(').append(featureNames[i]).append(", ").append(featureValues[i]).append(')'); 477 if(i != 0) { 478 builder.append(", "); 479 } 480 } 481 //builder.append(features.toString()); 482 builder.append("])"); 483 484 return builder.toString(); 485 } 486 487 @Override 488 public boolean equals(Object o) { 489 if (this == o) return true; 490 if (!(o instanceof ArrayExample)) return false; 491 ArrayExample<?> that = (ArrayExample<?>) o; 492 if (Objects.equals(metadata,that.metadata) && output.getClass().equals(that.output.getClass())) { 493 @SuppressWarnings("unchecked") //guarded by a getClass. 494 boolean outputTest = output.fullEquals((T)that.output); 495 if(outputTest && size == that.size) { 496 //we do not use Arrays.equals here because these are "backing arrays" which could be different sizes 497 for(int i=0; i<size; i++) { 498 if(!this.featureNames[i].equals(that.featureNames[i])) return false; 499 if(this.featureValues[i] != that.featureValues[i]) return false; 500 } 501 return true; 502 } 503 return false; 504 } else { 505 return false; 506 } 507 } 508 509 @Override 510 public int hashCode() { 511 int result = Objects.hash(size); 512 result = 31 * result + output.hashCode(); 513 //we don't use Arrays.hashCode here because featureNames and featureValues 514 //are backing arrays and the length of each could be arbitrarily diverging 515 //from the member size. 516 for(int i=0; i<size; i++) { 517 result = 31 * result + featureNames[i].hashCode(); 518 result = 31 * result + Double.hashCode(featureValues[i]); 519 } 520 return result; 521 } 522 523 class ArrayExampleIterator implements Iterator<Feature> { 524 int pos = 0; 525 526 @Override 527 public boolean hasNext() { 528 return pos < size; 529 } 530 531 @Override 532 public Feature next() { 533 Feature f = new Feature(featureNames[pos],featureValues[pos]); 534 pos++; 535 return f; 536 } 537 } 538 539 @Override 540 public void canonicalize(FeatureMap featureMap) { 541 for(int i=0; i< featureNames.length; i++) { 542 VariableInfo vi = featureMap.get(featureNames[i]); 543 if(vi != null) { 544 featureNames[i] = vi.getName(); 545 } 546 } 547 } 548}