001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.impl; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collection; 022import java.util.Collections; 023import java.util.HashMap; 024import java.util.HashSet; 025import java.util.Iterator; 026import java.util.List; 027import java.util.Map; 028import java.util.Objects; 029import java.util.PriorityQueue; 030import java.util.Set; 031import java.util.logging.Logger; 032import java.util.stream.Collectors; 033 034import org.tribuo.Example; 035import org.tribuo.Feature; 036import org.tribuo.FeatureMap; 037import org.tribuo.Output; 038import org.tribuo.VariableInfo; 039import org.tribuo.transform.TransformerMap; 040import org.tribuo.util.Merger; 041 042import com.oracle.labs.mlrg.olcut.util.SortUtil; 043 044/** 045 * An {@link Example} backed by a single array of feature names. This 046 * implementation is modeled after {@link ArrayExample} but does not store 047 * feature values because it assumes only binary features - i.e., features with a 048 * feature value of 1.0. The following methods throw an 049 * {@link UnsupportedOperationException}: 050 * <ul> 051 * <li>{@link #densify(List)}</li> 052 * <li>{@link #densify(FeatureMap)}</li> 053 * <li>{@link #set(Feature)}</li> 054 * <li>{@link #transform(TransformerMap)}</li> 055 * </ul> 056 * 057 * @param <T> The output type. 058 */ 059public final class BinaryFeaturesExample<T extends Output<T>> extends Example<T> { 060 private static final long serialVersionUID = 1L; 061 062 private static final Logger logger = Logger.getLogger(BinaryFeaturesExample.class.getName()); 063 064 public static final int DEFAULT_SIZE = 10; 065 066 protected String[] featureNames; 067 068 protected int size = 0; 069 070 /** 071 * Constructs an example from an output and a weight, with an initial size for 072 * the feature arrays. 073 * 074 * @param output The output. 075 * @param weight The weight. 076 * @param initialSize The initial size of the feature arrays. 077 */ 078 public BinaryFeaturesExample(T output, float weight, int initialSize) { 079 super(output,weight); 080 featureNames = new String[initialSize]; 081 } 082 083 /** 084 * Constructs an example from an output, a weight and the metadata. 085 * 086 * @param output The output. 087 * @param weight The weight. 088 * @param metadata The metadata. 089 */ 090 public BinaryFeaturesExample(T output, float weight, Map<String,Object> metadata) { 091 super(output,weight,metadata); 092 featureNames = new String[DEFAULT_SIZE]; 093 } 094 095 /** 096 * Constructs an example from an output and a weight. 097 * 098 * @param output The output. 099 * @param weight The example weight. 100 */ 101 public BinaryFeaturesExample(T output, float weight) { 102 super(output,weight); 103 featureNames = new String[DEFAULT_SIZE]; 104 } 105 106 /** 107 * Constructs an example from an output and the metadata. 108 * 109 * @param output The output. 110 * @param metadata The metadata. 111 */ 112 public BinaryFeaturesExample(T output, Map<String,Object> metadata) { 113 super(output,metadata); 114 featureNames = new String[DEFAULT_SIZE]; 115 } 116 117 /** 118 * Constructs an example from an output. 119 * 120 * @param output The output. 121 */ 122 public BinaryFeaturesExample(T output) { 123 super(output); 124 featureNames = new String[DEFAULT_SIZE]; 125 } 126 127 /** 128 * Constructs an example from an output and an array of names. This is currently 129 * the most efficient constructor. 130 * 131 * @param output The output. 132 * @param names The feature names. 133 */ 134 public BinaryFeaturesExample(T output, String[] names) { 135 super(output); 136 size = names.length; 137 featureNames = Arrays.copyOf(names,names.length); 138 sort(); 139 } 140 141 /** 142 * Constructs an example from an output and a list of features. This constructor 143 * will throw an {@link IllegalArgumentException} if any of the features have a 144 * feature value that does not equal 1.0. 145 * 146 * @param output The output (e.g., label) of the example 147 * @param features The list of features. 148 */ 149 public BinaryFeaturesExample(T output, List<? extends Feature> features) { 150 super(output); 151 size = features.size(); 152 featureNames = new String[size]; 153 154 int i = 0; 155 for (Feature f : features) { 156 checkIsBinary(f); 157 featureNames[i] = f.getName(); 158 i++; 159 } 160 161 sort(); 162 } 163 164 /** 165 * Copy constructor. This constructor will throw an 166 * {@link IllegalArgumentException} if any of the features have a feature value 167 * that does not equal 1.0. 168 * 169 * @param other The example to copy. 170 */ 171 public BinaryFeaturesExample(Example<T> other) { 172 super(other); 173 featureNames = new String[other.size()]; 174 for (Feature f : other) { 175 checkIsBinary(f); 176 featureNames[size] = f.getName(); 177 size++; 178 } 179 } 180 181 /** 182 * Clones an example's features, but uses the supplied output and weight. 183 * @param output The output to use. 184 * @param other The features to use. 185 * @param weight The weight to use. 186 * @param <U> The output type of the other example. 187 */ 188 public <U extends Output<U>> BinaryFeaturesExample(T output, Example<U> other, float weight) { 189 super(output,weight); 190 featureNames = new String[other.size()]; 191 for (Feature f : other) { 192 checkIsBinary(f); 193 featureNames[size] = f.getName(); 194 size++; 195 } 196 } 197 198 /** 199 * Adds a single feature with a value of 1. 200 * @param name The name of the feature. 201 */ 202 public void add(String name) { 203 if (size >= featureNames.length) { 204 growArray(); 205 } 206 // 207 // TODO: find the right insertion position, System.arraycopy 208 // everything up one and then write the new value. 209 featureNames[size] = name; 210 size++; 211 sort(); 212 } 213 214 public static boolean isBinary(Feature feature) { 215 return feature.getValue() == 1.0; 216 } 217 218 public static void checkIsBinary(Feature feature) { 219 if(!isBinary(feature)) { 220 throw new IllegalArgumentException("non-binary features are not allowed in BinaryFeaturesExample: value="+feature.getValue()); 221 } 222 } 223 224 /** 225 * Adds a feature to this example. This method will throw an {@link IllegalArgumentException} if 226 * any of the features have a feature value that does not equal 1.0. 227 * 228 * @param feature The feature to add to this example. 229 */ 230 @Override 231 public void add(Feature feature) { 232 checkIsBinary(feature); 233 add(feature.getName()); 234 } 235 236 /** 237 * Adds a collection of features to this example. This method will throw an 238 * {@link IllegalArgumentException} if any of the features have a feature value 239 * that does not equal 1.0. 240 * 241 * @param features The features to add to this example. 242 */ 243 @Override 244 public void addAll(Collection<? extends Feature> features) { 245 if (size + features.size() >= featureNames.length) { 246 growArray(size+features.size()); 247 } 248 for (Feature f : features) { 249 checkIsBinary(f); 250 featureNames[size] = f.getName(); 251 size++; 252 } 253 sort(); 254 } 255 256 /** 257 * Grows the backing arrays storing the names. 258 * @param minCapacity The new minimum capacity required. 259 */ 260 protected void growArray(int minCapacity) { 261 int newCapacity = newCapacity(minCapacity); 262 featureNames = Arrays.copyOf(featureNames,newCapacity); 263 } 264 265 /** 266 * Grows the backing arrays by size+1. 267 */ 268 protected void growArray() { 269 growArray(size+1); 270 } 271 272 /** 273 * Returns a capacity at least as large as the given minimum capacity. 274 * Returns the current capacity increased by 50% if that suffices. 275 * Will not return a capacity greater than MAX_ARRAY_SIZE unless 276 * the given minimum capacity is greater than MAX_ARRAY_SIZE. 277 * 278 * @param minCapacity the desired minimum capacity 279 * @throws OutOfMemoryError if minCapacity is less than zero 280 * @return The new capacity. 281 */ 282 protected int newCapacity(int minCapacity) { 283 // overflow-conscious code 284 int oldCapacity = featureNames.length; 285 int newCapacity = oldCapacity + (oldCapacity >> 1); 286 if (newCapacity - minCapacity <= 0) { 287 if (minCapacity < 0) { 288 // overflow 289 throw new OutOfMemoryError(); 290 } 291 return minCapacity; 292 } 293 return newCapacity; 294 } 295 296 /** 297 * Sorts the feature list to maintain the lexicographic order invariant. 298 */ 299 @Override 300 protected void sort() { 301 int[] sortedIndices = SortUtil.argsort(featureNames,0,size,true); 302 303 String[] newNames = Arrays.copyOf(featureNames,size); 304 for (int i = 0; i < sortedIndices.length; i++) { 305 featureNames[i] = newNames[sortedIndices[i]]; 306 } 307 } 308 309 @Override 310 public int size() { 311 return size; 312 } 313 314 @Override 315 public void removeFeatures(List<Feature> featureList) { 316 Map<String,List<Integer>> map = new HashMap<>(); 317 for (int i = 0; i < featureNames.length; i++) { 318 List<Integer> list = map.computeIfAbsent(featureNames[i],(k) -> new ArrayList<>()); 319 list.add(i); 320 } 321 322 PriorityQueue<Integer> removeQueue = new PriorityQueue<>(); 323 for (Feature f : featureList) { 324 List<Integer> i = map.get(f.getName()); 325 if (i != null) { 326 // If we've found this feature remove it from the map to prevent double counting 327 map.remove(f.getName()); 328 removeQueue.addAll(i); 329 } 330 } 331 332 String[] newNames = new String[size-removeQueue.size()]; 333 334 int source = 0; 335 int dest = 0; 336 while (!removeQueue.isEmpty()) { 337 int curRemoveIdx = removeQueue.poll(); 338 while (source < curRemoveIdx) { 339 newNames[dest] = featureNames[source]; 340 source++; 341 dest++; 342 } 343 source++; 344 } 345 while (source < size) { 346 newNames[dest] = featureNames[source]; 347 source++; 348 dest++; 349 } 350 featureNames = newNames; 351 size = featureNames.length; 352 } 353 354 @Override 355 public boolean validateExample() { 356 if (size == 0) { 357 return false; 358 } else { 359 Set<String> names = new HashSet<>(Arrays.asList(featureNames).subList(0, size)); 360 return names.size() == size; 361 } 362 } 363 364 @Override 365 public BinaryFeaturesExample<T> copy() { 366 return new BinaryFeaturesExample<>(this); 367 } 368 369 @Override 370 public Feature lookup(String i) { 371 int index = Arrays.binarySearch(featureNames,0,size,i); 372 if (index < 0) { 373 return null; 374 } else { 375 return new Feature(featureNames[index], 1.0); 376 } 377 } 378 379 @Override 380 public void set(Feature feature) { 381 throw new UnsupportedOperationException(); 382 } 383 384 @Override 385 public void transform(TransformerMap transformerMap) { 386 throw new UnsupportedOperationException(); 387 } 388 389 /** 390 * @param merger this parameter is ignored because we do not store feature values in this class. So, the 'merging' can be thought of as a logical AND. 391 */ 392 @Override 393 public void reduceByName(Merger merger) { 394 if (size > 0) { 395 List<String> newNames = Arrays.stream(featureNames, 0, size).distinct().collect(Collectors.toList()); 396 Collections.sort(newNames); 397 featureNames = newNames.toArray(new String[newNames.size()]); 398 size = featureNames.length; 399 } else { 400 logger.finer("Reducing an example with no features."); 401 } 402 } 403 404 @Override 405 public void densify(List<String> featureList) { 406 throw new UnsupportedOperationException(); 407 } 408 409 @Override 410 protected void densify(FeatureMap fMap) { 411 throw new UnsupportedOperationException(); 412 } 413 414 @Override 415 public Iterator<Feature> iterator() { 416 return new BinaryFeaturesExampleIterator(); 417 } 418 419 @Override 420 public String toString() { 421 StringBuilder builder = new StringBuilder(); 422 423 builder.append("ArrayExample(numFeatures="); 424 builder.append(size); 425 builder.append(",output="); 426 builder.append(output); 427 builder.append(",weight="); 428 builder.append(weight); 429 if (metadata != null) { 430 builder.append(",metadata="); 431 builder.append(metadata.toString()); 432 } 433 builder.append(",features=["); 434 for(int i = 0; i < size; i++) { 435 builder.append('(').append(featureNames[i]).append(", ").append(1.0).append(')'); 436 if(i != 0) { 437 builder.append(", "); 438 } 439 } 440 //builder.append(features.toString()); 441 builder.append("])"); 442 443 return builder.toString(); 444 } 445 446 @Override 447 public boolean equals(Object o) { 448 if (this == o) return true; 449 if (!(o instanceof BinaryFeaturesExample)) return false; 450 BinaryFeaturesExample<?> that = (BinaryFeaturesExample<?>) o; 451 if (Objects.equals(metadata,that.metadata) && output.getClass().equals(that.output.getClass())) { 452 @SuppressWarnings("unchecked") //guarded by a getClass. 453 boolean outputTest = output.fullEquals((T)that.output); 454 if(outputTest && size == that.size) { 455 //we do not use Arrays.equals here because these are "backing arrays" which could be different sizes 456 for(int i=0; i<size; i++) { 457 if(!this.featureNames[i].equals(that.featureNames[i])) return false; 458 } 459 return true; 460 } 461 return false; 462 } else { 463 return false; 464 } 465 } 466 467 @Override 468 public int hashCode() { 469 int result = Objects.hash(size); 470 result = 31 * result + output.hashCode(); 471 //we don't use Arrays.hashCode here because featureNames 472 //is a backing array and its length could be arbitrarily diverging 473 //from the member size. 474 for(int i=0; i<size; i++) { 475 result = 31 * result + featureNames[i].hashCode(); 476 } 477 return result; 478 } 479 480 class BinaryFeaturesExampleIterator implements Iterator<Feature> { 481 int pos = 0; 482 483 @Override 484 public boolean hasNext() { 485 return pos < size; 486 } 487 488 @Override 489 public Feature next() { 490 Feature f = new Feature(featureNames[pos], 1.0); 491 pos++; 492 return f; 493 } 494 } 495 496 @Override 497 public void canonicalize(FeatureMap featureMap) { 498 for(int i=0; i< featureNames.length; i++) { 499 VariableInfo vi = featureMap.get(featureNames[i]); 500 if(vi != null) { 501 featureNames[i] = vi.getName(); 502 } 503 } 504 } 505}