001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.columnar; 018 019import com.oracle.labs.mlrg.olcut.util.IOSpliterator; 020 021import java.util.Collections; 022import java.util.Iterator; 023import java.util.List; 024import java.util.Map; 025import java.util.NoSuchElementException; 026import java.util.Optional; 027import java.util.function.Consumer; 028import java.util.logging.Logger; 029 030/** 031 * An abstract class for iterators that read data in to a columnar format, usually from a file of some kind. Subclasses 032 * handle how to format data from that file. 033 * <p> 034 * Note: the {@code fields} field must be set in the constructor of implementing classes. 035 */ 036public abstract class ColumnarIterator extends IOSpliterator<ColumnarIterator.Row> implements Iterator<ColumnarIterator.Row> { 037 private static final Logger logger = Logger.getLogger(ColumnarIterator.class.getName()); 038 039 protected List<String> fields; 040 protected Optional<Row> currentRow = Optional.empty(); 041 042 /** 043 * A representation of a row of untyped data from a columnar data source. In addition to its row data it stores a 044 * canonical field list and its index (from 0) in the original data source. It should be immutable once constructed. 045 * It is the responsibility of the implementor of ColumnarIterator to ensure that the passed field list is immutable. 046 */ 047 public static class Row { 048 049 private final long index; 050 private final List<String> fields; 051 private final Map<String, String> rowData; 052 053 public Row(long index, List<String> fields, Map<String, String> rowData) { 054 this.index = index; 055 this.fields = fields; 056 this.rowData = Collections.unmodifiableMap(rowData); 057 } 058 059 public List<String> getFields() { 060 return fields; 061 } 062 063 public long getIndex() { 064 return index; 065 } 066 067 public Map<String, String> getRowData() { 068 return rowData; 069 } 070 071 @Override 072 public String toString() { 073 return "Row(index=" + index + ", fields=" + fields.toString() + ", rowData=" + rowData.toString() + ")"; 074 } 075 } 076 077 078 /** 079 * Constructs a ColumnarIterator wrapped around a buffering spliterator. 080 * <p> 081 * Note when using this constructor you must set the {@code fields} field to 082 * the appropriate value after you've called super(). It must be immutable. 083 */ 084 protected ColumnarIterator() { 085 super(); 086 } 087 088 /** 089 * Constructs a ColumnarIterator wrapped around a buffering spliterator. 090 * <p> 091 * Note when using this constructor you must set the {@code fields} field to 092 * the appropriate value after you've called super(). It must be immutable. 093 * @param characteristics The spliterator characteristics. 094 * @param batchsize The buffer size. 095 * @param estimatedSize The estimated size of this iterator. 096 */ 097 protected ColumnarIterator(int characteristics, int batchsize, long estimatedSize) { 098 super(characteristics, batchsize, estimatedSize); 099 } 100 101 /** 102 * The immutable list of field names. 103 * @return The field names. 104 */ 105 public List<String> getFields() { 106 return fields; 107 } 108 109 @Override 110 public boolean hasNext() { 111 if (currentRow.isPresent()) { 112 return true; 113 } else { 114 currentRow = getRow(); 115 return currentRow.isPresent(); 116 } 117 } 118 119 @Override 120 public Row next() { 121 if (hasNext()) { 122 Row r = currentRow.get(); 123 currentRow = Optional.empty(); 124 return r; 125 } else { 126 throw new NoSuchElementException(); 127 } 128 } 129 130 @Override 131 public boolean tryAdvance(Consumer<? super Row> action) { 132 if (hasNext()) { 133 action.accept(next()); 134 return true; 135 } else { 136 return false; 137 } 138 } 139 140 @Override 141 public void forEachRemaining(Consumer<? super Row> action) { 142 while (hasNext()) { 143 action.accept(next()); 144 } 145 } 146 147 /** 148 * Returns the next row of data based on internal state stored by the implementor, or {@link Optional#empty()} 149 * if there is no more data. 150 * @return The next row of data or None. 151 */ 152 protected abstract Optional<Row> getRow(); 153}