001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.columnar;
018
019import com.oracle.labs.mlrg.olcut.util.IOSpliterator;
020
021import java.util.Collections;
022import java.util.Iterator;
023import java.util.List;
024import java.util.Map;
025import java.util.NoSuchElementException;
026import java.util.Optional;
027import java.util.function.Consumer;
028import java.util.logging.Logger;
029
030/**
031 * An abstract class for iterators that read data in to a columnar format, usually from a file of some kind. Subclasses
032 * handle how to format data from that file.
033 * <p>
034 * Note: the {@code fields} field must be set in the constructor of implementing classes.
035 */
036public abstract class ColumnarIterator extends IOSpliterator<ColumnarIterator.Row> implements Iterator<ColumnarIterator.Row> {
037    private static final Logger logger = Logger.getLogger(ColumnarIterator.class.getName());
038
039    protected List<String> fields;
040    protected Optional<Row> currentRow = Optional.empty();
041
042    /**
043     * A representation of a row of untyped data from a columnar data source. In addition to its row data it stores a
044     * canonical field list and its index (from 0) in the original data source. It should be immutable once constructed.
045     * It is the responsibility of the implementor of ColumnarIterator to ensure that the passed field list is immutable.
046     */
047    public static class Row {
048
049        private final long index;
050        private final List<String> fields;
051        private final Map<String, String> rowData;
052
053        public Row(long index, List<String> fields, Map<String, String> rowData) {
054            this.index = index;
055            this.fields = fields;
056            this.rowData = Collections.unmodifiableMap(rowData);
057        }
058
059        public List<String> getFields() {
060            return fields;
061        }
062
063        public long getIndex() {
064            return index;
065        }
066
067        public Map<String, String> getRowData() {
068            return rowData;
069        }
070
071        @Override
072        public String toString() {
073            return "Row(index=" + index + ", fields=" + fields.toString() + ", rowData=" + rowData.toString() + ")";
074        }
075    }
076
077
078    /**
079     * Constructs a ColumnarIterator wrapped around a buffering spliterator.
080     * <p>
081     * Note when using this constructor you must set the {@code fields} field to
082     * the appropriate value after you've called super(). It must be immutable.
083     */
084    protected ColumnarIterator() {
085        super();
086    }
087
088    /**
089     * Constructs a ColumnarIterator wrapped around a buffering spliterator.
090     * <p>
091     * Note when using this constructor you must set the {@code fields} field to
092     * the appropriate value after you've called super(). It must be immutable.
093     * @param characteristics The spliterator characteristics.
094     * @param batchsize The buffer size.
095     * @param estimatedSize The estimated size of this iterator.
096     */
097    protected ColumnarIterator(int characteristics, int batchsize, long estimatedSize) {
098        super(characteristics, batchsize, estimatedSize);
099    }
100
101    /**
102     * The immutable list of field names.
103     * @return The field names.
104     */
105    public List<String> getFields() {
106        return fields;
107    }
108
109    @Override
110    public boolean hasNext() {
111        if (currentRow.isPresent()) {
112            return true;
113        } else {
114            currentRow = getRow();
115            return currentRow.isPresent();
116        }
117    }
118
119    @Override
120    public Row next() {
121        if (hasNext()) {
122            Row r = currentRow.get();
123            currentRow = Optional.empty();
124            return r;
125        } else {
126            throw new NoSuchElementException();
127        }
128    }
129
130    @Override
131    public boolean tryAdvance(Consumer<? super Row> action) {
132        if (hasNext()) {
133            action.accept(next());
134            return true;
135        } else {
136            return false;
137        }
138    }
139
140    @Override
141    public void forEachRemaining(Consumer<? super Row> action) {
142        while (hasNext()) {
143            action.accept(next());
144        }
145    }
146
147    /**
148     * Returns the next row of data based on internal state stored by the implementor, or {@link Optional#empty()}
149     * if there is no more data.
150     * @return The next row of data or None.
151     */
152    protected abstract Optional<Row> getRow();
153}