001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.json;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.PrimitiveProvenance;
022import com.oracle.labs.mlrg.olcut.provenance.Provenance;
023import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil;
024import com.oracle.labs.mlrg.olcut.provenance.impl.SkeletalConfiguredObjectProvenance;
025import com.oracle.labs.mlrg.olcut.provenance.primitives.DateTimeProvenance;
026import com.oracle.labs.mlrg.olcut.provenance.primitives.HashProvenance;
027import com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance;
028import org.tribuo.DataSource;
029import org.tribuo.Example;
030import org.tribuo.Output;
031import org.tribuo.data.columnar.ColumnarDataSource;
032import org.tribuo.data.columnar.ColumnarIterator;
033import org.tribuo.data.columnar.FieldProcessor;
034import org.tribuo.data.columnar.RowProcessor;
035import org.tribuo.provenance.ConfiguredDataSourceProvenance;
036
037import java.io.IOException;
038import java.net.URI;
039import java.nio.file.Path;
040import java.nio.file.Paths;
041import java.time.Instant;
042import java.time.OffsetDateTime;
043import java.time.ZoneId;
044import java.util.HashMap;
045import java.util.Map;
046import java.util.Objects;
047import java.util.logging.Logger;
048
049/**
050 * A {@link DataSource} for loading data from a JSON text file
051 * and applying {@link FieldProcessor}s to it.
052 */
053public class JsonDataSource<T extends Output<T>> extends ColumnarDataSource<T> {
054    private static final Logger logger = Logger.getLogger(JsonFileIterator.class.getName());
055
056    private URI dataFile;
057
058    @Config(mandatory = true,description="Path to the json file.")
059    private Path dataPath;
060
061    private ConfiguredDataSourceProvenance provenance;
062
063    /**
064     * For OLCUT.
065     */
066    private JsonDataSource() {}
067
068    /**
069     * Creates a JsonDataSource using the specified RowProcessor to process the data.
070     *
071     * @param dataPath The Path to the data file.
072     * @param rowProcessor The row processor which converts a row into an {@link Example}.
073     * @param outputRequired Is the output required to exist in the data file.
074     */
075    public JsonDataSource(Path dataPath, RowProcessor<T> rowProcessor, boolean outputRequired) {
076        this(dataPath.toUri(),dataPath,rowProcessor,outputRequired);
077    }
078
079    /**
080     * Creates a JsonDataSource using the specified RowProcessor to process the data.
081     *
082     * @param dataFile A URI for the data file.
083     * @param rowProcessor The row processor which converts a row into an {@link Example}.
084     * @param outputRequired Is the output required to exist in the data file.
085     */
086    public JsonDataSource(URI dataFile, RowProcessor<T> rowProcessor, boolean outputRequired) {
087        this(dataFile,Paths.get(dataFile),rowProcessor,outputRequired);
088    }
089
090    /**
091     * Creates a JsonDataSource using the specified RowProcessor to process the data.
092     * @param dataFile A URI for the data file.
093     * @param rowProcessor The row processor which converts a row into an {@link Example}.
094     * @param outputRequired Is the output required to exist in the data file.
095     */
096    private JsonDataSource(URI dataFile, Path dataPath, RowProcessor<T> rowProcessor, boolean outputRequired) {
097        super(rowProcessor.getResponseProcessor().getOutputFactory(), rowProcessor, outputRequired);
098        this.dataPath = dataPath;
099        this.dataFile = dataFile;
100        this.provenance = new JsonDataSourceProvenance(this);
101    }
102
103    /**
104     * Used by the OLCUT configuration system, and should not be called by external code.
105     */
106    @Override
107    public void postConfig() {
108        this.dataFile = dataPath.toUri();
109        this.provenance = new JsonDataSourceProvenance(this);
110    }
111
112    @Override
113    public String toString() {
114        return "JsonDataSource(file=" + dataFile + ",rowProcessor="+rowProcessor.getDescription()+")";
115    }
116
117    @Override
118    public ColumnarIterator rowIterator() {
119        try {
120            return new JsonFileIterator(dataFile);
121        } catch (IOException e) {
122            throw new IllegalStateException("Failed to read data",e);
123        }
124    }
125
126    @Override
127    public ConfiguredDataSourceProvenance getProvenance() {
128        return provenance;
129    }
130
131    /**
132     * Provenance for {@link JsonDataSource}.
133     */
134    public static class JsonDataSourceProvenance extends SkeletalConfiguredObjectProvenance implements ConfiguredDataSourceProvenance {
135        private static final long serialVersionUID = 1L;
136
137        private final DateTimeProvenance fileModifiedTime;
138        private final DateTimeProvenance dataSourceCreationTime;
139        private final HashProvenance sha256Hash;
140
141        <T extends Output<T>> JsonDataSourceProvenance(JsonDataSource<T> host) {
142            super(host,"DataSource");
143            this.fileModifiedTime = new DateTimeProvenance(FILE_MODIFIED_TIME,OffsetDateTime.ofInstant(Instant.ofEpochMilli(host.dataPath.toFile().lastModified()), ZoneId.systemDefault()));
144            this.dataSourceCreationTime = new DateTimeProvenance(DATASOURCE_CREATION_TIME,OffsetDateTime.now());
145            this.sha256Hash = new HashProvenance(DEFAULT_HASH_TYPE,RESOURCE_HASH,ProvenanceUtil.hashResource(DEFAULT_HASH_TYPE,host.dataPath));
146        }
147
148        public JsonDataSourceProvenance(Map<String,Provenance> map) {
149            this(extractProvenanceInfo(map));
150        }
151
152        private JsonDataSourceProvenance(ExtractedInfo info) {
153            super(info);
154            this.fileModifiedTime = (DateTimeProvenance) info.instanceValues.get(FILE_MODIFIED_TIME);
155            this.dataSourceCreationTime = (DateTimeProvenance) info.instanceValues.get(DATASOURCE_CREATION_TIME);
156            this.sha256Hash = (HashProvenance) info.instanceValues.get(RESOURCE_HASH);
157        }
158
159        protected static ExtractedInfo extractProvenanceInfo(Map<String,Provenance> map) {
160            Map<String,Provenance> configuredParameters = new HashMap<>(map);
161            String className = ObjectProvenance.checkAndExtractProvenance(configuredParameters,CLASS_NAME, StringProvenance.class, JsonDataSourceProvenance.class.getSimpleName()).getValue();
162            String hostTypeStringName = ObjectProvenance.checkAndExtractProvenance(configuredParameters,HOST_SHORT_NAME, StringProvenance.class, JsonDataSourceProvenance.class.getSimpleName()).getValue();
163
164            Map<String,PrimitiveProvenance<?>> instanceParameters = new HashMap<>();
165            instanceParameters.put(FILE_MODIFIED_TIME,ObjectProvenance.checkAndExtractProvenance(configuredParameters,FILE_MODIFIED_TIME,DateTimeProvenance.class, JsonDataSourceProvenance.class.getSimpleName()));
166            instanceParameters.put(DATASOURCE_CREATION_TIME,ObjectProvenance.checkAndExtractProvenance(configuredParameters,DATASOURCE_CREATION_TIME,DateTimeProvenance.class, JsonDataSourceProvenance.class.getSimpleName()));
167            instanceParameters.put(RESOURCE_HASH,ObjectProvenance.checkAndExtractProvenance(configuredParameters,RESOURCE_HASH,HashProvenance.class, JsonDataSourceProvenance.class.getSimpleName()));
168
169            return new ExtractedInfo(className,hostTypeStringName,configuredParameters,instanceParameters);
170        }
171
172        @Override
173        public boolean equals(Object o) {
174            if (this == o) return true;
175            if (o == null || getClass() != o.getClass()) return false;
176            if (!super.equals(o)) return false;
177            JsonDataSourceProvenance pairs = (JsonDataSourceProvenance) o;
178            return fileModifiedTime.equals(pairs.fileModifiedTime) &&
179                    dataSourceCreationTime.equals(pairs.dataSourceCreationTime) &&
180                    sha256Hash.equals(pairs.sha256Hash);
181        }
182
183        @Override
184        public int hashCode() {
185            return Objects.hash(super.hashCode(), fileModifiedTime, dataSourceCreationTime, sha256Hash);
186        }
187
188        @Override
189        public Map<String, PrimitiveProvenance<?>> getInstanceValues() {
190            Map<String,PrimitiveProvenance<?>> map = super.getInstanceValues();
191
192            map.put(FILE_MODIFIED_TIME,fileModifiedTime);
193            map.put(DATASOURCE_CREATION_TIME,dataSourceCreationTime);
194            map.put(RESOURCE_HASH,sha256Hash);
195
196            return map;
197        }
198    }
199}