001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.json; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.PrimitiveProvenance; 022import com.oracle.labs.mlrg.olcut.provenance.Provenance; 023import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil; 024import com.oracle.labs.mlrg.olcut.provenance.impl.SkeletalConfiguredObjectProvenance; 025import com.oracle.labs.mlrg.olcut.provenance.primitives.DateTimeProvenance; 026import com.oracle.labs.mlrg.olcut.provenance.primitives.HashProvenance; 027import com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance; 028import org.tribuo.DataSource; 029import org.tribuo.Example; 030import org.tribuo.Output; 031import org.tribuo.data.columnar.ColumnarDataSource; 032import org.tribuo.data.columnar.ColumnarIterator; 033import org.tribuo.data.columnar.FieldProcessor; 034import org.tribuo.data.columnar.RowProcessor; 035import org.tribuo.provenance.ConfiguredDataSourceProvenance; 036 037import java.io.IOException; 038import java.net.URI; 039import java.nio.file.Path; 040import java.nio.file.Paths; 041import java.time.Instant; 042import java.time.OffsetDateTime; 043import java.time.ZoneId; 044import java.util.HashMap; 045import java.util.Map; 046import java.util.Objects; 047import java.util.logging.Logger; 048 049/** 050 * A {@link DataSource} for loading data from a JSON text file 051 * and applying {@link FieldProcessor}s to it. 052 */ 053public class JsonDataSource<T extends Output<T>> extends ColumnarDataSource<T> { 054 private static final Logger logger = Logger.getLogger(JsonFileIterator.class.getName()); 055 056 private URI dataFile; 057 058 @Config(mandatory = true,description="Path to the json file.") 059 private Path dataPath; 060 061 private ConfiguredDataSourceProvenance provenance; 062 063 /** 064 * For OLCUT. 065 */ 066 private JsonDataSource() {} 067 068 /** 069 * Creates a JsonDataSource using the specified RowProcessor to process the data. 070 * 071 * @param dataPath The Path to the data file. 072 * @param rowProcessor The row processor which converts a row into an {@link Example}. 073 * @param outputRequired Is the output required to exist in the data file. 074 */ 075 public JsonDataSource(Path dataPath, RowProcessor<T> rowProcessor, boolean outputRequired) { 076 this(dataPath.toUri(),dataPath,rowProcessor,outputRequired); 077 } 078 079 /** 080 * Creates a JsonDataSource using the specified RowProcessor to process the data. 081 * 082 * @param dataFile A URI for the data file. 083 * @param rowProcessor The row processor which converts a row into an {@link Example}. 084 * @param outputRequired Is the output required to exist in the data file. 085 */ 086 public JsonDataSource(URI dataFile, RowProcessor<T> rowProcessor, boolean outputRequired) { 087 this(dataFile,Paths.get(dataFile),rowProcessor,outputRequired); 088 } 089 090 /** 091 * Creates a JsonDataSource using the specified RowProcessor to process the data. 092 * @param dataFile A URI for the data file. 093 * @param rowProcessor The row processor which converts a row into an {@link Example}. 094 * @param outputRequired Is the output required to exist in the data file. 095 */ 096 private JsonDataSource(URI dataFile, Path dataPath, RowProcessor<T> rowProcessor, boolean outputRequired) { 097 super(rowProcessor.getResponseProcessor().getOutputFactory(), rowProcessor, outputRequired); 098 this.dataPath = dataPath; 099 this.dataFile = dataFile; 100 this.provenance = new JsonDataSourceProvenance(this); 101 } 102 103 /** 104 * Used by the OLCUT configuration system, and should not be called by external code. 105 */ 106 @Override 107 public void postConfig() { 108 this.dataFile = dataPath.toUri(); 109 this.provenance = new JsonDataSourceProvenance(this); 110 } 111 112 @Override 113 public String toString() { 114 return "JsonDataSource(file=" + dataFile + ",rowProcessor="+rowProcessor.getDescription()+")"; 115 } 116 117 @Override 118 public ColumnarIterator rowIterator() { 119 try { 120 return new JsonFileIterator(dataFile); 121 } catch (IOException e) { 122 throw new IllegalStateException("Failed to read data",e); 123 } 124 } 125 126 @Override 127 public ConfiguredDataSourceProvenance getProvenance() { 128 return provenance; 129 } 130 131 /** 132 * Provenance for {@link JsonDataSource}. 133 */ 134 public static class JsonDataSourceProvenance extends SkeletalConfiguredObjectProvenance implements ConfiguredDataSourceProvenance { 135 private static final long serialVersionUID = 1L; 136 137 private final DateTimeProvenance fileModifiedTime; 138 private final DateTimeProvenance dataSourceCreationTime; 139 private final HashProvenance sha256Hash; 140 141 <T extends Output<T>> JsonDataSourceProvenance(JsonDataSource<T> host) { 142 super(host,"DataSource"); 143 this.fileModifiedTime = new DateTimeProvenance(FILE_MODIFIED_TIME,OffsetDateTime.ofInstant(Instant.ofEpochMilli(host.dataPath.toFile().lastModified()), ZoneId.systemDefault())); 144 this.dataSourceCreationTime = new DateTimeProvenance(DATASOURCE_CREATION_TIME,OffsetDateTime.now()); 145 this.sha256Hash = new HashProvenance(DEFAULT_HASH_TYPE,RESOURCE_HASH,ProvenanceUtil.hashResource(DEFAULT_HASH_TYPE,host.dataPath)); 146 } 147 148 public JsonDataSourceProvenance(Map<String,Provenance> map) { 149 this(extractProvenanceInfo(map)); 150 } 151 152 private JsonDataSourceProvenance(ExtractedInfo info) { 153 super(info); 154 this.fileModifiedTime = (DateTimeProvenance) info.instanceValues.get(FILE_MODIFIED_TIME); 155 this.dataSourceCreationTime = (DateTimeProvenance) info.instanceValues.get(DATASOURCE_CREATION_TIME); 156 this.sha256Hash = (HashProvenance) info.instanceValues.get(RESOURCE_HASH); 157 } 158 159 protected static ExtractedInfo extractProvenanceInfo(Map<String,Provenance> map) { 160 Map<String,Provenance> configuredParameters = new HashMap<>(map); 161 String className = ObjectProvenance.checkAndExtractProvenance(configuredParameters,CLASS_NAME, StringProvenance.class, JsonDataSourceProvenance.class.getSimpleName()).getValue(); 162 String hostTypeStringName = ObjectProvenance.checkAndExtractProvenance(configuredParameters,HOST_SHORT_NAME, StringProvenance.class, JsonDataSourceProvenance.class.getSimpleName()).getValue(); 163 164 Map<String,PrimitiveProvenance<?>> instanceParameters = new HashMap<>(); 165 instanceParameters.put(FILE_MODIFIED_TIME,ObjectProvenance.checkAndExtractProvenance(configuredParameters,FILE_MODIFIED_TIME,DateTimeProvenance.class, JsonDataSourceProvenance.class.getSimpleName())); 166 instanceParameters.put(DATASOURCE_CREATION_TIME,ObjectProvenance.checkAndExtractProvenance(configuredParameters,DATASOURCE_CREATION_TIME,DateTimeProvenance.class, JsonDataSourceProvenance.class.getSimpleName())); 167 instanceParameters.put(RESOURCE_HASH,ObjectProvenance.checkAndExtractProvenance(configuredParameters,RESOURCE_HASH,HashProvenance.class, JsonDataSourceProvenance.class.getSimpleName())); 168 169 return new ExtractedInfo(className,hostTypeStringName,configuredParameters,instanceParameters); 170 } 171 172 @Override 173 public boolean equals(Object o) { 174 if (this == o) return true; 175 if (o == null || getClass() != o.getClass()) return false; 176 if (!super.equals(o)) return false; 177 JsonDataSourceProvenance pairs = (JsonDataSourceProvenance) o; 178 return fileModifiedTime.equals(pairs.fileModifiedTime) && 179 dataSourceCreationTime.equals(pairs.dataSourceCreationTime) && 180 sha256Hash.equals(pairs.sha256Hash); 181 } 182 183 @Override 184 public int hashCode() { 185 return Objects.hash(super.hashCode(), fileModifiedTime, dataSourceCreationTime, sha256Hash); 186 } 187 188 @Override 189 public Map<String, PrimitiveProvenance<?>> getInstanceValues() { 190 Map<String,PrimitiveProvenance<?>> map = super.getInstanceValues(); 191 192 map.put(FILE_MODIFIED_TIME,fileModifiedTime); 193 map.put(DATASOURCE_CREATION_TIME,dataSourceCreationTime); 194 map.put(RESOURCE_HASH,sha256Hash); 195 196 return map; 197 } 198 } 199}