001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data.text.impl; 018 019import com.oracle.labs.mlrg.olcut.config.Config; 020import com.oracle.labs.mlrg.olcut.provenance.ObjectProvenance; 021import com.oracle.labs.mlrg.olcut.provenance.PrimitiveProvenance; 022import com.oracle.labs.mlrg.olcut.provenance.Provenance; 023import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil; 024import com.oracle.labs.mlrg.olcut.provenance.impl.SkeletalConfiguredObjectProvenance; 025import com.oracle.labs.mlrg.olcut.provenance.primitives.DateTimeProvenance; 026import com.oracle.labs.mlrg.olcut.provenance.primitives.HashProvenance; 027import com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance; 028import org.tribuo.Example; 029import org.tribuo.Output; 030import org.tribuo.OutputFactory; 031import org.tribuo.data.text.TextFeatureExtractor; 032import org.tribuo.provenance.ConfiguredDataSourceProvenance; 033 034import java.nio.file.Paths; 035import java.time.OffsetDateTime; 036import java.util.HashMap; 037import java.util.List; 038import java.util.Map; 039import java.util.Objects; 040import java.util.Optional; 041import java.util.logging.Logger; 042 043/** 044 * A version of {@link SimpleTextDataSource} that accepts an {@link Iterable} of Strings. 045 * <p> 046 * Uses the parsing logic from {@link SimpleTextDataSource}. 047 */ 048public class SimpleStringDataSource<T extends Output<T>> extends SimpleTextDataSource<T> { 049 050 private static final Logger logger = Logger.getLogger(SimpleStringDataSource.class.getName()); 051 052 /** 053 * Used because OLCUT doesn't support generic Iterables. 054 */ 055 @Config(mandatory = true,description="The input data lines.") 056 protected List<String> rawLines; 057 058 /** 059 * For olcut. 060 */ 061 private SimpleStringDataSource() {} 062 063 public SimpleStringDataSource(List<String> rawLines, OutputFactory<T> outputFactory, TextFeatureExtractor<T> extractor) { 064 super(outputFactory, extractor); 065 this.rawLines = rawLines; 066 this.path = Paths.get(System.getProperty("user.dir")); 067 read(); 068 this.provenance = cacheProvenance(); 069 } 070 071 /** 072 * Used by the OLCUT configuration system, and should not be called by external code. 073 */ 074 @Override 075 public void postConfig() { 076 read(); 077 this.provenance = cacheProvenance(); 078 } 079 080 @Override 081 public String toString() { 082 StringBuilder buffer = new StringBuilder(); 083 buffer.append(this.getClass().getSimpleName()); 084 buffer.append("(extractor="); 085 buffer.append(extractor.toString()); 086 buffer.append(",preprocessors="); 087 buffer.append(preprocessors.toString()); 088 buffer.append(")"); 089 return buffer.toString(); 090 } 091 092 @Override 093 protected void read() { 094 int n = 0; 095 for (String line : rawLines) { 096 n++; 097 Optional<Example<T>> example = parseLine(line, n); 098 example.ifPresent(data::add); 099 } 100 } 101 102 @Override 103 protected ConfiguredDataSourceProvenance cacheProvenance() { 104 return new SimpleStringDataSourceProvenance(this); 105 } 106 107 /** 108 * Provenance for {@link SimpleStringDataSource}. 109 */ 110 public static class SimpleStringDataSourceProvenance extends SkeletalConfiguredObjectProvenance implements ConfiguredDataSourceProvenance { 111 private static final long serialVersionUID = 1L; 112 113 private final DateTimeProvenance dataSourceCreationTime; 114 private final HashProvenance sha256Hash; 115 116 <T extends Output<T>> SimpleStringDataSourceProvenance(SimpleStringDataSource<T> host) { 117 super(host,"DataSource"); 118 this.dataSourceCreationTime = new DateTimeProvenance(DATASOURCE_CREATION_TIME,OffsetDateTime.now()); 119 this.sha256Hash = new HashProvenance(DEFAULT_HASH_TYPE,RESOURCE_HASH, ProvenanceUtil.hashList(DEFAULT_HASH_TYPE,host.rawLines)); 120 } 121 122 public SimpleStringDataSourceProvenance(Map<String, Provenance> map) { 123 this(extractProvenanceInfo(map)); 124 } 125 126 private SimpleStringDataSourceProvenance(ExtractedInfo info) { 127 super(info); 128 this.dataSourceCreationTime = (DateTimeProvenance) info.instanceValues.get(DATASOURCE_CREATION_TIME); 129 this.sha256Hash = (HashProvenance) info.instanceValues.get(RESOURCE_HASH); 130 } 131 132 protected static ExtractedInfo extractProvenanceInfo(Map<String,Provenance> map) { 133 Map<String,Provenance> configuredParameters = new HashMap<>(map); 134 String className = ObjectProvenance.checkAndExtractProvenance(configuredParameters,CLASS_NAME, StringProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName()).getValue(); 135 String hostTypeStringName = ObjectProvenance.checkAndExtractProvenance(configuredParameters,HOST_SHORT_NAME, StringProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName()).getValue(); 136 137 Map<String, PrimitiveProvenance<?>> instanceParameters = new HashMap<>(); 138 instanceParameters.put(DATASOURCE_CREATION_TIME,ObjectProvenance.checkAndExtractProvenance(configuredParameters,DATASOURCE_CREATION_TIME,DateTimeProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName())); 139 instanceParameters.put(RESOURCE_HASH,ObjectProvenance.checkAndExtractProvenance(configuredParameters,RESOURCE_HASH,HashProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName())); 140 141 return new ExtractedInfo(className,hostTypeStringName,configuredParameters,instanceParameters); 142 } 143 144 @Override 145 public boolean equals(Object o) { 146 if (this == o) return true; 147 if (!(o instanceof SimpleStringDataSourceProvenance)) return false; 148 if (!super.equals(o)) return false; 149 SimpleStringDataSourceProvenance pairs = (SimpleStringDataSourceProvenance) o; 150 return dataSourceCreationTime.equals(pairs.dataSourceCreationTime) && 151 sha256Hash.equals(pairs.sha256Hash); 152 } 153 154 @Override 155 public int hashCode() { 156 return Objects.hash(super.hashCode(), dataSourceCreationTime, sha256Hash); 157 } 158 159 @Override 160 public Map<String, PrimitiveProvenance<?>> getInstanceValues() { 161 Map<String,PrimitiveProvenance<?>> map = new HashMap<>(); 162 163 map.put(DATASOURCE_CREATION_TIME,dataSourceCreationTime); 164 map.put(RESOURCE_HASH,sha256Hash); 165 166 return map; 167 } 168 } 169 170}