001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data.text.impl;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.PrimitiveProvenance;
022import com.oracle.labs.mlrg.olcut.provenance.Provenance;
023import com.oracle.labs.mlrg.olcut.provenance.ProvenanceUtil;
024import com.oracle.labs.mlrg.olcut.provenance.impl.SkeletalConfiguredObjectProvenance;
025import com.oracle.labs.mlrg.olcut.provenance.primitives.DateTimeProvenance;
026import com.oracle.labs.mlrg.olcut.provenance.primitives.HashProvenance;
027import com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance;
028import org.tribuo.Example;
029import org.tribuo.Output;
030import org.tribuo.OutputFactory;
031import org.tribuo.data.text.TextFeatureExtractor;
032import org.tribuo.provenance.ConfiguredDataSourceProvenance;
033
034import java.nio.file.Paths;
035import java.time.OffsetDateTime;
036import java.util.HashMap;
037import java.util.List;
038import java.util.Map;
039import java.util.Objects;
040import java.util.Optional;
041import java.util.logging.Logger;
042
043/**
044 * A version of {@link SimpleTextDataSource} that accepts an {@link Iterable} of Strings.
045 * <p>
046 * Uses the parsing logic from {@link SimpleTextDataSource}.
047 */
048public class SimpleStringDataSource<T extends Output<T>> extends SimpleTextDataSource<T> {
049
050    private static final Logger logger = Logger.getLogger(SimpleStringDataSource.class.getName());
051
052    /**
053     * Used because OLCUT doesn't support generic Iterables.
054     */
055    @Config(mandatory = true,description="The input data lines.")
056    protected List<String> rawLines;
057
058    /**
059     * For olcut.
060     */
061    private SimpleStringDataSource() {}
062
063    public SimpleStringDataSource(List<String> rawLines, OutputFactory<T> outputFactory, TextFeatureExtractor<T> extractor) {
064        super(outputFactory, extractor);
065        this.rawLines = rawLines;
066        this.path = Paths.get(System.getProperty("user.dir"));
067        read();
068        this.provenance = cacheProvenance();
069    }
070
071    /**
072     * Used by the OLCUT configuration system, and should not be called by external code.
073     */
074    @Override
075    public void postConfig() {
076        read();
077        this.provenance = cacheProvenance();
078    }
079
080    @Override
081    public String toString() {
082        StringBuilder buffer = new StringBuilder();
083        buffer.append(this.getClass().getSimpleName());
084        buffer.append("(extractor=");
085        buffer.append(extractor.toString());
086        buffer.append(",preprocessors=");
087        buffer.append(preprocessors.toString());
088        buffer.append(")");
089        return buffer.toString();
090    }
091
092    @Override
093    protected void read() {
094        int n = 0;
095        for (String line : rawLines) {
096            n++;
097            Optional<Example<T>> example = parseLine(line, n);
098            example.ifPresent(data::add);
099        }
100    }
101
102    @Override
103    protected ConfiguredDataSourceProvenance cacheProvenance() {
104        return new SimpleStringDataSourceProvenance(this);
105    }
106
107    /**
108     * Provenance for {@link SimpleStringDataSource}.
109     */
110    public static class SimpleStringDataSourceProvenance extends SkeletalConfiguredObjectProvenance implements ConfiguredDataSourceProvenance {
111        private static final long serialVersionUID = 1L;
112
113        private final DateTimeProvenance dataSourceCreationTime;
114        private final HashProvenance sha256Hash;
115
116        <T extends Output<T>> SimpleStringDataSourceProvenance(SimpleStringDataSource<T> host) {
117            super(host,"DataSource");
118            this.dataSourceCreationTime = new DateTimeProvenance(DATASOURCE_CREATION_TIME,OffsetDateTime.now());
119            this.sha256Hash = new HashProvenance(DEFAULT_HASH_TYPE,RESOURCE_HASH, ProvenanceUtil.hashList(DEFAULT_HASH_TYPE,host.rawLines));
120        }
121
122        public SimpleStringDataSourceProvenance(Map<String, Provenance> map) {
123            this(extractProvenanceInfo(map));
124        }
125
126        private SimpleStringDataSourceProvenance(ExtractedInfo info) {
127            super(info);
128            this.dataSourceCreationTime = (DateTimeProvenance) info.instanceValues.get(DATASOURCE_CREATION_TIME);
129            this.sha256Hash = (HashProvenance) info.instanceValues.get(RESOURCE_HASH);
130        }
131
132        protected static ExtractedInfo extractProvenanceInfo(Map<String,Provenance> map) {
133            Map<String,Provenance> configuredParameters = new HashMap<>(map);
134            String className = ObjectProvenance.checkAndExtractProvenance(configuredParameters,CLASS_NAME, StringProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName()).getValue();
135            String hostTypeStringName = ObjectProvenance.checkAndExtractProvenance(configuredParameters,HOST_SHORT_NAME, StringProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName()).getValue();
136
137            Map<String, PrimitiveProvenance<?>> instanceParameters = new HashMap<>();
138            instanceParameters.put(DATASOURCE_CREATION_TIME,ObjectProvenance.checkAndExtractProvenance(configuredParameters,DATASOURCE_CREATION_TIME,DateTimeProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName()));
139            instanceParameters.put(RESOURCE_HASH,ObjectProvenance.checkAndExtractProvenance(configuredParameters,RESOURCE_HASH,HashProvenance.class, SimpleStringDataSourceProvenance.class.getSimpleName()));
140
141            return new ExtractedInfo(className,hostTypeStringName,configuredParameters,instanceParameters);
142        }
143
144        @Override
145        public boolean equals(Object o) {
146            if (this == o) return true;
147            if (!(o instanceof SimpleStringDataSourceProvenance)) return false;
148            if (!super.equals(o)) return false;
149            SimpleStringDataSourceProvenance pairs = (SimpleStringDataSourceProvenance) o;
150            return dataSourceCreationTime.equals(pairs.dataSourceCreationTime) &&
151                    sha256Hash.equals(pairs.sha256Hash);
152        }
153
154        @Override
155        public int hashCode() {
156            return Objects.hash(super.hashCode(), dataSourceCreationTime, sha256Hash);
157        }
158
159        @Override
160        public Map<String, PrimitiveProvenance<?>> getInstanceValues() {
161            Map<String,PrimitiveProvenance<?>> map = new HashMap<>();
162
163            map.put(DATASOURCE_CREATION_TIME,dataSourceCreationTime);
164            map.put(RESOURCE_HASH,sha256Hash);
165
166            return map;
167        }
168    }
169
170}