001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.data;
018
019import com.oracle.labs.mlrg.olcut.config.ConfigurationManager;
020import com.oracle.labs.mlrg.olcut.config.Option;
021import com.oracle.labs.mlrg.olcut.config.Options;
022import com.oracle.labs.mlrg.olcut.config.UsageException;
023import com.oracle.labs.mlrg.olcut.util.IOUtil;
024import com.oracle.labs.mlrg.olcut.util.LabsLogFormatter;
025import org.tribuo.ConfigurableDataSource;
026import org.tribuo.MutableDataset;
027import org.tribuo.Output;
028
029import java.io.IOException;
030import java.io.ObjectOutputStream;
031import java.nio.file.Path;
032import java.util.logging.Level;
033import java.util.logging.Logger;
034
035/**
036 * Reads in a Datasource, processes all the data, and writes it out as a serialized dataset. This makes sharing
037 * data preprocessing between multiple runs easier.
038 */
039public final class PreprocessAndSerialize {
040    private static final Logger logger = Logger.getLogger(PreprocessAndSerialize.class.getName());
041
042    private PreprocessAndSerialize() {}
043
044    public static class PreprocessAndSerializeOptions implements Options {
045        @Option(charName='d', longName="dataSource", usage="Datasource to load from a config file")
046        public ConfigurableDataSource<? extends Output<?>> dataSource;
047        @Option(charName='o', longName="serialized-dataset", usage="path to serialize the dataset")
048        public Path output;
049    }
050
051    public static void main(String[] args) {
052
053        LabsLogFormatter.setAllLogFormatters();
054
055        PreprocessAndSerializeOptions opts = new PreprocessAndSerializeOptions();
056        ConfigurationManager cm;
057        try {
058            cm = new ConfigurationManager(args,opts);
059        } catch (UsageException e) {
060            logger.info(e.getUsage());
061            System.exit(1);
062        }
063
064        logger.info("Reading datasource into dataset");
065        MutableDataset<?> dataset = new MutableDataset<>(opts.dataSource);
066
067        logger.info("Finished reading dataset");
068
069        if(opts.output.endsWith("gz")) {
070            logger.info("Writing zipped dataset");
071        }
072        try(ObjectOutputStream os = IOUtil.getObjectOutputStream(opts.output.toString(), opts.output.endsWith("gz"))) {
073            os.writeObject(dataset);
074        } catch (IOException e) {
075            logger.log(Level.SEVERE,  "Error writing serialized dataset", e);
076            System.exit(1);
077        }
078    }
079}