001/* 002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package org.tribuo.data; 018 019import com.oracle.labs.mlrg.olcut.config.ConfigurationManager; 020import com.oracle.labs.mlrg.olcut.config.Option; 021import com.oracle.labs.mlrg.olcut.config.Options; 022import com.oracle.labs.mlrg.olcut.config.UsageException; 023import com.oracle.labs.mlrg.olcut.util.IOUtil; 024import com.oracle.labs.mlrg.olcut.util.LabsLogFormatter; 025import org.tribuo.ConfigurableDataSource; 026import org.tribuo.MutableDataset; 027import org.tribuo.Output; 028 029import java.io.IOException; 030import java.io.ObjectOutputStream; 031import java.nio.file.Path; 032import java.util.logging.Level; 033import java.util.logging.Logger; 034 035/** 036 * Reads in a Datasource, processes all the data, and writes it out as a serialized dataset. This makes sharing 037 * data preprocessing between multiple runs easier. 038 */ 039public final class PreprocessAndSerialize { 040 private static final Logger logger = Logger.getLogger(PreprocessAndSerialize.class.getName()); 041 042 private PreprocessAndSerialize() {} 043 044 public static class PreprocessAndSerializeOptions implements Options { 045 @Option(charName='d', longName="dataSource", usage="Datasource to load from a config file") 046 public ConfigurableDataSource<? extends Output<?>> dataSource; 047 @Option(charName='o', longName="serialized-dataset", usage="path to serialize the dataset") 048 public Path output; 049 } 050 051 public static void main(String[] args) { 052 053 LabsLogFormatter.setAllLogFormatters(); 054 055 PreprocessAndSerializeOptions opts = new PreprocessAndSerializeOptions(); 056 ConfigurationManager cm; 057 try { 058 cm = new ConfigurationManager(args,opts); 059 } catch (UsageException e) { 060 logger.info(e.getUsage()); 061 System.exit(1); 062 } 063 064 logger.info("Reading datasource into dataset"); 065 MutableDataset<?> dataset = new MutableDataset<>(opts.dataSource); 066 067 logger.info("Finished reading dataset"); 068 069 if(opts.output.endsWith("gz")) { 070 logger.info("Writing zipped dataset"); 071 } 072 try(ObjectOutputStream os = IOUtil.getObjectOutputStream(opts.output.toString(), opts.output.endsWith("gz"))) { 073 os.writeObject(dataset); 074 } catch (IOException e) { 075 logger.log(Level.SEVERE, "Error writing serialized dataset", e); 076 System.exit(1); 077 } 078 } 079}