extern crate csv;
extern crate docopt;
extern crate env_logger;
#[macro_use]
extern crate error_chain;
extern crate humansize;
extern crate libc;
#[macro_use]
extern crate log;
extern crate rustc_serialize;
extern crate time;
use humansize::{FileSize, file_size_opts};
use std::env;
use std::fs;
use std::io;
use std::io::prelude::*;
use std::process;
use errors::*;
mod errors;
const USAGE: &'static str = r#"
Usage: scrubcsv [options] [<input>]
scrubcsv --help
scrubcsv --version
Read a CSV file, normalize the "good" lines, and print them to standard
output. Discard any lines with the wrong number of columns.
Options:
--help Show this help message
--version Print the version of this program
-q, --quiet Do not print performance information
-d, --delimiter CHAR Character used to separate fields in a row
(must be a single ASCII byte) [default: ,]
Exit code:
0 on success
1 on error
2 if more than 10% of rows were bad
"#;
#[derive(Debug, RustcDecodable)]
struct Args {
arg_input: Option<String>,
flag_delimiter: String,
flag_quiet: bool,
flag_version: bool,
}
fn run(args: &Args) -> Result<()> {
let delimiter = if args.flag_delimiter.as_bytes().len() == 1 {
args.flag_delimiter.as_bytes()[0]
} else {
return Err("field delimiter must be exactly one byte".into());
};
let start_time = time::precise_time_s();
let stdout = io::stdout();
let output = io::BufWriter::new(stdout.lock());
let stdin = io::stdin();
let unbuffered_input: Box<Read> = if let Some(ref path) = args.arg_input {
Box::new(fs::File::open(path)?)
} else {
Box::new(stdin.lock())
};
let input = io::BufReader::new(unbuffered_input);
let mut rdr = csv::Reader::from_reader(input)
.has_headers(false)
.flexible(true)
.delimiter(delimiter);
let mut wtr = csv::Writer::from_writer(output);
let mut rows: u64 = 0;
let mut bad_rows: u64 = 0;
let mut columns_expected = None;
for record in rdr.byte_records() {
let record = record?;
let is_valid = if let Some(cols) = columns_expected {
if record.len() != cols {
bad_rows += 1;
false
} else {
true
}
} else {
columns_expected = Some(record.len());
true
};
if is_valid {
wtr.write(record.into_iter())?;
}
rows += 1;
}
if !args.flag_quiet {
let ellapsed = time::precise_time_s() - start_time;
let bytes_per_second = (rdr.byte_offset() as f64 / ellapsed) as i64;
writeln!(io::stderr(),
"{} rows ({} bad) in {:.2} seconds, {}/sec",
rows,
bad_rows,
ellapsed,
bytes_per_second.file_size(file_size_opts::BINARY)?)?;
}
if bad_rows * 10 > rows {
return Err(ErrorKind::TooManyBadRows(bad_rows, rows).into());
}
Ok(())
}
fn main() {
env_logger::init().unwrap();
let args: Args = docopt::Docopt::new(USAGE)
.and_then(|d| d.argv(env::args()).decode())
.unwrap_or_else(|e| e.exit());
debug!("Arguments: {:#?}", args);
if args.flag_version {
println!("scrubcsv {}", env!("CARGO_PKG_VERSION"));
process::exit(0);
}
if let Err(err) = run(&args) {
let mut stderr = io::stderr();
write!(&mut stderr, "ERROR").unwrap();
for e in err.iter() {
write!(&mut stderr, ": {}", e).unwrap();
}
writeln!(&mut stderr, "").unwrap();
if err.should_show_backtrace() {
if let Some(backtrace) = err.backtrace() {
writeln!(&mut stderr, "Backtrace:\n{:?}", backtrace).unwrap();
}
}
process::exit(err.to_exit_code());
}
}