mod benchmark;
#[cfg(feature = "http")]
mod http;
use benchmark::{find_annotations, run_benchmark};
use clap::Parser;
use csv_nose::{DatePreference, Quote, SampleSize, Sniffer};
use std::fmt::Write;
use std::path::PathBuf;
use std::process::ExitCode;
#[derive(Parser, Debug)]
#[command(name = "csv-nose")]
#[command(author, version, about, long_about = None)]
struct Args {
#[arg(required_unless_present = "benchmark")]
files: Vec<String>,
#[arg(long)]
benchmark: bool,
#[arg(long)]
annotations: Option<PathBuf>,
#[arg(short = 'n', long, default_value = "100")]
sample_records: usize,
#[arg(short = 'b', long)]
sample_bytes: Option<usize>,
#[arg(short = 'a', long)]
all: bool,
#[arg(short = 'd', long)]
delimiter: Option<char>,
#[arg(short = 'q', long)]
quote: Option<String>,
#[arg(long)]
dmy: bool,
#[arg(short = 'f', long, default_value = "text")]
format: OutputFormat,
#[arg(short = 'v', long)]
verbose: bool,
#[arg(long)]
delimiter_only: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
enum OutputFormat {
Text,
Json,
Csv,
}
fn main() -> ExitCode {
let args = Args::parse();
if args.benchmark {
return run_benchmark_cli(&args);
}
let mut exit_code = ExitCode::SUCCESS;
for file in &args.files {
let result = if is_url(file) {
#[cfg(feature = "http")]
{
sniff_url(file, &args)
}
#[cfg(not(feature = "http"))]
{
Err("HTTP support not enabled. Rebuild with --features http".into())
}
} else {
sniff_file(&PathBuf::from(file), &args)
};
if let Err(e) = result {
eprintln!("Error processing {file}: {e}");
exit_code = ExitCode::FAILURE;
}
}
exit_code
}
fn is_url(path: &str) -> bool {
path.starts_with("http://") || path.starts_with("https://")
}
fn run_benchmark_cli(args: &Args) -> ExitCode {
if args.files.is_empty() {
eprintln!("Error: benchmark mode requires a directory path");
return ExitCode::FAILURE;
}
if is_url(&args.files[0]) {
eprintln!("Error: benchmark mode requires a local directory, not a URL");
return ExitCode::FAILURE;
}
let data_dir = PathBuf::from(&args.files[0]);
if !data_dir.is_dir() {
eprintln!("Error: {} is not a directory", data_dir.display());
return ExitCode::FAILURE;
}
let annotations_path = if let Some(ref path) = args.annotations {
path.clone()
} else if let Some(path) = find_annotations(&data_dir) {
path
} else {
eprintln!(
"Error: Could not find annotations file for {}",
data_dir.display()
);
eprintln!("Use --annotations to specify the path to the annotations file");
return ExitCode::FAILURE;
};
println!("Running benchmark on: {}", data_dir.display());
println!("Using annotations: {}", annotations_path.display());
println!();
match run_benchmark(&data_dir, &annotations_path) {
Ok(result) => {
result.print_details();
result.print_summary();
ExitCode::SUCCESS
}
Err(e) => {
eprintln!("Error running benchmark: {e}");
ExitCode::FAILURE
}
}
}
fn sniff_file(path: &PathBuf, args: &Args) -> Result<(), Box<dyn std::error::Error>> {
let mut sniffer = Sniffer::new();
if args.all {
sniffer.sample_size(SampleSize::All);
} else if let Some(bytes) = args.sample_bytes {
sniffer.sample_size(SampleSize::Bytes(bytes));
} else {
sniffer.sample_size(SampleSize::Records(args.sample_records));
}
if args.dmy {
sniffer.date_preference(DatePreference::DmyFormat);
}
if let Some(delim) = args.delimiter {
sniffer.delimiter(delim as u8);
}
if let Some(ref quote_str) = args.quote {
if quote_str.to_lowercase() == "none" {
sniffer.quote(Quote::None);
} else if let Some(c) = quote_str.chars().next() {
sniffer.quote(Quote::Some(c as u8));
}
}
let metadata = sniffer.sniff_path(path)?;
if args.delimiter_only {
println!("{}", metadata.dialect.delimiter as char);
return Ok(());
}
let display_path = path.display().to_string();
match args.format {
OutputFormat::Text => print_text_output(&display_path, &metadata, args.verbose),
OutputFormat::Json => print_json_output(&display_path, &metadata, args.verbose),
OutputFormat::Csv => print_csv_output(&display_path, &metadata),
}
Ok(())
}
#[cfg(feature = "http")]
fn sniff_url(url: &str, args: &Args) -> Result<(), Box<dyn std::error::Error>> {
let max_bytes = if args.all {
None
} else if let Some(bytes) = args.sample_bytes {
Some(bytes)
} else {
Some(args.sample_records * 500)
};
let fetch_result = http::fetch_url(url, max_bytes)?;
let mut sniffer = Sniffer::new();
sniffer.sample_size(SampleSize::All);
if args.dmy {
sniffer.date_preference(DatePreference::DmyFormat);
}
if let Some(delim) = args.delimiter {
sniffer.delimiter(delim as u8);
}
if let Some(ref quote_str) = args.quote {
if quote_str.to_lowercase() == "none" {
sniffer.quote(Quote::None);
} else if let Some(c) = quote_str.chars().next() {
sniffer.quote(Quote::Some(c as u8));
}
}
let metadata = sniffer.sniff_bytes(&fetch_result.data)?;
if args.delimiter_only {
println!("{}", metadata.dialect.delimiter as char);
return Ok(());
}
match args.format {
OutputFormat::Text => print_text_output(url, &metadata, args.verbose),
OutputFormat::Json => print_json_output(url, &metadata, args.verbose),
OutputFormat::Csv => print_csv_output(url, &metadata),
}
Ok(())
}
fn print_text_output(path: &str, metadata: &csv_nose::Metadata, verbose: bool) {
println!("File: {path}");
println!(" Delimiter: {:?}", metadata.dialect.delimiter as char);
println!(
" Quote: {}",
match metadata.dialect.quote {
Quote::None => "none".to_string(),
Quote::Some(q) => format!("{:?}", q as char),
}
);
println!(" Has header: {}", metadata.dialect.header.has_header_row);
println!(
" Preamble rows: {}",
metadata.dialect.header.num_preamble_rows
);
println!(" Flexible: {}", metadata.dialect.flexible);
println!(" UTF-8: {}", metadata.dialect.is_utf8);
println!(" Fields: {}", metadata.num_fields);
println!(" Avg record length: {} bytes", metadata.avg_record_len);
if verbose {
println!(" Field details:");
for (i, (name, typ)) in metadata
.fields
.iter()
.zip(metadata.types.iter())
.enumerate()
{
println!(" {}: {} ({})", i + 1, name, typ);
}
}
println!();
}
fn escape_json(s: &str) -> String {
let mut result = String::with_capacity(s.len());
for c in s.chars() {
match c {
'"' => result.push_str("\\\""),
'\\' => result.push_str("\\\\"),
'\n' => result.push_str("\\n"),
'\r' => result.push_str("\\r"),
'\t' => result.push_str("\\t"),
c if c.is_control() => {
let _ = write!(result, "\\u{:04x}", c as u32);
}
c => result.push(c),
}
}
result
}
fn escape_csv(s: &str) -> String {
if s.contains(',') || s.contains('"') || s.contains('\n') || s.contains('\r') {
format!("\"{}\"", s.replace('"', "\"\""))
} else {
s.to_string()
}
}
fn print_json_output(path: &str, metadata: &csv_nose::Metadata, verbose: bool) {
let quote_str = match metadata.dialect.quote {
Quote::None => "null".to_string(),
Quote::Some(q) => format!("\"{}\"", q as char),
};
print!(
r#"{{"file":"{}","dialect":{{"delimiter":"{}","quote":{},"has_header":{},"preamble_rows":{},"flexible":{},"is_utf8":{}}},"num_fields":{},"avg_record_len":{}"#,
escape_json(path),
metadata.dialect.delimiter as char,
quote_str,
metadata.dialect.header.has_header_row,
metadata.dialect.header.num_preamble_rows,
metadata.dialect.flexible,
metadata.dialect.is_utf8,
metadata.num_fields,
metadata.avg_record_len
);
if verbose {
print!(r#","fields":["#);
for (i, (name, typ)) in metadata
.fields
.iter()
.zip(metadata.types.iter())
.enumerate()
{
if i > 0 {
print!(",");
}
print!(
r#"{{"name":"{}","type":"{}"}}"#,
escape_json(name),
escape_json(&typ.to_string())
);
}
print!("]");
}
println!("}}");
}
fn print_csv_output(path: &str, metadata: &csv_nose::Metadata) {
use std::sync::atomic::{AtomicBool, Ordering};
static HEADER_PRINTED: AtomicBool = AtomicBool::new(false);
let quote_str = match metadata.dialect.quote {
Quote::None => "none".to_string(),
Quote::Some(q) => format!("{}", q as char),
};
if !HEADER_PRINTED.swap(true, Ordering::Relaxed) {
println!(
"file,delimiter,quote,has_header,preamble_rows,flexible,is_utf8,num_fields,avg_record_len"
);
}
println!(
"{},{},{},{},{},{},{},{},{}",
escape_csv(path),
metadata.dialect.delimiter as char,
quote_str,
metadata.dialect.header.has_header_row,
metadata.dialect.header.num_preamble_rows,
metadata.dialect.flexible,
metadata.dialect.is_utf8,
metadata.num_fields,
metadata.avg_record_len
);
}