qsv 0.87.0

A high performance CSV data-wrangling toolkit.
#![cfg_attr(
    feature = "cargo-clippy",
    allow(
        clippy::cast_possible_truncation,
        clippy::cast_possible_wrap,
        clippy::cast_sign_loss,
        // things are often more readable this way
        clippy::cast_lossless,
        clippy::module_name_repetitions,
        clippy::type_complexity,
        clippy::zero_prefixed_literal,
        // correctly used
        clippy::derive_partial_eq_without_eq,
        clippy::enum_glob_use,
        let_underscore_drop,
        clippy::result_unit_err,
        // not practical
        clippy::similar_names,
        clippy::too_many_lines,
        clippy::struct_excessive_bools,
        // preference
        clippy::doc_markdown,
        clippy::unseparated_literal_suffix,
        clippy::unnecessary_wraps,
        // false positive
        clippy::needless_doctest_main,
        // noisy
        clippy::missing_errors_doc,
        clippy::must_use_candidate,
        clippy::use_self,
        clippy::cognitive_complexity,
        clippy::option_if_let_else,
    )
)]

extern crate crossbeam_channel as channel;
use std::{env, io, time::Instant};

use docopt::Docopt;
use serde::Deserialize;

use crate::clitypes::{CliError, CliResult, QsvExitCode};

#[cfg(feature = "mimalloc")]
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

mod clitypes;
mod cmd;
mod config;
mod index;
mod select;
mod util;

static USAGE: &str = r#"
Usage:
    qsv <command> [<args>...]
    qsv [options]

Options:
    --list               List all commands available.
    --envlist            List all qsv-relevant environment variables.
    -u, --update         Update qsv to the latest release from GitHub.
    -U, --updatenow      Update qsv to the latest release from GitHub without confirming.
    -h, --help           Display this message
    <command> -h         Display the command help message
    -v, --version        Print version info, mem allocator, features installed, 
                         max_jobs, num_cpus, build info then exit

* sponsored by datHere - Data Infrastructure Engineering
"#;

#[derive(Deserialize)]
struct Args {
    arg_command:    Option<Command>,
    flag_list:      bool,
    flag_envlist:   bool,
    flag_update:    bool,
    flag_updatenow: bool,
}

fn main() -> QsvExitCode {
    let mut enabled_commands = String::new();
    #[cfg(all(feature = "apply", not(feature = "lite")))]
    enabled_commands.push_str("    apply       Apply series of transformations to a column\n");

    enabled_commands.push_str(
        "    behead      Drop header from CSV file
    cat         Concatenate by row or column
    count       Count records
    dedup       Remove redundant rows
    diff        Create the difference between two CSVs
    enum        Add a new column enumerating CSV lines
    excel       Exports an Excel sheet to a CSV
    exclude     Excludes the records in one CSV from another
    explode     Explode rows based on some column separator
    extsort     Sort arbitrarily large text file\n",
    );

    #[cfg(all(feature = "fetch", not(feature = "lite")))]
    enabled_commands.push_str(
        "    fetch       Fetches data from web services for every row using HTTP Get.
    fetchpost   Fetches data from web services for every row using HTTP Post.\n",
    );

    enabled_commands.push_str(
        "    fill        Fill empty values
    fixlengths  Makes all records have same length
    flatten     Show one field per line
    fmt         Format CSV output (change field delimiter)\n",
    );

    #[cfg(all(feature = "foreach", not(feature = "lite")))]
    enabled_commands
        .push_str("    foreach     Loop over a CSV file to execute bash commands (*nix only)\n");

    enabled_commands.push_str("    frequency   Show frequency tables\n");

    #[cfg(all(feature = "generate", not(feature = "lite")))]
    enabled_commands.push_str("    generate    Generate test data by profiling a CSV\n");

    enabled_commands.push_str(
        "    headers     Show header names
    help        Show this usage message
    index       Create CSV index for faster access
    input       Read CSVs w/ special quoting, skipping, trimming & transcoding rules
    join        Join CSV files
    jsonl       Convert newline-delimited JSON files to CSV\n",
    );

    #[cfg(all(feature = "luau", not(feature = "lite")))]
    enabled_commands.push_str("    luau        Execute Luau script on CSV data\n");

    enabled_commands.push_str(
        "    partition   Partition CSV data based on a column value
    pseudo      Pseudonymise the values of a column\n",
    );

    #[cfg(all(feature = "python", not(feature = "lite")))]
    enabled_commands.push_str("    py          Evaluate a Python expression on CSV data\n");

    enabled_commands.push_str(
        "    rename      Rename the columns of CSV data efficiently
    replace     Replace patterns in CSV data
    reverse     Reverse rows of CSV data
    safenames   Modify a CSV's header names to db-safe names
    sample      Randomly sample CSV data
    schema      Generate JSON Schema from CSV data
    search      Search CSV data with a regex
    searchset   Search CSV data with a regex set
    select      Select, re-order, duplicate or drop columns
    slice       Slice records from CSV
    sniff       Quickly sniff CSV metadata
    sort        Sort CSV data in alphabetical, numerical, reverse or random order
    sortcheck   Check if a CSV is sorted
    split       Split CSV data into many files
    stats       Infer data types and compute summary statistics
    table       Align CSV data into columns
    tojsonl     Convert CSV to newline-delimited JSON\n",
    );

    #[cfg(all(feature = "to", not(feature = "lite")))]
    enabled_commands
        .push_str("    to          Convert CSVs to PostgreSQL/XLSX/Parquet/SQLite/Data Package\n");

    enabled_commands.push_str(
        "    transpose   Transpose rows/columns of CSV data
    validate    Validate CSV data for RFC4180-compliance or with JSON Schema",
    );
    let num_commands = enabled_commands.split('\n').count();

    let now = Instant::now();
    let qsv_args = util::init_logger();

    let args: Args = Docopt::new(USAGE)
        .and_then(|d| {
            d.options_first(true)
                .version(Some(util::version()))
                .deserialize()
        })
        .unwrap_or_else(|e| e.exit());
    if args.flag_list {
        wout!("Installed commands ({num_commands}):");
        wout!(
            "{enabled_commands}\n
sponsored by datHere - Data Infrastructure Engineering
        "
        );
        util::log_end(qsv_args, now);
        return QsvExitCode::Good;
    } else if args.flag_envlist {
        util::show_env_vars();
        util::log_end(qsv_args, now);
        return QsvExitCode::Good;
    }
    if args.flag_update || args.flag_updatenow {
        let update_checked = util::qsv_check_for_update(false, args.flag_updatenow);
        util::log_end(qsv_args, now);
        if update_checked.is_ok() {
            return QsvExitCode::Good;
        }
        return QsvExitCode::Bad;
    }
    match args.arg_command {
        None => {
            werr!(
                "qsv is a suite of CSV command line utilities.

Please choose one of the following {num_commands} commands:\n{enabled_commands}\n
sponsored by datHere - Data Infrastructure Engineering
"
            );
            _ = util::qsv_check_for_update(true, false);
            util::log_end(qsv_args, now);
            QsvExitCode::Good
        }
        Some(cmd) => match cmd.run() {
            Ok(()) => {
                util::log_end(qsv_args, now);
                QsvExitCode::Good
            }
            Err(CliError::Flag(err)) => {
                werr!("{err}");
                util::log_end(qsv_args, now);
                QsvExitCode::IncorrectUsage
            }
            Err(CliError::Csv(err)) => {
                werr!("{err}");
                util::log_end(qsv_args, now);
                QsvExitCode::Bad
            }
            Err(CliError::Io(ref err)) if err.kind() == io::ErrorKind::BrokenPipe => {
                werr!("Broken pipe: {err}");
                util::log_end(qsv_args, now);
                QsvExitCode::Abort
            }
            Err(CliError::Io(err)) => {
                werr!("{err}");
                util::log_end(qsv_args, now);
                QsvExitCode::Bad
            }
            Err(CliError::NoMatch()) => {
                util::log_end(qsv_args, now);
                QsvExitCode::Bad
            }
            Err(CliError::Other(msg)) => {
                werr!("{msg}");
                util::log_end(qsv_args, now);
                QsvExitCode::Bad
            }
        },
    }
}

#[derive(Debug, Deserialize)]
#[serde(rename_all = "lowercase")]
enum Command {
    #[cfg(all(feature = "apply", not(feature = "lite")))]
    Apply,
    Behead,
    Cat,
    Count,
    Dedup,
    Diff,
    Enum,
    Excel,
    Exclude,
    Explode,
    ExtSort,
    #[cfg(all(feature = "fetch", not(feature = "lite")))]
    Fetch,
    #[cfg(all(feature = "fetch", not(feature = "lite")))]
    FetchPost,
    Fill,
    FixLengths,
    Flatten,
    Fmt,
    #[cfg(all(feature = "foreach", target_family = "unix", not(feature = "lite")))]
    ForEach,
    Frequency,
    #[cfg(all(feature = "generate", not(feature = "lite")))]
    Generate,
    Headers,
    Help,
    Index,
    Input,
    Join,
    Jsonl,
    #[cfg(all(feature = "luau", not(feature = "lite")))]
    Luau,
    Partition,
    Pseudo,
    #[cfg(all(feature = "python", not(feature = "lite")))]
    Py,
    Rename,
    Replace,
    Reverse,
    Safenames,
    Sample,
    Schema,
    Search,
    SearchSet,
    Select,
    Slice,
    Sniff,
    Sort,
    SortCheck,
    Split,
    Stats,
    Table,
    Transpose,
    #[cfg(all(feature = "to", not(feature = "lite")))]
    To,
    Tojsonl,
    Validate,
}

impl Command {
    fn run(self) -> CliResult<()> {
        let argv: Vec<_> = env::args().collect();
        let argv: Vec<_> = argv.iter().map(|s| &**s).collect();
        let argv = &*argv;

        if !argv[1].chars().all(char::is_lowercase) {
            return Err(CliError::Other(format!(
                "qsv expects commands in lowercase. Did you mean '{}'?",
                argv[1].to_lowercase()
            )));
        }
        match self {
            Command::Behead => cmd::behead::run(argv),
            #[cfg(all(feature = "apply", not(feature = "lite")))]
            Command::Apply => cmd::apply::run(argv),
            Command::Cat => cmd::cat::run(argv),
            Command::Count => cmd::count::run(argv),
            Command::Dedup => cmd::dedup::run(argv),
            Command::Diff => cmd::diff::run(argv),
            Command::Enum => cmd::enumerate::run(argv),
            Command::Excel => cmd::excel::run(argv),
            Command::Exclude => cmd::exclude::run(argv),
            Command::Explode => cmd::explode::run(argv),
            Command::ExtSort => cmd::extsort::run(argv),
            #[cfg(all(feature = "fetch", not(feature = "lite")))]
            Command::Fetch => cmd::fetch::run(argv),
            #[cfg(all(feature = "fetch", not(feature = "lite")))]
            Command::FetchPost => cmd::fetchpost::run(argv),
            #[cfg(all(feature = "foreach", target_family = "unix", not(feature = "lite")))]
            Command::ForEach => cmd::foreach::run(argv),
            Command::Fill => cmd::fill::run(argv),
            Command::FixLengths => cmd::fixlengths::run(argv),
            Command::Flatten => cmd::flatten::run(argv),
            Command::Fmt => cmd::fmt::run(argv),
            Command::Frequency => cmd::frequency::run(argv),
            #[cfg(all(feature = "generate", not(feature = "lite")))]
            Command::Generate => cmd::generate::run(argv),
            Command::Headers => cmd::headers::run(argv),
            Command::Help => {
                wout!("{USAGE}");
                _ = util::qsv_check_for_update(true, false);
                Ok(())
            }
            Command::Index => cmd::index::run(argv),
            Command::Input => cmd::input::run(argv),
            Command::Join => cmd::join::run(argv),
            Command::Jsonl => cmd::jsonl::run(argv),
            #[cfg(all(feature = "luau", not(feature = "lite")))]
            Command::Luau => cmd::luau::run(argv),
            Command::Partition => cmd::partition::run(argv),
            Command::Pseudo => cmd::pseudo::run(argv),
            #[cfg(all(feature = "python", not(feature = "lite")))]
            Command::Py => cmd::python::run(argv),
            Command::Rename => cmd::rename::run(argv),
            Command::Replace => cmd::replace::run(argv),
            Command::Reverse => cmd::reverse::run(argv),
            Command::Safenames => cmd::safenames::run(argv),
            Command::Sample => cmd::sample::run(argv),
            Command::Schema => cmd::schema::run(argv),
            Command::Search => cmd::search::run(argv),
            Command::SearchSet => cmd::searchset::run(argv),
            Command::Select => cmd::select::run(argv),
            Command::Slice => cmd::slice::run(argv),
            Command::Sniff => cmd::sniff::run(argv),
            Command::Sort => cmd::sort::run(argv),
            Command::SortCheck => cmd::sortcheck::run(argv),
            Command::Split => cmd::split::run(argv),
            Command::Stats => cmd::stats::run(argv),
            Command::Table => cmd::table::run(argv),
            Command::Transpose => cmd::transpose::run(argv),
            #[cfg(all(feature = "to", not(feature = "lite")))]
            Command::To => cmd::to::run(argv),
            Command::Tojsonl => cmd::tojsonl::run(argv),
            Command::Validate => cmd::validate::run(argv),
        }
    }
}