csv_managed/
cli.rs

1use std::path::PathBuf;
2
3use clap::{Args, Parser, Subcommand, ValueEnum};
4
5#[derive(Debug, Parser)]
6#[command(author, version, about = "Manage CSV files efficiently", long_about = None)]
7pub struct Cli {
8    #[command(subcommand)]
9    pub command: Commands,
10}
11
12#[derive(Debug, Subcommand)]
13pub enum Commands {
14    /// Create a -schema.yml file from explicit column definitions
15    Schema(SchemaArgs),
16    /// Create a B-Tree index (.idx) for one or more columns
17    Index(IndexArgs),
18    /// Transform a CSV file using sorting, filtering, projection, derivations, and schema-driven replacements
19    Process(ProcessArgs),
20    /// Append multiple CSV files into a single output
21    Append(AppendArgs),
22    /// Produce summary statistics for numeric columns or frequency counts via --frequency
23    Stats(StatsArgs),
24    // /// Join two CSV files on common columns
25    // Join(JoinArgs),
26    /// Install the csv-managed binary via cargo install
27    Install(InstallArgs),
28}
29
30#[derive(Debug, Args)]
31pub struct SchemaArgs {
32    /// Manual schema creation and shared options
33    #[command(subcommand)]
34    pub mode: Option<SchemaMode>,
35    /// Destination -schema.yml file path (alias --schema retained for compatibility)
36    #[arg(short = 'o', long = "output", alias = "schema", short_alias = 'm')]
37    pub output: Option<PathBuf>,
38    /// Column definitions using `name:type` syntax (comma-separated or repeatable)
39    #[arg(short = 'c', long = "column", action = clap::ArgAction::Append)]
40    pub columns: Vec<String>,
41    /// Value replacement directives using `column=value->replacement`
42    #[arg(long = "replace", action = clap::ArgAction::Append)]
43    pub replacements: Vec<String>,
44}
45
46#[derive(Debug, Subcommand)]
47pub enum SchemaMode {
48    /// Display inferred schema details without writing a file
49    Probe(SchemaProbeArgs),
50    /// Infer schema metadata and optionally persist a -schema.yml file
51    Infer(SchemaInferArgs),
52    /// Verify CSV files against a schema definition
53    Verify(SchemaVerifyArgs),
54    /// List column names and data types from a schema file
55    Columns(SchemaColumnsArgs),
56}
57
58#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
59#[value(rename_all = "kebab-case")]
60pub enum NaPlaceholderBehavior {
61    Empty,
62    Fill,
63}
64
65#[derive(Debug, Args, Clone)]
66pub struct SchemaProbeArgs {
67    /// Input CSV file to inspect
68    #[arg(short = 'i', long = "input")]
69    pub input: PathBuf,
70    /// Number of rows to sample when inferring types (0 means full scan)
71    #[arg(long = "sample-rows", default_value_t = 2000)]
72    pub sample_rows: usize,
73    /// CSV delimiter character (supports ',', 'tab', ';', '|')
74    #[arg(long, value_parser = parse_delimiter)]
75    pub delimiter: Option<u8>,
76    /// Character encoding of the input file (defaults to utf-8)
77    #[arg(long = "input-encoding")]
78    pub input_encoding: Option<String>,
79    /// Emit column mapping templates to stdout after probing
80    #[arg(long = "mapping")]
81    pub mapping: bool,
82    /// Override inferred column types using `name:type`
83    #[arg(long = "override", action = clap::ArgAction::Append)]
84    pub overrides: Vec<String>,
85    /// Capture or validate a snapshot with header/type hash and sampled value summaries (writes if missing)
86    #[arg(long = "snapshot")]
87    pub snapshot: Option<PathBuf>,
88    /// How to treat NA-style placeholders (NA, N/A, #N/A, #NA)
89    #[arg(long = "na-behavior", value_enum, default_value = "empty")]
90    pub na_behavior: NaPlaceholderBehavior,
91    /// Replacement value used when --na-behavior=fill (defaults to empty string)
92    #[arg(long = "na-fill")]
93    pub na_fill: Option<String>,
94    /// Force header detection outcome (`true` treats first row as header, `false` treats it as data)
95    #[arg(long = "assume-header", value_name = "true|false")]
96    pub assume_header: Option<bool>,
97}
98
99#[derive(Debug, Args, Clone)]
100pub struct SchemaInferArgs {
101    #[command(flatten)]
102    pub probe: SchemaProbeArgs,
103    /// Destination -schema.yml file path (alias --schema retained for compatibility)
104    #[arg(short = 'o', long = "output", alias = "schema", short_alias = 'm')]
105    pub output: Option<PathBuf>,
106    /// Inject empty replace arrays into the generated schema as a template when inferring
107    #[arg(long = "replace-template")]
108    pub replace_template: bool,
109    /// Render the inference report and schema YAML to stdout instead of writing a file
110    #[arg(long = "preview")]
111    pub preview: bool,
112    /// Show a unified diff between an existing schema file and the inferred schema
113    #[arg(long = "diff")]
114    pub diff: Option<PathBuf>,
115}
116
117#[derive(Debug, Args, Clone)]
118pub struct SchemaVerifyArgs {
119    /// Schema file describing the expected structure
120    #[arg(short = 'm', long = "schema", alias = "meta")]
121    pub schema: PathBuf,
122    /// One or more CSV files to verify
123    #[arg(short = 'i', long = "input", required = true, action = clap::ArgAction::Append)]
124    pub inputs: Vec<PathBuf>,
125    /// CSV delimiter character
126    #[arg(long, value_parser = parse_delimiter)]
127    pub delimiter: Option<u8>,
128    /// Character encoding for input files (defaults to utf-8)
129    #[arg(long = "input-encoding")]
130    pub input_encoding: Option<String>,
131    /// Report invalid rows by summary (default) or detail. Append ':detail' and/or ':summary' and optionally a LIMIT value.
132    #[arg(long = "report-invalid", value_name = "OPTIONS", num_args = 0..=3)]
133    pub report_invalid: Option<Vec<String>>,
134}
135
136#[derive(Debug, Args)]
137pub struct IndexArgs {
138    /// Input CSV file to index
139    #[arg(short, long)]
140    pub input: PathBuf,
141    /// Output index file (.idx)
142    #[arg(short = 'o', long = "index")]
143    pub index: PathBuf,
144    /// Columns to include in a single ascending index (deprecated when --spec is used)
145    #[arg(short = 'C', long = "columns", value_delimiter = ',')]
146    pub columns: Vec<String>,
147    /// Repeatable index specifications such as `col_a:asc,col_b:desc` or `fast=col_a:asc`
148    #[arg(long = "spec", action = clap::ArgAction::Append)]
149    pub specs: Vec<String>,
150    /// Generate covering index variants by expanding column prefixes and direction combinations (use `|` to separate directions)
151    #[arg(long = "covering", action = clap::ArgAction::Append)]
152    pub coverings: Vec<String>,
153    /// Optional schema file describing column types
154    #[arg(short = 'm', long = "schema", alias = "meta")]
155    pub schema: Option<PathBuf>,
156    /// Limit number of rows to scan (useful for prototyping)
157    #[arg(long)]
158    pub limit: Option<usize>,
159    /// CSV delimiter character (supports ',', 'tab', ';', '|')
160    #[arg(long, value_parser = parse_delimiter)]
161    pub delimiter: Option<u8>,
162    /// Character encoding of the input file (defaults to utf-8)
163    #[arg(long = "input-encoding")]
164    pub input_encoding: Option<String>,
165}
166
167#[derive(Debug, Args)]
168pub struct ProcessArgs {
169    /// Input CSV file to process
170    #[arg(short = 'i', long = "input")]
171    pub input: PathBuf,
172    /// Output CSV file (stdout if omitted)
173    #[arg(short = 'o', long = "output")]
174    pub output: Option<PathBuf>,
175    /// Schema file to drive typed operations and apply value replacements
176    #[arg(short = 'm', long = "schema", alias = "meta")]
177    pub schema: Option<PathBuf>,
178    /// Existing index file to speed up operations
179    #[arg(short = 'x', long = "index")]
180    pub index: Option<PathBuf>,
181    /// Specific index variant name to use from the selected index file
182    #[arg(long = "index-variant")]
183    pub index_variant: Option<String>,
184    /// Sort directives of the form `column[:asc|desc]`
185    #[arg(long = "sort", action = clap::ArgAction::Append)]
186    pub sort: Vec<String>,
187    /// Restrict output to this comma-separated list of columns
188    #[arg(short = 'C', long = "columns", action = clap::ArgAction::Append)]
189    pub columns: Vec<String>,
190    /// Exclude this comma-separated list of columns from output
191    #[arg(long = "exclude-columns", action = clap::ArgAction::Append)]
192    pub exclude_columns: Vec<String>,
193    /// Additional derived columns using `name=expression`
194    #[arg(long = "derive", action = clap::ArgAction::Append)]
195    pub derives: Vec<String>,
196    /// Row-level filters such as `amount>=100` or `status = shipped`
197    #[arg(long = "filter", action = clap::ArgAction::Append)]
198    pub filters: Vec<String>,
199    /// Evalexpr-based filter expressions that must evaluate to truthy values
200    #[arg(long = "filter-expr", action = clap::ArgAction::Append)]
201    pub filter_exprs: Vec<String>,
202    /// Emit 1-based row numbers as the first column
203    #[arg(long = "row-numbers")]
204    pub row_numbers: bool,
205    /// Limit number of rows emitted
206    #[arg(long)]
207    pub limit: Option<usize>,
208    /// CSV delimiter character for reading input
209    #[arg(long, value_parser = parse_delimiter)]
210    pub delimiter: Option<u8>,
211    /// Delimiter to use for output (defaults to input delimiter)
212    #[arg(long = "output-delimiter", value_parser = parse_delimiter)]
213    pub output_delimiter: Option<u8>,
214    /// Character encoding of the input file (defaults to utf-8)
215    #[arg(long = "input-encoding")]
216    pub input_encoding: Option<String>,
217    /// Character encoding for the output file/stdout (defaults to utf-8)
218    #[arg(long = "output-encoding")]
219    pub output_encoding: Option<String>,
220    /// Normalize boolean columns in output
221    #[arg(long = "boolean-format", default_value = "original")]
222    pub boolean_format: BooleanFormat,
223    /// Render results as a preview table on stdout (disables --output and defaults the row limit)
224    #[arg(long = "preview")]
225    pub preview: bool,
226    /// Render output as an elastic table to stdout
227    #[arg(long = "table")]
228    pub table: bool,
229    /// Apply schema-defined datatype mappings before replacements (auto when available)
230    #[arg(long = "apply-mappings")]
231    pub apply_mappings: bool,
232    /// Skip schema-defined datatype mappings even if they exist
233    #[arg(long = "skip-mappings")]
234    pub skip_mappings: bool,
235}
236
237#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq, Default)]
238#[value(rename_all = "kebab-case")]
239pub enum BooleanFormat {
240    #[default]
241    Original,
242    TrueFalse,
243    OneZero,
244}
245
246#[derive(Debug, Args)]
247pub struct AppendArgs {
248    /// One or more CSV files to append
249    #[arg(short = 'i', long = "input", required = true, action = clap::ArgAction::Append)]
250    pub inputs: Vec<PathBuf>,
251    /// Destination CSV file (stdout if omitted)
252    #[arg(short = 'o', long = "output")]
253    pub output: Option<PathBuf>,
254    /// Schema file to verify against
255    #[arg(short = 'm', long = "schema", alias = "meta")]
256    pub schema: Option<PathBuf>,
257    /// CSV delimiter character
258    #[arg(long, value_parser = parse_delimiter)]
259    pub delimiter: Option<u8>,
260    /// Character encoding for input files (defaults to utf-8)
261    #[arg(long = "input-encoding")]
262    pub input_encoding: Option<String>,
263    /// Character encoding for the output file/stdout (defaults to utf-8)
264    #[arg(long = "output-encoding")]
265    pub output_encoding: Option<String>,
266}
267
268#[derive(Debug, Args)]
269pub struct StatsArgs {
270    /// Input CSV file to profile
271    #[arg(short = 'i', long = "input")]
272    pub input: PathBuf,
273    /// Schema file to drive typed operations
274    #[arg(short = 'm', long = "schema", alias = "meta")]
275    pub schema: Option<PathBuf>,
276    /// Columns to include (defaults to numeric columns)
277    #[arg(short = 'C', long = "columns", action = clap::ArgAction::Append)]
278    pub columns: Vec<String>,
279    /// Row-level filters such as `amount>=100` or `status = shipped`
280    #[arg(long = "filter", action = clap::ArgAction::Append)]
281    pub filters: Vec<String>,
282    /// Evalexpr-based filter expressions that must evaluate to truthy values
283    #[arg(long = "filter-expr", action = clap::ArgAction::Append)]
284    pub filter_exprs: Vec<String>,
285    /// CSV delimiter character
286    #[arg(long, value_parser = parse_delimiter)]
287    pub delimiter: Option<u8>,
288    /// Character encoding for input file (defaults to utf-8)
289    #[arg(long = "input-encoding")]
290    pub input_encoding: Option<String>,
291    /// Maximum rows to scan (0 = all)
292    #[arg(long, default_value_t = 0)]
293    pub limit: usize,
294    /// Emit distinct value counts instead of summary statistics
295    #[arg(long)]
296    pub frequency: bool,
297    /// Maximum distinct values to display per column when --frequency is used (0 = all)
298    #[arg(long, default_value_t = 0)]
299    pub top: usize,
300}
301
302#[derive(Debug, Args)]
303pub struct SchemaColumnsArgs {
304    /// Schema file describing the columns to list
305    #[arg(short = 'm', long = "schema", alias = "meta")]
306    pub schema: PathBuf,
307}
308
309#[derive(Debug, Clone, Copy, ValueEnum)]
310#[value(rename_all = "kebab-case")]
311pub enum JoinKind {
312    Inner,
313    Left,
314    Right,
315    Full,
316}
317
318#[derive(Debug, Args)]
319pub struct JoinArgs {
320    /// Left CSV input
321    #[arg(long = "left")]
322    pub left: PathBuf,
323    /// Right CSV input
324    #[arg(long = "right")]
325    pub right: PathBuf,
326    /// Output CSV file (stdout if omitted)
327    #[arg(short = 'o', long = "output")]
328    pub output: Option<PathBuf>,
329    /// Comma-separated key columns from the left file
330    #[arg(long = "left-key")]
331    pub left_key: String,
332    /// Comma-separated key columns from the right file
333    #[arg(long = "right-key")]
334    pub right_key: String,
335    /// Join type (inner, left, right, full)
336    #[arg(long = "type", value_enum, default_value = "inner")]
337    pub kind: JoinKind,
338    /// Schema for the left file
339    #[arg(long = "left-schema", alias = "left-meta")]
340    pub left_schema: Option<PathBuf>,
341    /// Schema for the right file
342    #[arg(long = "right-schema", alias = "right-meta")]
343    pub right_schema: Option<PathBuf>,
344    /// CSV delimiter character for inputs
345    #[arg(long = "delimiter", value_parser = parse_delimiter)]
346    pub delimiter: Option<u8>,
347    /// Character encoding for the left input file (defaults to utf-8)
348    #[arg(long = "left-encoding")]
349    pub left_encoding: Option<String>,
350    /// Character encoding for the right input file (defaults to utf-8)
351    #[arg(long = "right-encoding")]
352    pub right_encoding: Option<String>,
353    /// Character encoding for the output file/stdout (defaults to utf-8)
354    #[arg(long = "output-encoding")]
355    pub output_encoding: Option<String>,
356}
357
358#[derive(Debug, Args)]
359pub struct InstallArgs {
360    /// Install a specific published version
361    #[arg(long)]
362    pub version: Option<String>,
363    /// Force reinstallation even if already installed
364    #[arg(long)]
365    pub force: bool,
366    /// Use --locked to honour Cargo.lock for dependencies
367    #[arg(long)]
368    pub locked: bool,
369    /// Install into an alternate root directory
370    #[arg(long)]
371    pub root: Option<PathBuf>,
372}
373
374pub fn parse_delimiter(value: &str) -> Result<u8, String> {
375    match value {
376        "tab" | "\t" => Ok(b'\t'),
377        "comma" | "," => Ok(b','),
378        "|" | "pipe" => Ok(b'|'),
379        ";" | "semicolon" => Ok(b';'),
380        other => {
381            let mut chars = other.chars();
382            let first = chars
383                .next()
384                .ok_or_else(|| "Delimiter cannot be empty".to_string())?;
385            if chars.next().is_some() {
386                return Err("Delimiter must be a single character".to_string());
387            }
388            if !first.is_ascii() {
389                return Err("Delimiter must be ASCII".to_string());
390            }
391            Ok(first as u8)
392        }
393    }
394}