csv_managed/
cli.rs

1use std::path::PathBuf;
2
3use clap::{Args, Parser, Subcommand, ValueEnum};
4
5#[derive(Debug, Parser)]
6#[command(author, version, about = "Manage CSV files efficiently", long_about = None)]
7pub struct Cli {
8    #[command(subcommand)]
9    pub command: Commands,
10}
11
12#[derive(Debug, Subcommand)]
13pub enum Commands {
14    /// Probe a CSV file and infer column data types into a .schema file
15    Probe(ProbeArgs),
16    /// Create a .schema file from explicit column definitions
17    Schema(SchemaArgs),
18    /// Create a B-Tree index (.idx) for one or more columns
19    Index(IndexArgs),
20    /// Transform a CSV file using sorting, filtering, projection, derivations, and schema-driven replacements
21    Process(ProcessArgs),
22    /// Append multiple CSV files into a single output
23    Append(AppendArgs),
24    /// Verify one or more CSV files against a schema definition
25    Verify(VerifyArgs),
26    /// Preview the first few rows of a CSV file in a formatted table
27    Preview(PreviewArgs),
28    /// Produce summary statistics for numeric columns
29    Stats(StatsArgs),
30    /// Produce frequency counts for categorical columns
31    Frequency(FrequencyArgs),
32    /// Join two CSV files on common columns
33    Join(JoinArgs),
34    /// Install the csv-managed binary via cargo install
35    Install(InstallArgs),
36    /// List column names and data types from a schema file
37    Columns(ColumnsArgs),
38}
39
40#[derive(Debug, Args)]
41pub struct ProbeArgs {
42    /// Input CSV file to inspect
43    #[arg(short = 'i', long = "input")]
44    pub input: PathBuf,
45    /// Destination .schema file path
46    #[arg(short = 'm', long = "schema", alias = "meta")]
47    pub schema: PathBuf,
48    /// Number of rows to sample when inferring types (0 means full scan)
49    #[arg(long, default_value_t = 2000)]
50    pub sample_rows: usize,
51    /// CSV delimiter character (supports ',', 'tab', ';', '|')
52    #[arg(long, value_parser = parse_delimiter)]
53    pub delimiter: Option<u8>,
54    /// Character encoding of the input file (defaults to utf-8)
55    #[arg(long = "input-encoding")]
56    pub input_encoding: Option<String>,
57    /// Emit column mapping templates to stdout after probing
58    #[arg(long = "mapping")]
59    pub mapping: bool,
60    /// Inject empty replace arrays into the generated schema as a template
61    #[arg(long = "replace")]
62    pub replace_template: bool,
63}
64
65#[derive(Debug, Args)]
66pub struct SchemaArgs {
67    /// Destination .schema file path
68    #[arg(short = 'o', long = "output")]
69    pub output: PathBuf,
70    /// Column definitions using `name:type` syntax (comma-separated or repeatable)
71    #[arg(short = 'c', long = "column", action = clap::ArgAction::Append, required = true)]
72    pub columns: Vec<String>,
73    /// Value replacement directives using `column=value->replacement`
74    #[arg(long = "replace", action = clap::ArgAction::Append)]
75    pub replacements: Vec<String>,
76}
77
78#[derive(Debug, Args)]
79pub struct IndexArgs {
80    /// Input CSV file to index
81    #[arg(short, long)]
82    pub input: PathBuf,
83    /// Output index file (.idx)
84    #[arg(short = 'o', long = "index")]
85    pub index: PathBuf,
86    /// Columns to include in a single ascending index (deprecated when --spec is used)
87    #[arg(short = 'C', long = "columns", value_delimiter = ',')]
88    pub columns: Vec<String>,
89    /// Repeatable index specifications such as `col_a:asc,col_b:desc` or `fast=col_a:asc`
90    #[arg(long = "spec", action = clap::ArgAction::Append)]
91    pub specs: Vec<String>,
92    /// Generate index variants by expanding column prefixes and direction combinations (use `|` to separate directions)
93    #[arg(long = "combo", action = clap::ArgAction::Append)]
94    pub combos: Vec<String>,
95    /// Optional schema file describing column types
96    #[arg(short = 'm', long = "schema", alias = "meta")]
97    pub schema: Option<PathBuf>,
98    /// Limit number of rows to scan (useful for prototyping)
99    #[arg(long)]
100    pub limit: Option<usize>,
101    /// CSV delimiter character (supports ',', 'tab', ';', '|')
102    #[arg(long, value_parser = parse_delimiter)]
103    pub delimiter: Option<u8>,
104    /// Character encoding of the input file (defaults to utf-8)
105    #[arg(long = "input-encoding")]
106    pub input_encoding: Option<String>,
107}
108
109#[derive(Debug, Args)]
110pub struct ProcessArgs {
111    /// Input CSV file to process
112    #[arg(short = 'i', long = "input")]
113    pub input: PathBuf,
114    /// Output CSV file (stdout if omitted)
115    #[arg(short = 'o', long = "output")]
116    pub output: Option<PathBuf>,
117    /// Schema file to drive typed operations and apply value replacements
118    #[arg(short = 'm', long = "schema", alias = "meta")]
119    pub schema: Option<PathBuf>,
120    /// Existing index file to speed up operations
121    #[arg(short = 'x', long = "index")]
122    pub index: Option<PathBuf>,
123    /// Specific index variant name to use from the selected index file
124    #[arg(long = "index-variant")]
125    pub index_variant: Option<String>,
126    /// Sort directives of the form `column[:asc|desc]`
127    #[arg(long = "sort", action = clap::ArgAction::Append)]
128    pub sort: Vec<String>,
129    /// Restrict output to this comma-separated list of columns
130    #[arg(short = 'C', long = "columns", action = clap::ArgAction::Append)]
131    pub columns: Vec<String>,
132    /// Exclude this comma-separated list of columns from output
133    #[arg(long = "exclude-columns", action = clap::ArgAction::Append)]
134    pub exclude_columns: Vec<String>,
135    /// Additional derived columns using `name=expression`
136    #[arg(long = "derive", action = clap::ArgAction::Append)]
137    pub derives: Vec<String>,
138    /// Row-level filters such as `amount>=100` or `status = shipped`
139    #[arg(long = "filter", action = clap::ArgAction::Append)]
140    pub filters: Vec<String>,
141    /// Emit 1-based row numbers as the first column
142    #[arg(long = "row-numbers")]
143    pub row_numbers: bool,
144    /// Limit number of rows emitted
145    #[arg(long)]
146    pub limit: Option<usize>,
147    /// CSV delimiter character for reading input
148    #[arg(long, value_parser = parse_delimiter)]
149    pub delimiter: Option<u8>,
150    /// Delimiter to use for output (defaults to input delimiter)
151    #[arg(long = "output-delimiter", value_parser = parse_delimiter)]
152    pub output_delimiter: Option<u8>,
153    /// Character encoding of the input file (defaults to utf-8)
154    #[arg(long = "input-encoding")]
155    pub input_encoding: Option<String>,
156    /// Character encoding for the output file/stdout (defaults to utf-8)
157    #[arg(long = "output-encoding")]
158    pub output_encoding: Option<String>,
159    /// Normalize boolean columns in output
160    #[arg(long = "boolean-format", default_value = "original")]
161    pub boolean_format: BooleanFormat,
162    /// Render output as an elastic table to stdout
163    #[arg(long = "table")]
164    pub table: bool,
165}
166
167#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq, Default)]
168#[value(rename_all = "kebab-case")]
169pub enum BooleanFormat {
170    #[default]
171    Original,
172    TrueFalse,
173    OneZero,
174}
175
176#[derive(Debug, Args)]
177pub struct AppendArgs {
178    /// One or more CSV files to append
179    #[arg(short = 'i', long = "input", required = true, action = clap::ArgAction::Append)]
180    pub inputs: Vec<PathBuf>,
181    /// Destination CSV file (stdout if omitted)
182    #[arg(short = 'o', long = "output")]
183    pub output: Option<PathBuf>,
184    /// Schema file to verify against
185    #[arg(short = 'm', long = "schema", alias = "meta")]
186    pub schema: Option<PathBuf>,
187    /// CSV delimiter character
188    #[arg(long, value_parser = parse_delimiter)]
189    pub delimiter: Option<u8>,
190    /// Character encoding for input files (defaults to utf-8)
191    #[arg(long = "input-encoding")]
192    pub input_encoding: Option<String>,
193    /// Character encoding for the output file/stdout (defaults to utf-8)
194    #[arg(long = "output-encoding")]
195    pub output_encoding: Option<String>,
196}
197
198#[derive(Debug, Args)]
199pub struct VerifyArgs {
200    /// Schema file describing the expected structure
201    #[arg(short = 'm', long = "schema", alias = "meta")]
202    pub schema: PathBuf,
203    /// One or more CSV files to verify
204    #[arg(short = 'i', long = "input", required = true, action = clap::ArgAction::Append)]
205    pub inputs: Vec<PathBuf>,
206    /// CSV delimiter character
207    #[arg(long, value_parser = parse_delimiter)]
208    pub delimiter: Option<u8>,
209    /// Character encoding for input files (defaults to utf-8)
210    #[arg(long = "input-encoding")]
211    pub input_encoding: Option<String>,
212    /// Report invalid rows by summary (default) or detail. Append ':detail' and/or ':summary' and optionally a LIMIT value.
213    #[arg(long = "report-invalid", value_name = "OPTIONS", num_args = 0..=3)]
214    pub report_invalid: Option<Vec<String>>,
215}
216
217#[derive(Debug, Args)]
218pub struct PreviewArgs {
219    /// Input CSV file to preview
220    #[arg(short = 'i', long = "input")]
221    pub input: PathBuf,
222    /// Number of rows to display
223    #[arg(long, default_value_t = 10)]
224    pub rows: usize,
225    /// CSV delimiter character
226    #[arg(long, value_parser = parse_delimiter)]
227    pub delimiter: Option<u8>,
228    /// Character encoding for input file (defaults to utf-8)
229    #[arg(long = "input-encoding")]
230    pub input_encoding: Option<String>,
231}
232
233#[derive(Debug, Args)]
234pub struct StatsArgs {
235    /// Input CSV file to profile
236    #[arg(short = 'i', long = "input")]
237    pub input: PathBuf,
238    /// Schema file to drive typed operations
239    #[arg(short = 'm', long = "schema", alias = "meta")]
240    pub schema: Option<PathBuf>,
241    /// Columns to include (defaults to numeric columns)
242    #[arg(short = 'C', long = "columns", action = clap::ArgAction::Append)]
243    pub columns: Vec<String>,
244    /// CSV delimiter character
245    #[arg(long, value_parser = parse_delimiter)]
246    pub delimiter: Option<u8>,
247    /// Character encoding for input file (defaults to utf-8)
248    #[arg(long = "input-encoding")]
249    pub input_encoding: Option<String>,
250    /// Maximum rows to scan (0 = all)
251    #[arg(long, default_value_t = 0)]
252    pub limit: usize,
253}
254
255#[derive(Debug, Args)]
256pub struct FrequencyArgs {
257    /// Input CSV file to analyze
258    #[arg(short = 'i', long = "input")]
259    pub input: PathBuf,
260    /// Schema file to drive typed operations
261    #[arg(short = 'm', long = "schema", alias = "meta")]
262    pub schema: Option<PathBuf>,
263    /// Columns to compute frequency counts for
264    #[arg(short = 'C', long = "columns", action = clap::ArgAction::Append)]
265    pub columns: Vec<String>,
266    /// CSV delimiter character
267    #[arg(long, value_parser = parse_delimiter)]
268    pub delimiter: Option<u8>,
269    /// Character encoding for input file (defaults to utf-8)
270    #[arg(long = "input-encoding")]
271    pub input_encoding: Option<String>,
272    /// Maximum distinct values to display per column (0 = all)
273    #[arg(long, default_value_t = 0)]
274    pub top: usize,
275}
276
277#[derive(Debug, Args)]
278pub struct ColumnsArgs {
279    /// Schema file describing the columns to list
280    #[arg(short = 'm', long = "schema", alias = "meta")]
281    pub schema: PathBuf,
282}
283
284#[derive(Debug, Clone, Copy, ValueEnum)]
285#[value(rename_all = "kebab-case")]
286pub enum JoinKind {
287    Inner,
288    Left,
289    Right,
290    Full,
291}
292
293#[derive(Debug, Args)]
294pub struct JoinArgs {
295    /// Left CSV input
296    #[arg(long = "left")]
297    pub left: PathBuf,
298    /// Right CSV input
299    #[arg(long = "right")]
300    pub right: PathBuf,
301    /// Output CSV file (stdout if omitted)
302    #[arg(short = 'o', long = "output")]
303    pub output: Option<PathBuf>,
304    /// Comma-separated key columns from the left file
305    #[arg(long = "left-key")]
306    pub left_key: String,
307    /// Comma-separated key columns from the right file
308    #[arg(long = "right-key")]
309    pub right_key: String,
310    /// Join type (inner, left, right, full)
311    #[arg(long = "type", value_enum, default_value = "inner")]
312    pub kind: JoinKind,
313    /// Schema for the left file
314    #[arg(long = "left-schema", alias = "left-meta")]
315    pub left_schema: Option<PathBuf>,
316    /// Schema for the right file
317    #[arg(long = "right-schema", alias = "right-meta")]
318    pub right_schema: Option<PathBuf>,
319    /// CSV delimiter character for inputs
320    #[arg(long = "delimiter", value_parser = parse_delimiter)]
321    pub delimiter: Option<u8>,
322    /// Character encoding for the left input file (defaults to utf-8)
323    #[arg(long = "left-encoding")]
324    pub left_encoding: Option<String>,
325    /// Character encoding for the right input file (defaults to utf-8)
326    #[arg(long = "right-encoding")]
327    pub right_encoding: Option<String>,
328    /// Character encoding for the output file/stdout (defaults to utf-8)
329    #[arg(long = "output-encoding")]
330    pub output_encoding: Option<String>,
331}
332
333#[derive(Debug, Args)]
334pub struct InstallArgs {
335    /// Install a specific published version
336    #[arg(long)]
337    pub version: Option<String>,
338    /// Force reinstallation even if already installed
339    #[arg(long)]
340    pub force: bool,
341    /// Use --locked to honour Cargo.lock for dependencies
342    #[arg(long)]
343    pub locked: bool,
344    /// Install into an alternate root directory
345    #[arg(long)]
346    pub root: Option<PathBuf>,
347}
348
349pub fn parse_delimiter(value: &str) -> Result<u8, String> {
350    match value {
351        "tab" | "\t" => Ok(b'\t'),
352        "comma" | "," => Ok(b','),
353        "|" | "pipe" => Ok(b'|'),
354        ";" | "semicolon" => Ok(b';'),
355        other => {
356            let mut chars = other.chars();
357            let first = chars
358                .next()
359                .ok_or_else(|| "Delimiter cannot be empty".to_string())?;
360            if chars.next().is_some() {
361                return Err("Delimiter must be a single character".to_string());
362            }
363            if !first.is_ascii() {
364                return Err("Delimiter must be ASCII".to_string());
365            }
366            Ok(first as u8)
367        }
368    }
369}