Skip to main content

datui_cli/
lib.rs

1//! Shared CLI definitions for datui.
2//!
3//! Used by the main application and by the build script (manpage) and
4//! gen_docs binary (command-line-options markdown).
5
6use clap::{CommandFactory, Parser, ValueEnum};
7use std::path::Path;
8
9/// File format for data files (used to bypass extension-based detection).
10/// When `--format` is not specified, format is auto-detected from the file extension.
11#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
12pub enum FileFormat {
13    /// Parquet columnar format
14    Parquet,
15    /// Comma-separated values
16    Csv,
17    /// Tab-separated values
18    Tsv,
19    /// Pipe-separated values
20    Psv,
21    /// JSON array format
22    Json,
23    /// JSON Lines / NDJSON (one JSON object per line)
24    Jsonl,
25    /// Arrow IPC / Feather
26    Arrow,
27    /// Avro row format
28    Avro,
29    /// ORC columnar format
30    Orc,
31    /// Excel (.xls, .xlsx, .xlsm, .xlsb)
32    Excel,
33}
34
35impl FileFormat {
36    /// Detect file format from path extension. Returns None when extension is missing or unknown.
37    pub fn from_path(path: &Path) -> Option<Self> {
38        path.extension()
39            .and_then(|e| e.to_str())
40            .and_then(Self::from_extension)
41    }
42
43    /// Parse format from extension string (e.g. "parquet", "csv").
44    pub fn from_extension(ext: &str) -> Option<Self> {
45        match ext.to_lowercase().as_str() {
46            "parquet" => Some(Self::Parquet),
47            "csv" => Some(Self::Csv),
48            "tsv" => Some(Self::Tsv),
49            "psv" => Some(Self::Psv),
50            "json" => Some(Self::Json),
51            "jsonl" | "ndjson" => Some(Self::Jsonl),
52            "arrow" | "ipc" | "feather" => Some(Self::Arrow),
53            "avro" => Some(Self::Avro),
54            "orc" => Some(Self::Orc),
55            "xls" | "xlsx" | "xlsm" | "xlsb" => Some(Self::Excel),
56            _ => None,
57        }
58    }
59}
60
61/// Compression format for data files
62#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
63pub enum CompressionFormat {
64    /// Gzip compression (.gz) - Most common, good balance of speed and compression
65    Gzip,
66    /// Zstandard compression (.zst) - Modern, fast compression with good ratios
67    Zstd,
68    /// Bzip2 compression (.bz2) - Good compression ratio, slower than gzip
69    Bzip2,
70    /// XZ compression (.xz) - Excellent compression ratio, slower than bzip2
71    Xz,
72}
73
74impl CompressionFormat {
75    /// Detect compression format from file extension
76    pub fn from_extension(path: &Path) -> Option<Self> {
77        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
78            match ext.to_lowercase().as_str() {
79                "gz" => Some(Self::Gzip),
80                "zst" | "zstd" => Some(Self::Zstd),
81                "bz2" | "bz" => Some(Self::Bzip2),
82                "xz" => Some(Self::Xz),
83                _ => None,
84            }
85        } else {
86            None
87        }
88    }
89
90    /// Get file extension for this compression format
91    pub fn extension(&self) -> &'static str {
92        match self {
93            Self::Gzip => "gz",
94            Self::Zstd => "zst",
95            Self::Bzip2 => "bz2",
96            Self::Xz => "xz",
97        }
98    }
99}
100
101/// Command-line arguments for datui
102#[derive(Clone, Parser, Debug)]
103#[command(
104    name = "datui",
105    version,
106    about = "Data Exploration in the Terminal",
107    long_about = include_str!("../long_about.txt")
108)]
109pub struct Args {
110    /// Path(s) to the data file(s) to open.
111    /// Multiple files of the same format are concatenated into one table (not required with --generate-config, --clear-cache, or --remove-templates)
112    #[arg(required_unless_present_any = ["generate_config", "clear_cache", "remove_templates"], num_args = 1.., value_name = "PATH")]
113    pub paths: Vec<std::path::PathBuf>,
114
115    /// Skip this many lines when reading a file
116    #[arg(long = "skip-lines")]
117    pub skip_lines: Option<usize>,
118
119    /// Skip this many rows when reading a file
120    #[arg(long = "skip-rows")]
121    pub skip_rows: Option<usize>,
122
123    /// Skip this many rows at the end of the file (e.g. to ignore vendor footer or trailing garbage)
124    #[arg(long = "skip-tail-rows", value_name = "N")]
125    pub skip_tail_rows: Option<usize>,
126
127    /// Specify that the file has no header
128    #[arg(long = "no-header")]
129    pub no_header: Option<bool>,
130
131    /// Specify the delimiter to use when reading a delimited text file
132    #[arg(long = "delimiter")]
133    pub delimiter: Option<u8>,
134
135    /// Treat these values as null when reading CSV. Use once per value; no "=" means all columns, COL=VAL means column COL only (first "=" separates column from value). Example: --null-value NA --null-value amount=
136    #[arg(long = "null-value", value_name = "VAL")]
137    pub null_value: Vec<String>,
138
139    /// Specify the compression format explicitly (gzip, zstd, bzip2, xz)
140    /// If not specified, compression is auto-detected from file extension.
141    #[arg(long = "compression", value_enum)]
142    pub compression: Option<CompressionFormat>,
143
144    /// Force file format (parquet, csv, tsv, psv, json, jsonl, arrow, avro, orc, excel).
145    /// By default format is auto-detected from the file extension. Use this for URLs or paths without an extension.
146    #[arg(long = "format", value_enum)]
147    pub format: Option<FileFormat>,
148
149    /// Enable debug mode to show operational information
150    #[arg(long = "debug", action)]
151    pub debug: bool,
152
153    /// Enable Hive-style partitioning for directory or glob paths; ignored for a single file
154    #[arg(long = "hive", action)]
155    pub hive: bool,
156
157    /// Infer Hive/partitioned Parquet schema from one file for faster load (default: true). Set to false to use full schema scan.
158    #[arg(long = "single-spine-schema", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
159    pub single_spine_schema: Option<bool>,
160
161    /// Try to parse CSV string columns as dates (e.g. YYYY-MM-DD, ISO datetime). Default: true
162    #[arg(long = "parse-dates", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
163    pub parse_dates: Option<bool>,
164
165    /// Trim whitespace and parse CSV string columns as date, datetime, time, duration, int, or float. Default: applied to all string columns. Use --parse-strings=COL (repeatable) to limit to specific columns, or --no-parse-strings to disable.
166    #[arg(long = "parse-strings", value_name = "COL", num_args = 0.., default_missing_value = "")]
167    pub parse_strings: Vec<String>,
168
169    /// Disable parse-strings for CSV (trim and type inference). Overrides config and default.
170    #[arg(long = "no-parse-strings", action)]
171    pub no_parse_strings: bool,
172
173    /// Decompress into memory. Default: decompress to temp file and use lazy scan
174    #[arg(long = "decompress-in-memory", default_missing_value = "true", num_args = 0..=1, value_parser = clap::value_parser!(bool))]
175    pub decompress_in_memory: Option<bool>,
176
177    /// Directory for decompression temp files (default: system temp, e.g. TMPDIR)
178    #[arg(long = "temp-dir", value_name = "DIR")]
179    pub temp_dir: Option<std::path::PathBuf>,
180
181    /// Excel sheet to load: 0-based index (e.g. 0) or sheet name (e.g. "Sales")
182    #[arg(long = "sheet", value_name = "SHEET")]
183    pub excel_sheet: Option<String>,
184
185    /// Clear all cache data and exit
186    #[arg(long = "clear-cache", action)]
187    pub clear_cache: bool,
188
189    /// Apply a template by name when starting the application
190    #[arg(long = "template")]
191    pub template: Option<String>,
192
193    /// Remove all templates and exit
194    #[arg(long = "remove-templates", action)]
195    pub remove_templates: bool,
196
197    /// When set, datasets with this many or more rows are sampled for analysis (faster, less memory).
198    /// Overrides config [performance] sampling_threshold. Use 0 to disable sampling (full dataset) for this run.
199    /// When omitted, config or full-dataset mode is used.
200    #[arg(long = "sampling-threshold", value_name = "N")]
201    pub sampling_threshold: Option<usize>,
202
203    /// Use Polars streaming engine for LazyFrame collect when available (default: true). Set to false to disable.
204    #[arg(long = "polars-streaming", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
205    pub polars_streaming: Option<bool>,
206
207    /// Apply workaround for Polars 0.52 pivot with Date/Datetime index (default: true). Set to false to test without it.
208    #[arg(long = "workaround-pivot-date-index", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
209    pub workaround_pivot_date_index: Option<bool>,
210
211    /// Number of pages to buffer ahead of the visible area (default: 3)
212    /// Larger values provide smoother scrolling but use more memory
213    #[arg(long = "pages-lookahead")]
214    pub pages_lookahead: Option<usize>,
215
216    /// Number of pages to buffer behind the visible area (default: 3)
217    /// Larger values provide smoother scrolling but use more memory
218    #[arg(long = "pages-lookback")]
219    pub pages_lookback: Option<usize>,
220
221    /// Display row numbers on the left side of the table
222    #[arg(long = "row-numbers", action)]
223    pub row_numbers: bool,
224
225    /// Starting index for row numbers (default: 1)
226    #[arg(long = "row-start-index")]
227    pub row_start_index: Option<usize>,
228
229    /// Colorize main table cells by column type (default: true). Set to false to disable.
230    #[arg(long = "column-colors", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
231    pub column_colors: Option<bool>,
232
233    /// Generate default configuration file at ~/.config/datui/config.toml
234    #[arg(long = "generate-config", action)]
235    pub generate_config: bool,
236
237    /// Force overwrite existing config file when using --generate-config
238    #[arg(long = "force", requires = "generate_config", action)]
239    pub force: bool,
240
241    /// S3-compatible endpoint URL (overrides config and AWS_ENDPOINT_URL). Example: http://localhost:9000
242    #[arg(long = "s3-endpoint-url", value_name = "URL")]
243    pub s3_endpoint_url: Option<String>,
244
245    /// S3 access key (overrides config and AWS_ACCESS_KEY_ID)
246    #[arg(long = "s3-access-key-id", value_name = "KEY")]
247    pub s3_access_key_id: Option<String>,
248
249    /// S3 secret key (overrides config and AWS_SECRET_ACCESS_KEY)
250    #[arg(long = "s3-secret-access-key", value_name = "SECRET")]
251    pub s3_secret_access_key: Option<String>,
252
253    /// S3 region (overrides config and AWS_REGION). Example: us-east-1
254    #[arg(long = "s3-region", value_name = "REGION")]
255    pub s3_region: Option<String>,
256}
257
258/// Escape `|` and newlines for use in markdown table cells.
259fn escape_table_cell(s: &str) -> String {
260    s.replace('|', "\\|").replace(['\n', '\r'], " ")
261}
262
263/// Render command-line options as markdown.
264///
265/// Used by the gen_docs binary; output is written to stdout and then
266/// to `docs/reference/command-line-options.md` by the docs build process.
267pub fn render_options_markdown() -> String {
268    let mut cmd = Args::command();
269    cmd.build();
270
271    let mut out = String::from("# Command Line Options\n\n");
272
273    out.push_str("## Usage\n\n```\n");
274    let usage = cmd.render_usage();
275    out.push_str(&usage.to_string());
276    out.push_str("\n```\n\n");
277
278    out.push_str("## Options\n\n");
279    out.push_str("| Option | Description |\n");
280    out.push_str("|--------|-------------|\n");
281
282    for arg in cmd.get_arguments() {
283        let id = arg.get_id().as_ref().to_string();
284        if id == "help" || id == "version" {
285            continue;
286        }
287
288        let option_str = if arg.is_positional() {
289            let placeholder: String = arg
290                .get_value_names()
291                .map(|names| {
292                    names
293                        .iter()
294                        .map(|n: &clap::builder::Str| format!("<{}>", n.as_ref() as &str))
295                        .collect::<Vec<_>>()
296                        .join(" ")
297                })
298                .unwrap_or_default();
299            if arg.is_required_set() {
300                placeholder
301            } else {
302                format!("[{placeholder}]")
303            }
304        } else {
305            let mut parts = Vec::new();
306            if let Some(s) = arg.get_short() {
307                parts.push(format!("-{s}"));
308            }
309            if let Some(l) = arg.get_long() {
310                parts.push(format!("--{l}"));
311            }
312            let op = parts.join(", ");
313            let takes_val = arg.get_action().takes_values();
314            let placeholder: String = if takes_val {
315                arg.get_value_names()
316                    .map(|names| {
317                        names
318                            .iter()
319                            .map(|n: &clap::builder::Str| format!("<{}>", n.as_ref() as &str))
320                            .collect::<Vec<_>>()
321                            .join(" ")
322                    })
323                    .unwrap_or_default()
324            } else {
325                String::new()
326            };
327            if placeholder.is_empty() {
328                op
329            } else {
330                format!("{op} {placeholder}")
331            }
332        };
333
334        let help = arg
335            .get_help()
336            .map(|h| escape_table_cell(&h.to_string()))
337            .unwrap_or_else(|| "-".to_string());
338
339        out.push_str(&format!("| `{option_str}` | {help} |\n"));
340    }
341
342    out
343}
344
345#[cfg(test)]
346mod tests {
347    use super::*;
348
349    #[test]
350    fn test_compression_detection() {
351        assert_eq!(
352            CompressionFormat::from_extension(Path::new("file.csv.gz")),
353            Some(CompressionFormat::Gzip)
354        );
355        assert_eq!(
356            CompressionFormat::from_extension(Path::new("file.csv.zst")),
357            Some(CompressionFormat::Zstd)
358        );
359        assert_eq!(
360            CompressionFormat::from_extension(Path::new("file.csv.bz2")),
361            Some(CompressionFormat::Bzip2)
362        );
363        assert_eq!(
364            CompressionFormat::from_extension(Path::new("file.csv.xz")),
365            Some(CompressionFormat::Xz)
366        );
367        assert_eq!(
368            CompressionFormat::from_extension(Path::new("file.csv")),
369            None
370        );
371        assert_eq!(CompressionFormat::from_extension(Path::new("file")), None);
372    }
373
374    #[test]
375    fn test_compression_extension() {
376        assert_eq!(CompressionFormat::Gzip.extension(), "gz");
377        assert_eq!(CompressionFormat::Zstd.extension(), "zst");
378        assert_eq!(CompressionFormat::Bzip2.extension(), "bz2");
379        assert_eq!(CompressionFormat::Xz.extension(), "xz");
380    }
381
382    #[test]
383    fn test_file_format_from_path() {
384        assert_eq!(
385            FileFormat::from_path(Path::new("data.parquet")),
386            Some(FileFormat::Parquet)
387        );
388        assert_eq!(
389            FileFormat::from_path(Path::new("data.csv")),
390            Some(FileFormat::Csv)
391        );
392        assert_eq!(
393            FileFormat::from_path(Path::new("file.jsonl")),
394            Some(FileFormat::Jsonl)
395        );
396        assert_eq!(FileFormat::from_path(Path::new("noext")), None);
397        assert_eq!(
398            FileFormat::from_path(Path::new("file.NDJSON")),
399            Some(FileFormat::Jsonl)
400        );
401    }
402}