Skip to main content

datui_cli/
lib.rs

1//! Shared CLI definitions for datui.
2//!
3//! Used by the main application and by the build script (manpage) and
4//! gen_docs binary (command-line-options markdown).
5
6use clap::{CommandFactory, Parser, ValueEnum};
7use std::path::Path;
8
9/// File format for data files (used to bypass extension-based detection).
10/// When `--format` is not specified, format is auto-detected from the file extension.
11#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
12pub enum FileFormat {
13    /// Parquet columnar format
14    Parquet,
15    /// Comma-separated values
16    Csv,
17    /// Tab-separated values
18    Tsv,
19    /// Pipe-separated values
20    Psv,
21    /// JSON array format
22    Json,
23    /// JSON Lines / NDJSON (one JSON object per line)
24    Jsonl,
25    /// Arrow IPC / Feather
26    Arrow,
27    /// Avro row format
28    Avro,
29    /// ORC columnar format
30    Orc,
31    /// Excel (.xls, .xlsx, .xlsm, .xlsb)
32    Excel,
33}
34
35impl FileFormat {
36    /// Detect file format from path extension. Returns None when extension is missing or unknown.
37    pub fn from_path(path: &Path) -> Option<Self> {
38        path.extension()
39            .and_then(|e| e.to_str())
40            .and_then(Self::from_extension)
41    }
42
43    /// Parse format from extension string (e.g. "parquet", "csv").
44    pub fn from_extension(ext: &str) -> Option<Self> {
45        match ext.to_lowercase().as_str() {
46            "parquet" => Some(Self::Parquet),
47            "csv" => Some(Self::Csv),
48            "tsv" => Some(Self::Tsv),
49            "psv" => Some(Self::Psv),
50            "json" => Some(Self::Json),
51            "jsonl" | "ndjson" => Some(Self::Jsonl),
52            "arrow" | "ipc" | "feather" => Some(Self::Arrow),
53            "avro" => Some(Self::Avro),
54            "orc" => Some(Self::Orc),
55            "xls" | "xlsx" | "xlsm" | "xlsb" => Some(Self::Excel),
56            _ => None,
57        }
58    }
59}
60
61/// Compression format for data files
62#[derive(Debug, Clone, Copy, ValueEnum, PartialEq, Eq)]
63pub enum CompressionFormat {
64    /// Gzip compression (.gz) - Most common, good balance of speed and compression
65    Gzip,
66    /// Zstandard compression (.zst) - Modern, fast compression with good ratios
67    Zstd,
68    /// Bzip2 compression (.bz2) - Good compression ratio, slower than gzip
69    Bzip2,
70    /// XZ compression (.xz) - Excellent compression ratio, slower than bzip2
71    Xz,
72}
73
74impl CompressionFormat {
75    /// Detect compression format from file extension
76    pub fn from_extension(path: &Path) -> Option<Self> {
77        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
78            match ext.to_lowercase().as_str() {
79                "gz" => Some(Self::Gzip),
80                "zst" | "zstd" => Some(Self::Zstd),
81                "bz2" | "bz" => Some(Self::Bzip2),
82                "xz" => Some(Self::Xz),
83                _ => None,
84            }
85        } else {
86            None
87        }
88    }
89
90    /// Get file extension for this compression format
91    pub fn extension(&self) -> &'static str {
92        match self {
93            Self::Gzip => "gz",
94            Self::Zstd => "zst",
95            Self::Bzip2 => "bz2",
96            Self::Xz => "xz",
97        }
98    }
99}
100
101/// Command-line arguments for datui
102#[derive(Parser, Debug)]
103#[command(
104    name = "datui",
105    version,
106    about = "Data Exploration in the Terminal",
107    long_about = include_str!("../long_about.txt")
108)]
109pub struct Args {
110    /// Path(s) to the data file(s) to open.
111    /// Multiple files of the same format are concatenated into one table (not required with --generate-config, --clear-cache, or --remove-templates)
112    #[arg(required_unless_present_any = ["generate_config", "clear_cache", "remove_templates"], num_args = 1.., value_name = "PATH")]
113    pub paths: Vec<std::path::PathBuf>,
114
115    /// Skip this many lines when reading a file
116    #[arg(long = "skip-lines")]
117    pub skip_lines: Option<usize>,
118
119    /// Skip this many rows when reading a file
120    #[arg(long = "skip-rows")]
121    pub skip_rows: Option<usize>,
122
123    /// Specify that the file has no header
124    #[arg(long = "no-header")]
125    pub no_header: Option<bool>,
126
127    /// Specify the delimiter to use when reading a delimited text file
128    #[arg(long = "delimiter")]
129    pub delimiter: Option<u8>,
130
131    /// Treat these values as null when reading CSV. Use once per value; no "=" means all columns, COL=VAL means column COL only (first "=" separates column from value). Example: --null-value NA --null-value amount=
132    #[arg(long = "null-value", value_name = "VAL")]
133    pub null_value: Vec<String>,
134
135    /// Specify the compression format explicitly (gzip, zstd, bzip2, xz)
136    /// If not specified, compression is auto-detected from file extension.
137    #[arg(long = "compression", value_enum)]
138    pub compression: Option<CompressionFormat>,
139
140    /// Force file format (parquet, csv, tsv, psv, json, jsonl, arrow, avro, orc, excel).
141    /// By default format is auto-detected from the file extension. Use this for URLs or paths without an extension.
142    #[arg(long = "format", value_enum)]
143    pub format: Option<FileFormat>,
144
145    /// Enable debug mode to show operational information
146    #[arg(long = "debug", action)]
147    pub debug: bool,
148
149    /// Enable Hive-style partitioning for directory or glob paths; ignored for a single file
150    #[arg(long = "hive", action)]
151    pub hive: bool,
152
153    /// Infer Hive/partitioned Parquet schema from one file for faster load (default: true). Set to false to use full schema scan.
154    #[arg(long = "single-spine-schema", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
155    pub single_spine_schema: Option<bool>,
156
157    /// Try to parse CSV string columns as dates (e.g. YYYY-MM-DD, ISO datetime). Default: true
158    #[arg(long = "parse-dates", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
159    pub parse_dates: Option<bool>,
160
161    /// Decompress into memory. Default: decompress to temp file and use lazy scan
162    #[arg(long = "decompress-in-memory", default_missing_value = "true", num_args = 0..=1, value_parser = clap::value_parser!(bool))]
163    pub decompress_in_memory: Option<bool>,
164
165    /// Directory for decompression temp files (default: system temp, e.g. TMPDIR)
166    #[arg(long = "temp-dir", value_name = "DIR")]
167    pub temp_dir: Option<std::path::PathBuf>,
168
169    /// Excel sheet to load: 0-based index (e.g. 0) or sheet name (e.g. "Sales")
170    #[arg(long = "sheet", value_name = "SHEET")]
171    pub excel_sheet: Option<String>,
172
173    /// Clear all cache data and exit
174    #[arg(long = "clear-cache", action)]
175    pub clear_cache: bool,
176
177    /// Apply a template by name when starting the application
178    #[arg(long = "template")]
179    pub template: Option<String>,
180
181    /// Remove all templates and exit
182    #[arg(long = "remove-templates", action)]
183    pub remove_templates: bool,
184
185    /// When set, datasets with this many or more rows are sampled for analysis (faster, less memory).
186    /// Overrides config [performance] sampling_threshold. Use 0 to disable sampling (full dataset) for this run.
187    /// When omitted, config or full-dataset mode is used.
188    #[arg(long = "sampling-threshold", value_name = "N")]
189    pub sampling_threshold: Option<usize>,
190
191    /// Use Polars streaming engine for LazyFrame collect when available (default: true). Set to false to disable.
192    #[arg(long = "polars-streaming", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
193    pub polars_streaming: Option<bool>,
194
195    /// Apply workaround for Polars 0.52 pivot with Date/Datetime index (default: true). Set to false to test without it.
196    #[arg(long = "workaround-pivot-date-index", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
197    pub workaround_pivot_date_index: Option<bool>,
198
199    /// Number of pages to buffer ahead of the visible area (default: 3)
200    /// Larger values provide smoother scrolling but use more memory
201    #[arg(long = "pages-lookahead")]
202    pub pages_lookahead: Option<usize>,
203
204    /// Number of pages to buffer behind the visible area (default: 3)
205    /// Larger values provide smoother scrolling but use more memory
206    #[arg(long = "pages-lookback")]
207    pub pages_lookback: Option<usize>,
208
209    /// Display row numbers on the left side of the table
210    #[arg(long = "row-numbers", action)]
211    pub row_numbers: bool,
212
213    /// Starting index for row numbers (default: 1)
214    #[arg(long = "row-start-index")]
215    pub row_start_index: Option<usize>,
216
217    /// Colorize main table cells by column type (default: true). Set to false to disable.
218    #[arg(long = "column-colors", value_name = "BOOL", value_parser = clap::value_parser!(bool))]
219    pub column_colors: Option<bool>,
220
221    /// Generate default configuration file at ~/.config/datui/config.toml
222    #[arg(long = "generate-config", action)]
223    pub generate_config: bool,
224
225    /// Force overwrite existing config file when using --generate-config
226    #[arg(long = "force", requires = "generate_config", action)]
227    pub force: bool,
228
229    /// S3-compatible endpoint URL (overrides config and AWS_ENDPOINT_URL). Example: http://localhost:9000
230    #[arg(long = "s3-endpoint-url", value_name = "URL")]
231    pub s3_endpoint_url: Option<String>,
232
233    /// S3 access key (overrides config and AWS_ACCESS_KEY_ID)
234    #[arg(long = "s3-access-key-id", value_name = "KEY")]
235    pub s3_access_key_id: Option<String>,
236
237    /// S3 secret key (overrides config and AWS_SECRET_ACCESS_KEY)
238    #[arg(long = "s3-secret-access-key", value_name = "SECRET")]
239    pub s3_secret_access_key: Option<String>,
240
241    /// S3 region (overrides config and AWS_REGION). Example: us-east-1
242    #[arg(long = "s3-region", value_name = "REGION")]
243    pub s3_region: Option<String>,
244}
245
246/// Escape `|` and newlines for use in markdown table cells.
247fn escape_table_cell(s: &str) -> String {
248    s.replace('|', "\\|").replace(['\n', '\r'], " ")
249}
250
251/// Render command-line options as markdown.
252///
253/// Used by the gen_docs binary; output is written to stdout and then
254/// to `docs/reference/command-line-options.md` by the docs build process.
255pub fn render_options_markdown() -> String {
256    let mut cmd = Args::command();
257    cmd.build();
258
259    let mut out = String::from("# Command Line Options\n\n");
260
261    out.push_str("## Usage\n\n```\n");
262    let usage = cmd.render_usage();
263    out.push_str(&usage.to_string());
264    out.push_str("\n```\n\n");
265
266    out.push_str("## Options\n\n");
267    out.push_str("| Option | Description |\n");
268    out.push_str("|--------|-------------|\n");
269
270    for arg in cmd.get_arguments() {
271        let id = arg.get_id().as_ref().to_string();
272        if id == "help" || id == "version" {
273            continue;
274        }
275
276        let option_str = if arg.is_positional() {
277            let placeholder: String = arg
278                .get_value_names()
279                .map(|names| {
280                    names
281                        .iter()
282                        .map(|n: &clap::builder::Str| format!("<{}>", n.as_ref() as &str))
283                        .collect::<Vec<_>>()
284                        .join(" ")
285                })
286                .unwrap_or_default();
287            if arg.is_required_set() {
288                placeholder
289            } else {
290                format!("[{placeholder}]")
291            }
292        } else {
293            let mut parts = Vec::new();
294            if let Some(s) = arg.get_short() {
295                parts.push(format!("-{s}"));
296            }
297            if let Some(l) = arg.get_long() {
298                parts.push(format!("--{l}"));
299            }
300            let op = parts.join(", ");
301            let takes_val = arg.get_action().takes_values();
302            let placeholder: String = if takes_val {
303                arg.get_value_names()
304                    .map(|names| {
305                        names
306                            .iter()
307                            .map(|n: &clap::builder::Str| format!("<{}>", n.as_ref() as &str))
308                            .collect::<Vec<_>>()
309                            .join(" ")
310                    })
311                    .unwrap_or_default()
312            } else {
313                String::new()
314            };
315            if placeholder.is_empty() {
316                op
317            } else {
318                format!("{op} {placeholder}")
319            }
320        };
321
322        let help = arg
323            .get_help()
324            .map(|h| escape_table_cell(&h.to_string()))
325            .unwrap_or_else(|| "-".to_string());
326
327        out.push_str(&format!("| `{option_str}` | {help} |\n"));
328    }
329
330    out
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    #[test]
338    fn test_compression_detection() {
339        assert_eq!(
340            CompressionFormat::from_extension(Path::new("file.csv.gz")),
341            Some(CompressionFormat::Gzip)
342        );
343        assert_eq!(
344            CompressionFormat::from_extension(Path::new("file.csv.zst")),
345            Some(CompressionFormat::Zstd)
346        );
347        assert_eq!(
348            CompressionFormat::from_extension(Path::new("file.csv.bz2")),
349            Some(CompressionFormat::Bzip2)
350        );
351        assert_eq!(
352            CompressionFormat::from_extension(Path::new("file.csv.xz")),
353            Some(CompressionFormat::Xz)
354        );
355        assert_eq!(
356            CompressionFormat::from_extension(Path::new("file.csv")),
357            None
358        );
359        assert_eq!(CompressionFormat::from_extension(Path::new("file")), None);
360    }
361
362    #[test]
363    fn test_compression_extension() {
364        assert_eq!(CompressionFormat::Gzip.extension(), "gz");
365        assert_eq!(CompressionFormat::Zstd.extension(), "zst");
366        assert_eq!(CompressionFormat::Bzip2.extension(), "bz2");
367        assert_eq!(CompressionFormat::Xz.extension(), "xz");
368    }
369
370    #[test]
371    fn test_file_format_from_path() {
372        assert_eq!(
373            FileFormat::from_path(Path::new("data.parquet")),
374            Some(FileFormat::Parquet)
375        );
376        assert_eq!(
377            FileFormat::from_path(Path::new("data.csv")),
378            Some(FileFormat::Csv)
379        );
380        assert_eq!(
381            FileFormat::from_path(Path::new("file.jsonl")),
382            Some(FileFormat::Jsonl)
383        );
384        assert_eq!(FileFormat::from_path(Path::new("noext")), None);
385        assert_eq!(
386            FileFormat::from_path(Path::new("file.NDJSON")),
387            Some(FileFormat::Jsonl)
388        );
389    }
390}