Skip to main content

alimentar/cli/
mod.rs

1//! alimentar CLI - Data Loading, Distribution and Tooling
2//!
3//! Command-line interface for alimentar operations.
4
5use std::{path::PathBuf, process::ExitCode};
6
7use clap::{Parser, Subcommand};
8
9mod basic;
10mod drift;
11mod fed;
12mod hub;
13mod quality;
14mod registry;
15mod view;
16
17// Re-export subcommand enums
18pub use drift::DriftCommands;
19pub use fed::FedCommands;
20#[cfg(feature = "hf-hub")]
21pub use hub::HubCommands;
22pub use hub::ImportSource;
23pub use quality::QualityCommands;
24pub use registry::RegistryCommands;
25
26/// alimentar - Data Loading, Distribution and Tooling in Pure Rust
27#[derive(Parser)]
28#[command(name = "alimentar")]
29#[command(author, version, about, long_about = None)]
30struct Cli {
31    #[command(subcommand)]
32    command: Commands,
33}
34
35#[derive(Subcommand)]
36enum Commands {
37    /// Convert between data formats
38    Convert {
39        /// Input file path
40        input: PathBuf,
41        /// Output file path
42        output: PathBuf,
43    },
44    /// Display dataset information
45    Info {
46        /// Path to dataset file
47        path: PathBuf,
48    },
49    /// Display first N rows of a dataset
50    Head {
51        /// Path to dataset file
52        path: PathBuf,
53        /// Number of rows to display
54        #[arg(short = 'n', long, default_value = "10")]
55        rows: usize,
56    },
57    /// Display dataset schema
58    Schema {
59        /// Path to dataset file
60        path: PathBuf,
61    },
62    /// Mix multiple datasets with weighted sampling
63    Mix {
64        /// Input files with optional weights (file:weight, e.g.,
65        /// "data.parquet:0.8")
66        #[arg(required = true)]
67        inputs: Vec<String>,
68        /// Output file path
69        #[arg(short, long)]
70        output: PathBuf,
71        /// Random seed for reproducibility
72        #[arg(short, long, default_value = "42")]
73        seed: u64,
74        /// Maximum total rows in output (0 = sum of all weighted inputs)
75        #[arg(short = 'n', long, default_value = "0")]
76        max_rows: usize,
77    },
78    /// Apply Fill-in-the-Middle (FIM) transform for code model training
79    #[cfg(feature = "shuffle")]
80    Fim {
81        /// Input dataset file (Parquet/CSV/JSON)
82        input: PathBuf,
83        /// Output file path
84        #[arg(short, long)]
85        output: PathBuf,
86        /// Column containing code text
87        #[arg(long, default_value = "text")]
88        column: String,
89        /// FIM application rate (0.0-1.0)
90        #[arg(long, default_value = "0.5")]
91        rate: f64,
92        /// FIM format: psm or spm
93        #[arg(long, default_value = "psm")]
94        format: String,
95        /// Random seed for reproducibility
96        #[arg(long, default_value = "42")]
97        seed: u64,
98    },
99    /// Deduplicate dataset by text content (R-019)
100    Dedup {
101        /// Input dataset file
102        input: PathBuf,
103        /// Output file path
104        #[arg(short, long)]
105        output: PathBuf,
106        /// Column to dedup on (auto-detected if not specified)
107        #[arg(long)]
108        column: Option<String>,
109    },
110    /// Filter dataset by text quality signals (R-022)
111    #[command(name = "filter-text")]
112    FilterText {
113        /// Input dataset file
114        input: PathBuf,
115        /// Output file path
116        #[arg(short, long)]
117        output: PathBuf,
118        /// Column containing text (auto-detected if not specified)
119        #[arg(long)]
120        column: Option<String>,
121        /// Minimum composite quality score (0.0-1.0)
122        #[arg(long, default_value = "0.4")]
123        min_score: f64,
124        /// Minimum document length in characters
125        #[arg(long, default_value = "50")]
126        min_length: usize,
127        /// Maximum document length in characters
128        #[arg(long, default_value = "1000000")]
129        max_length: usize,
130    },
131    /// Interactive TUI viewer for datasets
132    View {
133        /// Path to dataset file (Parquet/Arrow/CSV/JSON)
134        path: PathBuf,
135        /// Initial search query
136        #[arg(long)]
137        search: Option<String>,
138    },
139    /// Import dataset from local files or HuggingFace Hub
140    Import {
141        #[command(subcommand)]
142        source: ImportSource,
143    },
144    /// HuggingFace Hub commands (push/upload datasets)
145    #[allow(clippy::doc_markdown)]
146    #[cfg(feature = "hf-hub")]
147    #[command(subcommand)]
148    Hub(HubCommands),
149    /// Registry commands for dataset sharing and discovery
150    #[command(subcommand)]
151    Registry(RegistryCommands),
152    /// Data drift detection commands
153    #[command(subcommand)]
154    Drift(DriftCommands),
155    /// Data quality checking commands
156    #[command(subcommand)]
157    Quality(QualityCommands),
158    /// Federated split coordination commands
159    #[command(subcommand)]
160    Fed(FedCommands),
161    /// Python doctest extraction commands
162    #[cfg(feature = "doctest")]
163    #[command(subcommand)]
164    Doctest(DoctestCommands),
165    /// Interactive REPL for data exploration
166    #[cfg(feature = "repl")]
167    Repl,
168}
169
170/// Python doctest extraction commands
171#[cfg(feature = "doctest")]
172#[derive(Subcommand)]
173pub enum DoctestCommands {
174    /// Extract doctests from Python source files
175    Extract {
176        /// Input directory containing Python source files
177        input: PathBuf,
178        /// Output parquet file
179        #[arg(short, long)]
180        output: PathBuf,
181        /// Source identifier (e.g., "cpython", "numpy")
182        #[arg(short, long, default_value = "unknown")]
183        source: String,
184        /// Version string or git SHA
185        #[arg(short, long, default_value = "unknown")]
186        version: String,
187    },
188    /// Merge multiple doctest corpora into one
189    Merge {
190        /// Input parquet files to merge
191        #[arg(required = true)]
192        inputs: Vec<PathBuf>,
193        /// Output parquet file
194        #[arg(short, long)]
195        output: PathBuf,
196    },
197}
198
199#[allow(clippy::too_many_lines)]
200/// Run the alimentar CLI.
201pub fn run() -> ExitCode {
202    let cli = Cli::parse();
203
204    let result = match cli.command {
205        Commands::Convert { input, output } => basic::cmd_convert(&input, &output),
206        Commands::Info { path } => basic::cmd_info(&path),
207        Commands::Head { path, rows } => basic::cmd_head(&path, rows),
208        Commands::Schema { path } => basic::cmd_schema(&path),
209        Commands::Mix {
210            inputs,
211            output,
212            seed,
213            max_rows,
214        } => basic::cmd_mix(&inputs, &output, seed, max_rows),
215        #[cfg(feature = "shuffle")]
216        Commands::Fim {
217            input,
218            output,
219            column,
220            rate,
221            format,
222            seed,
223        } => basic::cmd_fim(&input, &output, &column, rate, &format, seed),
224        Commands::Dedup {
225            input,
226            output,
227            column,
228        } => basic::cmd_dedup(&input, &output, column.as_deref()),
229        Commands::FilterText {
230            input,
231            output,
232            column,
233            min_score,
234            min_length,
235            max_length,
236        } => basic::cmd_filter_text(
237            &input,
238            &output,
239            column.as_deref(),
240            min_score,
241            min_length,
242            max_length,
243        ),
244        Commands::View { path, search } => view::cmd_view(&path, search.as_deref()),
245        Commands::Import { source } => match source {
246            ImportSource::Local {
247                input,
248                output,
249                format,
250            } => hub::cmd_import_local(&input, &output, format.as_deref()),
251            #[cfg(feature = "hf-hub")]
252            ImportSource::Hf {
253                repo_id,
254                output,
255                revision,
256                subset,
257                split,
258            } => hub::cmd_import_hf(&repo_id, &output, &revision, subset.as_deref(), &split),
259        },
260        #[cfg(feature = "hf-hub")]
261        Commands::Hub(hub_cmd) => match hub_cmd {
262            HubCommands::Push {
263                input,
264                repo_id,
265                path_in_repo,
266                message,
267                readme,
268                private,
269            } => hub::cmd_hub_push(
270                &input,
271                &repo_id,
272                path_in_repo.as_deref(),
273                &message,
274                readme.as_ref(),
275                private,
276            ),
277        },
278        Commands::Registry(registry_cmd) => dispatch_registry(registry_cmd),
279        Commands::Drift(drift_cmd) => dispatch_drift(drift_cmd),
280        Commands::Quality(quality_cmd) => dispatch_quality(quality_cmd),
281        Commands::Fed(fed_cmd) => dispatch_fed(fed_cmd),
282        #[cfg(feature = "doctest")]
283        Commands::Doctest(doctest_cmd) => match doctest_cmd {
284            DoctestCommands::Extract {
285                input,
286                output,
287                source,
288                version,
289            } => cmd_doctest_extract(&input, &output, &source, &version),
290            DoctestCommands::Merge { inputs, output } => cmd_doctest_merge(&inputs, &output),
291        },
292        #[cfg(feature = "repl")]
293        Commands::Repl => crate::repl::run(),
294    };
295
296    match result {
297        Ok(()) => ExitCode::SUCCESS,
298        Err(e) => {
299            eprintln!("Error: {}", e);
300            ExitCode::FAILURE
301        }
302    }
303}
304
305fn dispatch_registry(cmd: RegistryCommands) -> crate::error::Result<()> {
306    match cmd {
307        RegistryCommands::Init { path } => registry::cmd_registry_init(&path),
308        RegistryCommands::List { path } => registry::cmd_registry_list(&path),
309        RegistryCommands::Push {
310            input,
311            name,
312            version,
313            description,
314            license,
315            tags,
316            registry,
317        } => registry::cmd_registry_push(
318            &input,
319            &name,
320            &version,
321            &description,
322            &license,
323            &tags,
324            &registry,
325        ),
326        RegistryCommands::Pull {
327            name,
328            output,
329            version,
330            registry,
331        } => registry::cmd_registry_pull(&name, &output, version.as_deref(), &registry),
332        RegistryCommands::Search { query, path } => registry::cmd_registry_search(&query, &path),
333        RegistryCommands::ShowInfo { name, path } => registry::cmd_registry_show_info(&name, &path),
334        RegistryCommands::Delete {
335            name,
336            version,
337            path,
338        } => registry::cmd_registry_delete(&name, &version, &path),
339    }
340}
341
342fn dispatch_drift(cmd: DriftCommands) -> crate::error::Result<()> {
343    match cmd {
344        DriftCommands::Detect {
345            reference,
346            current,
347            tests,
348            alpha,
349            format,
350        } => drift::cmd_drift_detect(&reference, &current, &tests, alpha, &format),
351        DriftCommands::Report {
352            reference,
353            current,
354            output,
355        } => drift::cmd_drift_report(&reference, &current, output.as_ref()),
356        DriftCommands::Sketch {
357            input,
358            output,
359            sketch_type,
360            source,
361            format,
362        } => drift::cmd_drift_sketch(&input, &output, &sketch_type, source.as_deref(), &format),
363        DriftCommands::Merge {
364            sketches,
365            output,
366            format,
367        } => drift::cmd_drift_merge(&sketches, &output, &format),
368        DriftCommands::Compare {
369            reference,
370            current,
371            threshold,
372            format,
373        } => drift::cmd_drift_compare(&reference, &current, threshold, &format),
374    }
375}
376
377fn dispatch_quality(cmd: QualityCommands) -> crate::error::Result<()> {
378    match cmd {
379        QualityCommands::Check {
380            path,
381            null_threshold,
382            duplicate_threshold,
383            detect_outliers,
384            format,
385        } => quality::cmd_quality_check(
386            &path,
387            null_threshold,
388            duplicate_threshold,
389            detect_outliers,
390            &format,
391        ),
392        QualityCommands::Report { path, output } => {
393            quality::cmd_quality_report(&path, output.as_deref())
394        }
395        QualityCommands::Score {
396            path,
397            profile,
398            suggest,
399            json,
400            badge,
401        } => quality::cmd_quality_score(&path, &profile, suggest, json, badge),
402        QualityCommands::Profiles => quality::cmd_quality_profiles(),
403    }
404}
405
406fn dispatch_fed(cmd: FedCommands) -> crate::error::Result<()> {
407    match cmd {
408        FedCommands::Manifest {
409            input,
410            output,
411            node_id,
412            train_ratio,
413            seed,
414            format,
415        } => fed::cmd_fed_manifest(&input, &output, &node_id, train_ratio, seed, &format),
416        FedCommands::Plan {
417            manifests,
418            output,
419            strategy,
420            train_ratio,
421            seed,
422            stratify_column,
423            format,
424        } => fed::cmd_fed_plan(
425            &manifests,
426            &output,
427            &strategy,
428            train_ratio,
429            seed,
430            stratify_column.as_deref(),
431            &format,
432        ),
433        FedCommands::Split {
434            input,
435            plan,
436            node_id,
437            train_output,
438            test_output,
439            validation_output,
440        } => fed::cmd_fed_split(
441            &input,
442            &plan,
443            &node_id,
444            &train_output,
445            &test_output,
446            validation_output.as_ref(),
447        ),
448        FedCommands::Verify { manifests, format } => fed::cmd_fed_verify(&manifests, &format),
449    }
450}
451
452// =============================================================================
453// Doctest Commands
454// =============================================================================
455
456#[cfg(feature = "doctest")]
457fn cmd_doctest_extract(
458    input: &std::path::Path,
459    output: &std::path::Path,
460    source: &str,
461    version: &str,
462) -> crate::Result<()> {
463    use crate::DocTestParser;
464
465    if !input.is_dir() {
466        return Err(crate::Error::invalid_config(format!(
467            "Input path must be a directory: {}",
468            input.display()
469        )));
470    }
471
472    let parser = DocTestParser::new();
473    let corpus = parser.parse_directory(input, source, version)?;
474
475    println!(
476        "Extracted {} doctests from {} ({})",
477        corpus.len(),
478        source,
479        version
480    );
481
482    if corpus.is_empty() {
483        println!("Warning: No doctests found in {}", input.display());
484        return Ok(());
485    }
486
487    let dataset = corpus.to_dataset()?;
488    dataset.to_parquet(output)?;
489
490    println!("Wrote {} to {}", corpus.len(), output.display());
491    Ok(())
492}
493
494#[cfg(feature = "doctest")]
495fn cmd_doctest_merge(inputs: &[PathBuf], output: &std::path::Path) -> crate::Result<()> {
496    use crate::{dataset::Dataset, ArrowDataset};
497
498    if inputs.is_empty() {
499        return Err(crate::Error::invalid_config("No input files provided"));
500    }
501
502    // Load all datasets and concatenate
503    let mut all_batches = Vec::new();
504    let mut total_rows = 0;
505
506    for input in inputs {
507        let dataset = ArrowDataset::from_parquet(input)?;
508        total_rows += dataset.len();
509        for batch in dataset.iter() {
510            all_batches.push(batch.clone());
511        }
512    }
513
514    if all_batches.is_empty() {
515        return Err(crate::Error::invalid_config("No data found in input files"));
516    }
517
518    // Create merged dataset
519    let merged = ArrowDataset::new(all_batches)?;
520    merged.to_parquet(output)?;
521
522    println!(
523        "Merged {} doctests from {} files to {}",
524        total_rows,
525        inputs.len(),
526        output.display()
527    );
528    Ok(())
529}