1use std::{path::PathBuf, process::ExitCode};
6
7use clap::{Parser, Subcommand};
8
9mod basic;
10mod drift;
11mod fed;
12mod hub;
13mod quality;
14mod registry;
15mod view;
16
17pub use drift::DriftCommands;
19pub use fed::FedCommands;
20#[cfg(feature = "hf-hub")]
21pub use hub::HubCommands;
22pub use hub::ImportSource;
23pub use quality::QualityCommands;
24pub use registry::RegistryCommands;
25
26#[derive(Parser)]
28#[command(name = "alimentar")]
29#[command(author, version, about, long_about = None)]
30struct Cli {
31 #[command(subcommand)]
32 command: Commands,
33}
34
35#[derive(Subcommand)]
36enum Commands {
37 Convert {
39 input: PathBuf,
41 output: PathBuf,
43 },
44 Info {
46 path: PathBuf,
48 },
49 Head {
51 path: PathBuf,
53 #[arg(short = 'n', long, default_value = "10")]
55 rows: usize,
56 },
57 Schema {
59 path: PathBuf,
61 },
62 Mix {
64 #[arg(required = true)]
67 inputs: Vec<String>,
68 #[arg(short, long)]
70 output: PathBuf,
71 #[arg(short, long, default_value = "42")]
73 seed: u64,
74 #[arg(short = 'n', long, default_value = "0")]
76 max_rows: usize,
77 },
78 #[cfg(feature = "shuffle")]
80 Fim {
81 input: PathBuf,
83 #[arg(short, long)]
85 output: PathBuf,
86 #[arg(long, default_value = "text")]
88 column: String,
89 #[arg(long, default_value = "0.5")]
91 rate: f64,
92 #[arg(long, default_value = "psm")]
94 format: String,
95 #[arg(long, default_value = "42")]
97 seed: u64,
98 },
99 Dedup {
101 input: PathBuf,
103 #[arg(short, long)]
105 output: PathBuf,
106 #[arg(long)]
108 column: Option<String>,
109 },
110 #[command(name = "filter-text")]
112 FilterText {
113 input: PathBuf,
115 #[arg(short, long)]
117 output: PathBuf,
118 #[arg(long)]
120 column: Option<String>,
121 #[arg(long, default_value = "0.4")]
123 min_score: f64,
124 #[arg(long, default_value = "50")]
126 min_length: usize,
127 #[arg(long, default_value = "1000000")]
129 max_length: usize,
130 },
131 View {
133 path: PathBuf,
135 #[arg(long)]
137 search: Option<String>,
138 },
139 Import {
141 #[command(subcommand)]
142 source: ImportSource,
143 },
144 #[allow(clippy::doc_markdown)]
146 #[cfg(feature = "hf-hub")]
147 #[command(subcommand)]
148 Hub(HubCommands),
149 #[command(subcommand)]
151 Registry(RegistryCommands),
152 #[command(subcommand)]
154 Drift(DriftCommands),
155 #[command(subcommand)]
157 Quality(QualityCommands),
158 #[command(subcommand)]
160 Fed(FedCommands),
161 #[cfg(feature = "doctest")]
163 #[command(subcommand)]
164 Doctest(DoctestCommands),
165 #[cfg(feature = "repl")]
167 Repl,
168}
169
170#[cfg(feature = "doctest")]
172#[derive(Subcommand)]
173pub enum DoctestCommands {
174 Extract {
176 input: PathBuf,
178 #[arg(short, long)]
180 output: PathBuf,
181 #[arg(short, long, default_value = "unknown")]
183 source: String,
184 #[arg(short, long, default_value = "unknown")]
186 version: String,
187 },
188 Merge {
190 #[arg(required = true)]
192 inputs: Vec<PathBuf>,
193 #[arg(short, long)]
195 output: PathBuf,
196 },
197}
198
199#[allow(clippy::too_many_lines)]
200pub fn run() -> ExitCode {
202 let cli = Cli::parse();
203
204 let result = match cli.command {
205 Commands::Convert { input, output } => basic::cmd_convert(&input, &output),
206 Commands::Info { path } => basic::cmd_info(&path),
207 Commands::Head { path, rows } => basic::cmd_head(&path, rows),
208 Commands::Schema { path } => basic::cmd_schema(&path),
209 Commands::Mix {
210 inputs,
211 output,
212 seed,
213 max_rows,
214 } => basic::cmd_mix(&inputs, &output, seed, max_rows),
215 #[cfg(feature = "shuffle")]
216 Commands::Fim {
217 input,
218 output,
219 column,
220 rate,
221 format,
222 seed,
223 } => basic::cmd_fim(&input, &output, &column, rate, &format, seed),
224 Commands::Dedup {
225 input,
226 output,
227 column,
228 } => basic::cmd_dedup(&input, &output, column.as_deref()),
229 Commands::FilterText {
230 input,
231 output,
232 column,
233 min_score,
234 min_length,
235 max_length,
236 } => basic::cmd_filter_text(
237 &input,
238 &output,
239 column.as_deref(),
240 min_score,
241 min_length,
242 max_length,
243 ),
244 Commands::View { path, search } => view::cmd_view(&path, search.as_deref()),
245 Commands::Import { source } => match source {
246 ImportSource::Local {
247 input,
248 output,
249 format,
250 } => hub::cmd_import_local(&input, &output, format.as_deref()),
251 #[cfg(feature = "hf-hub")]
252 ImportSource::Hf {
253 repo_id,
254 output,
255 revision,
256 subset,
257 split,
258 } => hub::cmd_import_hf(&repo_id, &output, &revision, subset.as_deref(), &split),
259 },
260 #[cfg(feature = "hf-hub")]
261 Commands::Hub(hub_cmd) => match hub_cmd {
262 HubCommands::Push {
263 input,
264 repo_id,
265 path_in_repo,
266 message,
267 readme,
268 private,
269 } => hub::cmd_hub_push(
270 &input,
271 &repo_id,
272 path_in_repo.as_deref(),
273 &message,
274 readme.as_ref(),
275 private,
276 ),
277 },
278 Commands::Registry(registry_cmd) => dispatch_registry(registry_cmd),
279 Commands::Drift(drift_cmd) => dispatch_drift(drift_cmd),
280 Commands::Quality(quality_cmd) => dispatch_quality(quality_cmd),
281 Commands::Fed(fed_cmd) => dispatch_fed(fed_cmd),
282 #[cfg(feature = "doctest")]
283 Commands::Doctest(doctest_cmd) => match doctest_cmd {
284 DoctestCommands::Extract {
285 input,
286 output,
287 source,
288 version,
289 } => cmd_doctest_extract(&input, &output, &source, &version),
290 DoctestCommands::Merge { inputs, output } => cmd_doctest_merge(&inputs, &output),
291 },
292 #[cfg(feature = "repl")]
293 Commands::Repl => crate::repl::run(),
294 };
295
296 match result {
297 Ok(()) => ExitCode::SUCCESS,
298 Err(e) => {
299 eprintln!("Error: {}", e);
300 ExitCode::FAILURE
301 }
302 }
303}
304
305fn dispatch_registry(cmd: RegistryCommands) -> crate::error::Result<()> {
306 match cmd {
307 RegistryCommands::Init { path } => registry::cmd_registry_init(&path),
308 RegistryCommands::List { path } => registry::cmd_registry_list(&path),
309 RegistryCommands::Push {
310 input,
311 name,
312 version,
313 description,
314 license,
315 tags,
316 registry,
317 } => registry::cmd_registry_push(
318 &input,
319 &name,
320 &version,
321 &description,
322 &license,
323 &tags,
324 ®istry,
325 ),
326 RegistryCommands::Pull {
327 name,
328 output,
329 version,
330 registry,
331 } => registry::cmd_registry_pull(&name, &output, version.as_deref(), ®istry),
332 RegistryCommands::Search { query, path } => registry::cmd_registry_search(&query, &path),
333 RegistryCommands::ShowInfo { name, path } => registry::cmd_registry_show_info(&name, &path),
334 RegistryCommands::Delete {
335 name,
336 version,
337 path,
338 } => registry::cmd_registry_delete(&name, &version, &path),
339 }
340}
341
342fn dispatch_drift(cmd: DriftCommands) -> crate::error::Result<()> {
343 match cmd {
344 DriftCommands::Detect {
345 reference,
346 current,
347 tests,
348 alpha,
349 format,
350 } => drift::cmd_drift_detect(&reference, ¤t, &tests, alpha, &format),
351 DriftCommands::Report {
352 reference,
353 current,
354 output,
355 } => drift::cmd_drift_report(&reference, ¤t, output.as_ref()),
356 DriftCommands::Sketch {
357 input,
358 output,
359 sketch_type,
360 source,
361 format,
362 } => drift::cmd_drift_sketch(&input, &output, &sketch_type, source.as_deref(), &format),
363 DriftCommands::Merge {
364 sketches,
365 output,
366 format,
367 } => drift::cmd_drift_merge(&sketches, &output, &format),
368 DriftCommands::Compare {
369 reference,
370 current,
371 threshold,
372 format,
373 } => drift::cmd_drift_compare(&reference, ¤t, threshold, &format),
374 }
375}
376
377fn dispatch_quality(cmd: QualityCommands) -> crate::error::Result<()> {
378 match cmd {
379 QualityCommands::Check {
380 path,
381 null_threshold,
382 duplicate_threshold,
383 detect_outliers,
384 format,
385 } => quality::cmd_quality_check(
386 &path,
387 null_threshold,
388 duplicate_threshold,
389 detect_outliers,
390 &format,
391 ),
392 QualityCommands::Report { path, output } => {
393 quality::cmd_quality_report(&path, output.as_deref())
394 }
395 QualityCommands::Score {
396 path,
397 profile,
398 suggest,
399 json,
400 badge,
401 } => quality::cmd_quality_score(&path, &profile, suggest, json, badge),
402 QualityCommands::Profiles => quality::cmd_quality_profiles(),
403 }
404}
405
406fn dispatch_fed(cmd: FedCommands) -> crate::error::Result<()> {
407 match cmd {
408 FedCommands::Manifest {
409 input,
410 output,
411 node_id,
412 train_ratio,
413 seed,
414 format,
415 } => fed::cmd_fed_manifest(&input, &output, &node_id, train_ratio, seed, &format),
416 FedCommands::Plan {
417 manifests,
418 output,
419 strategy,
420 train_ratio,
421 seed,
422 stratify_column,
423 format,
424 } => fed::cmd_fed_plan(
425 &manifests,
426 &output,
427 &strategy,
428 train_ratio,
429 seed,
430 stratify_column.as_deref(),
431 &format,
432 ),
433 FedCommands::Split {
434 input,
435 plan,
436 node_id,
437 train_output,
438 test_output,
439 validation_output,
440 } => fed::cmd_fed_split(
441 &input,
442 &plan,
443 &node_id,
444 &train_output,
445 &test_output,
446 validation_output.as_ref(),
447 ),
448 FedCommands::Verify { manifests, format } => fed::cmd_fed_verify(&manifests, &format),
449 }
450}
451
452#[cfg(feature = "doctest")]
457fn cmd_doctest_extract(
458 input: &std::path::Path,
459 output: &std::path::Path,
460 source: &str,
461 version: &str,
462) -> crate::Result<()> {
463 use crate::DocTestParser;
464
465 if !input.is_dir() {
466 return Err(crate::Error::invalid_config(format!(
467 "Input path must be a directory: {}",
468 input.display()
469 )));
470 }
471
472 let parser = DocTestParser::new();
473 let corpus = parser.parse_directory(input, source, version)?;
474
475 println!(
476 "Extracted {} doctests from {} ({})",
477 corpus.len(),
478 source,
479 version
480 );
481
482 if corpus.is_empty() {
483 println!("Warning: No doctests found in {}", input.display());
484 return Ok(());
485 }
486
487 let dataset = corpus.to_dataset()?;
488 dataset.to_parquet(output)?;
489
490 println!("Wrote {} to {}", corpus.len(), output.display());
491 Ok(())
492}
493
494#[cfg(feature = "doctest")]
495fn cmd_doctest_merge(inputs: &[PathBuf], output: &std::path::Path) -> crate::Result<()> {
496 use crate::{dataset::Dataset, ArrowDataset};
497
498 if inputs.is_empty() {
499 return Err(crate::Error::invalid_config("No input files provided"));
500 }
501
502 let mut all_batches = Vec::new();
504 let mut total_rows = 0;
505
506 for input in inputs {
507 let dataset = ArrowDataset::from_parquet(input)?;
508 total_rows += dataset.len();
509 for batch in dataset.iter() {
510 all_batches.push(batch.clone());
511 }
512 }
513
514 if all_batches.is_empty() {
515 return Err(crate::Error::invalid_config("No data found in input files"));
516 }
517
518 let merged = ArrowDataset::new(all_batches)?;
520 merged.to_parquet(output)?;
521
522 println!(
523 "Merged {} doctests from {} files to {}",
524 total_rows,
525 inputs.len(),
526 output.display()
527 );
528 Ok(())
529}