use std::{path::PathBuf, process::ExitCode};
use clap::{Parser, Subcommand};
mod basic;
mod drift;
mod fed;
mod hub;
mod quality;
mod registry;
mod view;
pub use drift::DriftCommands;
pub use fed::FedCommands;
#[cfg(feature = "hf-hub")]
pub use hub::HubCommands;
pub use hub::ImportSource;
pub use quality::QualityCommands;
pub use registry::RegistryCommands;
#[derive(Parser)]
#[command(name = "alimentar")]
#[command(author, version, about, long_about = None)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Convert {
input: PathBuf,
output: PathBuf,
},
Info {
path: PathBuf,
},
Head {
path: PathBuf,
#[arg(short = 'n', long, default_value = "10")]
rows: usize,
},
Schema {
path: PathBuf,
},
Mix {
#[arg(required = true)]
inputs: Vec<String>,
#[arg(short, long)]
output: PathBuf,
#[arg(short, long, default_value = "42")]
seed: u64,
#[arg(short = 'n', long, default_value = "0")]
max_rows: usize,
},
#[cfg(feature = "shuffle")]
Fim {
input: PathBuf,
#[arg(short, long)]
output: PathBuf,
#[arg(long, default_value = "text")]
column: String,
#[arg(long, default_value = "0.5")]
rate: f64,
#[arg(long, default_value = "psm")]
format: String,
#[arg(long, default_value = "42")]
seed: u64,
},
View {
path: PathBuf,
#[arg(long)]
search: Option<String>,
},
Import {
#[command(subcommand)]
source: ImportSource,
},
#[allow(clippy::doc_markdown)]
#[cfg(feature = "hf-hub")]
#[command(subcommand)]
Hub(HubCommands),
#[command(subcommand)]
Registry(RegistryCommands),
#[command(subcommand)]
Drift(DriftCommands),
#[command(subcommand)]
Quality(QualityCommands),
#[command(subcommand)]
Fed(FedCommands),
#[cfg(feature = "doctest")]
#[command(subcommand)]
Doctest(DoctestCommands),
#[cfg(feature = "repl")]
Repl,
}
#[cfg(feature = "doctest")]
#[derive(Subcommand)]
pub enum DoctestCommands {
Extract {
input: PathBuf,
#[arg(short, long)]
output: PathBuf,
#[arg(short, long, default_value = "unknown")]
source: String,
#[arg(short, long, default_value = "unknown")]
version: String,
},
Merge {
#[arg(required = true)]
inputs: Vec<PathBuf>,
#[arg(short, long)]
output: PathBuf,
},
}
#[allow(clippy::too_many_lines)]
pub fn run() -> ExitCode {
let cli = Cli::parse();
let result = match cli.command {
Commands::Convert { input, output } => basic::cmd_convert(&input, &output),
Commands::Info { path } => basic::cmd_info(&path),
Commands::Head { path, rows } => basic::cmd_head(&path, rows),
Commands::Schema { path } => basic::cmd_schema(&path),
Commands::Mix {
inputs,
output,
seed,
max_rows,
} => basic::cmd_mix(&inputs, &output, seed, max_rows),
#[cfg(feature = "shuffle")]
Commands::Fim {
input,
output,
column,
rate,
format,
seed,
} => basic::cmd_fim(&input, &output, &column, rate, &format, seed),
Commands::View { path, search } => view::cmd_view(&path, search.as_deref()),
Commands::Import { source } => match source {
ImportSource::Local {
input,
output,
format,
} => hub::cmd_import_local(&input, &output, format.as_deref()),
#[cfg(feature = "hf-hub")]
ImportSource::Hf {
repo_id,
output,
revision,
subset,
split,
} => hub::cmd_import_hf(&repo_id, &output, &revision, subset.as_deref(), &split),
},
#[cfg(feature = "hf-hub")]
Commands::Hub(hub_cmd) => match hub_cmd {
HubCommands::Push {
input,
repo_id,
path_in_repo,
message,
readme,
private,
} => hub::cmd_hub_push(
&input,
&repo_id,
path_in_repo.as_deref(),
&message,
readme.as_ref(),
private,
),
},
Commands::Registry(registry_cmd) => match registry_cmd {
RegistryCommands::Init { path } => registry::cmd_registry_init(&path),
RegistryCommands::List { path } => registry::cmd_registry_list(&path),
RegistryCommands::Push {
input,
name,
version,
description,
license,
tags,
registry,
} => registry::cmd_registry_push(
&input,
&name,
&version,
&description,
&license,
&tags,
®istry,
),
RegistryCommands::Pull {
name,
output,
version,
registry,
} => registry::cmd_registry_pull(&name, &output, version.as_deref(), ®istry),
RegistryCommands::Search { query, path } => {
registry::cmd_registry_search(&query, &path)
}
RegistryCommands::ShowInfo { name, path } => {
registry::cmd_registry_show_info(&name, &path)
}
RegistryCommands::Delete {
name,
version,
path,
} => registry::cmd_registry_delete(&name, &version, &path),
},
Commands::Drift(drift_cmd) => match drift_cmd {
DriftCommands::Detect {
reference,
current,
tests,
alpha,
format,
} => drift::cmd_drift_detect(&reference, ¤t, &tests, alpha, &format),
DriftCommands::Report {
reference,
current,
output,
} => drift::cmd_drift_report(&reference, ¤t, output.as_ref()),
DriftCommands::Sketch {
input,
output,
sketch_type,
source,
format,
} => drift::cmd_drift_sketch(&input, &output, &sketch_type, source.as_deref(), &format),
DriftCommands::Merge {
sketches,
output,
format,
} => drift::cmd_drift_merge(&sketches, &output, &format),
DriftCommands::Compare {
reference,
current,
threshold,
format,
} => drift::cmd_drift_compare(&reference, ¤t, threshold, &format),
},
Commands::Quality(quality_cmd) => match quality_cmd {
QualityCommands::Check {
path,
null_threshold,
duplicate_threshold,
detect_outliers,
format,
} => quality::cmd_quality_check(
&path,
null_threshold,
duplicate_threshold,
detect_outliers,
&format,
),
QualityCommands::Report { path, output } => {
quality::cmd_quality_report(&path, output.as_ref())
}
QualityCommands::Score {
path,
profile,
suggest,
json,
badge,
} => quality::cmd_quality_score(&path, &profile, suggest, json, badge),
QualityCommands::Profiles => quality::cmd_quality_profiles(),
},
Commands::Fed(fed_cmd) => match fed_cmd {
FedCommands::Manifest {
input,
output,
node_id,
train_ratio,
seed,
format,
} => fed::cmd_fed_manifest(&input, &output, &node_id, train_ratio, seed, &format),
FedCommands::Plan {
manifests,
output,
strategy,
train_ratio,
seed,
stratify_column,
format,
} => fed::cmd_fed_plan(
&manifests,
&output,
&strategy,
train_ratio,
seed,
stratify_column.as_deref(),
&format,
),
FedCommands::Split {
input,
plan,
node_id,
train_output,
test_output,
validation_output,
} => fed::cmd_fed_split(
&input,
&plan,
&node_id,
&train_output,
&test_output,
validation_output.as_ref(),
),
FedCommands::Verify { manifests, format } => fed::cmd_fed_verify(&manifests, &format),
},
#[cfg(feature = "doctest")]
Commands::Doctest(doctest_cmd) => match doctest_cmd {
DoctestCommands::Extract {
input,
output,
source,
version,
} => cmd_doctest_extract(&input, &output, &source, &version),
DoctestCommands::Merge { inputs, output } => cmd_doctest_merge(&inputs, &output),
},
#[cfg(feature = "repl")]
Commands::Repl => crate::repl::run(),
};
match result {
Ok(()) => ExitCode::SUCCESS,
Err(e) => {
eprintln!("Error: {}", e);
ExitCode::FAILURE
}
}
}
#[cfg(feature = "doctest")]
fn cmd_doctest_extract(
input: &std::path::Path,
output: &std::path::Path,
source: &str,
version: &str,
) -> crate::Result<()> {
use crate::DocTestParser;
if !input.is_dir() {
return Err(crate::Error::invalid_config(format!(
"Input path must be a directory: {}",
input.display()
)));
}
let parser = DocTestParser::new();
let corpus = parser.parse_directory(input, source, version)?;
println!(
"Extracted {} doctests from {} ({})",
corpus.len(),
source,
version
);
if corpus.is_empty() {
println!("Warning: No doctests found in {}", input.display());
return Ok(());
}
let dataset = corpus.to_dataset()?;
dataset.to_parquet(output)?;
println!("Wrote {} to {}", corpus.len(), output.display());
Ok(())
}
#[cfg(feature = "doctest")]
fn cmd_doctest_merge(inputs: &[PathBuf], output: &std::path::Path) -> crate::Result<()> {
use crate::dataset::Dataset;
use crate::ArrowDataset;
if inputs.is_empty() {
return Err(crate::Error::invalid_config("No input files provided"));
}
let mut all_batches = Vec::new();
let mut total_rows = 0;
for input in inputs {
let dataset = ArrowDataset::from_parquet(input)?;
total_rows += dataset.len();
for batch in dataset.iter() {
all_batches.push(batch.clone());
}
}
if all_batches.is_empty() {
return Err(crate::Error::invalid_config("No data found in input files"));
}
let merged = ArrowDataset::new(all_batches)?;
merged.to_parquet(output)?;
println!(
"Merged {} doctests from {} files to {}",
total_rows,
inputs.len(),
output.display()
);
Ok(())
}