use std::path::PathBuf;
use clap::Subcommand;
#[derive(Subcommand)]
pub enum ImportSource {
Local {
input: PathBuf,
#[arg(short, long)]
output: PathBuf,
#[arg(short, long)]
format: Option<String>,
},
#[allow(clippy::doc_markdown)]
#[cfg(feature = "hf-hub")]
Hf {
repo_id: String,
#[arg(short, long)]
output: PathBuf,
#[arg(short, long, default_value = "main")]
revision: String,
#[arg(short, long)]
subset: Option<String>,
#[arg(long, default_value = "train")]
split: String,
},
}
pub(crate) fn cmd_import_local(
input: &PathBuf,
output: &PathBuf,
format: Option<&str>,
) -> crate::Result<()> {
use super::basic::{load_dataset, save_dataset};
use crate::Dataset;
if !input.exists() {
return Err(crate::Error::io(
std::io::Error::new(std::io::ErrorKind::NotFound, "Input file not found"),
input,
));
}
println!("Importing {}...", input.display());
let dataset = load_dataset(input)?;
if let Some(fmt) = format {
let forced_output = output.with_extension(fmt);
save_dataset(&dataset, &forced_output)?;
if forced_output != *output {
std::fs::rename(&forced_output, output).map_err(|e| crate::Error::io(e, output))?;
}
} else {
save_dataset(&dataset, output)?;
}
println!(
"Imported {} rows: {} -> {}",
dataset.len(),
input.display(),
output.display()
);
Ok(())
}
#[cfg(feature = "hf-hub")]
#[derive(Subcommand)]
pub enum HubCommands {
#[allow(clippy::doc_markdown)]
Push {
input: PathBuf,
repo_id: String,
#[arg(short, long)]
path_in_repo: Option<String>,
#[arg(short, long, default_value = "Upload via alimentar")]
message: String,
#[arg(long)]
readme: Option<PathBuf>,
#[arg(long)]
private: bool,
},
}
#[cfg(feature = "hf-hub")]
pub(crate) fn cmd_import_hf(
repo_id: &str,
output: &PathBuf,
revision: &str,
subset: Option<&str>,
split: &str,
) -> crate::Result<()> {
use crate::{dataset::Dataset, hf_hub::HfDataset};
println!("Importing {} from HuggingFace Hub...", repo_id);
let mut builder = HfDataset::builder(repo_id).revision(revision).split(split);
if let Some(s) = subset {
builder = builder.subset(s);
}
let dataset = builder.build()?;
println!("Downloading to {}...", output.display());
let data = dataset.download_to(output)?;
println!(
"Successfully imported {} ({} rows) to {}",
repo_id,
data.len(),
output.display()
);
Ok(())
}
#[cfg(feature = "hf-hub")]
fn print_quality_warning() {
eprintln!();
eprintln!("WARNING: Data quality is CRITICAL for ML datasets!");
eprintln!("Publishing low-quality data harms the ML community.");
eprintln!();
eprintln!("Before publishing, verify quality with:");
eprintln!(" alimentar quality score <file.parquet>");
eprintln!();
eprintln!("Minimum recommended: Grade B (85%)");
eprintln!();
eprintln!("To improve quality, use:");
eprintln!(" aprender clean <input> -o <output> # Clean data");
eprintln!(" entrenar augment <input> -o <output> # Augment for training");
eprintln!();
eprintln!("See: https://paiml.github.io/alimentar/hf-hub/publishing.html");
eprintln!();
}
#[cfg(feature = "hf-hub")]
pub(crate) fn cmd_hub_push(
input: &PathBuf,
repo_id: &str,
path_in_repo: Option<&str>,
message: &str,
readme: Option<&PathBuf>,
private: bool,
) -> crate::Result<()> {
use crate::hf_hub::HfPublisher;
print_quality_warning();
if !input.exists() {
return Err(crate::Error::io(
std::io::Error::new(std::io::ErrorKind::NotFound, "Input file not found"),
input,
));
}
let path_in_repo = path_in_repo.map(String::from).unwrap_or_else(|| {
input
.file_name()
.map(|f| f.to_string_lossy().into_owned())
.unwrap_or_else(|| "data.parquet".to_string())
});
println!("Pushing {} to {}...", input.display(), repo_id);
let publisher = HfPublisher::new(repo_id)
.with_private(private)
.with_commit_message(message);
println!("Creating repository (if needed)...");
publisher.create_repo_sync()?;
println!("Uploading {}...", path_in_repo);
publisher.upload_parquet_file_sync(input, &path_in_repo)?;
if let Some(readme_path) = readme {
println!("Uploading README.md...");
let readme_content =
std::fs::read_to_string(readme_path).map_err(|e| crate::Error::io(e, readme_path))?;
publisher.upload_readme_validated_sync(&readme_content)?;
}
let visibility = if private { "private" } else { "public" };
println!(
"Successfully pushed to https://huggingface.co/datasets/{} ({})",
repo_id, visibility
);
Ok(())
}