Skip to main content

alimentar/cli/
hub.rs

1//! HuggingFace Hub CLI commands.
2
3use std::path::PathBuf;
4
5use clap::Subcommand;
6
7/// Import source options.
8#[derive(Subcommand)]
9pub enum ImportSource {
10    /// Import from a local file (CSV, JSON, JSONL, Parquet, Arrow)
11    Local {
12        /// Input file or directory path
13        input: PathBuf,
14        /// Output file path (format inferred from extension)
15        #[arg(short, long)]
16        output: PathBuf,
17        /// Force output format (csv, json, jsonl, parquet, arrow)
18        #[arg(short, long)]
19        format: Option<String>,
20    },
21    /// Import from HuggingFace Hub.
22    #[allow(clippy::doc_markdown)]
23    #[cfg(feature = "hf-hub")]
24    Hf {
25        /// Dataset repository ID (e.g., "squad", "openai/gsm8k")
26        repo_id: String,
27        /// Output path for the downloaded dataset
28        #[arg(short, long)]
29        output: PathBuf,
30        /// Git revision (branch, tag, or commit)
31        #[arg(short, long, default_value = "main")]
32        revision: String,
33        /// Dataset subset/configuration
34        #[arg(short, long)]
35        subset: Option<String>,
36        /// Data split (train, validation, test)
37        #[arg(long, default_value = "train")]
38        split: String,
39    },
40}
41
42/// Import a local dataset file, converting between formats.
43pub(crate) fn cmd_import_local(
44    input: &PathBuf,
45    output: &PathBuf,
46    format: Option<&str>,
47) -> crate::Result<()> {
48    use super::basic::{load_dataset, save_dataset};
49    use crate::Dataset;
50
51    if !input.exists() {
52        return Err(crate::Error::io(
53            std::io::Error::new(std::io::ErrorKind::NotFound, "Input file not found"),
54            input,
55        ));
56    }
57
58    println!("Importing {}...", input.display());
59    let dataset = load_dataset(input)?;
60
61    // Apply format override by using the target extension for save, then rename
62    if let Some(fmt) = format {
63        let forced_output = output.with_extension(fmt);
64        save_dataset(&dataset, &forced_output)?;
65        if forced_output != *output {
66            std::fs::rename(&forced_output, output).map_err(|e| crate::Error::io(e, output))?;
67        }
68    } else {
69        save_dataset(&dataset, output)?;
70    }
71
72    println!(
73        "Imported {} rows: {} -> {}",
74        dataset.len(),
75        input.display(),
76        output.display()
77    );
78
79    Ok(())
80}
81
82/// HuggingFace Hub commands.
83#[cfg(feature = "hf-hub")]
84#[derive(Subcommand)]
85pub enum HubCommands {
86    /// Push (upload) a dataset to HuggingFace Hub
87    #[allow(clippy::doc_markdown)]
88    Push {
89        /// Path to the parquet file to upload
90        input: PathBuf,
91        /// HuggingFace repository ID (e.g., "paiml/my-dataset")
92        repo_id: String,
93        /// Path in the repository (e.g., "data/train.parquet")
94        #[arg(short, long)]
95        path_in_repo: Option<String>,
96        /// Commit message for the upload
97        #[arg(short, long, default_value = "Upload via alimentar")]
98        message: String,
99        /// Path to README.md to upload as dataset card
100        #[arg(long)]
101        readme: Option<PathBuf>,
102        /// Make the dataset private
103        #[arg(long)]
104        private: bool,
105    },
106}
107
108/// Import from HuggingFace Hub.
109#[cfg(feature = "hf-hub")]
110pub(crate) fn cmd_import_hf(
111    repo_id: &str,
112    output: &PathBuf,
113    revision: &str,
114    subset: Option<&str>,
115    split: &str,
116) -> crate::Result<()> {
117    use crate::{dataset::Dataset, hf_hub::HfDataset};
118
119    println!("Importing {} from HuggingFace Hub...", repo_id);
120
121    let mut builder = HfDataset::builder(repo_id).revision(revision).split(split);
122
123    if let Some(s) = subset {
124        builder = builder.subset(s);
125    }
126
127    let dataset = builder.build()?;
128
129    println!("Downloading to {}...", output.display());
130    let data = dataset.download_to(output)?;
131
132    println!(
133        "Successfully imported {} ({} rows) to {}",
134        repo_id,
135        data.len(),
136        output.display()
137    );
138
139    Ok(())
140}
141
142/// Prints a quality warning for HuggingFace Hub uploads.
143#[cfg(feature = "hf-hub")]
144fn print_quality_warning() {
145    eprintln!();
146    eprintln!("WARNING: Data quality is CRITICAL for ML datasets!");
147    eprintln!("Publishing low-quality data harms the ML community.");
148    eprintln!();
149    eprintln!("Before publishing, verify quality with:");
150    eprintln!("  alimentar quality score <file.parquet>");
151    eprintln!();
152    eprintln!("Minimum recommended: Grade B (85%)");
153    eprintln!();
154    eprintln!("To improve quality, use:");
155    eprintln!("  aprender clean <input> -o <output>     # Clean data");
156    eprintln!("  entrenar augment <input> -o <output>   # Augment for training");
157    eprintln!();
158    eprintln!("See: https://paiml.github.io/alimentar/hf-hub/publishing.html");
159    eprintln!();
160}
161
162/// Push (upload) a dataset to HuggingFace Hub.
163#[cfg(feature = "hf-hub")]
164pub(crate) fn cmd_hub_push(
165    input: &PathBuf,
166    repo_id: &str,
167    path_in_repo: Option<&str>,
168    message: &str,
169    readme: Option<&PathBuf>,
170    private: bool,
171) -> crate::Result<()> {
172    use crate::hf_hub::HfPublisher;
173
174    // Display quality warning
175    print_quality_warning();
176
177    // Validate input file exists
178    if !input.exists() {
179        return Err(crate::Error::io(
180            std::io::Error::new(std::io::ErrorKind::NotFound, "Input file not found"),
181            input,
182        ));
183    }
184
185    // Derive path_in_repo from filename if not specified
186    let path_in_repo = path_in_repo.map(String::from).unwrap_or_else(|| {
187        input
188            .file_name()
189            .map(|f| f.to_string_lossy().into_owned())
190            .unwrap_or_else(|| "data.parquet".to_string())
191    });
192
193    println!("Pushing {} to {}...", input.display(), repo_id);
194
195    let publisher = HfPublisher::new(repo_id)
196        .with_private(private)
197        .with_commit_message(message);
198
199    // Create repo (idempotent - succeeds if already exists)
200    println!("Creating repository (if needed)...");
201    publisher.create_repo_sync()?;
202
203    // Upload parquet file
204    println!("Uploading {}...", path_in_repo);
205    publisher.upload_parquet_file_sync(input, &path_in_repo)?;
206
207    // Upload README if provided
208    if let Some(readme_path) = readme {
209        println!("Uploading README.md...");
210        let readme_content =
211            std::fs::read_to_string(readme_path).map_err(|e| crate::Error::io(e, readme_path))?;
212        publisher.upload_readme_validated_sync(&readme_content)?;
213    }
214
215    let visibility = if private { "private" } else { "public" };
216    println!(
217        "Successfully pushed to https://huggingface.co/datasets/{} ({})",
218        repo_id, visibility
219    );
220
221    Ok(())
222}