1use std::path::PathBuf;
4
5use clap::Subcommand;
6
7#[derive(Subcommand)]
9pub enum ImportSource {
10 Local {
12 input: PathBuf,
14 #[arg(short, long)]
16 output: PathBuf,
17 #[arg(short, long)]
19 format: Option<String>,
20 },
21 #[allow(clippy::doc_markdown)]
23 #[cfg(feature = "hf-hub")]
24 Hf {
25 repo_id: String,
27 #[arg(short, long)]
29 output: PathBuf,
30 #[arg(short, long, default_value = "main")]
32 revision: String,
33 #[arg(short, long)]
35 subset: Option<String>,
36 #[arg(long, default_value = "train")]
38 split: String,
39 },
40}
41
42pub(crate) fn cmd_import_local(
44 input: &PathBuf,
45 output: &PathBuf,
46 format: Option<&str>,
47) -> crate::Result<()> {
48 use super::basic::{load_dataset, save_dataset};
49 use crate::Dataset;
50
51 if !input.exists() {
52 return Err(crate::Error::io(
53 std::io::Error::new(std::io::ErrorKind::NotFound, "Input file not found"),
54 input,
55 ));
56 }
57
58 println!("Importing {}...", input.display());
59 let dataset = load_dataset(input)?;
60
61 if let Some(fmt) = format {
63 let forced_output = output.with_extension(fmt);
64 save_dataset(&dataset, &forced_output)?;
65 if forced_output != *output {
66 std::fs::rename(&forced_output, output).map_err(|e| crate::Error::io(e, output))?;
67 }
68 } else {
69 save_dataset(&dataset, output)?;
70 }
71
72 println!(
73 "Imported {} rows: {} -> {}",
74 dataset.len(),
75 input.display(),
76 output.display()
77 );
78
79 Ok(())
80}
81
82#[cfg(feature = "hf-hub")]
84#[derive(Subcommand)]
85pub enum HubCommands {
86 #[allow(clippy::doc_markdown)]
88 Push {
89 input: PathBuf,
91 repo_id: String,
93 #[arg(short, long)]
95 path_in_repo: Option<String>,
96 #[arg(short, long, default_value = "Upload via alimentar")]
98 message: String,
99 #[arg(long)]
101 readme: Option<PathBuf>,
102 #[arg(long)]
104 private: bool,
105 },
106}
107
108#[cfg(feature = "hf-hub")]
110pub(crate) fn cmd_import_hf(
111 repo_id: &str,
112 output: &PathBuf,
113 revision: &str,
114 subset: Option<&str>,
115 split: &str,
116) -> crate::Result<()> {
117 use crate::{dataset::Dataset, hf_hub::HfDataset};
118
119 println!("Importing {} from HuggingFace Hub...", repo_id);
120
121 let mut builder = HfDataset::builder(repo_id).revision(revision).split(split);
122
123 if let Some(s) = subset {
124 builder = builder.subset(s);
125 }
126
127 let dataset = builder.build()?;
128
129 println!("Downloading to {}...", output.display());
130 let data = dataset.download_to(output)?;
131
132 println!(
133 "Successfully imported {} ({} rows) to {}",
134 repo_id,
135 data.len(),
136 output.display()
137 );
138
139 Ok(())
140}
141
142#[cfg(feature = "hf-hub")]
144fn print_quality_warning() {
145 eprintln!();
146 eprintln!("WARNING: Data quality is CRITICAL for ML datasets!");
147 eprintln!("Publishing low-quality data harms the ML community.");
148 eprintln!();
149 eprintln!("Before publishing, verify quality with:");
150 eprintln!(" alimentar quality score <file.parquet>");
151 eprintln!();
152 eprintln!("Minimum recommended: Grade B (85%)");
153 eprintln!();
154 eprintln!("To improve quality, use:");
155 eprintln!(" aprender clean <input> -o <output> # Clean data");
156 eprintln!(" entrenar augment <input> -o <output> # Augment for training");
157 eprintln!();
158 eprintln!("See: https://paiml.github.io/alimentar/hf-hub/publishing.html");
159 eprintln!();
160}
161
162#[cfg(feature = "hf-hub")]
164pub(crate) fn cmd_hub_push(
165 input: &PathBuf,
166 repo_id: &str,
167 path_in_repo: Option<&str>,
168 message: &str,
169 readme: Option<&PathBuf>,
170 private: bool,
171) -> crate::Result<()> {
172 use crate::hf_hub::HfPublisher;
173
174 print_quality_warning();
176
177 if !input.exists() {
179 return Err(crate::Error::io(
180 std::io::Error::new(std::io::ErrorKind::NotFound, "Input file not found"),
181 input,
182 ));
183 }
184
185 let path_in_repo = path_in_repo.map(String::from).unwrap_or_else(|| {
187 input
188 .file_name()
189 .map(|f| f.to_string_lossy().into_owned())
190 .unwrap_or_else(|| "data.parquet".to_string())
191 });
192
193 println!("Pushing {} to {}...", input.display(), repo_id);
194
195 let publisher = HfPublisher::new(repo_id)
196 .with_private(private)
197 .with_commit_message(message);
198
199 println!("Creating repository (if needed)...");
201 publisher.create_repo_sync()?;
202
203 println!("Uploading {}...", path_in_repo);
205 publisher.upload_parquet_file_sync(input, &path_in_repo)?;
206
207 if let Some(readme_path) = readme {
209 println!("Uploading README.md...");
210 let readme_content =
211 std::fs::read_to_string(readme_path).map_err(|e| crate::Error::io(e, readme_path))?;
212 publisher.upload_readme_validated_sync(&readme_content)?;
213 }
214
215 let visibility = if private { "private" } else { "public" };
216 println!(
217 "Successfully pushed to https://huggingface.co/datasets/{} ({})",
218 repo_id, visibility
219 );
220
221 Ok(())
222}