use clap::{Args, Subcommand};
use console::style;
use synth_claw::datasets::hf::{get_dataset_info, preview_dataset, search_datasets};
#[derive(Subcommand)]
pub enum DatasetsCommand {
Search(SearchArgs),
Info(InfoArgs),
Preview(PreviewArgs),
}
#[derive(Args)]
pub struct SearchArgs {
pub query: String,
#[arg(short, long, default_value = "10")]
pub limit: usize,
}
#[derive(Args)]
pub struct InfoArgs {
pub dataset: String,
}
#[derive(Args)]
pub struct PreviewArgs {
pub dataset: String,
#[arg(short, long)]
pub subset: Option<String>,
#[arg(short = 'S', long, default_value = "train")]
pub split: String,
#[arg(short, long, default_value = "5")]
pub rows: usize,
}
pub async fn run(cmd: DatasetsCommand) -> anyhow::Result<()> {
match cmd {
DatasetsCommand::Search(args) => search(args).await,
DatasetsCommand::Info(args) => info(args).await,
DatasetsCommand::Preview(args) => preview(args).await,
}
}
async fn search(args: SearchArgs) -> anyhow::Result<()> {
println!(
"{} Searching for datasets matching: {}",
style("→").cyan().bold(),
style(&args.query).green()
);
let results = search_datasets(&args.query, args.limit).await?;
if results.is_empty() {
println!("{}", style("No datasets found").yellow());
return Ok(());
}
println!("\n{}", style("Results:").bold());
for result in results {
println!(
" {} {} (↓{} ♥{})",
style("•").cyan(),
style(&result.id).green(),
result.downloads,
result.likes
);
}
Ok(())
}
async fn info(args: InfoArgs) -> anyhow::Result<()> {
println!(
"{} Getting info for: {}",
style("→").cyan().bold(),
style(&args.dataset).green()
);
let info = get_dataset_info(&args.dataset).await?;
println!("\n{}", style("Dataset Info:").bold());
println!(" Name: {}", style(&info.name).green());
if info.description.as_ref().is_some_and(|d| !d.is_empty()) {
println!(" Description: {}", info.description.as_ref().unwrap());
}
println!(" Total rows: {}", info.num_rows);
println!(" Columns: {}", info.columns.join(", "));
println!("\n{}", style("Splits:").bold());
for split in &info.splits {
println!(" {} ({} rows)", style(&split.name).cyan(), split.num_rows);
}
Ok(())
}
async fn preview(args: PreviewArgs) -> anyhow::Result<()> {
let info = get_dataset_info(&args.dataset).await?;
let split = if info.splits.iter().any(|s| s.name == args.split) {
args.split.clone()
} else {
let first_split = info.splits.first()
.map(|s| s.name.clone())
.unwrap_or_else(|| "train".to_string());
if args.split == "train" {
println!(
"{} Split '{}' not found, using '{}' instead",
style("⚠").yellow().bold(),
args.split,
first_split
);
} else {
println!(
"{} Split '{}' not found. Available splits: {}",
style("⚠").yellow().bold(),
args.split,
info.splits.iter().map(|s| s.name.as_str()).collect::<Vec<_>>().join(", ")
);
println!(" Using '{}' instead", first_split);
}
first_split
};
println!(
"{} Previewing: {} (split: {}, rows: {})",
style("→").cyan().bold(),
style(&args.dataset).green(),
split,
args.rows
);
let rows = preview_dataset(&args.dataset, args.subset.as_deref(), &split, args.rows).await?;
println!("\n{}", style("Preview:").bold());
for (i, row) in rows.iter().enumerate() {
println!("\n{}:", style(format!("Row {}", i + 1)).cyan().bold());
if let Some(obj) = row.as_object() {
for (key, value) in obj {
let value_str = match value {
serde_json::Value::String(s) => {
if s.len() > 100 {
format!("{}...", &s[..100])
} else {
s.clone()
}
}
other => other.to_string(),
};
println!(" {}: {}", style(key).yellow(), value_str);
}
}
}
Ok(())
}