Skip to main content

alimentar/cli/
registry.rs

1//! Registry CLI commands for dataset sharing and discovery.
2
3use std::path::{Path, PathBuf};
4
5use clap::Subcommand;
6
7use super::basic::load_dataset;
8use crate::{
9    backend::LocalBackend,
10    registry::{DatasetMetadata, Registry},
11    Dataset,
12};
13
14/// Registry commands for dataset sharing and discovery.
15#[derive(Subcommand)]
16pub enum RegistryCommands {
17    /// Initialize a new registry
18    Init {
19        /// Path to registry directory
20        #[arg(short, long, default_value = ".alimentar")]
21        path: PathBuf,
22    },
23    /// List all datasets in a registry
24    List {
25        /// Path to registry directory
26        #[arg(short, long, default_value = ".alimentar")]
27        path: PathBuf,
28    },
29    /// Push (publish) a dataset to the registry
30    Push {
31        /// Path to the dataset file (parquet)
32        input: PathBuf,
33        /// Dataset name in the registry
34        #[arg(short, long)]
35        name: String,
36        /// Dataset version (semver)
37        #[arg(short, long, default_value = "1.0.0")]
38        version: String,
39        /// Description of the dataset
40        #[arg(short, long, default_value = "")]
41        description: String,
42        /// License identifier (e.g., MIT, Apache-2.0)
43        #[arg(short, long, default_value = "")]
44        license: String,
45        /// Tags for the dataset (comma-separated)
46        #[arg(short, long, default_value = "")]
47        tags: String,
48        /// Path to registry directory
49        #[arg(long, default_value = ".alimentar")]
50        registry: PathBuf,
51    },
52    /// Pull (download) a dataset from the registry
53    Pull {
54        /// Dataset name
55        name: String,
56        /// Output path for the dataset
57        #[arg(short, long)]
58        output: PathBuf,
59        /// Specific version to pull (defaults to latest)
60        #[arg(short, long)]
61        version: Option<String>,
62        /// Path to registry directory
63        #[arg(long, default_value = ".alimentar")]
64        registry: PathBuf,
65    },
66    /// Search datasets by name or description
67    Search {
68        /// Search query
69        query: String,
70        /// Path to registry directory
71        #[arg(short, long, default_value = ".alimentar")]
72        path: PathBuf,
73    },
74    /// Show detailed info about a specific dataset
75    ShowInfo {
76        /// Dataset name
77        name: String,
78        /// Path to registry directory
79        #[arg(short, long, default_value = ".alimentar")]
80        path: PathBuf,
81    },
82    /// Delete a dataset version from the registry
83    Delete {
84        /// Dataset name
85        name: String,
86        /// Version to delete
87        #[arg(short, long)]
88        version: String,
89        /// Path to registry directory
90        #[arg(short, long, default_value = ".alimentar")]
91        path: PathBuf,
92    },
93}
94
95/// Create a registry with the given path.
96pub(crate) fn create_registry(path: &Path) -> crate::Result<Registry> {
97    // Ensure directory exists
98    if !path.exists() {
99        std::fs::create_dir_all(path).map_err(|e| crate::Error::io(e, path))?;
100    }
101    let backend = LocalBackend::new(path)?;
102    Ok(Registry::new(Box::new(backend)))
103}
104
105/// Initialize a new registry.
106pub(crate) fn cmd_registry_init(path: &Path) -> crate::Result<()> {
107    let registry = create_registry(path)?;
108    registry.init()?;
109    println!("Initialized registry at: {}", path.display());
110    Ok(())
111}
112
113/// List all datasets in a registry.
114pub(crate) fn cmd_registry_list(path: &Path) -> crate::Result<()> {
115    let registry = create_registry(path)?;
116    let datasets = registry.list()?;
117
118    if datasets.is_empty() {
119        println!("No datasets in registry.");
120        return Ok(());
121    }
122
123    println!("Datasets in registry:\n");
124    println!(
125        "{:<25} {:<12} {:<10} {:<15} DESCRIPTION",
126        "NAME", "LATEST", "VERSIONS", "ROWS"
127    );
128    println!("{}", "-".repeat(80));
129
130    for ds in datasets {
131        let desc = if ds.metadata.description.len() > 30 {
132            format!("{}...", &ds.metadata.description[..27])
133        } else {
134            ds.metadata.description.clone()
135        };
136        println!(
137            "{:<25} {:<12} {:<10} {:<15} {}",
138            ds.name,
139            ds.latest,
140            ds.versions.len(),
141            ds.num_rows,
142            desc
143        );
144    }
145
146    Ok(())
147}
148
149/// Push (publish) a dataset to the registry.
150#[allow(clippy::too_many_arguments)]
151pub(crate) fn cmd_registry_push(
152    input: &Path,
153    name: &str,
154    version: &str,
155    description: &str,
156    license: &str,
157    tags: &str,
158    registry_path: &Path,
159) -> crate::Result<()> {
160    let registry = create_registry(registry_path)?;
161
162    // Initialize if needed
163    registry.init()?;
164
165    // Load the dataset
166    let dataset = load_dataset(input)?;
167
168    // Parse tags
169    let tag_list: Vec<String> = if tags.is_empty() {
170        Vec::new()
171    } else {
172        tags.split(',').map(|s| s.trim().to_string()).collect()
173    };
174
175    // Create metadata
176    let metadata = DatasetMetadata {
177        description: description.to_string(),
178        license: license.to_string(),
179        tags: tag_list,
180        source: Some(input.display().to_string()),
181        citation: None,
182        sha256: None, // Computed during save, not at publish time
183    };
184
185    // Publish
186    registry.publish(name, version, &dataset, metadata)?;
187
188    println!(
189        "Published {}@{} ({} rows) to registry",
190        name,
191        version,
192        dataset.len()
193    );
194
195    Ok(())
196}
197
198/// Pull (download) a dataset from the registry.
199pub(crate) fn cmd_registry_pull(
200    name: &str,
201    output: &Path,
202    version: Option<&str>,
203    registry_path: &Path,
204) -> crate::Result<()> {
205    let registry = create_registry(registry_path)?;
206
207    // Pull the dataset
208    let dataset = registry.pull(name, version)?;
209
210    // Save to output
211    dataset.to_parquet(output)?;
212
213    let ver = version.unwrap_or("latest");
214    println!(
215        "Pulled {}@{} ({} rows) to {}",
216        name,
217        ver,
218        dataset.len(),
219        output.display()
220    );
221
222    Ok(())
223}
224
225/// Search datasets by name or description.
226pub(crate) fn cmd_registry_search(query: &str, path: &Path) -> crate::Result<()> {
227    let registry = create_registry(path)?;
228    let results = registry.search(query)?;
229
230    if results.is_empty() {
231        println!("No datasets found matching '{}'", query);
232        return Ok(());
233    }
234
235    println!("Search results for '{}':\n", query);
236    println!("{:<25} {:<12} {:<10} DESCRIPTION", "NAME", "LATEST", "ROWS");
237    println!("{}", "-".repeat(70));
238
239    for ds in results {
240        let desc = if ds.metadata.description.len() > 30 {
241            format!("{}...", &ds.metadata.description[..27])
242        } else {
243            ds.metadata.description.clone()
244        };
245        println!(
246            "{:<25} {:<12} {:<10} {}",
247            ds.name, ds.latest, ds.num_rows, desc
248        );
249    }
250
251    Ok(())
252}
253
254/// Show detailed info about a specific dataset.
255pub(crate) fn cmd_registry_show_info(name: &str, path: &Path) -> crate::Result<()> {
256    let registry = create_registry(path)?;
257    let info = registry.get_info(name)?;
258
259    println!("Dataset: {}", info.name);
260    println!("Latest: {}", info.latest);
261    println!("Versions: {}", info.versions.join(", "));
262    println!("Rows: {}", info.num_rows);
263    println!("Size: {} bytes", info.size_bytes);
264    println!();
265    println!("Description: {}", info.metadata.description);
266    println!("License: {}", info.metadata.license);
267    println!("Tags: {}", info.metadata.tags.join(", "));
268
269    if let Some(source) = &info.metadata.source {
270        println!("Source: {}", source);
271    }
272    if let Some(citation) = &info.metadata.citation {
273        println!("Citation: {}", citation);
274    }
275
276    println!();
277    println!("Schema:");
278    if let Some(fields) = info.schema.get("fields").and_then(|f| f.as_array()) {
279        for field in fields {
280            let name = field.get("name").and_then(|n| n.as_str()).unwrap_or("?");
281            let dtype = field
282                .get("data_type")
283                .and_then(|d| d.as_str())
284                .unwrap_or("?");
285            let nullable = field
286                .get("nullable")
287                .and_then(serde_json::Value::as_bool)
288                .unwrap_or(true);
289            let null_str = if nullable { "nullable" } else { "not null" };
290            println!("  - {} ({}) [{}]", name, dtype, null_str);
291        }
292    }
293
294    Ok(())
295}
296
297/// Delete a dataset version from the registry.
298pub(crate) fn cmd_registry_delete(name: &str, version: &str, path: &Path) -> crate::Result<()> {
299    let registry = create_registry(path)?;
300    registry.delete(name, version)?;
301    println!("Deleted {}@{} from registry", name, version);
302    Ok(())
303}
304
305#[cfg(test)]
306#[allow(
307    clippy::cast_possible_truncation,
308    clippy::cast_possible_wrap,
309    clippy::cast_precision_loss,
310    clippy::uninlined_format_args,
311    clippy::unwrap_used,
312    clippy::expect_used,
313    clippy::redundant_clone,
314    clippy::cast_lossless,
315    clippy::redundant_closure_for_method_calls,
316    clippy::too_many_lines,
317    clippy::float_cmp,
318    clippy::similar_names,
319    clippy::needless_late_init,
320    clippy::redundant_pattern_matching
321)]
322mod tests {
323    use std::sync::Arc;
324
325    use arrow::{
326        array::{Int32Array, StringArray},
327        datatypes::{DataType, Field, Schema},
328    };
329
330    use super::*;
331    use crate::ArrowDataset;
332
333    fn create_test_parquet(path: &Path, rows: usize) {
334        let schema = Arc::new(Schema::new(vec![
335            Field::new("id", DataType::Int32, false),
336            Field::new("name", DataType::Utf8, false),
337        ]));
338
339        let ids: Vec<i32> = (0..rows as i32).collect();
340        let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
341
342        let batch = arrow::array::RecordBatch::try_new(
343            schema,
344            vec![
345                Arc::new(Int32Array::from(ids)),
346                Arc::new(StringArray::from(names)),
347            ],
348        )
349        .ok()
350        .unwrap_or_else(|| panic!("Should create batch"));
351
352        let dataset = ArrowDataset::from_batch(batch)
353            .ok()
354            .unwrap_or_else(|| panic!("Should create dataset"));
355
356        dataset
357            .to_parquet(path)
358            .ok()
359            .unwrap_or_else(|| panic!("Should write parquet"));
360    }
361
362    #[test]
363    fn test_cmd_registry_init() {
364        let temp_dir = tempfile::tempdir()
365            .ok()
366            .unwrap_or_else(|| panic!("Should create temp dir"));
367        let registry_path = temp_dir.path().join("registry");
368
369        let result = cmd_registry_init(&registry_path);
370        assert!(result.is_ok());
371        assert!(registry_path.exists());
372    }
373
374    #[test]
375    fn test_cmd_registry_list_empty() {
376        let temp_dir = tempfile::tempdir()
377            .ok()
378            .unwrap_or_else(|| panic!("Should create temp dir"));
379        let registry_path = temp_dir.path().join("registry");
380
381        // Init first
382        cmd_registry_init(&registry_path)
383            .ok()
384            .unwrap_or_else(|| panic!("Should init"));
385
386        let result = cmd_registry_list(&registry_path);
387        assert!(result.is_ok());
388    }
389
390    #[test]
391    fn test_cmd_registry_push_and_pull() {
392        let temp_dir = tempfile::tempdir()
393            .ok()
394            .unwrap_or_else(|| panic!("Should create temp dir"));
395        let registry_path = temp_dir.path().join("registry");
396        let input = temp_dir.path().join("data.parquet");
397        let output = temp_dir.path().join("pulled.parquet");
398
399        // Create test data
400        create_test_parquet(&input, 25);
401
402        // Push
403        let result = cmd_registry_push(
404            &input,
405            "test-dataset",
406            "1.0.0",
407            "A test dataset",
408            "MIT",
409            "test,example",
410            &registry_path,
411        );
412        assert!(result.is_ok());
413
414        // List should show the dataset
415        let result = cmd_registry_list(&registry_path);
416        assert!(result.is_ok());
417
418        // Pull
419        let result = cmd_registry_pull("test-dataset", &output, Some("1.0.0"), &registry_path);
420        assert!(result.is_ok());
421        assert!(output.exists());
422
423        // Verify data
424        let original = ArrowDataset::from_parquet(&input)
425            .ok()
426            .unwrap_or_else(|| panic!("Should load original"));
427        let pulled = ArrowDataset::from_parquet(&output)
428            .ok()
429            .unwrap_or_else(|| panic!("Should load pulled"));
430        assert_eq!(original.len(), pulled.len());
431    }
432
433    #[test]
434    fn test_cmd_registry_search() {
435        let temp_dir = tempfile::tempdir()
436            .ok()
437            .unwrap_or_else(|| panic!("Should create temp dir"));
438        let registry_path = temp_dir.path().join("registry");
439        let input = temp_dir.path().join("data.parquet");
440
441        create_test_parquet(&input, 10);
442
443        // Push with description
444        cmd_registry_push(
445            &input,
446            "ml-dataset",
447            "1.0.0",
448            "Machine learning training data",
449            "Apache-2.0",
450            "ml,training",
451            &registry_path,
452        )
453        .ok()
454        .unwrap_or_else(|| panic!("Should push"));
455
456        // Search by name
457        let result = cmd_registry_search("ml", &registry_path);
458        assert!(result.is_ok());
459
460        // Search by description
461        let result = cmd_registry_search("machine", &registry_path);
462        assert!(result.is_ok());
463    }
464
465    #[test]
466    fn test_cmd_registry_show_info() {
467        let temp_dir = tempfile::tempdir()
468            .ok()
469            .unwrap_or_else(|| panic!("Should create temp dir"));
470        let registry_path = temp_dir.path().join("registry");
471        let input = temp_dir.path().join("data.parquet");
472
473        create_test_parquet(&input, 10);
474
475        cmd_registry_push(
476            &input,
477            "info-test",
478            "1.0.0",
479            "Test description",
480            "MIT",
481            "test",
482            &registry_path,
483        )
484        .ok()
485        .unwrap_or_else(|| panic!("Should push"));
486
487        let result = cmd_registry_show_info("info-test", &registry_path);
488        assert!(result.is_ok());
489    }
490
491    #[test]
492    fn test_cmd_registry_delete() {
493        let temp_dir = tempfile::tempdir()
494            .ok()
495            .unwrap_or_else(|| panic!("Should create temp dir"));
496        let registry_path = temp_dir.path().join("registry");
497        let input = temp_dir.path().join("data.parquet");
498
499        create_test_parquet(&input, 10);
500
501        // Push
502        cmd_registry_push(
503            &input,
504            "delete-test",
505            "1.0.0",
506            "Will be deleted",
507            "",
508            "",
509            &registry_path,
510        )
511        .ok()
512        .unwrap_or_else(|| panic!("Should push"));
513
514        // Delete
515        let result = cmd_registry_delete("delete-test", "1.0.0", &registry_path);
516        assert!(result.is_ok());
517
518        // Should no longer exist
519        let result = cmd_registry_show_info("delete-test", &registry_path);
520        assert!(result.is_err());
521    }
522
523    #[test]
524    fn test_cmd_registry_pull_latest() {
525        let temp_dir = tempfile::tempdir()
526            .ok()
527            .unwrap_or_else(|| panic!("Should create temp dir"));
528        let registry_path = temp_dir.path().join("registry");
529        let input1 = temp_dir.path().join("v1.parquet");
530        let input2 = temp_dir.path().join("v2.parquet");
531        let output = temp_dir.path().join("pulled.parquet");
532
533        create_test_parquet(&input1, 10);
534        create_test_parquet(&input2, 20);
535
536        // Push v1
537        cmd_registry_push(&input1, "versioned", "1.0.0", "V1", "", "", &registry_path)
538            .ok()
539            .unwrap_or_else(|| panic!("Should push v1"));
540
541        // Push v2
542        cmd_registry_push(&input2, "versioned", "2.0.0", "V2", "", "", &registry_path)
543            .ok()
544            .unwrap_or_else(|| panic!("Should push v2"));
545
546        // Pull latest (no version specified)
547        let result = cmd_registry_pull("versioned", &output, None, &registry_path);
548        assert!(result.is_ok());
549
550        // Should be v2 (20 rows)
551        let pulled = ArrowDataset::from_parquet(&output)
552            .ok()
553            .unwrap_or_else(|| panic!("Should load"));
554        assert_eq!(pulled.len(), 20);
555    }
556
557    #[test]
558    fn test_cmd_registry_search_no_results() {
559        let temp_dir = tempfile::tempdir()
560            .ok()
561            .unwrap_or_else(|| panic!("Should create temp dir"));
562        let registry_path = temp_dir.path().join("registry");
563
564        // Init registry
565        cmd_registry_init(&registry_path)
566            .ok()
567            .unwrap_or_else(|| panic!("Should init"));
568
569        // Search for something that doesn't exist
570        let result = cmd_registry_search("nonexistent-dataset-xyz", &registry_path);
571        assert!(result.is_ok());
572    }
573
574    #[test]
575    fn test_cmd_registry_push_with_long_description() {
576        let temp_dir = tempfile::tempdir()
577            .ok()
578            .unwrap_or_else(|| panic!("Should create temp dir"));
579        let registry_path = temp_dir.path().join("registry");
580        let input = temp_dir.path().join("data.parquet");
581
582        create_test_parquet(&input, 10);
583
584        // Push with a very long description
585        let long_desc = "This is a very long description that exceeds thirty characters and will be truncated in the list view";
586        let result = cmd_registry_push(
587            &input,
588            "long-desc-test",
589            "1.0.0",
590            long_desc,
591            "MIT",
592            "",
593            &registry_path,
594        );
595        assert!(result.is_ok());
596
597        // List should truncate the description
598        let result = cmd_registry_list(&registry_path);
599        assert!(result.is_ok());
600    }
601
602    #[test]
603    fn test_cmd_registry_show_info_with_all_metadata() {
604        let temp_dir = tempfile::tempdir()
605            .ok()
606            .unwrap_or_else(|| panic!("Should create temp dir"));
607        let registry_path = temp_dir.path().join("registry");
608        let input = temp_dir.path().join("data.parquet");
609
610        create_test_parquet(&input, 10);
611
612        // Push with all metadata
613        cmd_registry_push(
614            &input,
615            "full-metadata",
616            "1.0.0",
617            "Full metadata test",
618            "Apache-2.0",
619            "test,metadata,full",
620            &registry_path,
621        )
622        .ok()
623        .unwrap_or_else(|| panic!("Should push"));
624
625        let result = cmd_registry_show_info("full-metadata", &registry_path);
626        assert!(result.is_ok());
627    }
628
629    #[test]
630    fn test_create_registry_new_directory() {
631        let temp_dir = tempfile::tempdir()
632            .ok()
633            .unwrap_or_else(|| panic!("Should create temp dir"));
634        let registry_path = temp_dir.path().join("new_registry_dir");
635
636        // Directory doesn't exist yet
637        assert!(!registry_path.exists());
638
639        let result = create_registry(&registry_path);
640        assert!(result.is_ok());
641
642        // Directory should now exist
643        assert!(registry_path.exists());
644    }
645
646    #[test]
647    fn test_cmd_registry_delete_nonexistent() {
648        let temp_dir = tempfile::tempdir()
649            .ok()
650            .unwrap_or_else(|| panic!("Should create temp dir"));
651        let registry_path = temp_dir.path().join("registry");
652
653        // Init registry
654        cmd_registry_init(&registry_path)
655            .ok()
656            .unwrap_or_else(|| panic!("Should init"));
657
658        // Try to delete something that doesn't exist
659        let result = cmd_registry_delete("nonexistent", "1.0.0", &registry_path);
660        assert!(result.is_err());
661    }
662
663    #[test]
664    fn test_cmd_registry_pull_nonexistent() {
665        let temp_dir = tempfile::tempdir()
666            .ok()
667            .unwrap_or_else(|| panic!("Should create temp dir"));
668        let registry_path = temp_dir.path().join("registry");
669        let output = temp_dir.path().join("output.parquet");
670
671        // Init registry
672        cmd_registry_init(&registry_path)
673            .ok()
674            .unwrap_or_else(|| panic!("Should init"));
675
676        // Try to pull something that doesn't exist
677        let result = cmd_registry_pull("nonexistent", &output, None, &registry_path);
678        assert!(result.is_err());
679    }
680
681    #[test]
682    fn test_cmd_registry_search_with_data() {
683        let temp_dir = tempfile::tempdir()
684            .ok()
685            .unwrap_or_else(|| panic!("Should create temp dir"));
686        let registry_path = temp_dir.path().join("registry");
687        let data_path = temp_dir.path().join("data.parquet");
688
689        create_test_parquet(&data_path, 20);
690
691        // Push a dataset first
692        cmd_registry_push(
693            &data_path,
694            "searchable-data",
695            "1.0.0",
696            "Dataset for search test",
697            "MIT",
698            "search,test",
699            &registry_path,
700        )
701        .unwrap();
702
703        let result = cmd_registry_search("search", &registry_path);
704        assert!(result.is_ok());
705    }
706
707    #[test]
708    fn test_cmd_registry_search_empty_results() {
709        let temp_dir = tempfile::tempdir()
710            .ok()
711            .unwrap_or_else(|| panic!("Should create temp dir"));
712        let registry_path = temp_dir.path().join("registry");
713
714        cmd_registry_init(&registry_path).unwrap();
715
716        let result = cmd_registry_search("nonexistent", &registry_path);
717        assert!(result.is_ok());
718    }
719
720    #[test]
721    fn test_cmd_registry_show_info_basic() {
722        let temp_dir = tempfile::tempdir()
723            .ok()
724            .unwrap_or_else(|| panic!("Should create temp dir"));
725        let registry_path = temp_dir.path().join("registry");
726        let data_path = temp_dir.path().join("data.parquet");
727
728        create_test_parquet(&data_path, 20);
729
730        cmd_registry_push(
731            &data_path,
732            "info-dataset",
733            "1.0.0",
734            "Dataset for info test",
735            "Apache-2.0",
736            "info,test",
737            &registry_path,
738        )
739        .unwrap();
740
741        let result = cmd_registry_show_info("info-dataset", &registry_path);
742        assert!(result.is_ok());
743    }
744
745    #[test]
746    fn test_cmd_registry_show_info_not_found() {
747        let temp_dir = tempfile::tempdir()
748            .ok()
749            .unwrap_or_else(|| panic!("Should create temp dir"));
750        let registry_path = temp_dir.path().join("registry");
751
752        cmd_registry_init(&registry_path).unwrap();
753
754        let result = cmd_registry_show_info("nonexistent", &registry_path);
755        assert!(result.is_err());
756    }
757
758    #[test]
759    fn test_cmd_registry_delete_existing() {
760        let temp_dir = tempfile::tempdir()
761            .ok()
762            .unwrap_or_else(|| panic!("Should create temp dir"));
763        let registry_path = temp_dir.path().join("registry");
764        let data_path = temp_dir.path().join("data.parquet");
765
766        create_test_parquet(&data_path, 20);
767
768        cmd_registry_push(
769            &data_path,
770            "delete-test",
771            "1.0.0",
772            "Dataset to delete",
773            "MIT",
774            "delete,test",
775            &registry_path,
776        )
777        .unwrap();
778
779        let result = cmd_registry_delete("delete-test", "1.0.0", &registry_path);
780        assert!(result.is_ok());
781    }
782}