1mod index;
8
9pub use index::{DatasetInfo, DatasetMetadata, RegistryIndex};
10
11use crate::{
12 backend::StorageBackend,
13 dataset::{ArrowDataset, Dataset},
14 error::{Error, Result},
15};
16
17pub struct Registry {
31 backend: Box<dyn StorageBackend>,
32 index_path: String,
33}
34
35impl Registry {
36 pub fn new(backend: Box<dyn StorageBackend>) -> Self {
38 Self {
39 backend,
40 index_path: "registry-index.json".to_string(),
41 }
42 }
43
44 pub fn with_index_path(
46 backend: Box<dyn StorageBackend>,
47 index_path: impl Into<String>,
48 ) -> Self {
49 Self {
50 backend,
51 index_path: index_path.into(),
52 }
53 }
54
55 pub fn init(&self) -> Result<()> {
62 if self.backend.exists(&self.index_path)? {
63 return Ok(());
64 }
65
66 let index = RegistryIndex::new();
67 self.save_index(&index)
68 }
69
70 pub fn load_index(&self) -> Result<RegistryIndex> {
76 let data = self.backend.get(&self.index_path)?;
77 let index: RegistryIndex = serde_json::from_slice(&data)
78 .map_err(|e| Error::storage(format!("Failed to parse registry index: {e}")))?;
79 Ok(index)
80 }
81
82 fn save_index(&self, index: &RegistryIndex) -> Result<()> {
84 let data = serde_json::to_vec_pretty(index)
85 .map_err(|e| Error::storage(format!("Failed to serialize registry index: {e}")))?;
86 self.backend.put(&self.index_path, data.into())
87 }
88
89 pub fn list(&self) -> Result<Vec<DatasetInfo>> {
95 let index = self.load_index()?;
96 Ok(index.datasets)
97 }
98
99 pub fn get_info(&self, name: &str) -> Result<DatasetInfo> {
105 let index = self.load_index()?;
106 index
107 .datasets
108 .into_iter()
109 .find(|d| d.name == name)
110 .ok_or_else(|| Error::storage(format!("Dataset '{}' not found in registry", name)))
111 }
112
113 pub fn publish(
127 &self,
128 name: &str,
129 version: &str,
130 dataset: &ArrowDataset,
131 metadata: DatasetMetadata,
132 ) -> Result<()> {
133 if name.is_empty() {
135 return Err(Error::invalid_config("Dataset name cannot be empty"));
136 }
137 if version.is_empty() {
138 return Err(Error::invalid_config("Version cannot be empty"));
139 }
140
141 let data_path = format!("datasets/{}/{}/data.parquet", name, version);
143
144 let temp_dir = std::env::temp_dir();
146 let unique_id = format!("{}_{:?}", std::process::id(), std::thread::current().id());
147 let temp_path = temp_dir.join(format!("alimentar_publish_{}.parquet", unique_id));
148 dataset.to_parquet(&temp_path)?;
149
150 let parquet_data = std::fs::read(&temp_path).map_err(|e| Error::io(e, &temp_path))?;
152 self.backend.put(&data_path, parquet_data.into())?;
153
154 let _ = std::fs::remove_file(&temp_path);
156
157 let mut index = self.load_index().unwrap_or_else(|_| RegistryIndex::new());
159
160 let dataset_info = index.datasets.iter_mut().find(|d| d.name == name);
162
163 let size_bytes = self.backend.size(&data_path)?;
164 let num_rows = dataset.len();
165 let schema = dataset.schema();
166
167 if let Some(info) = dataset_info {
168 if !info.versions.contains(&version.to_string()) {
170 info.versions.push(version.to_string());
171 }
172 info.latest = version.to_string();
173 info.size_bytes = size_bytes;
174 info.num_rows = num_rows;
175 info.metadata = metadata;
176 } else {
177 let schema_json = schema_to_json(&schema);
180
181 index.datasets.push(DatasetInfo {
182 name: name.to_string(),
183 versions: vec![version.to_string()],
184 latest: version.to_string(),
185 size_bytes,
186 num_rows,
187 schema: schema_json,
188 metadata,
189 });
190 }
191
192 self.save_index(&index)
193 }
194
195 pub fn pull(&self, name: &str, version: Option<&str>) -> Result<ArrowDataset> {
206 let info = self.get_info(name)?;
207
208 let version = version.unwrap_or(&info.latest);
209
210 if !info.versions.contains(&version.to_string()) {
211 return Err(Error::storage(format!(
212 "Version '{}' not found for dataset '{}'. Available: {:?}",
213 version, name, info.versions
214 )));
215 }
216
217 let data_path = format!("datasets/{}/{}/data.parquet", name, version);
218
219 let data = self.backend.get(&data_path)?;
221
222 let temp_dir = std::env::temp_dir();
223 let unique_id = format!("{}_{:?}", std::process::id(), std::thread::current().id());
224 let temp_path = temp_dir.join(format!("alimentar_pull_{}.parquet", unique_id));
225
226 std::fs::write(&temp_path, &data).map_err(|e| Error::io(e, &temp_path))?;
227
228 let dataset = ArrowDataset::from_parquet(&temp_path)?;
229
230 let _ = std::fs::remove_file(&temp_path);
232
233 Ok(dataset)
234 }
235
236 pub fn search(&self, query: &str) -> Result<Vec<DatasetInfo>> {
242 let index = self.load_index()?;
243 let query_lower = query.to_lowercase();
244
245 let results: Vec<DatasetInfo> = index
246 .datasets
247 .into_iter()
248 .filter(|d| {
249 d.name.to_lowercase().contains(&query_lower)
250 || d.metadata.description.to_lowercase().contains(&query_lower)
251 })
252 .collect();
253
254 Ok(results)
255 }
256
257 pub fn search_tags(&self, tags: &[&str]) -> Result<Vec<DatasetInfo>> {
263 let index = self.load_index()?;
264
265 let results: Vec<DatasetInfo> = index
266 .datasets
267 .into_iter()
268 .filter(|d| {
269 tags.iter()
270 .any(|&tag| d.metadata.tags.iter().any(|t| t == tag))
271 })
272 .collect();
273
274 Ok(results)
275 }
276
277 pub fn delete(&self, name: &str, version: &str) -> Result<()> {
283 let mut index = self.load_index()?;
284
285 let dataset_idx = index
286 .datasets
287 .iter()
288 .position(|d| d.name == name)
289 .ok_or_else(|| Error::storage(format!("Dataset '{}' not found", name)))?;
290
291 let info = &mut index.datasets[dataset_idx];
292
293 if !info.versions.contains(&version.to_string()) {
294 return Err(Error::storage(format!(
295 "Version '{}' not found for dataset '{}'",
296 version, name
297 )));
298 }
299
300 let data_path = format!("datasets/{}/{}/data.parquet", name, version);
302 self.backend.delete(&data_path)?;
303
304 info.versions.retain(|v| v != version);
306
307 if info.versions.is_empty() {
308 index.datasets.remove(dataset_idx);
310 } else if info.latest == version {
311 info.latest = info.versions.last().cloned().unwrap_or_default();
313 }
314
315 self.save_index(&index)
316 }
317}
318
319impl std::fmt::Debug for Registry {
320 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
321 f.debug_struct("Registry")
322 .field("index_path", &self.index_path)
323 .finish_non_exhaustive()
324 }
325}
326
327fn schema_to_json(schema: &arrow::datatypes::SchemaRef) -> serde_json::Value {
329 let fields: Vec<serde_json::Value> = schema
330 .fields()
331 .iter()
332 .map(|field| {
333 serde_json::json!({
334 "name": field.name(),
335 "data_type": format!("{:?}", field.data_type()),
336 "nullable": field.is_nullable(),
337 })
338 })
339 .collect();
340
341 serde_json::json!({
342 "fields": fields,
343 })
344}
345
346#[cfg(test)]
347#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
348mod tests {
349 use std::sync::Arc;
350
351 use arrow::{
352 array::{Int32Array, StringArray},
353 datatypes::{DataType, Field, Schema},
354 record_batch::RecordBatch,
355 };
356
357 use super::*;
358 use crate::{backend::MemoryBackend, Dataset};
359
360 fn create_test_dataset(rows: usize) -> ArrowDataset {
361 let schema = Arc::new(Schema::new(vec![
362 Field::new("id", DataType::Int32, false),
363 Field::new("name", DataType::Utf8, false),
364 ]));
365
366 let ids: Vec<i32> = (0..rows as i32).collect();
367 let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
368
369 let batch = RecordBatch::try_new(
370 schema,
371 vec![
372 Arc::new(Int32Array::from(ids)),
373 Arc::new(StringArray::from(names)),
374 ],
375 )
376 .ok()
377 .unwrap_or_else(|| panic!("Should create batch"));
378
379 ArrowDataset::from_batch(batch)
380 .ok()
381 .unwrap_or_else(|| panic!("Should create dataset"))
382 }
383
384 fn create_test_metadata() -> DatasetMetadata {
385 DatasetMetadata {
386 description: "Test dataset".to_string(),
387 license: "MIT".to_string(),
388 tags: vec!["test".to_string(), "example".to_string()],
389 source: Some("unit test".to_string()),
390 citation: None,
391 sha256: None,
392 }
393 }
394
395 #[test]
396 fn test_registry_init() {
397 let backend = MemoryBackend::new();
398 let registry = Registry::new(Box::new(backend));
399
400 let result = registry.init();
401 assert!(result.is_ok());
402
403 let result = registry.init();
405 assert!(result.is_ok());
406 }
407
408 #[test]
409 fn test_registry_publish_and_pull() {
410 let backend = MemoryBackend::new();
411 let registry = Registry::new(Box::new(backend));
412 registry
413 .init()
414 .ok()
415 .unwrap_or_else(|| panic!("Should init"));
416
417 let dataset = create_test_dataset(10);
418 let metadata = create_test_metadata();
419
420 let result = registry.publish("test-dataset", "1.0.0", &dataset, metadata);
422 assert!(result.is_ok());
423
424 let pulled = registry.pull("test-dataset", Some("1.0.0"));
426 assert!(pulled.is_ok());
427 let pulled = pulled.ok().unwrap_or_else(|| panic!("Should pull"));
428 assert_eq!(pulled.len(), 10);
429 }
430
431 #[test]
432 fn test_registry_list() {
433 let backend = MemoryBackend::new();
434 let registry = Registry::new(Box::new(backend));
435 registry
436 .init()
437 .ok()
438 .unwrap_or_else(|| panic!("Should init"));
439
440 let dataset = create_test_dataset(5);
441 let metadata = create_test_metadata();
442
443 registry
444 .publish("dataset-a", "1.0.0", &dataset, metadata.clone())
445 .ok()
446 .unwrap_or_else(|| panic!("Should publish"));
447 registry
448 .publish("dataset-b", "1.0.0", &dataset, metadata)
449 .ok()
450 .unwrap_or_else(|| panic!("Should publish"));
451
452 let list = registry.list();
453 assert!(list.is_ok());
454 let list = list.ok().unwrap_or_else(|| panic!("Should list"));
455 assert_eq!(list.len(), 2);
456 }
457
458 #[test]
459 fn test_registry_search() {
460 let backend = MemoryBackend::new();
461 let registry = Registry::new(Box::new(backend));
462 registry
463 .init()
464 .ok()
465 .unwrap_or_else(|| panic!("Should init"));
466
467 let dataset = create_test_dataset(5);
468
469 let metadata1 = DatasetMetadata {
470 description: "Machine learning dataset".to_string(),
471 license: "MIT".to_string(),
472 tags: vec!["ml".to_string()],
473 source: None,
474 citation: None,
475 sha256: None,
476 };
477
478 let metadata2 = DatasetMetadata {
479 description: "Natural language processing data".to_string(),
480 license: "Apache-2.0".to_string(),
481 tags: vec!["nlp".to_string()],
482 source: None,
483 citation: None,
484 sha256: None,
485 };
486
487 registry
488 .publish("ml-data", "1.0.0", &dataset, metadata1)
489 .ok()
490 .unwrap_or_else(|| panic!("Should publish"));
491 registry
492 .publish("nlp-data", "1.0.0", &dataset, metadata2)
493 .ok()
494 .unwrap_or_else(|| panic!("Should publish"));
495
496 let results = registry.search("ml");
498 assert!(results.is_ok());
499 let results = results.ok().unwrap_or_else(|| panic!("Should search"));
500 assert_eq!(results.len(), 1);
501 assert_eq!(results[0].name, "ml-data");
502
503 let results = registry.search("natural language");
505 assert!(results.is_ok());
506 let results = results.ok().unwrap_or_else(|| panic!("Should search"));
507 assert_eq!(results.len(), 1);
508 assert_eq!(results[0].name, "nlp-data");
509 }
510
511 #[test]
512 fn test_registry_search_tags() {
513 let backend = MemoryBackend::new();
514 let registry = Registry::new(Box::new(backend));
515 registry
516 .init()
517 .ok()
518 .unwrap_or_else(|| panic!("Should init"));
519
520 let dataset = create_test_dataset(5);
521 let metadata = DatasetMetadata {
522 description: "Test".to_string(),
523 license: "MIT".to_string(),
524 tags: vec!["ml".to_string(), "vision".to_string()],
525 source: None,
526 citation: None,
527 sha256: None,
528 };
529
530 registry
531 .publish("vision-data", "1.0.0", &dataset, metadata)
532 .ok()
533 .unwrap_or_else(|| panic!("Should publish"));
534
535 let results = registry.search_tags(&["vision"]);
536 assert!(results.is_ok());
537 let results = results.ok().unwrap_or_else(|| panic!("Should search"));
538 assert_eq!(results.len(), 1);
539
540 let results = registry.search_tags(&["audio"]);
541 assert!(results.is_ok());
542 let results = results.ok().unwrap_or_else(|| panic!("Should search"));
543 assert!(results.is_empty());
544 }
545
546 #[test]
547 fn test_registry_versioning() {
548 let backend = MemoryBackend::new();
549 let registry = Registry::new(Box::new(backend));
550 registry
551 .init()
552 .ok()
553 .unwrap_or_else(|| panic!("Should init"));
554
555 let dataset_v1 = create_test_dataset(10);
556 let dataset_v2 = create_test_dataset(20);
557 let metadata = create_test_metadata();
558
559 registry
560 .publish("versioned", "1.0.0", &dataset_v1, metadata.clone())
561 .ok()
562 .unwrap_or_else(|| panic!("Should publish v1"));
563 registry
564 .publish("versioned", "2.0.0", &dataset_v2, metadata)
565 .ok()
566 .unwrap_or_else(|| panic!("Should publish v2"));
567
568 let info = registry.get_info("versioned");
569 assert!(info.is_ok());
570 let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
571 assert_eq!(info.versions.len(), 2);
572 assert_eq!(info.latest, "2.0.0");
573
574 let v1 = registry.pull("versioned", Some("1.0.0"));
576 assert!(v1.is_ok());
577 let v1 = v1.ok().unwrap_or_else(|| panic!("Should pull v1"));
578 assert_eq!(v1.len(), 10);
579
580 let latest = registry.pull("versioned", None);
582 assert!(latest.is_ok());
583 let latest = latest.ok().unwrap_or_else(|| panic!("Should pull latest"));
584 assert_eq!(latest.len(), 20);
585 }
586
587 #[test]
588 fn test_registry_delete() {
589 let backend = MemoryBackend::new();
590 let registry = Registry::new(Box::new(backend));
591 registry
592 .init()
593 .ok()
594 .unwrap_or_else(|| panic!("Should init"));
595
596 let dataset = create_test_dataset(5);
597 let metadata = create_test_metadata();
598
599 registry
600 .publish("to-delete", "1.0.0", &dataset, metadata.clone())
601 .ok()
602 .unwrap_or_else(|| panic!("Should publish"));
603 registry
604 .publish("to-delete", "2.0.0", &dataset, metadata)
605 .ok()
606 .unwrap_or_else(|| panic!("Should publish"));
607
608 let result = registry.delete("to-delete", "1.0.0");
610 assert!(result.is_ok());
611
612 let info = registry.get_info("to-delete");
613 assert!(info.is_ok());
614 let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
615 assert_eq!(info.versions.len(), 1);
616 assert!(!info.versions.contains(&"1.0.0".to_string()));
617
618 let result = registry.delete("to-delete", "2.0.0");
620 assert!(result.is_ok());
621
622 let info = registry.get_info("to-delete");
623 assert!(info.is_err());
624 }
625
626 #[test]
627 fn test_registry_empty_name_error() {
628 let backend = MemoryBackend::new();
629 let registry = Registry::new(Box::new(backend));
630 registry
631 .init()
632 .ok()
633 .unwrap_or_else(|| panic!("Should init"));
634
635 let dataset = create_test_dataset(5);
636 let metadata = create_test_metadata();
637
638 let result = registry.publish("", "1.0.0", &dataset, metadata);
639 assert!(result.is_err());
640 }
641
642 #[test]
643 fn test_registry_not_found() {
644 let backend = MemoryBackend::new();
645 let registry = Registry::new(Box::new(backend));
646 registry
647 .init()
648 .ok()
649 .unwrap_or_else(|| panic!("Should init"));
650
651 let result = registry.pull("nonexistent", None);
652 assert!(result.is_err());
653 }
654
655 #[test]
656 fn test_registry_with_index_path() {
657 let backend = MemoryBackend::new();
658 let registry = Registry::with_index_path(Box::new(backend), "custom-index.json");
659 registry
660 .init()
661 .ok()
662 .unwrap_or_else(|| panic!("Should init"));
663
664 let dataset = create_test_dataset(5);
665 let metadata = create_test_metadata();
666
667 let result = registry.publish("test", "1.0.0", &dataset, metadata);
668 assert!(result.is_ok());
669
670 let list = registry.list();
671 assert!(list.is_ok());
672 assert_eq!(list.ok().unwrap_or_else(|| panic!("Should list")).len(), 1);
673 }
674
675 #[test]
676 fn test_registry_debug() {
677 let backend = MemoryBackend::new();
678 let registry = Registry::new(Box::new(backend));
679 let debug = format!("{:?}", registry);
680 assert!(debug.contains("Registry"));
681 assert!(debug.contains("index_path"));
682 }
683
684 #[test]
685 fn test_registry_delete_nonexistent_dataset() {
686 let backend = MemoryBackend::new();
687 let registry = Registry::new(Box::new(backend));
688 registry
689 .init()
690 .ok()
691 .unwrap_or_else(|| panic!("Should init"));
692
693 let result = registry.delete("nonexistent", "1.0.0");
694 assert!(result.is_err());
695 }
696
697 #[test]
698 fn test_registry_delete_nonexistent_version() {
699 let backend = MemoryBackend::new();
700 let registry = Registry::new(Box::new(backend));
701 registry
702 .init()
703 .ok()
704 .unwrap_or_else(|| panic!("Should init"));
705
706 let dataset = create_test_dataset(5);
707 let metadata = create_test_metadata();
708
709 registry
710 .publish("dataset", "1.0.0", &dataset, metadata)
711 .ok()
712 .unwrap_or_else(|| panic!("Should publish"));
713
714 let result = registry.delete("dataset", "2.0.0");
715 assert!(result.is_err());
716 }
717
718 #[test]
719 fn test_registry_delete_latest_updates_correctly() {
720 let backend = MemoryBackend::new();
721 let registry = Registry::new(Box::new(backend));
722 registry
723 .init()
724 .ok()
725 .unwrap_or_else(|| panic!("Should init"));
726
727 let dataset = create_test_dataset(5);
728 let metadata = create_test_metadata();
729
730 registry
731 .publish("multi-version", "1.0.0", &dataset, metadata.clone())
732 .ok()
733 .unwrap_or_else(|| panic!("Should publish"));
734 registry
735 .publish("multi-version", "2.0.0", &dataset, metadata)
736 .ok()
737 .unwrap_or_else(|| panic!("Should publish"));
738
739 let result = registry.delete("multi-version", "2.0.0");
741 assert!(result.is_ok());
742
743 let info = registry.get_info("multi-version");
744 assert!(info.is_ok());
745 let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
746 assert_eq!(info.latest, "1.0.0");
747 }
748
749 #[test]
750 fn test_registry_pull_nonexistent_version() {
751 let backend = MemoryBackend::new();
752 let registry = Registry::new(Box::new(backend));
753 registry
754 .init()
755 .ok()
756 .unwrap_or_else(|| panic!("Should init"));
757
758 let dataset = create_test_dataset(5);
759 let metadata = create_test_metadata();
760
761 registry
762 .publish("versioned-test", "1.0.0", &dataset, metadata)
763 .ok()
764 .unwrap_or_else(|| panic!("Should publish"));
765
766 let result = registry.pull("versioned-test", Some("9.9.9"));
767 assert!(result.is_err());
768 }
769
770 #[test]
771 fn test_registry_empty_version_error() {
772 let backend = MemoryBackend::new();
773 let registry = Registry::new(Box::new(backend));
774 registry
775 .init()
776 .ok()
777 .unwrap_or_else(|| panic!("Should init"));
778
779 let dataset = create_test_dataset(5);
780 let metadata = create_test_metadata();
781
782 let result = registry.publish("test", "", &dataset, metadata);
783 assert!(result.is_err());
784 }
785
786 #[test]
787 fn test_registry_publish_update_existing() {
788 let backend = MemoryBackend::new();
789 let registry = Registry::new(Box::new(backend));
790 registry
791 .init()
792 .ok()
793 .unwrap_or_else(|| panic!("Should init"));
794
795 let dataset1 = create_test_dataset(5);
796 let dataset2 = create_test_dataset(10);
797 let metadata = create_test_metadata();
798
799 registry
801 .publish("update-test", "1.0.0", &dataset1, metadata.clone())
802 .ok()
803 .unwrap_or_else(|| panic!("Should publish"));
804 registry
805 .publish("update-test", "1.0.0", &dataset2, metadata)
806 .ok()
807 .unwrap_or_else(|| panic!("Should publish update"));
808
809 let info = registry.get_info("update-test");
810 assert!(info.is_ok());
811 let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
812 assert_eq!(info.versions.len(), 1);
814 assert_eq!(info.num_rows, 10);
816 }
817
818 #[test]
821 fn test_registry_load_index_without_init() {
822 let backend = MemoryBackend::new();
823 let registry = Registry::new(Box::new(backend));
824
825 let result = registry.load_index();
827 assert!(result.is_err());
828 }
829
830 #[test]
831 fn test_registry_list_empty() {
832 let backend = MemoryBackend::new();
833 let registry = Registry::new(Box::new(backend));
834 registry
835 .init()
836 .ok()
837 .unwrap_or_else(|| panic!("Should init"));
838
839 let list = registry.list();
840 assert!(list.is_ok());
841 assert!(list.ok().unwrap_or_else(|| panic!("list")).is_empty());
842 }
843
844 #[test]
845 fn test_registry_search_no_results() {
846 let backend = MemoryBackend::new();
847 let registry = Registry::new(Box::new(backend));
848 registry
849 .init()
850 .ok()
851 .unwrap_or_else(|| panic!("Should init"));
852
853 let dataset = create_test_dataset(5);
854 let metadata = create_test_metadata();
855
856 registry
857 .publish("my-dataset", "1.0.0", &dataset, metadata)
858 .ok()
859 .unwrap_or_else(|| panic!("Should publish"));
860
861 let results = registry.search("nonexistent");
862 assert!(results.is_ok());
863 assert!(results.ok().unwrap_or_else(|| panic!("search")).is_empty());
864 }
865
866 #[test]
867 fn test_registry_search_case_insensitive() {
868 let backend = MemoryBackend::new();
869 let registry = Registry::new(Box::new(backend));
870 registry
871 .init()
872 .ok()
873 .unwrap_or_else(|| panic!("Should init"));
874
875 let dataset = create_test_dataset(5);
876 let metadata = DatasetMetadata {
877 description: "UPPERCASE DESCRIPTION".to_string(),
878 license: "MIT".to_string(),
879 tags: vec![],
880 source: None,
881 citation: None,
882 sha256: None,
883 };
884
885 registry
886 .publish("TestDataset", "1.0.0", &dataset, metadata)
887 .ok()
888 .unwrap_or_else(|| panic!("Should publish"));
889
890 let results = registry.search("testdataset");
892 assert_eq!(results.ok().unwrap_or_else(|| panic!("search")).len(), 1);
893
894 let results = registry.search("uppercase");
896 assert_eq!(results.ok().unwrap_or_else(|| panic!("search")).len(), 1);
897 }
898
899 #[test]
900 fn test_registry_search_tags_no_match() {
901 let backend = MemoryBackend::new();
902 let registry = Registry::new(Box::new(backend));
903 registry
904 .init()
905 .ok()
906 .unwrap_or_else(|| panic!("Should init"));
907
908 let dataset = create_test_dataset(5);
909 let metadata = DatasetMetadata {
910 description: "Test".to_string(),
911 license: "MIT".to_string(),
912 tags: vec!["ml".to_string(), "vision".to_string()],
913 source: None,
914 citation: None,
915 sha256: None,
916 };
917
918 registry
919 .publish("tagged-data", "1.0.0", &dataset, metadata)
920 .ok()
921 .unwrap_or_else(|| panic!("Should publish"));
922
923 let results = registry.search_tags(&["audio", "nlp"]);
924 assert!(results.is_ok());
925 assert!(results.ok().unwrap_or_else(|| panic!("search")).is_empty());
926 }
927
928 #[test]
929 fn test_registry_search_tags_multiple_matches() {
930 let backend = MemoryBackend::new();
931 let registry = Registry::new(Box::new(backend));
932 registry
933 .init()
934 .ok()
935 .unwrap_or_else(|| panic!("Should init"));
936
937 let dataset = create_test_dataset(5);
938
939 let metadata1 = DatasetMetadata {
940 description: "Dataset 1".to_string(),
941 license: "MIT".to_string(),
942 tags: vec!["ml".to_string()],
943 source: None,
944 citation: None,
945 sha256: None,
946 };
947
948 let metadata2 = DatasetMetadata {
949 description: "Dataset 2".to_string(),
950 license: "MIT".to_string(),
951 tags: vec!["ml".to_string(), "vision".to_string()],
952 source: None,
953 citation: None,
954 sha256: None,
955 };
956
957 registry
958 .publish("data-a", "1.0.0", &dataset, metadata1)
959 .ok()
960 .unwrap_or_else(|| panic!("publish a"));
961 registry
962 .publish("data-b", "1.0.0", &dataset, metadata2)
963 .ok()
964 .unwrap_or_else(|| panic!("publish b"));
965
966 let results = registry.search_tags(&["ml"]);
968 assert_eq!(results.ok().unwrap_or_else(|| panic!("search")).len(), 2);
969 }
970
971 #[test]
972 fn test_registry_get_info_schema_contains_fields() {
973 let backend = MemoryBackend::new();
974 let registry = Registry::new(Box::new(backend));
975 registry
976 .init()
977 .ok()
978 .unwrap_or_else(|| panic!("Should init"));
979
980 let dataset = create_test_dataset(10);
981 let metadata = create_test_metadata();
982
983 registry
984 .publish("schema-test", "1.0.0", &dataset, metadata)
985 .ok()
986 .unwrap_or_else(|| panic!("Should publish"));
987
988 let info = registry.get_info("schema-test");
989 assert!(info.is_ok());
990 let info = info.ok().unwrap_or_else(|| panic!("info"));
991
992 assert!(info.schema.is_object());
994 let fields = info.schema.get("fields");
995 assert!(fields.is_some());
996 }
997
998 #[test]
999 fn test_registry_multiple_versions_ordering() {
1000 let backend = MemoryBackend::new();
1001 let registry = Registry::new(Box::new(backend));
1002 registry
1003 .init()
1004 .ok()
1005 .unwrap_or_else(|| panic!("Should init"));
1006
1007 let dataset = create_test_dataset(5);
1008 let metadata = create_test_metadata();
1009
1010 registry
1012 .publish("ordered", "2.0.0", &dataset, metadata.clone())
1013 .ok()
1014 .unwrap_or_else(|| panic!("publish 2.0"));
1015 registry
1016 .publish("ordered", "1.0.0", &dataset, metadata.clone())
1017 .ok()
1018 .unwrap_or_else(|| panic!("publish 1.0"));
1019 registry
1020 .publish("ordered", "3.0.0", &dataset, metadata)
1021 .ok()
1022 .unwrap_or_else(|| panic!("publish 3.0"));
1023
1024 let info = registry.get_info("ordered");
1025 assert!(info.is_ok());
1026 let info = info.ok().unwrap_or_else(|| panic!("info"));
1027
1028 assert_eq!(info.latest, "3.0.0");
1030 assert_eq!(info.versions.len(), 3);
1031 }
1032
1033 #[test]
1034 fn test_registry_delete_middle_version() {
1035 let backend = MemoryBackend::new();
1036 let registry = Registry::new(Box::new(backend));
1037 registry
1038 .init()
1039 .ok()
1040 .unwrap_or_else(|| panic!("Should init"));
1041
1042 let dataset = create_test_dataset(5);
1043 let metadata = create_test_metadata();
1044
1045 registry
1046 .publish("three-versions", "1.0.0", &dataset, metadata.clone())
1047 .ok()
1048 .unwrap_or_else(|| panic!("publish 1.0"));
1049 registry
1050 .publish("three-versions", "2.0.0", &dataset, metadata.clone())
1051 .ok()
1052 .unwrap_or_else(|| panic!("publish 2.0"));
1053 registry
1054 .publish("three-versions", "3.0.0", &dataset, metadata)
1055 .ok()
1056 .unwrap_or_else(|| panic!("publish 3.0"));
1057
1058 registry
1060 .delete("three-versions", "2.0.0")
1061 .ok()
1062 .unwrap_or_else(|| panic!("delete"));
1063
1064 let info = registry.get_info("three-versions");
1065 assert!(info.is_ok());
1066 let info = info.ok().unwrap_or_else(|| panic!("info"));
1067
1068 assert_eq!(info.versions.len(), 2);
1069 assert!(!info.versions.contains(&"2.0.0".to_string()));
1070 assert_eq!(info.latest, "3.0.0");
1072 }
1073
1074 #[test]
1075 fn test_registry_pull_specific_version() {
1076 let backend = MemoryBackend::new();
1077 let registry = Registry::new(Box::new(backend));
1078 registry
1079 .init()
1080 .ok()
1081 .unwrap_or_else(|| panic!("Should init"));
1082
1083 let dataset_v1 = create_test_dataset(5);
1084 let dataset_v2 = create_test_dataset(15);
1085 let metadata = create_test_metadata();
1086
1087 registry
1088 .publish("versioned", "1.0.0", &dataset_v1, metadata.clone())
1089 .ok()
1090 .unwrap_or_else(|| panic!("publish v1"));
1091 registry
1092 .publish("versioned", "2.0.0", &dataset_v2, metadata)
1093 .ok()
1094 .unwrap_or_else(|| panic!("publish v2"));
1095
1096 let pulled = registry.pull("versioned", Some("1.0.0"));
1098 assert!(pulled.is_ok());
1099 let pulled = pulled.ok().unwrap_or_else(|| panic!("pulled"));
1100 assert_eq!(pulled.len(), 5);
1101 }
1102
1103 #[test]
1104 fn test_registry_init_idempotent() {
1105 let backend = MemoryBackend::new();
1106 let registry = Registry::new(Box::new(backend));
1107
1108 for _ in 0..5 {
1110 let result = registry.init();
1111 assert!(result.is_ok());
1112 }
1113
1114 let list = registry.list();
1116 assert!(list.is_ok());
1117 }
1118
1119 #[test]
1120 fn test_registry_metadata_preserved() {
1121 let backend = MemoryBackend::new();
1122 let registry = Registry::new(Box::new(backend));
1123 registry
1124 .init()
1125 .ok()
1126 .unwrap_or_else(|| panic!("Should init"));
1127
1128 let dataset = create_test_dataset(10);
1129 let metadata = DatasetMetadata {
1130 description: "Full metadata test".to_string(),
1131 license: "Apache-2.0".to_string(),
1132 tags: vec!["tag1".to_string(), "tag2".to_string(), "tag3".to_string()],
1133 source: Some("http://example.com/source".to_string()),
1134 citation: Some("Citation text here".to_string()),
1135 sha256: Some("abc123".to_string()),
1136 };
1137
1138 registry
1139 .publish("metadata-test", "1.0.0", &dataset, metadata.clone())
1140 .ok()
1141 .unwrap_or_else(|| panic!("publish"));
1142
1143 let info = registry.get_info("metadata-test");
1144 assert!(info.is_ok());
1145 let info = info.ok().unwrap_or_else(|| panic!("info"));
1146
1147 assert_eq!(info.metadata.description, metadata.description);
1148 assert_eq!(info.metadata.license, metadata.license);
1149 assert_eq!(info.metadata.tags, metadata.tags);
1150 assert_eq!(info.metadata.source, metadata.source);
1151 assert_eq!(info.metadata.citation, metadata.citation);
1152 assert_eq!(info.metadata.sha256, metadata.sha256);
1153 }
1154
1155 #[test]
1156 fn test_registry_size_bytes_updated() {
1157 let backend = MemoryBackend::new();
1158 let registry = Registry::new(Box::new(backend));
1159 registry
1160 .init()
1161 .ok()
1162 .unwrap_or_else(|| panic!("Should init"));
1163
1164 let dataset = create_test_dataset(100);
1165 let metadata = create_test_metadata();
1166
1167 registry
1168 .publish("size-test", "1.0.0", &dataset, metadata)
1169 .ok()
1170 .unwrap_or_else(|| panic!("publish"));
1171
1172 let info = registry.get_info("size-test");
1173 assert!(info.is_ok());
1174 let info = info.ok().unwrap_or_else(|| panic!("info"));
1175
1176 assert!(info.size_bytes > 0);
1178 }
1179
1180 #[test]
1181 fn test_schema_to_json_function() {
1182 let dataset = create_test_dataset(5);
1183 let schema = dataset.schema();
1184 let json = schema_to_json(&schema);
1185
1186 assert!(json.is_object());
1187 let fields = json.get("fields").and_then(|f| f.as_array());
1188 assert!(fields.is_some());
1189
1190 let fields = fields.unwrap_or_else(|| panic!("fields"));
1191 assert_eq!(fields.len(), 2); let first = &fields[0];
1195 assert_eq!(
1196 first
1197 .get("name")
1198 .and_then(|n: &serde_json::Value| n.as_str()),
1199 Some("id")
1200 );
1201 assert!(first.get("data_type").is_some());
1202 assert!(first.get("nullable").is_some());
1203 }
1204
1205 #[test]
1206 fn test_registry_concurrent_publish_different_datasets() {
1207 let backend = MemoryBackend::new();
1208 let registry = Registry::new(Box::new(backend));
1209 registry
1210 .init()
1211 .ok()
1212 .unwrap_or_else(|| panic!("Should init"));
1213
1214 let dataset = create_test_dataset(5);
1215 let metadata = create_test_metadata();
1216
1217 for i in 0..5 {
1219 registry
1220 .publish(
1221 &format!("dataset-{}", i),
1222 "1.0.0",
1223 &dataset,
1224 metadata.clone(),
1225 )
1226 .ok()
1227 .unwrap_or_else(|| panic!("publish"));
1228 }
1229
1230 let list = registry.list();
1231 assert!(list.is_ok());
1232 assert_eq!(list.ok().unwrap_or_else(|| panic!("list")).len(), 5);
1233 }
1234
1235 #[test]
1238 fn test_registry_search_partial_name_match() {
1239 let backend = MemoryBackend::new();
1240 let registry = Registry::new(Box::new(backend));
1241 registry.init().ok().unwrap();
1242
1243 let dataset = create_test_dataset(5);
1244 let metadata = create_test_metadata();
1245
1246 registry
1247 .publish("machine-learning-data", "1.0.0", &dataset, metadata.clone())
1248 .ok()
1249 .unwrap();
1250 registry
1251 .publish("deep-learning-data", "1.0.0", &dataset, metadata.clone())
1252 .ok()
1253 .unwrap();
1254 registry
1255 .publish("statistics-data", "1.0.0", &dataset, metadata)
1256 .ok()
1257 .unwrap();
1258
1259 let results = registry.search("learning").ok().unwrap();
1261 assert_eq!(results.len(), 2);
1262 }
1263
1264 #[test]
1265 fn test_registry_search_by_description() {
1266 let backend = MemoryBackend::new();
1267 let registry = Registry::new(Box::new(backend));
1268 registry.init().ok().unwrap();
1269
1270 let dataset = create_test_dataset(5);
1271
1272 let metadata1 = DatasetMetadata {
1273 description: "Image classification dataset".to_string(),
1274 license: "MIT".to_string(),
1275 tags: vec![],
1276 source: None,
1277 citation: None,
1278 sha256: None,
1279 };
1280
1281 let metadata2 = DatasetMetadata {
1282 description: "Text sentiment analysis".to_string(),
1283 license: "MIT".to_string(),
1284 tags: vec![],
1285 source: None,
1286 citation: None,
1287 sha256: None,
1288 };
1289
1290 registry
1291 .publish("images", "1.0.0", &dataset, metadata1)
1292 .ok()
1293 .unwrap();
1294 registry
1295 .publish("text", "1.0.0", &dataset, metadata2)
1296 .ok()
1297 .unwrap();
1298
1299 let image_results = registry.search("classification").ok().unwrap();
1301 assert_eq!(image_results.len(), 1);
1302 assert_eq!(image_results[0].name, "images");
1303
1304 let text_results = registry.search("sentiment").ok().unwrap();
1305 assert_eq!(text_results.len(), 1);
1306 assert_eq!(text_results[0].name, "text");
1307 }
1308
1309 #[test]
1310 fn test_registry_search_tags_multiple_tags() {
1311 let backend = MemoryBackend::new();
1312 let registry = Registry::new(Box::new(backend));
1313 registry.init().ok().unwrap();
1314
1315 let dataset = create_test_dataset(5);
1316
1317 let metadata = DatasetMetadata {
1318 description: "Multi-tag dataset".to_string(),
1319 license: "MIT".to_string(),
1320 tags: vec!["tag1".to_string(), "tag2".to_string(), "tag3".to_string()],
1321 source: None,
1322 citation: None,
1323 sha256: None,
1324 };
1325
1326 registry
1327 .publish("tagged", "1.0.0", &dataset, metadata)
1328 .ok()
1329 .unwrap();
1330
1331 assert_eq!(registry.search_tags(&["tag1"]).ok().unwrap().len(), 1);
1333 assert_eq!(registry.search_tags(&["tag2"]).ok().unwrap().len(), 1);
1334 assert_eq!(registry.search_tags(&["tag3"]).ok().unwrap().len(), 1);
1335 assert_eq!(
1336 registry.search_tags(&["tag1", "tag2"]).ok().unwrap().len(),
1337 1
1338 );
1339 }
1340
1341 #[test]
1342 fn test_registry_delete_first_of_multiple_versions() {
1343 let backend = MemoryBackend::new();
1344 let registry = Registry::new(Box::new(backend));
1345 registry.init().ok().unwrap();
1346
1347 let dataset = create_test_dataset(5);
1348 let metadata = create_test_metadata();
1349
1350 registry
1351 .publish("versioned", "1.0.0", &dataset, metadata.clone())
1352 .ok()
1353 .unwrap();
1354 registry
1355 .publish("versioned", "2.0.0", &dataset, metadata.clone())
1356 .ok()
1357 .unwrap();
1358 registry
1359 .publish("versioned", "3.0.0", &dataset, metadata)
1360 .ok()
1361 .unwrap();
1362
1363 registry.delete("versioned", "1.0.0").ok().unwrap();
1365
1366 let info = registry.get_info("versioned").ok().unwrap();
1367 assert_eq!(info.versions.len(), 2);
1368 assert!(!info.versions.contains(&"1.0.0".to_string()));
1369 assert_eq!(info.latest, "3.0.0");
1371 }
1372
1373 #[test]
1374 fn test_registry_publish_without_init() {
1375 let backend = MemoryBackend::new();
1376 let registry = Registry::new(Box::new(backend));
1377 let dataset = create_test_dataset(5);
1380 let metadata = create_test_metadata();
1381
1382 let result = registry.publish("no-init", "1.0.0", &dataset, metadata);
1384 assert!(result.is_ok());
1387 }
1388
1389 #[test]
1390 fn test_registry_get_info_contains_schema() {
1391 let backend = MemoryBackend::new();
1392 let registry = Registry::new(Box::new(backend));
1393 registry.init().ok().unwrap();
1394
1395 let dataset = create_test_dataset(10);
1396 let metadata = create_test_metadata();
1397
1398 registry
1399 .publish("schema-check", "1.0.0", &dataset, metadata)
1400 .ok()
1401 .unwrap();
1402
1403 let info = registry.get_info("schema-check").ok().unwrap();
1404
1405 assert!(info.schema.is_object());
1407 let fields = info.schema.get("fields").and_then(|f| f.as_array());
1408 assert!(fields.is_some());
1409 assert_eq!(fields.unwrap().len(), 2); }
1411
1412 #[test]
1413 fn test_registry_delete_all_versions_removes_dataset() {
1414 let backend = MemoryBackend::new();
1415 let registry = Registry::new(Box::new(backend));
1416 registry.init().ok().unwrap();
1417
1418 let dataset = create_test_dataset(5);
1419 let metadata = create_test_metadata();
1420
1421 registry
1422 .publish("temp", "1.0.0", &dataset, metadata.clone())
1423 .ok()
1424 .unwrap();
1425 registry
1426 .publish("temp", "2.0.0", &dataset, metadata)
1427 .ok()
1428 .unwrap();
1429
1430 registry.delete("temp", "1.0.0").ok().unwrap();
1432 registry.delete("temp", "2.0.0").ok().unwrap();
1433
1434 let result = registry.get_info("temp");
1436 assert!(result.is_err());
1437
1438 let list = registry.list().ok().unwrap();
1440 assert!(list.is_empty());
1441 }
1442
1443 #[test]
1444 fn test_registry_pull_uses_latest_when_none() {
1445 let backend = MemoryBackend::new();
1446 let registry = Registry::new(Box::new(backend));
1447 registry.init().ok().unwrap();
1448
1449 let dataset_v1 = create_test_dataset(10);
1450 let dataset_v2 = create_test_dataset(20);
1451 let metadata = create_test_metadata();
1452
1453 registry
1454 .publish("latest-test", "1.0.0", &dataset_v1, metadata.clone())
1455 .ok()
1456 .unwrap();
1457 registry
1458 .publish("latest-test", "2.0.0", &dataset_v2, metadata)
1459 .ok()
1460 .unwrap();
1461
1462 let pulled = registry.pull("latest-test", None).ok().unwrap();
1464 assert_eq!(pulled.len(), 20); }
1466
1467 #[test]
1468 fn test_registry_with_custom_index_path() {
1469 let backend = MemoryBackend::new();
1470 let registry = Registry::with_index_path(Box::new(backend), "my-custom-index.json");
1471
1472 registry.init().ok().unwrap();
1473
1474 let dataset = create_test_dataset(5);
1475 let metadata = create_test_metadata();
1476
1477 registry
1478 .publish("custom-path", "1.0.0", &dataset, metadata)
1479 .ok()
1480 .unwrap();
1481
1482 let list = registry.list().ok().unwrap();
1484 assert_eq!(list.len(), 1);
1485 }
1486
1487 #[test]
1488 fn test_registry_metadata_all_fields() {
1489 let backend = MemoryBackend::new();
1490 let registry = Registry::new(Box::new(backend));
1491 registry.init().ok().unwrap();
1492
1493 let dataset = create_test_dataset(5);
1494
1495 let metadata = DatasetMetadata {
1496 description: "Full metadata test".to_string(),
1497 license: "Apache-2.0".to_string(),
1498 tags: vec!["a".to_string(), "b".to_string(), "c".to_string()],
1499 source: Some("https://example.com/source".to_string()),
1500 citation: Some("Cite this dataset".to_string()),
1501 sha256: Some("abc123def456".to_string()),
1502 };
1503
1504 registry
1505 .publish("full-meta", "1.0.0", &dataset, metadata.clone())
1506 .ok()
1507 .unwrap();
1508
1509 let info = registry.get_info("full-meta").ok().unwrap();
1510 assert_eq!(info.metadata.description, "Full metadata test");
1511 assert_eq!(info.metadata.license, "Apache-2.0");
1512 assert_eq!(info.metadata.tags.len(), 3);
1513 assert_eq!(
1514 info.metadata.source,
1515 Some("https://example.com/source".to_string())
1516 );
1517 assert_eq!(
1518 info.metadata.citation,
1519 Some("Cite this dataset".to_string())
1520 );
1521 }
1522
1523 #[test]
1524 fn test_registry_republish_same_version() {
1525 let backend = MemoryBackend::new();
1526 let registry = Registry::new(Box::new(backend));
1527 registry.init().ok().unwrap();
1528
1529 let dataset1 = create_test_dataset(10);
1530 let dataset2 = create_test_dataset(20);
1531 let metadata = create_test_metadata();
1532
1533 registry
1535 .publish("republish", "1.0.0", &dataset1, metadata.clone())
1536 .ok()
1537 .unwrap();
1538
1539 let info1 = registry.get_info("republish").ok().unwrap();
1540 assert_eq!(info1.num_rows, 10);
1541
1542 registry
1544 .publish("republish", "1.0.0", &dataset2, metadata)
1545 .ok()
1546 .unwrap();
1547
1548 let info2 = registry.get_info("republish").ok().unwrap();
1549 assert_eq!(info2.num_rows, 20);
1550 assert_eq!(info2.versions.len(), 1);
1552 }
1553
1554 #[test]
1555 fn test_registry_search_empty_string() {
1556 let backend = MemoryBackend::new();
1557 let registry = Registry::new(Box::new(backend));
1558 registry.init().ok().unwrap();
1559
1560 let dataset = create_test_dataset(5);
1561 let metadata = create_test_metadata();
1562
1563 registry
1564 .publish("test-data", "1.0.0", &dataset, metadata)
1565 .ok()
1566 .unwrap();
1567
1568 let results = registry.search("").ok().unwrap();
1570 assert_eq!(results.len(), 1);
1571 }
1572
1573 #[test]
1574 fn test_registry_search_tags_empty_array() {
1575 let backend = MemoryBackend::new();
1576 let registry = Registry::new(Box::new(backend));
1577 registry.init().ok().unwrap();
1578
1579 let dataset = create_test_dataset(5);
1580 let metadata = DatasetMetadata {
1581 description: "Test".to_string(),
1582 license: "MIT".to_string(),
1583 tags: vec!["ml".to_string()],
1584 source: None,
1585 citation: None,
1586 sha256: None,
1587 };
1588
1589 registry
1590 .publish("tagged-ds", "1.0.0", &dataset, metadata)
1591 .ok()
1592 .unwrap();
1593
1594 let results = registry.search_tags(&[]).ok().unwrap();
1596 assert!(results.is_empty());
1597 }
1598
1599 #[test]
1600 fn test_schema_to_json_with_nullable_fields() {
1601 let schema = Arc::new(Schema::new(vec![
1603 Field::new("required_id", DataType::Int32, false),
1604 Field::new("optional_name", DataType::Utf8, true),
1605 ]));
1606
1607 let batch = RecordBatch::try_new(
1608 Arc::clone(&schema),
1609 vec![
1610 Arc::new(Int32Array::from(vec![1, 2, 3])),
1611 Arc::new(StringArray::from(vec!["a", "b", "c"])),
1612 ],
1613 )
1614 .unwrap();
1615
1616 let dataset = ArrowDataset::from_batch(batch).unwrap();
1617
1618 let json = schema_to_json(&dataset.schema());
1619 let fields = json.get("fields").and_then(|f| f.as_array()).unwrap();
1620
1621 assert_eq!(fields.len(), 2);
1622
1623 let field0_nullable = fields[0].get("nullable").and_then(|v| v.as_bool());
1625 let field1_nullable = fields[1].get("nullable").and_then(|v| v.as_bool());
1626
1627 assert_eq!(field0_nullable, Some(false));
1628 assert_eq!(field1_nullable, Some(true));
1629 }
1630}