Skip to main content

alimentar/registry/
mod.rs

1//! Dataset registry for sharing and discovery.
2//!
3//! The registry provides a way to publish, discover, and retrieve datasets
4//! from various storage backends. It supports local registries, S3-based
5//! registries, and HTTP-based read-only registries.
6
7mod index;
8
9pub use index::{DatasetInfo, DatasetMetadata, RegistryIndex};
10
11use crate::{
12    backend::StorageBackend,
13    dataset::{ArrowDataset, Dataset},
14    error::{Error, Result},
15};
16
17/// A dataset registry for publishing and retrieving datasets.
18///
19/// The registry stores datasets along with metadata in a structured format,
20/// enabling discovery and versioning of datasets.
21///
22/// # Example
23///
24/// ```no_run
25/// use alimentar::{backend::MemoryBackend, registry::Registry};
26///
27/// let backend = MemoryBackend::new();
28/// let registry = Registry::new(Box::new(backend));
29/// ```
30pub struct Registry {
31    backend: Box<dyn StorageBackend>,
32    index_path: String,
33}
34
35impl Registry {
36    /// Creates a new registry with the given storage backend.
37    pub fn new(backend: Box<dyn StorageBackend>) -> Self {
38        Self {
39            backend,
40            index_path: "registry-index.json".to_string(),
41        }
42    }
43
44    /// Creates a new registry with a custom index path.
45    pub fn with_index_path(
46        backend: Box<dyn StorageBackend>,
47        index_path: impl Into<String>,
48    ) -> Self {
49        Self {
50            backend,
51            index_path: index_path.into(),
52        }
53    }
54
55    /// Initializes the registry by creating an empty index if one doesn't
56    /// exist.
57    ///
58    /// # Errors
59    ///
60    /// Returns an error if the index cannot be created.
61    pub fn init(&self) -> Result<()> {
62        if self.backend.exists(&self.index_path)? {
63            return Ok(());
64        }
65
66        let index = RegistryIndex::new();
67        self.save_index(&index)
68    }
69
70    /// Loads the registry index.
71    ///
72    /// # Errors
73    ///
74    /// Returns an error if the index cannot be loaded or parsed.
75    pub fn load_index(&self) -> Result<RegistryIndex> {
76        let data = self.backend.get(&self.index_path)?;
77        let index: RegistryIndex = serde_json::from_slice(&data)
78            .map_err(|e| Error::storage(format!("Failed to parse registry index: {e}")))?;
79        Ok(index)
80    }
81
82    /// Saves the registry index.
83    fn save_index(&self, index: &RegistryIndex) -> Result<()> {
84        let data = serde_json::to_vec_pretty(index)
85            .map_err(|e| Error::storage(format!("Failed to serialize registry index: {e}")))?;
86        self.backend.put(&self.index_path, data.into())
87    }
88
89    /// Lists all available datasets in the registry.
90    ///
91    /// # Errors
92    ///
93    /// Returns an error if the index cannot be loaded.
94    pub fn list(&self) -> Result<Vec<DatasetInfo>> {
95        let index = self.load_index()?;
96        Ok(index.datasets)
97    }
98
99    /// Gets information about a specific dataset.
100    ///
101    /// # Errors
102    ///
103    /// Returns an error if the dataset is not found.
104    pub fn get_info(&self, name: &str) -> Result<DatasetInfo> {
105        let index = self.load_index()?;
106        index
107            .datasets
108            .into_iter()
109            .find(|d| d.name == name)
110            .ok_or_else(|| Error::storage(format!("Dataset '{}' not found in registry", name)))
111    }
112
113    /// Publishes a dataset to the registry.
114    ///
115    /// # Arguments
116    ///
117    /// * `name` - Unique name for the dataset
118    /// * `version` - Semantic version string (e.g., "1.0.0")
119    /// * `dataset` - The dataset to publish
120    /// * `metadata` - Metadata describing the dataset
121    ///
122    /// # Errors
123    ///
124    /// Returns an error if the dataset cannot be saved or the index cannot be
125    /// updated.
126    pub fn publish(
127        &self,
128        name: &str,
129        version: &str,
130        dataset: &ArrowDataset,
131        metadata: DatasetMetadata,
132    ) -> Result<()> {
133        // Validate inputs
134        if name.is_empty() {
135            return Err(Error::invalid_config("Dataset name cannot be empty"));
136        }
137        if version.is_empty() {
138            return Err(Error::invalid_config("Version cannot be empty"));
139        }
140
141        // Create the data path
142        let data_path = format!("datasets/{}/{}/data.parquet", name, version);
143
144        // Save the dataset as parquet (use thread ID for uniqueness in parallel tests)
145        let temp_dir = std::env::temp_dir();
146        let unique_id = format!("{}_{:?}", std::process::id(), std::thread::current().id());
147        let temp_path = temp_dir.join(format!("alimentar_publish_{}.parquet", unique_id));
148        dataset.to_parquet(&temp_path)?;
149
150        // Read the parquet file and upload
151        let parquet_data = std::fs::read(&temp_path).map_err(|e| Error::io(e, &temp_path))?;
152        self.backend.put(&data_path, parquet_data.into())?;
153
154        // Delete scratch file after upload completes
155        let _ = std::fs::remove_file(&temp_path);
156
157        // Update the index
158        let mut index = self.load_index().unwrap_or_else(|_| RegistryIndex::new());
159
160        // Find or create dataset info
161        let dataset_info = index.datasets.iter_mut().find(|d| d.name == name);
162
163        let size_bytes = self.backend.size(&data_path)?;
164        let num_rows = dataset.len();
165        let schema = dataset.schema();
166
167        if let Some(info) = dataset_info {
168            // Update existing dataset
169            if !info.versions.contains(&version.to_string()) {
170                info.versions.push(version.to_string());
171            }
172            info.latest = version.to_string();
173            info.size_bytes = size_bytes;
174            info.num_rows = num_rows;
175            info.metadata = metadata;
176        } else {
177            // Add new dataset
178            // Convert schema to a simple JSON representation
179            let schema_json = schema_to_json(&schema);
180
181            index.datasets.push(DatasetInfo {
182                name: name.to_string(),
183                versions: vec![version.to_string()],
184                latest: version.to_string(),
185                size_bytes,
186                num_rows,
187                schema: schema_json,
188                metadata,
189            });
190        }
191
192        self.save_index(&index)
193    }
194
195    /// Pulls a dataset from the registry.
196    ///
197    /// # Arguments
198    ///
199    /// * `name` - Name of the dataset
200    /// * `version` - Optional version (uses latest if not specified)
201    ///
202    /// # Errors
203    ///
204    /// Returns an error if the dataset cannot be found or loaded.
205    pub fn pull(&self, name: &str, version: Option<&str>) -> Result<ArrowDataset> {
206        let info = self.get_info(name)?;
207
208        let version = version.unwrap_or(&info.latest);
209
210        if !info.versions.contains(&version.to_string()) {
211            return Err(Error::storage(format!(
212                "Version '{}' not found for dataset '{}'. Available: {:?}",
213                version, name, info.versions
214            )));
215        }
216
217        let data_path = format!("datasets/{}/{}/data.parquet", name, version);
218
219        // Fetch data and write to per-thread scratch file
220        let data = self.backend.get(&data_path)?;
221
222        let temp_dir = std::env::temp_dir();
223        let unique_id = format!("{}_{:?}", std::process::id(), std::thread::current().id());
224        let temp_path = temp_dir.join(format!("alimentar_pull_{}.parquet", unique_id));
225
226        std::fs::write(&temp_path, &data).map_err(|e| Error::io(e, &temp_path))?;
227
228        let dataset = ArrowDataset::from_parquet(&temp_path)?;
229
230        // Delete scratch file after parquet load completes
231        let _ = std::fs::remove_file(&temp_path);
232
233        Ok(dataset)
234    }
235
236    /// Searches datasets by query string (matches name and description).
237    ///
238    /// # Errors
239    ///
240    /// Returns an error if the index cannot be loaded.
241    pub fn search(&self, query: &str) -> Result<Vec<DatasetInfo>> {
242        let index = self.load_index()?;
243        let query_lower = query.to_lowercase();
244
245        let results: Vec<DatasetInfo> = index
246            .datasets
247            .into_iter()
248            .filter(|d| {
249                d.name.to_lowercase().contains(&query_lower)
250                    || d.metadata.description.to_lowercase().contains(&query_lower)
251            })
252            .collect();
253
254        Ok(results)
255    }
256
257    /// Searches datasets by tags.
258    ///
259    /// # Errors
260    ///
261    /// Returns an error if the index cannot be loaded.
262    pub fn search_tags(&self, tags: &[&str]) -> Result<Vec<DatasetInfo>> {
263        let index = self.load_index()?;
264
265        let results: Vec<DatasetInfo> = index
266            .datasets
267            .into_iter()
268            .filter(|d| {
269                tags.iter()
270                    .any(|&tag| d.metadata.tags.iter().any(|t| t == tag))
271            })
272            .collect();
273
274        Ok(results)
275    }
276
277    /// Deletes a dataset version from the registry.
278    ///
279    /// # Errors
280    ///
281    /// Returns an error if the dataset cannot be deleted.
282    pub fn delete(&self, name: &str, version: &str) -> Result<()> {
283        let mut index = self.load_index()?;
284
285        let dataset_idx = index
286            .datasets
287            .iter()
288            .position(|d| d.name == name)
289            .ok_or_else(|| Error::storage(format!("Dataset '{}' not found", name)))?;
290
291        let info = &mut index.datasets[dataset_idx];
292
293        if !info.versions.contains(&version.to_string()) {
294            return Err(Error::storage(format!(
295                "Version '{}' not found for dataset '{}'",
296                version, name
297            )));
298        }
299
300        // Delete the data file
301        let data_path = format!("datasets/{}/{}/data.parquet", name, version);
302        self.backend.delete(&data_path)?;
303
304        // Update index
305        info.versions.retain(|v| v != version);
306
307        if info.versions.is_empty() {
308            // Remove dataset entirely if no versions left
309            index.datasets.remove(dataset_idx);
310        } else if info.latest == version {
311            // Update latest to most recent remaining version
312            info.latest = info.versions.last().cloned().unwrap_or_default();
313        }
314
315        self.save_index(&index)
316    }
317}
318
319impl std::fmt::Debug for Registry {
320    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
321        f.debug_struct("Registry")
322            .field("index_path", &self.index_path)
323            .finish_non_exhaustive()
324    }
325}
326
327/// Converts an Arrow schema to a JSON representation.
328fn schema_to_json(schema: &arrow::datatypes::SchemaRef) -> serde_json::Value {
329    let fields: Vec<serde_json::Value> = schema
330        .fields()
331        .iter()
332        .map(|field| {
333            serde_json::json!({
334                "name": field.name(),
335                "data_type": format!("{:?}", field.data_type()),
336                "nullable": field.is_nullable(),
337            })
338        })
339        .collect();
340
341    serde_json::json!({
342        "fields": fields,
343    })
344}
345
346#[cfg(test)]
347#[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
348mod tests {
349    use std::sync::Arc;
350
351    use arrow::{
352        array::{Int32Array, StringArray},
353        datatypes::{DataType, Field, Schema},
354        record_batch::RecordBatch,
355    };
356
357    use super::*;
358    use crate::{backend::MemoryBackend, Dataset};
359
360    fn create_test_dataset(rows: usize) -> ArrowDataset {
361        let schema = Arc::new(Schema::new(vec![
362            Field::new("id", DataType::Int32, false),
363            Field::new("name", DataType::Utf8, false),
364        ]));
365
366        let ids: Vec<i32> = (0..rows as i32).collect();
367        let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
368
369        let batch = RecordBatch::try_new(
370            schema,
371            vec![
372                Arc::new(Int32Array::from(ids)),
373                Arc::new(StringArray::from(names)),
374            ],
375        )
376        .ok()
377        .unwrap_or_else(|| panic!("Should create batch"));
378
379        ArrowDataset::from_batch(batch)
380            .ok()
381            .unwrap_or_else(|| panic!("Should create dataset"))
382    }
383
384    fn create_test_metadata() -> DatasetMetadata {
385        DatasetMetadata {
386            description: "Test dataset".to_string(),
387            license: "MIT".to_string(),
388            tags: vec!["test".to_string(), "example".to_string()],
389            source: Some("unit test".to_string()),
390            citation: None,
391            sha256: None,
392        }
393    }
394
395    #[test]
396    fn test_registry_init() {
397        let backend = MemoryBackend::new();
398        let registry = Registry::new(Box::new(backend));
399
400        let result = registry.init();
401        assert!(result.is_ok());
402
403        // Second init should be idempotent
404        let result = registry.init();
405        assert!(result.is_ok());
406    }
407
408    #[test]
409    fn test_registry_publish_and_pull() {
410        let backend = MemoryBackend::new();
411        let registry = Registry::new(Box::new(backend));
412        registry
413            .init()
414            .ok()
415            .unwrap_or_else(|| panic!("Should init"));
416
417        let dataset = create_test_dataset(10);
418        let metadata = create_test_metadata();
419
420        // Publish
421        let result = registry.publish("test-dataset", "1.0.0", &dataset, metadata);
422        assert!(result.is_ok());
423
424        // Pull
425        let pulled = registry.pull("test-dataset", Some("1.0.0"));
426        assert!(pulled.is_ok());
427        let pulled = pulled.ok().unwrap_or_else(|| panic!("Should pull"));
428        assert_eq!(pulled.len(), 10);
429    }
430
431    #[test]
432    fn test_registry_list() {
433        let backend = MemoryBackend::new();
434        let registry = Registry::new(Box::new(backend));
435        registry
436            .init()
437            .ok()
438            .unwrap_or_else(|| panic!("Should init"));
439
440        let dataset = create_test_dataset(5);
441        let metadata = create_test_metadata();
442
443        registry
444            .publish("dataset-a", "1.0.0", &dataset, metadata.clone())
445            .ok()
446            .unwrap_or_else(|| panic!("Should publish"));
447        registry
448            .publish("dataset-b", "1.0.0", &dataset, metadata)
449            .ok()
450            .unwrap_or_else(|| panic!("Should publish"));
451
452        let list = registry.list();
453        assert!(list.is_ok());
454        let list = list.ok().unwrap_or_else(|| panic!("Should list"));
455        assert_eq!(list.len(), 2);
456    }
457
458    #[test]
459    fn test_registry_search() {
460        let backend = MemoryBackend::new();
461        let registry = Registry::new(Box::new(backend));
462        registry
463            .init()
464            .ok()
465            .unwrap_or_else(|| panic!("Should init"));
466
467        let dataset = create_test_dataset(5);
468
469        let metadata1 = DatasetMetadata {
470            description: "Machine learning dataset".to_string(),
471            license: "MIT".to_string(),
472            tags: vec!["ml".to_string()],
473            source: None,
474            citation: None,
475            sha256: None,
476        };
477
478        let metadata2 = DatasetMetadata {
479            description: "Natural language processing data".to_string(),
480            license: "Apache-2.0".to_string(),
481            tags: vec!["nlp".to_string()],
482            source: None,
483            citation: None,
484            sha256: None,
485        };
486
487        registry
488            .publish("ml-data", "1.0.0", &dataset, metadata1)
489            .ok()
490            .unwrap_or_else(|| panic!("Should publish"));
491        registry
492            .publish("nlp-data", "1.0.0", &dataset, metadata2)
493            .ok()
494            .unwrap_or_else(|| panic!("Should publish"));
495
496        // Search by name
497        let results = registry.search("ml");
498        assert!(results.is_ok());
499        let results = results.ok().unwrap_or_else(|| panic!("Should search"));
500        assert_eq!(results.len(), 1);
501        assert_eq!(results[0].name, "ml-data");
502
503        // Search by description
504        let results = registry.search("natural language");
505        assert!(results.is_ok());
506        let results = results.ok().unwrap_or_else(|| panic!("Should search"));
507        assert_eq!(results.len(), 1);
508        assert_eq!(results[0].name, "nlp-data");
509    }
510
511    #[test]
512    fn test_registry_search_tags() {
513        let backend = MemoryBackend::new();
514        let registry = Registry::new(Box::new(backend));
515        registry
516            .init()
517            .ok()
518            .unwrap_or_else(|| panic!("Should init"));
519
520        let dataset = create_test_dataset(5);
521        let metadata = DatasetMetadata {
522            description: "Test".to_string(),
523            license: "MIT".to_string(),
524            tags: vec!["ml".to_string(), "vision".to_string()],
525            source: None,
526            citation: None,
527            sha256: None,
528        };
529
530        registry
531            .publish("vision-data", "1.0.0", &dataset, metadata)
532            .ok()
533            .unwrap_or_else(|| panic!("Should publish"));
534
535        let results = registry.search_tags(&["vision"]);
536        assert!(results.is_ok());
537        let results = results.ok().unwrap_or_else(|| panic!("Should search"));
538        assert_eq!(results.len(), 1);
539
540        let results = registry.search_tags(&["audio"]);
541        assert!(results.is_ok());
542        let results = results.ok().unwrap_or_else(|| panic!("Should search"));
543        assert!(results.is_empty());
544    }
545
546    #[test]
547    fn test_registry_versioning() {
548        let backend = MemoryBackend::new();
549        let registry = Registry::new(Box::new(backend));
550        registry
551            .init()
552            .ok()
553            .unwrap_or_else(|| panic!("Should init"));
554
555        let dataset_v1 = create_test_dataset(10);
556        let dataset_v2 = create_test_dataset(20);
557        let metadata = create_test_metadata();
558
559        registry
560            .publish("versioned", "1.0.0", &dataset_v1, metadata.clone())
561            .ok()
562            .unwrap_or_else(|| panic!("Should publish v1"));
563        registry
564            .publish("versioned", "2.0.0", &dataset_v2, metadata)
565            .ok()
566            .unwrap_or_else(|| panic!("Should publish v2"));
567
568        let info = registry.get_info("versioned");
569        assert!(info.is_ok());
570        let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
571        assert_eq!(info.versions.len(), 2);
572        assert_eq!(info.latest, "2.0.0");
573
574        // Pull specific version
575        let v1 = registry.pull("versioned", Some("1.0.0"));
576        assert!(v1.is_ok());
577        let v1 = v1.ok().unwrap_or_else(|| panic!("Should pull v1"));
578        assert_eq!(v1.len(), 10);
579
580        // Pull latest
581        let latest = registry.pull("versioned", None);
582        assert!(latest.is_ok());
583        let latest = latest.ok().unwrap_or_else(|| panic!("Should pull latest"));
584        assert_eq!(latest.len(), 20);
585    }
586
587    #[test]
588    fn test_registry_delete() {
589        let backend = MemoryBackend::new();
590        let registry = Registry::new(Box::new(backend));
591        registry
592            .init()
593            .ok()
594            .unwrap_or_else(|| panic!("Should init"));
595
596        let dataset = create_test_dataset(5);
597        let metadata = create_test_metadata();
598
599        registry
600            .publish("to-delete", "1.0.0", &dataset, metadata.clone())
601            .ok()
602            .unwrap_or_else(|| panic!("Should publish"));
603        registry
604            .publish("to-delete", "2.0.0", &dataset, metadata)
605            .ok()
606            .unwrap_or_else(|| panic!("Should publish"));
607
608        // Delete one version
609        let result = registry.delete("to-delete", "1.0.0");
610        assert!(result.is_ok());
611
612        let info = registry.get_info("to-delete");
613        assert!(info.is_ok());
614        let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
615        assert_eq!(info.versions.len(), 1);
616        assert!(!info.versions.contains(&"1.0.0".to_string()));
617
618        // Delete last version removes dataset
619        let result = registry.delete("to-delete", "2.0.0");
620        assert!(result.is_ok());
621
622        let info = registry.get_info("to-delete");
623        assert!(info.is_err());
624    }
625
626    #[test]
627    fn test_registry_empty_name_error() {
628        let backend = MemoryBackend::new();
629        let registry = Registry::new(Box::new(backend));
630        registry
631            .init()
632            .ok()
633            .unwrap_or_else(|| panic!("Should init"));
634
635        let dataset = create_test_dataset(5);
636        let metadata = create_test_metadata();
637
638        let result = registry.publish("", "1.0.0", &dataset, metadata);
639        assert!(result.is_err());
640    }
641
642    #[test]
643    fn test_registry_not_found() {
644        let backend = MemoryBackend::new();
645        let registry = Registry::new(Box::new(backend));
646        registry
647            .init()
648            .ok()
649            .unwrap_or_else(|| panic!("Should init"));
650
651        let result = registry.pull("nonexistent", None);
652        assert!(result.is_err());
653    }
654
655    #[test]
656    fn test_registry_with_index_path() {
657        let backend = MemoryBackend::new();
658        let registry = Registry::with_index_path(Box::new(backend), "custom-index.json");
659        registry
660            .init()
661            .ok()
662            .unwrap_or_else(|| panic!("Should init"));
663
664        let dataset = create_test_dataset(5);
665        let metadata = create_test_metadata();
666
667        let result = registry.publish("test", "1.0.0", &dataset, metadata);
668        assert!(result.is_ok());
669
670        let list = registry.list();
671        assert!(list.is_ok());
672        assert_eq!(list.ok().unwrap_or_else(|| panic!("Should list")).len(), 1);
673    }
674
675    #[test]
676    fn test_registry_debug() {
677        let backend = MemoryBackend::new();
678        let registry = Registry::new(Box::new(backend));
679        let debug = format!("{:?}", registry);
680        assert!(debug.contains("Registry"));
681        assert!(debug.contains("index_path"));
682    }
683
684    #[test]
685    fn test_registry_delete_nonexistent_dataset() {
686        let backend = MemoryBackend::new();
687        let registry = Registry::new(Box::new(backend));
688        registry
689            .init()
690            .ok()
691            .unwrap_or_else(|| panic!("Should init"));
692
693        let result = registry.delete("nonexistent", "1.0.0");
694        assert!(result.is_err());
695    }
696
697    #[test]
698    fn test_registry_delete_nonexistent_version() {
699        let backend = MemoryBackend::new();
700        let registry = Registry::new(Box::new(backend));
701        registry
702            .init()
703            .ok()
704            .unwrap_or_else(|| panic!("Should init"));
705
706        let dataset = create_test_dataset(5);
707        let metadata = create_test_metadata();
708
709        registry
710            .publish("dataset", "1.0.0", &dataset, metadata)
711            .ok()
712            .unwrap_or_else(|| panic!("Should publish"));
713
714        let result = registry.delete("dataset", "2.0.0");
715        assert!(result.is_err());
716    }
717
718    #[test]
719    fn test_registry_delete_latest_updates_correctly() {
720        let backend = MemoryBackend::new();
721        let registry = Registry::new(Box::new(backend));
722        registry
723            .init()
724            .ok()
725            .unwrap_or_else(|| panic!("Should init"));
726
727        let dataset = create_test_dataset(5);
728        let metadata = create_test_metadata();
729
730        registry
731            .publish("multi-version", "1.0.0", &dataset, metadata.clone())
732            .ok()
733            .unwrap_or_else(|| panic!("Should publish"));
734        registry
735            .publish("multi-version", "2.0.0", &dataset, metadata)
736            .ok()
737            .unwrap_or_else(|| panic!("Should publish"));
738
739        // Latest is 2.0.0, delete it
740        let result = registry.delete("multi-version", "2.0.0");
741        assert!(result.is_ok());
742
743        let info = registry.get_info("multi-version");
744        assert!(info.is_ok());
745        let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
746        assert_eq!(info.latest, "1.0.0");
747    }
748
749    #[test]
750    fn test_registry_pull_nonexistent_version() {
751        let backend = MemoryBackend::new();
752        let registry = Registry::new(Box::new(backend));
753        registry
754            .init()
755            .ok()
756            .unwrap_or_else(|| panic!("Should init"));
757
758        let dataset = create_test_dataset(5);
759        let metadata = create_test_metadata();
760
761        registry
762            .publish("versioned-test", "1.0.0", &dataset, metadata)
763            .ok()
764            .unwrap_or_else(|| panic!("Should publish"));
765
766        let result = registry.pull("versioned-test", Some("9.9.9"));
767        assert!(result.is_err());
768    }
769
770    #[test]
771    fn test_registry_empty_version_error() {
772        let backend = MemoryBackend::new();
773        let registry = Registry::new(Box::new(backend));
774        registry
775            .init()
776            .ok()
777            .unwrap_or_else(|| panic!("Should init"));
778
779        let dataset = create_test_dataset(5);
780        let metadata = create_test_metadata();
781
782        let result = registry.publish("test", "", &dataset, metadata);
783        assert!(result.is_err());
784    }
785
786    #[test]
787    fn test_registry_publish_update_existing() {
788        let backend = MemoryBackend::new();
789        let registry = Registry::new(Box::new(backend));
790        registry
791            .init()
792            .ok()
793            .unwrap_or_else(|| panic!("Should init"));
794
795        let dataset1 = create_test_dataset(5);
796        let dataset2 = create_test_dataset(10);
797        let metadata = create_test_metadata();
798
799        // Publish same version twice (should update)
800        registry
801            .publish("update-test", "1.0.0", &dataset1, metadata.clone())
802            .ok()
803            .unwrap_or_else(|| panic!("Should publish"));
804        registry
805            .publish("update-test", "1.0.0", &dataset2, metadata)
806            .ok()
807            .unwrap_or_else(|| panic!("Should publish update"));
808
809        let info = registry.get_info("update-test");
810        assert!(info.is_ok());
811        let info = info.ok().unwrap_or_else(|| panic!("Should get info"));
812        // Versions should not be duplicated
813        assert_eq!(info.versions.len(), 1);
814        // Rows should be from second publish
815        assert_eq!(info.num_rows, 10);
816    }
817
818    // === Additional coverage tests for registry ===
819
820    #[test]
821    fn test_registry_load_index_without_init() {
822        let backend = MemoryBackend::new();
823        let registry = Registry::new(Box::new(backend));
824
825        // Trying to load without init should fail
826        let result = registry.load_index();
827        assert!(result.is_err());
828    }
829
830    #[test]
831    fn test_registry_list_empty() {
832        let backend = MemoryBackend::new();
833        let registry = Registry::new(Box::new(backend));
834        registry
835            .init()
836            .ok()
837            .unwrap_or_else(|| panic!("Should init"));
838
839        let list = registry.list();
840        assert!(list.is_ok());
841        assert!(list.ok().unwrap_or_else(|| panic!("list")).is_empty());
842    }
843
844    #[test]
845    fn test_registry_search_no_results() {
846        let backend = MemoryBackend::new();
847        let registry = Registry::new(Box::new(backend));
848        registry
849            .init()
850            .ok()
851            .unwrap_or_else(|| panic!("Should init"));
852
853        let dataset = create_test_dataset(5);
854        let metadata = create_test_metadata();
855
856        registry
857            .publish("my-dataset", "1.0.0", &dataset, metadata)
858            .ok()
859            .unwrap_or_else(|| panic!("Should publish"));
860
861        let results = registry.search("nonexistent");
862        assert!(results.is_ok());
863        assert!(results.ok().unwrap_or_else(|| panic!("search")).is_empty());
864    }
865
866    #[test]
867    fn test_registry_search_case_insensitive() {
868        let backend = MemoryBackend::new();
869        let registry = Registry::new(Box::new(backend));
870        registry
871            .init()
872            .ok()
873            .unwrap_or_else(|| panic!("Should init"));
874
875        let dataset = create_test_dataset(5);
876        let metadata = DatasetMetadata {
877            description: "UPPERCASE DESCRIPTION".to_string(),
878            license: "MIT".to_string(),
879            tags: vec![],
880            source: None,
881            citation: None,
882            sha256: None,
883        };
884
885        registry
886            .publish("TestDataset", "1.0.0", &dataset, metadata)
887            .ok()
888            .unwrap_or_else(|| panic!("Should publish"));
889
890        // Search lowercase
891        let results = registry.search("testdataset");
892        assert_eq!(results.ok().unwrap_or_else(|| panic!("search")).len(), 1);
893
894        // Search description lowercase
895        let results = registry.search("uppercase");
896        assert_eq!(results.ok().unwrap_or_else(|| panic!("search")).len(), 1);
897    }
898
899    #[test]
900    fn test_registry_search_tags_no_match() {
901        let backend = MemoryBackend::new();
902        let registry = Registry::new(Box::new(backend));
903        registry
904            .init()
905            .ok()
906            .unwrap_or_else(|| panic!("Should init"));
907
908        let dataset = create_test_dataset(5);
909        let metadata = DatasetMetadata {
910            description: "Test".to_string(),
911            license: "MIT".to_string(),
912            tags: vec!["ml".to_string(), "vision".to_string()],
913            source: None,
914            citation: None,
915            sha256: None,
916        };
917
918        registry
919            .publish("tagged-data", "1.0.0", &dataset, metadata)
920            .ok()
921            .unwrap_or_else(|| panic!("Should publish"));
922
923        let results = registry.search_tags(&["audio", "nlp"]);
924        assert!(results.is_ok());
925        assert!(results.ok().unwrap_or_else(|| panic!("search")).is_empty());
926    }
927
928    #[test]
929    fn test_registry_search_tags_multiple_matches() {
930        let backend = MemoryBackend::new();
931        let registry = Registry::new(Box::new(backend));
932        registry
933            .init()
934            .ok()
935            .unwrap_or_else(|| panic!("Should init"));
936
937        let dataset = create_test_dataset(5);
938
939        let metadata1 = DatasetMetadata {
940            description: "Dataset 1".to_string(),
941            license: "MIT".to_string(),
942            tags: vec!["ml".to_string()],
943            source: None,
944            citation: None,
945            sha256: None,
946        };
947
948        let metadata2 = DatasetMetadata {
949            description: "Dataset 2".to_string(),
950            license: "MIT".to_string(),
951            tags: vec!["ml".to_string(), "vision".to_string()],
952            source: None,
953            citation: None,
954            sha256: None,
955        };
956
957        registry
958            .publish("data-a", "1.0.0", &dataset, metadata1)
959            .ok()
960            .unwrap_or_else(|| panic!("publish a"));
961        registry
962            .publish("data-b", "1.0.0", &dataset, metadata2)
963            .ok()
964            .unwrap_or_else(|| panic!("publish b"));
965
966        // Both have 'ml' tag
967        let results = registry.search_tags(&["ml"]);
968        assert_eq!(results.ok().unwrap_or_else(|| panic!("search")).len(), 2);
969    }
970
971    #[test]
972    fn test_registry_get_info_schema_contains_fields() {
973        let backend = MemoryBackend::new();
974        let registry = Registry::new(Box::new(backend));
975        registry
976            .init()
977            .ok()
978            .unwrap_or_else(|| panic!("Should init"));
979
980        let dataset = create_test_dataset(10);
981        let metadata = create_test_metadata();
982
983        registry
984            .publish("schema-test", "1.0.0", &dataset, metadata)
985            .ok()
986            .unwrap_or_else(|| panic!("Should publish"));
987
988        let info = registry.get_info("schema-test");
989        assert!(info.is_ok());
990        let info = info.ok().unwrap_or_else(|| panic!("info"));
991
992        // Schema should be JSON with fields
993        assert!(info.schema.is_object());
994        let fields = info.schema.get("fields");
995        assert!(fields.is_some());
996    }
997
998    #[test]
999    fn test_registry_multiple_versions_ordering() {
1000        let backend = MemoryBackend::new();
1001        let registry = Registry::new(Box::new(backend));
1002        registry
1003            .init()
1004            .ok()
1005            .unwrap_or_else(|| panic!("Should init"));
1006
1007        let dataset = create_test_dataset(5);
1008        let metadata = create_test_metadata();
1009
1010        // Publish in non-sequential order
1011        registry
1012            .publish("ordered", "2.0.0", &dataset, metadata.clone())
1013            .ok()
1014            .unwrap_or_else(|| panic!("publish 2.0"));
1015        registry
1016            .publish("ordered", "1.0.0", &dataset, metadata.clone())
1017            .ok()
1018            .unwrap_or_else(|| panic!("publish 1.0"));
1019        registry
1020            .publish("ordered", "3.0.0", &dataset, metadata)
1021            .ok()
1022            .unwrap_or_else(|| panic!("publish 3.0"));
1023
1024        let info = registry.get_info("ordered");
1025        assert!(info.is_ok());
1026        let info = info.ok().unwrap_or_else(|| panic!("info"));
1027
1028        // Latest should be the last published
1029        assert_eq!(info.latest, "3.0.0");
1030        assert_eq!(info.versions.len(), 3);
1031    }
1032
1033    #[test]
1034    fn test_registry_delete_middle_version() {
1035        let backend = MemoryBackend::new();
1036        let registry = Registry::new(Box::new(backend));
1037        registry
1038            .init()
1039            .ok()
1040            .unwrap_or_else(|| panic!("Should init"));
1041
1042        let dataset = create_test_dataset(5);
1043        let metadata = create_test_metadata();
1044
1045        registry
1046            .publish("three-versions", "1.0.0", &dataset, metadata.clone())
1047            .ok()
1048            .unwrap_or_else(|| panic!("publish 1.0"));
1049        registry
1050            .publish("three-versions", "2.0.0", &dataset, metadata.clone())
1051            .ok()
1052            .unwrap_or_else(|| panic!("publish 2.0"));
1053        registry
1054            .publish("three-versions", "3.0.0", &dataset, metadata)
1055            .ok()
1056            .unwrap_or_else(|| panic!("publish 3.0"));
1057
1058        // Delete middle version
1059        registry
1060            .delete("three-versions", "2.0.0")
1061            .ok()
1062            .unwrap_or_else(|| panic!("delete"));
1063
1064        let info = registry.get_info("three-versions");
1065        assert!(info.is_ok());
1066        let info = info.ok().unwrap_or_else(|| panic!("info"));
1067
1068        assert_eq!(info.versions.len(), 2);
1069        assert!(!info.versions.contains(&"2.0.0".to_string()));
1070        // Latest should still be 3.0.0
1071        assert_eq!(info.latest, "3.0.0");
1072    }
1073
1074    #[test]
1075    fn test_registry_pull_specific_version() {
1076        let backend = MemoryBackend::new();
1077        let registry = Registry::new(Box::new(backend));
1078        registry
1079            .init()
1080            .ok()
1081            .unwrap_or_else(|| panic!("Should init"));
1082
1083        let dataset_v1 = create_test_dataset(5);
1084        let dataset_v2 = create_test_dataset(15);
1085        let metadata = create_test_metadata();
1086
1087        registry
1088            .publish("versioned", "1.0.0", &dataset_v1, metadata.clone())
1089            .ok()
1090            .unwrap_or_else(|| panic!("publish v1"));
1091        registry
1092            .publish("versioned", "2.0.0", &dataset_v2, metadata)
1093            .ok()
1094            .unwrap_or_else(|| panic!("publish v2"));
1095
1096        // Pull specific version 1.0.0 (not latest)
1097        let pulled = registry.pull("versioned", Some("1.0.0"));
1098        assert!(pulled.is_ok());
1099        let pulled = pulled.ok().unwrap_or_else(|| panic!("pulled"));
1100        assert_eq!(pulled.len(), 5);
1101    }
1102
1103    #[test]
1104    fn test_registry_init_idempotent() {
1105        let backend = MemoryBackend::new();
1106        let registry = Registry::new(Box::new(backend));
1107
1108        // Multiple inits should be safe
1109        for _ in 0..5 {
1110            let result = registry.init();
1111            assert!(result.is_ok());
1112        }
1113
1114        // Registry should still work
1115        let list = registry.list();
1116        assert!(list.is_ok());
1117    }
1118
1119    #[test]
1120    fn test_registry_metadata_preserved() {
1121        let backend = MemoryBackend::new();
1122        let registry = Registry::new(Box::new(backend));
1123        registry
1124            .init()
1125            .ok()
1126            .unwrap_or_else(|| panic!("Should init"));
1127
1128        let dataset = create_test_dataset(10);
1129        let metadata = DatasetMetadata {
1130            description: "Full metadata test".to_string(),
1131            license: "Apache-2.0".to_string(),
1132            tags: vec!["tag1".to_string(), "tag2".to_string(), "tag3".to_string()],
1133            source: Some("http://example.com/source".to_string()),
1134            citation: Some("Citation text here".to_string()),
1135            sha256: Some("abc123".to_string()),
1136        };
1137
1138        registry
1139            .publish("metadata-test", "1.0.0", &dataset, metadata.clone())
1140            .ok()
1141            .unwrap_or_else(|| panic!("publish"));
1142
1143        let info = registry.get_info("metadata-test");
1144        assert!(info.is_ok());
1145        let info = info.ok().unwrap_or_else(|| panic!("info"));
1146
1147        assert_eq!(info.metadata.description, metadata.description);
1148        assert_eq!(info.metadata.license, metadata.license);
1149        assert_eq!(info.metadata.tags, metadata.tags);
1150        assert_eq!(info.metadata.source, metadata.source);
1151        assert_eq!(info.metadata.citation, metadata.citation);
1152        assert_eq!(info.metadata.sha256, metadata.sha256);
1153    }
1154
1155    #[test]
1156    fn test_registry_size_bytes_updated() {
1157        let backend = MemoryBackend::new();
1158        let registry = Registry::new(Box::new(backend));
1159        registry
1160            .init()
1161            .ok()
1162            .unwrap_or_else(|| panic!("Should init"));
1163
1164        let dataset = create_test_dataset(100);
1165        let metadata = create_test_metadata();
1166
1167        registry
1168            .publish("size-test", "1.0.0", &dataset, metadata)
1169            .ok()
1170            .unwrap_or_else(|| panic!("publish"));
1171
1172        let info = registry.get_info("size-test");
1173        assert!(info.is_ok());
1174        let info = info.ok().unwrap_or_else(|| panic!("info"));
1175
1176        // Size should be > 0 for a dataset with data
1177        assert!(info.size_bytes > 0);
1178    }
1179
1180    #[test]
1181    fn test_schema_to_json_function() {
1182        let dataset = create_test_dataset(5);
1183        let schema = dataset.schema();
1184        let json = schema_to_json(&schema);
1185
1186        assert!(json.is_object());
1187        let fields = json.get("fields").and_then(|f| f.as_array());
1188        assert!(fields.is_some());
1189
1190        let fields = fields.unwrap_or_else(|| panic!("fields"));
1191        assert_eq!(fields.len(), 2); // id and name
1192
1193        // Check first field
1194        let first = &fields[0];
1195        assert_eq!(
1196            first
1197                .get("name")
1198                .and_then(|n: &serde_json::Value| n.as_str()),
1199            Some("id")
1200        );
1201        assert!(first.get("data_type").is_some());
1202        assert!(first.get("nullable").is_some());
1203    }
1204
1205    #[test]
1206    fn test_registry_concurrent_publish_different_datasets() {
1207        let backend = MemoryBackend::new();
1208        let registry = Registry::new(Box::new(backend));
1209        registry
1210            .init()
1211            .ok()
1212            .unwrap_or_else(|| panic!("Should init"));
1213
1214        let dataset = create_test_dataset(5);
1215        let metadata = create_test_metadata();
1216
1217        // Publish multiple different datasets
1218        for i in 0..5 {
1219            registry
1220                .publish(
1221                    &format!("dataset-{}", i),
1222                    "1.0.0",
1223                    &dataset,
1224                    metadata.clone(),
1225                )
1226                .ok()
1227                .unwrap_or_else(|| panic!("publish"));
1228        }
1229
1230        let list = registry.list();
1231        assert!(list.is_ok());
1232        assert_eq!(list.ok().unwrap_or_else(|| panic!("list")).len(), 5);
1233    }
1234
1235    // === Additional registry tests for coverage ===
1236
1237    #[test]
1238    fn test_registry_search_partial_name_match() {
1239        let backend = MemoryBackend::new();
1240        let registry = Registry::new(Box::new(backend));
1241        registry.init().ok().unwrap();
1242
1243        let dataset = create_test_dataset(5);
1244        let metadata = create_test_metadata();
1245
1246        registry
1247            .publish("machine-learning-data", "1.0.0", &dataset, metadata.clone())
1248            .ok()
1249            .unwrap();
1250        registry
1251            .publish("deep-learning-data", "1.0.0", &dataset, metadata.clone())
1252            .ok()
1253            .unwrap();
1254        registry
1255            .publish("statistics-data", "1.0.0", &dataset, metadata)
1256            .ok()
1257            .unwrap();
1258
1259        // Search by partial name
1260        let results = registry.search("learning").ok().unwrap();
1261        assert_eq!(results.len(), 2);
1262    }
1263
1264    #[test]
1265    fn test_registry_search_by_description() {
1266        let backend = MemoryBackend::new();
1267        let registry = Registry::new(Box::new(backend));
1268        registry.init().ok().unwrap();
1269
1270        let dataset = create_test_dataset(5);
1271
1272        let metadata1 = DatasetMetadata {
1273            description: "Image classification dataset".to_string(),
1274            license: "MIT".to_string(),
1275            tags: vec![],
1276            source: None,
1277            citation: None,
1278            sha256: None,
1279        };
1280
1281        let metadata2 = DatasetMetadata {
1282            description: "Text sentiment analysis".to_string(),
1283            license: "MIT".to_string(),
1284            tags: vec![],
1285            source: None,
1286            citation: None,
1287            sha256: None,
1288        };
1289
1290        registry
1291            .publish("images", "1.0.0", &dataset, metadata1)
1292            .ok()
1293            .unwrap();
1294        registry
1295            .publish("text", "1.0.0", &dataset, metadata2)
1296            .ok()
1297            .unwrap();
1298
1299        // Search by description content
1300        let image_results = registry.search("classification").ok().unwrap();
1301        assert_eq!(image_results.len(), 1);
1302        assert_eq!(image_results[0].name, "images");
1303
1304        let text_results = registry.search("sentiment").ok().unwrap();
1305        assert_eq!(text_results.len(), 1);
1306        assert_eq!(text_results[0].name, "text");
1307    }
1308
1309    #[test]
1310    fn test_registry_search_tags_multiple_tags() {
1311        let backend = MemoryBackend::new();
1312        let registry = Registry::new(Box::new(backend));
1313        registry.init().ok().unwrap();
1314
1315        let dataset = create_test_dataset(5);
1316
1317        let metadata = DatasetMetadata {
1318            description: "Multi-tag dataset".to_string(),
1319            license: "MIT".to_string(),
1320            tags: vec!["tag1".to_string(), "tag2".to_string(), "tag3".to_string()],
1321            source: None,
1322            citation: None,
1323            sha256: None,
1324        };
1325
1326        registry
1327            .publish("tagged", "1.0.0", &dataset, metadata)
1328            .ok()
1329            .unwrap();
1330
1331        // Any of the tags should match
1332        assert_eq!(registry.search_tags(&["tag1"]).ok().unwrap().len(), 1);
1333        assert_eq!(registry.search_tags(&["tag2"]).ok().unwrap().len(), 1);
1334        assert_eq!(registry.search_tags(&["tag3"]).ok().unwrap().len(), 1);
1335        assert_eq!(
1336            registry.search_tags(&["tag1", "tag2"]).ok().unwrap().len(),
1337            1
1338        );
1339    }
1340
1341    #[test]
1342    fn test_registry_delete_first_of_multiple_versions() {
1343        let backend = MemoryBackend::new();
1344        let registry = Registry::new(Box::new(backend));
1345        registry.init().ok().unwrap();
1346
1347        let dataset = create_test_dataset(5);
1348        let metadata = create_test_metadata();
1349
1350        registry
1351            .publish("versioned", "1.0.0", &dataset, metadata.clone())
1352            .ok()
1353            .unwrap();
1354        registry
1355            .publish("versioned", "2.0.0", &dataset, metadata.clone())
1356            .ok()
1357            .unwrap();
1358        registry
1359            .publish("versioned", "3.0.0", &dataset, metadata)
1360            .ok()
1361            .unwrap();
1362
1363        // Delete first version
1364        registry.delete("versioned", "1.0.0").ok().unwrap();
1365
1366        let info = registry.get_info("versioned").ok().unwrap();
1367        assert_eq!(info.versions.len(), 2);
1368        assert!(!info.versions.contains(&"1.0.0".to_string()));
1369        // Latest should still be 3.0.0
1370        assert_eq!(info.latest, "3.0.0");
1371    }
1372
1373    #[test]
1374    fn test_registry_publish_without_init() {
1375        let backend = MemoryBackend::new();
1376        let registry = Registry::new(Box::new(backend));
1377        // Don't call init()
1378
1379        let dataset = create_test_dataset(5);
1380        let metadata = create_test_metadata();
1381
1382        // Should still work because publish calls init internally via load_index
1383        let result = registry.publish("no-init", "1.0.0", &dataset, metadata);
1384        // This might fail or succeed depending on implementation
1385        // The current implementation creates index on first publish
1386        assert!(result.is_ok());
1387    }
1388
1389    #[test]
1390    fn test_registry_get_info_contains_schema() {
1391        let backend = MemoryBackend::new();
1392        let registry = Registry::new(Box::new(backend));
1393        registry.init().ok().unwrap();
1394
1395        let dataset = create_test_dataset(10);
1396        let metadata = create_test_metadata();
1397
1398        registry
1399            .publish("schema-check", "1.0.0", &dataset, metadata)
1400            .ok()
1401            .unwrap();
1402
1403        let info = registry.get_info("schema-check").ok().unwrap();
1404
1405        // Schema should be a JSON object with fields
1406        assert!(info.schema.is_object());
1407        let fields = info.schema.get("fields").and_then(|f| f.as_array());
1408        assert!(fields.is_some());
1409        assert_eq!(fields.unwrap().len(), 2); // id and name
1410    }
1411
1412    #[test]
1413    fn test_registry_delete_all_versions_removes_dataset() {
1414        let backend = MemoryBackend::new();
1415        let registry = Registry::new(Box::new(backend));
1416        registry.init().ok().unwrap();
1417
1418        let dataset = create_test_dataset(5);
1419        let metadata = create_test_metadata();
1420
1421        registry
1422            .publish("temp", "1.0.0", &dataset, metadata.clone())
1423            .ok()
1424            .unwrap();
1425        registry
1426            .publish("temp", "2.0.0", &dataset, metadata)
1427            .ok()
1428            .unwrap();
1429
1430        // Delete all versions
1431        registry.delete("temp", "1.0.0").ok().unwrap();
1432        registry.delete("temp", "2.0.0").ok().unwrap();
1433
1434        // Dataset should no longer exist
1435        let result = registry.get_info("temp");
1436        assert!(result.is_err());
1437
1438        // List should be empty
1439        let list = registry.list().ok().unwrap();
1440        assert!(list.is_empty());
1441    }
1442
1443    #[test]
1444    fn test_registry_pull_uses_latest_when_none() {
1445        let backend = MemoryBackend::new();
1446        let registry = Registry::new(Box::new(backend));
1447        registry.init().ok().unwrap();
1448
1449        let dataset_v1 = create_test_dataset(10);
1450        let dataset_v2 = create_test_dataset(20);
1451        let metadata = create_test_metadata();
1452
1453        registry
1454            .publish("latest-test", "1.0.0", &dataset_v1, metadata.clone())
1455            .ok()
1456            .unwrap();
1457        registry
1458            .publish("latest-test", "2.0.0", &dataset_v2, metadata)
1459            .ok()
1460            .unwrap();
1461
1462        // Pull without specifying version
1463        let pulled = registry.pull("latest-test", None).ok().unwrap();
1464        assert_eq!(pulled.len(), 20); // Should be v2 (20 rows)
1465    }
1466
1467    #[test]
1468    fn test_registry_with_custom_index_path() {
1469        let backend = MemoryBackend::new();
1470        let registry = Registry::with_index_path(Box::new(backend), "my-custom-index.json");
1471
1472        registry.init().ok().unwrap();
1473
1474        let dataset = create_test_dataset(5);
1475        let metadata = create_test_metadata();
1476
1477        registry
1478            .publish("custom-path", "1.0.0", &dataset, metadata)
1479            .ok()
1480            .unwrap();
1481
1482        // Should be able to list
1483        let list = registry.list().ok().unwrap();
1484        assert_eq!(list.len(), 1);
1485    }
1486
1487    #[test]
1488    fn test_registry_metadata_all_fields() {
1489        let backend = MemoryBackend::new();
1490        let registry = Registry::new(Box::new(backend));
1491        registry.init().ok().unwrap();
1492
1493        let dataset = create_test_dataset(5);
1494
1495        let metadata = DatasetMetadata {
1496            description: "Full metadata test".to_string(),
1497            license: "Apache-2.0".to_string(),
1498            tags: vec!["a".to_string(), "b".to_string(), "c".to_string()],
1499            source: Some("https://example.com/source".to_string()),
1500            citation: Some("Cite this dataset".to_string()),
1501            sha256: Some("abc123def456".to_string()),
1502        };
1503
1504        registry
1505            .publish("full-meta", "1.0.0", &dataset, metadata.clone())
1506            .ok()
1507            .unwrap();
1508
1509        let info = registry.get_info("full-meta").ok().unwrap();
1510        assert_eq!(info.metadata.description, "Full metadata test");
1511        assert_eq!(info.metadata.license, "Apache-2.0");
1512        assert_eq!(info.metadata.tags.len(), 3);
1513        assert_eq!(
1514            info.metadata.source,
1515            Some("https://example.com/source".to_string())
1516        );
1517        assert_eq!(
1518            info.metadata.citation,
1519            Some("Cite this dataset".to_string())
1520        );
1521    }
1522
1523    #[test]
1524    fn test_registry_republish_same_version() {
1525        let backend = MemoryBackend::new();
1526        let registry = Registry::new(Box::new(backend));
1527        registry.init().ok().unwrap();
1528
1529        let dataset1 = create_test_dataset(10);
1530        let dataset2 = create_test_dataset(20);
1531        let metadata = create_test_metadata();
1532
1533        // First publish
1534        registry
1535            .publish("republish", "1.0.0", &dataset1, metadata.clone())
1536            .ok()
1537            .unwrap();
1538
1539        let info1 = registry.get_info("republish").ok().unwrap();
1540        assert_eq!(info1.num_rows, 10);
1541
1542        // Republish same version with different data
1543        registry
1544            .publish("republish", "1.0.0", &dataset2, metadata)
1545            .ok()
1546            .unwrap();
1547
1548        let info2 = registry.get_info("republish").ok().unwrap();
1549        assert_eq!(info2.num_rows, 20);
1550        // Should only have 1 version, not 2
1551        assert_eq!(info2.versions.len(), 1);
1552    }
1553
1554    #[test]
1555    fn test_registry_search_empty_string() {
1556        let backend = MemoryBackend::new();
1557        let registry = Registry::new(Box::new(backend));
1558        registry.init().ok().unwrap();
1559
1560        let dataset = create_test_dataset(5);
1561        let metadata = create_test_metadata();
1562
1563        registry
1564            .publish("test-data", "1.0.0", &dataset, metadata)
1565            .ok()
1566            .unwrap();
1567
1568        // Empty search should match everything (contains "")
1569        let results = registry.search("").ok().unwrap();
1570        assert_eq!(results.len(), 1);
1571    }
1572
1573    #[test]
1574    fn test_registry_search_tags_empty_array() {
1575        let backend = MemoryBackend::new();
1576        let registry = Registry::new(Box::new(backend));
1577        registry.init().ok().unwrap();
1578
1579        let dataset = create_test_dataset(5);
1580        let metadata = DatasetMetadata {
1581            description: "Test".to_string(),
1582            license: "MIT".to_string(),
1583            tags: vec!["ml".to_string()],
1584            source: None,
1585            citation: None,
1586            sha256: None,
1587        };
1588
1589        registry
1590            .publish("tagged-ds", "1.0.0", &dataset, metadata)
1591            .ok()
1592            .unwrap();
1593
1594        // Empty tag array should return empty results
1595        let results = registry.search_tags(&[]).ok().unwrap();
1596        assert!(results.is_empty());
1597    }
1598
1599    #[test]
1600    fn test_schema_to_json_with_nullable_fields() {
1601        // Create schema with nullable and non-nullable fields
1602        let schema = Arc::new(Schema::new(vec![
1603            Field::new("required_id", DataType::Int32, false),
1604            Field::new("optional_name", DataType::Utf8, true),
1605        ]));
1606
1607        let batch = RecordBatch::try_new(
1608            Arc::clone(&schema),
1609            vec![
1610                Arc::new(Int32Array::from(vec![1, 2, 3])),
1611                Arc::new(StringArray::from(vec!["a", "b", "c"])),
1612            ],
1613        )
1614        .unwrap();
1615
1616        let dataset = ArrowDataset::from_batch(batch).unwrap();
1617
1618        let json = schema_to_json(&dataset.schema());
1619        let fields = json.get("fields").and_then(|f| f.as_array()).unwrap();
1620
1621        assert_eq!(fields.len(), 2);
1622
1623        // Check nullable values
1624        let field0_nullable = fields[0].get("nullable").and_then(|v| v.as_bool());
1625        let field1_nullable = fields[1].get("nullable").and_then(|v| v.as_bool());
1626
1627        assert_eq!(field0_nullable, Some(false));
1628        assert_eq!(field1_nullable, Some(true));
1629    }
1630}