Skip to main content

alimentar/registry/
index.rs

1//! Registry index format and metadata structures.
2
3use serde::{Deserialize, Serialize};
4use serde_json::Value;
5
6/// Registry index containing all dataset information.
7///
8/// The index is stored as JSON and contains metadata for all datasets
9/// in the registry.
10///
11/// # Example JSON
12///
13/// ```json
14/// {
15///   "version": "1.0",
16///   "datasets": [
17///     {
18///       "name": "mnist",
19///       "versions": ["1.0.0", "1.0.1"],
20///       "latest": "1.0.1",
21///       "size_bytes": 11490000,
22///       "num_rows": 70000,
23///       "schema": { ... },
24///       "metadata": { ... }
25///     }
26///   ]
27/// }
28/// ```
29#[derive(Debug, Clone, Serialize, Deserialize)]
30pub struct RegistryIndex {
31    /// Index format version.
32    pub version: String,
33    /// List of all datasets in the registry.
34    pub datasets: Vec<DatasetInfo>,
35}
36
37impl RegistryIndex {
38    /// Creates a new empty registry index.
39    pub fn new() -> Self {
40        Self {
41            version: "1.0".to_string(),
42            datasets: Vec::new(),
43        }
44    }
45
46    /// Returns the number of datasets in the index.
47    pub fn len(&self) -> usize {
48        self.datasets.len()
49    }
50
51    /// Returns true if the index contains no datasets.
52    pub fn is_empty(&self) -> bool {
53        self.datasets.is_empty()
54    }
55
56    /// Finds a dataset by name.
57    pub fn find(&self, name: &str) -> Option<&DatasetInfo> {
58        self.datasets.iter().find(|d| d.name == name)
59    }
60
61    /// Finds a dataset by name (mutable).
62    pub fn find_mut(&mut self, name: &str) -> Option<&mut DatasetInfo> {
63        self.datasets.iter_mut().find(|d| d.name == name)
64    }
65}
66
67impl Default for RegistryIndex {
68    fn default() -> Self {
69        Self::new()
70    }
71}
72
73/// Information about a dataset in the registry.
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct DatasetInfo {
76    /// Unique name of the dataset.
77    pub name: String,
78    /// List of available versions.
79    pub versions: Vec<String>,
80    /// Latest version tag.
81    pub latest: String,
82    /// Total size in bytes.
83    pub size_bytes: u64,
84    /// Number of rows in the dataset.
85    pub num_rows: usize,
86    /// Arrow schema as JSON.
87    pub schema: Value,
88    /// Dataset metadata.
89    pub metadata: DatasetMetadata,
90}
91
92impl DatasetInfo {
93    /// Checks if a specific version exists.
94    pub fn has_version(&self, version: &str) -> bool {
95        self.versions.contains(&version.to_string())
96    }
97
98    /// Returns the number of versions available.
99    pub fn version_count(&self) -> usize {
100        self.versions.len()
101    }
102}
103
104/// Metadata describing a dataset.
105#[derive(Debug, Clone, Default, Serialize, Deserialize)]
106pub struct DatasetMetadata {
107    /// Human-readable description.
108    pub description: String,
109    /// License identifier (e.g., "MIT", "Apache-2.0", "CC-BY-4.0").
110    pub license: String,
111    /// Tags for categorization.
112    pub tags: Vec<String>,
113    /// Original source URL.
114    #[serde(skip_serializing_if = "Option::is_none")]
115    pub source: Option<String>,
116    /// Citation information.
117    #[serde(skip_serializing_if = "Option::is_none")]
118    pub citation: Option<String>,
119    /// SHA-256 hash of the dataset content (hex string).
120    #[serde(skip_serializing_if = "Option::is_none")]
121    pub sha256: Option<String>,
122}
123
124impl DatasetMetadata {
125    /// Creates a new metadata builder.
126    pub fn builder() -> DatasetMetadataBuilder {
127        DatasetMetadataBuilder::default()
128    }
129
130    /// Creates metadata with just a description.
131    pub fn with_description(description: impl Into<String>) -> Self {
132        Self {
133            description: description.into(),
134            ..Default::default()
135        }
136    }
137}
138
139/// Builder for constructing DatasetMetadata.
140#[derive(Debug, Default)]
141pub struct DatasetMetadataBuilder {
142    description: String,
143    license: String,
144    tags: Vec<String>,
145    source: Option<String>,
146    citation: Option<String>,
147    sha256: Option<String>,
148}
149
150impl DatasetMetadataBuilder {
151    /// Sets the description.
152    #[must_use]
153    pub fn description(mut self, description: impl Into<String>) -> Self {
154        self.description = description.into();
155        self
156    }
157
158    /// Sets the license.
159    #[must_use]
160    pub fn license(mut self, license: impl Into<String>) -> Self {
161        self.license = license.into();
162        self
163    }
164
165    /// Adds a tag.
166    #[must_use]
167    pub fn tag(mut self, tag: impl Into<String>) -> Self {
168        self.tags.push(tag.into());
169        self
170    }
171
172    /// Sets multiple tags.
173    #[must_use]
174    pub fn tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
175        self.tags = tags.into_iter().map(Into::into).collect();
176        self
177    }
178
179    /// Sets the source URL.
180    #[must_use]
181    pub fn source(mut self, source: impl Into<String>) -> Self {
182        self.source = Some(source.into());
183        self
184    }
185
186    /// Sets the citation.
187    #[must_use]
188    pub fn citation(mut self, citation: impl Into<String>) -> Self {
189        self.citation = Some(citation.into());
190        self
191    }
192
193    /// Sets the SHA-256 hash for data provenance.
194    #[must_use]
195    pub fn sha256(mut self, hash: impl Into<String>) -> Self {
196        self.sha256 = Some(hash.into());
197        self
198    }
199
200    /// Builds the metadata.
201    pub fn build(self) -> DatasetMetadata {
202        DatasetMetadata {
203            description: self.description,
204            license: self.license,
205            tags: self.tags,
206            source: self.source,
207            citation: self.citation,
208            sha256: self.sha256,
209        }
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn test_registry_index_new() {
219        let index = RegistryIndex::new();
220        assert_eq!(index.version, "1.0");
221        assert!(index.is_empty());
222    }
223
224    #[test]
225    fn test_registry_index_find() {
226        let mut index = RegistryIndex::new();
227        index.datasets.push(DatasetInfo {
228            name: "test".to_string(),
229            versions: vec!["1.0.0".to_string()],
230            latest: "1.0.0".to_string(),
231            size_bytes: 1000,
232            num_rows: 100,
233            schema: serde_json::json!({}),
234            metadata: DatasetMetadata::default(),
235        });
236
237        assert!(index.find("test").is_some());
238        assert!(index.find("nonexistent").is_none());
239    }
240
241    #[test]
242    fn test_dataset_info_has_version() {
243        let info = DatasetInfo {
244            name: "test".to_string(),
245            versions: vec!["1.0.0".to_string(), "2.0.0".to_string()],
246            latest: "2.0.0".to_string(),
247            size_bytes: 1000,
248            num_rows: 100,
249            schema: serde_json::json!({}),
250            metadata: DatasetMetadata::default(),
251        };
252
253        assert!(info.has_version("1.0.0"));
254        assert!(info.has_version("2.0.0"));
255        assert!(!info.has_version("3.0.0"));
256        assert_eq!(info.version_count(), 2);
257    }
258
259    #[test]
260    fn test_metadata_builder() {
261        let metadata = DatasetMetadata::builder()
262            .description("A test dataset")
263            .license("MIT")
264            .tag("test")
265            .tag("example")
266            .source("https://example.com")
267            .build();
268
269        assert_eq!(metadata.description, "A test dataset");
270        assert_eq!(metadata.license, "MIT");
271        assert_eq!(metadata.tags, vec!["test", "example"]);
272        assert_eq!(metadata.source, Some("https://example.com".to_string()));
273        assert!(metadata.citation.is_none());
274    }
275
276    #[test]
277    fn test_metadata_with_description() {
278        let metadata = DatasetMetadata::with_description("Simple description");
279        assert_eq!(metadata.description, "Simple description");
280        assert!(metadata.license.is_empty());
281    }
282
283    #[test]
284    fn test_registry_index_serialization() {
285        let mut index = RegistryIndex::new();
286        index.datasets.push(DatasetInfo {
287            name: "test".to_string(),
288            versions: vec!["1.0.0".to_string()],
289            latest: "1.0.0".to_string(),
290            size_bytes: 1000,
291            num_rows: 100,
292            schema: serde_json::json!({"fields": []}),
293            metadata: DatasetMetadata::builder()
294                .description("Test dataset")
295                .license("MIT")
296                .build(),
297        });
298
299        let json = serde_json::to_string(&index);
300        assert!(json.is_ok());
301
302        let parsed: Result<RegistryIndex, _> =
303            serde_json::from_str(&json.ok().unwrap_or_else(|| panic!("Should serialize")));
304        assert!(parsed.is_ok());
305    }
306
307    #[test]
308    fn test_registry_index_len() {
309        let mut index = RegistryIndex::new();
310        assert_eq!(index.len(), 0);
311
312        index.datasets.push(DatasetInfo {
313            name: "test".to_string(),
314            versions: vec![],
315            latest: String::new(),
316            size_bytes: 0,
317            num_rows: 0,
318            schema: serde_json::json!({}),
319            metadata: DatasetMetadata::default(),
320        });
321        assert_eq!(index.len(), 1);
322    }
323
324    #[test]
325    fn test_metadata_builder_tags() {
326        let metadata = DatasetMetadata::builder().tags(["a", "b", "c"]).build();
327
328        assert_eq!(metadata.tags, vec!["a", "b", "c"]);
329    }
330
331    #[test]
332    fn test_registry_index_find_mut() {
333        let mut index = RegistryIndex::new();
334        index.datasets.push(DatasetInfo {
335            name: "test".to_string(),
336            versions: vec!["1.0.0".to_string()],
337            latest: "1.0.0".to_string(),
338            size_bytes: 1000,
339            num_rows: 100,
340            schema: serde_json::json!({}),
341            metadata: DatasetMetadata::default(),
342        });
343
344        let found = index.find_mut("test");
345        assert!(found.is_some());
346        found.unwrap().size_bytes = 2000;
347        assert_eq!(index.find("test").unwrap().size_bytes, 2000);
348
349        assert!(index.find_mut("nonexistent").is_none());
350    }
351
352    #[test]
353    fn test_registry_index_default() {
354        let index = RegistryIndex::default();
355        assert_eq!(index.version, "1.0");
356        assert!(index.is_empty());
357    }
358
359    #[test]
360    fn test_registry_index_clone() {
361        let mut index = RegistryIndex::new();
362        index.datasets.push(DatasetInfo {
363            name: "test".to_string(),
364            versions: vec!["1.0.0".to_string()],
365            latest: "1.0.0".to_string(),
366            size_bytes: 1000,
367            num_rows: 100,
368            schema: serde_json::json!({}),
369            metadata: DatasetMetadata::default(),
370        });
371
372        let cloned = index.clone();
373        assert_eq!(cloned.len(), index.len());
374        assert_eq!(cloned.version, index.version);
375    }
376
377    #[test]
378    fn test_registry_index_debug() {
379        let index = RegistryIndex::new();
380        let debug = format!("{:?}", index);
381        assert!(debug.contains("RegistryIndex"));
382    }
383
384    #[test]
385    fn test_dataset_info_clone() {
386        let info = DatasetInfo {
387            name: "test".to_string(),
388            versions: vec!["1.0.0".to_string()],
389            latest: "1.0.0".to_string(),
390            size_bytes: 1000,
391            num_rows: 100,
392            schema: serde_json::json!({}),
393            metadata: DatasetMetadata::default(),
394        };
395        let cloned = info.clone();
396        assert_eq!(cloned.name, info.name);
397    }
398
399    #[test]
400    fn test_dataset_info_debug() {
401        let info = DatasetInfo {
402            name: "test".to_string(),
403            versions: vec![],
404            latest: String::new(),
405            size_bytes: 0,
406            num_rows: 0,
407            schema: serde_json::json!({}),
408            metadata: DatasetMetadata::default(),
409        };
410        let debug = format!("{:?}", info);
411        assert!(debug.contains("DatasetInfo"));
412    }
413
414    #[test]
415    fn test_dataset_metadata_clone() {
416        let metadata = DatasetMetadata {
417            description: "desc".to_string(),
418            license: "MIT".to_string(),
419            tags: vec!["a".to_string()],
420            source: Some("http://example.com".to_string()),
421            citation: Some("citation".to_string()),
422            sha256: Some("abc123".to_string()),
423        };
424        let cloned = metadata.clone();
425        assert_eq!(cloned.description, metadata.description);
426        assert_eq!(cloned.sha256, metadata.sha256);
427    }
428
429    #[test]
430    fn test_dataset_metadata_debug() {
431        let metadata = DatasetMetadata::default();
432        let debug = format!("{:?}", metadata);
433        assert!(debug.contains("DatasetMetadata"));
434    }
435
436    #[test]
437    fn test_metadata_builder_all_fields() {
438        let metadata = DatasetMetadata::builder()
439            .description("Test")
440            .license("MIT")
441            .tag("tag1")
442            .source("http://source.com")
443            .citation("Citation text")
444            .sha256("abc123def456")
445            .build();
446
447        assert_eq!(metadata.description, "Test");
448        assert_eq!(metadata.license, "MIT");
449        assert_eq!(metadata.source, Some("http://source.com".to_string()));
450        assert_eq!(metadata.citation, Some("Citation text".to_string()));
451        assert_eq!(metadata.sha256, Some("abc123def456".to_string()));
452    }
453
454    #[test]
455    fn test_dataset_metadata_builder_debug() {
456        let builder = DatasetMetadataBuilder::default();
457        let debug = format!("{:?}", builder);
458        assert!(debug.contains("DatasetMetadataBuilder"));
459    }
460
461    #[test]
462    fn test_dataset_metadata_default() {
463        let metadata = DatasetMetadata::default();
464        assert!(metadata.description.is_empty());
465        assert!(metadata.license.is_empty());
466        assert!(metadata.tags.is_empty());
467        assert!(metadata.source.is_none());
468        assert!(metadata.citation.is_none());
469        assert!(metadata.sha256.is_none());
470    }
471}