scirs2_datasets/
registry.rs

1//! Dataset registry system for managing dataset metadata and locations
2
3use crate::cache::RegistryEntry;
4use crate::error::{DatasetsError, Result};
5use std::collections::HashMap;
6
7/// Global dataset registry containing metadata for downloadable datasets
8pub struct DatasetRegistry {
9    /// Map from dataset name to registry entry
10    entries: HashMap<String, RegistryEntry>,
11}
12
13impl Default for DatasetRegistry {
14    fn default() -> Self {
15        let mut registry = Self::new();
16        registry.populate_default_datasets();
17        registry
18    }
19}
20
21impl DatasetRegistry {
22    /// Create a new empty registry
23    pub fn new() -> Self {
24        Self {
25            entries: HashMap::new(),
26        }
27    }
28
29    /// Register a new dataset with the given name and metadata
30    pub fn register(&mut self, name: String, entry: RegistryEntry) {
31        self.entries.insert(name, entry);
32    }
33
34    /// Get a registry entry by name
35    pub fn get(&self, name: &str) -> Option<&RegistryEntry> {
36        self.entries.get(name)
37    }
38
39    /// List all available dataset names
40    pub fn list_datasets(&self) -> Vec<String> {
41        self.entries.keys().cloned().collect()
42    }
43
44    /// Check if a dataset is registered
45    pub fn contains(&self, name: &str) -> bool {
46        self.entries.contains_key(name)
47    }
48
49    /// Populate the registry with default datasets
50    ///
51    /// Note: The SHA256 hashes below are placeholder values and must be updated
52    /// with actual hashes when the dataset files are available in the repository.
53    fn populate_default_datasets(&mut self) {
54        // Real-world datasets
55        self.register(
56            "california_housing".to_string(),
57            RegistryEntry {
58                url: "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/california_housing.csv",
59                sha256: "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", // Placeholder - update with actual hash
60            },
61        );
62
63        self.register(
64            "wine".to_string(),
65            RegistryEntry {
66                url: "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/wine.csv",
67                sha256: "d4e1c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b856", // Placeholder - update with actual hash
68            },
69        );
70
71        // Time series datasets
72        self.register(
73            "electrocardiogram".to_string(),
74            RegistryEntry {
75                url: "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/electrocardiogram.json",
76                sha256: "a1b2c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b857", // Placeholder - update with actual hash
77            },
78        );
79
80        self.register(
81            "stock_market".to_string(),
82            RegistryEntry {
83                url: "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/stock_market.json",
84                sha256: "f5e6c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b858", // Placeholder - update with actual hash
85            },
86        );
87
88        self.register(
89            "weather".to_string(),
90            RegistryEntry {
91                url:
92                    "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/weather.json",
93                sha256: "b7c8c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b859", // Placeholder - update with actual hash
94            },
95        );
96    }
97}
98
99/// Get the global dataset registry
100pub fn get_registry() -> DatasetRegistry {
101    DatasetRegistry::default()
102}
103
104/// Load a dataset by name from the registry
105#[cfg(feature = "download")]
106pub fn load_dataset_by_name(name: &str, force_download: bool) -> Result<crate::utils::Dataset> {
107    let registry = get_registry();
108
109    match name {
110        "california_housing" => crate::sample::load_california_housing(force_download),
111        "wine" => crate::sample::load_wine(force_download),
112        "electrocardiogram" => crate::time_series::electrocardiogram(),
113        "stock_market" => crate::time_series::stock_market(false), // Default to raw prices
114        "weather" => crate::time_series::weather(None),            // Default to all features
115        _ => {
116            if registry.contains(name) {
117                Err(DatasetsError::Other(format!(
118                    "Dataset '{}' is registered but not yet implemented for loading",
119                    name
120                )))
121            } else {
122                Err(DatasetsError::Other(format!(
123                    "Unknown dataset: '{}'. Available datasets: {:?}",
124                    name,
125                    registry.list_datasets()
126                )))
127            }
128        }
129    }
130}
131
132#[cfg(not(feature = "download"))]
133/// Load a dataset by name from the registry (stub for when download feature is disabled)
134pub fn load_dataset_by_name(_name: &str, _force_download: bool) -> Result<crate::utils::Dataset> {
135    Err(DatasetsError::Other(
136        "Download feature is not enabled. Recompile with --features download".to_string(),
137    ))
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn test_registry_creation() {
146        let registry = DatasetRegistry::new();
147        assert!(registry.entries.is_empty());
148    }
149
150    #[test]
151    fn test_registry_default() {
152        let registry = DatasetRegistry::default();
153        assert!(!registry.entries.is_empty());
154        assert!(registry.contains("california_housing"));
155        assert!(registry.contains("wine"));
156        assert!(registry.contains("electrocardiogram"));
157    }
158
159    #[test]
160    fn test_registry_operations() {
161        let mut registry = DatasetRegistry::new();
162
163        let entry = RegistryEntry {
164            url: "https://example.com/test.csv",
165            sha256: "abcd1234",
166        };
167
168        registry.register("test_dataset".to_string(), entry);
169
170        assert!(registry.contains("test_dataset"));
171        assert!(!registry.contains("nonexistent"));
172
173        let retrieved = registry.get("test_dataset").unwrap();
174        assert_eq!(retrieved.url, "https://example.com/test.csv");
175        assert_eq!(retrieved.sha256, "abcd1234");
176
177        let datasets = registry.list_datasets();
178        assert_eq!(datasets.len(), 1);
179        assert!(datasets.contains(&"test_dataset".to_string()));
180    }
181
182    #[test]
183    fn test_get_registry() {
184        let registry = get_registry();
185        assert!(!registry.list_datasets().is_empty());
186    }
187}