scirs2_datasets/
sample.rs

1//! Sample datasets for testing and demonstration
2//!
3//! This module provides larger, real-world datasets that can be downloaded
4//! and loaded for testing and demonstration purposes.
5
6use crate::error::{DatasetsError, Result};
7use crate::utils::Dataset;
8
9#[cfg(feature = "download")]
10use crate::cache::download_data;
11#[cfg(feature = "download")]
12use crate::loaders;
13
14/// URL for dataset resources
15#[allow(dead_code)]
16const DATASET_BASE_URL: &str = "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/";
17
18/// Load the California Housing dataset
19#[cfg(feature = "download")]
20pub fn load_california_housing(force_download: bool) -> Result<Dataset> {
21    let url = format!("{}/california_housing.csv", DATASET_BASE_URL);
22
23    // Download or load from cache
24    let data = download_data(&url, force_download)?;
25
26    // Create a temporary file
27    use std::io::Write;
28    let temp_dir = std::env::temp_dir();
29    let temp_path = temp_dir.join("scirs2_california_housing.csv");
30
31    let mut temp_file = std::fs::File::create(&temp_path).map_err(DatasetsError::IoError)?;
32
33    temp_file.write_all(&data).map_err(DatasetsError::IoError)?;
34
35    // Load from the temporary file (using CSV loader)
36    let mut dataset = loaders::load_csv(&temp_path, true, Some(8))?;
37
38    // Add metadata
39    let feature_names = vec![
40        "MedInc".to_string(),
41        "HouseAge".to_string(),
42        "AveRooms".to_string(),
43        "AveBedrms".to_string(),
44        "Population".to_string(),
45        "AveOccup".to_string(),
46        "Latitude".to_string(),
47        "Longitude".to_string(),
48    ];
49
50    let description = "California Housing dataset
51    
52The data was derived from the 1990 U.S. census, using one row per census block group.
53A block group is the smallest geographical unit for which the U.S. Census Bureau 
54publishes sample data.
55
56Features:
57- MedInc: median income in block group
58- HouseAge: median house age in block group
59- AveRooms: average number of rooms per household
60- AveBedrms: average number of bedrooms per household
61- Population: block group population
62- AveOccup: average number of household members
63- Latitude: block group latitude
64- Longitude: block group longitude
65
66Target: Median house value for California districts, expressed in hundreds of thousands of dollars.
67
68This dataset is useful for regression tasks."
69        .to_string();
70
71    dataset = dataset
72        .with_feature_names(feature_names)
73        .with_description(description);
74
75    // Remove the temporary file
76    std::fs::remove_file(temp_path).ok();
77
78    Ok(dataset)
79}
80
81// Stub when download feature is not enabled
82#[cfg(not(feature = "download"))]
83/// Loads the California Housing dataset
84///
85/// This is a stub implementation when the download feature is not enabled.
86/// It returns an error informing the user to enable the download feature.
87///
88/// # Arguments
89///
90/// * `_force_download` - If true, force a new download instead of using cache
91///
92/// # Returns
93///
94/// * An error indicating that the download feature is not enabled
95pub fn load_california_housing(_force_download: bool) -> Result<Dataset> {
96    Err(DatasetsError::Other(
97        "Download feature is not enabled. Recompile with --features download".to_string(),
98    ))
99}
100
101/// Load the Wine dataset
102#[cfg(feature = "download")]
103pub fn load_wine(force_download: bool) -> Result<Dataset> {
104    let url = format!("{}/wine.csv", DATASET_BASE_URL);
105
106    // Download or load from cache
107    let data = download_data(&url, force_download)?;
108
109    // Create a temporary file
110    use std::io::Write;
111    let temp_dir = std::env::temp_dir();
112    let temp_path = temp_dir.join("scirs2_wine.csv");
113
114    let mut temp_file = std::fs::File::create(&temp_path).map_err(DatasetsError::IoError)?;
115
116    temp_file.write_all(&data).map_err(DatasetsError::IoError)?;
117
118    // Load from the temporary file (using CSV loader)
119    let mut dataset = loaders::load_csv(&temp_path, true, Some(0))?;
120
121    // Add metadata
122    let feature_names = vec![
123        "alcohol".to_string(),
124        "malic_acid".to_string(),
125        "ash".to_string(),
126        "alcalinity_of_ash".to_string(),
127        "magnesium".to_string(),
128        "total_phenols".to_string(),
129        "flavanoids".to_string(),
130        "nonflavanoid_phenols".to_string(),
131        "proanthocyanins".to_string(),
132        "color_intensity".to_string(),
133        "hue".to_string(),
134        "od280_od315_of_diluted_wines".to_string(),
135        "proline".to_string(),
136    ];
137
138    let target_names = vec![
139        "class_0".to_string(),
140        "class_1".to_string(),
141        "class_2".to_string(),
142    ];
143
144    let description = "Wine Recognition dataset
145    
146The data is the results of a chemical analysis of wines grown in the same region in Italy
147but derived from three different cultivars. The analysis determined the quantities of
14813 constituents found in each of the three types of wines.
149
150Features: Various chemical properties of the wine
151
152Target: Class of wine (0, 1, or 2)
153
154This dataset is useful for classification tasks."
155        .to_string();
156
157    dataset = dataset
158        .with_feature_names(feature_names)
159        .with_target_names(target_names)
160        .with_description(description);
161
162    // Remove the temporary file
163    std::fs::remove_file(temp_path).ok();
164
165    Ok(dataset)
166}
167
168// Stub when download feature is not enabled
169#[cfg(not(feature = "download"))]
170/// Loads the Wine dataset
171///
172/// This is a stub implementation when the download feature is not enabled.
173/// It returns an error informing the user to enable the download feature.
174///
175/// # Arguments
176///
177/// * `_force_download` - If true, force a new download instead of using cache
178///
179/// # Returns
180///
181/// * An error indicating that the download feature is not enabled
182pub fn load_wine(_force_download: bool) -> Result<Dataset> {
183    Err(DatasetsError::Other(
184        "Download feature is not enabled. Recompile with --features download".to_string(),
185    ))
186}
187
188/// Sample data fetcher - retrieves a list of available datasets
189#[cfg(feature = "download")]
190pub fn get_available_datasets() -> Result<Vec<String>> {
191    let url = format!("{}/datasets_index.txt", DATASET_BASE_URL);
192
193    // Download or load from cache
194    let data = download_data(&url, true)?;
195
196    // Parse the list of datasets
197    let content = String::from_utf8(data).map_err(|e| {
198        DatasetsError::InvalidFormat(format!("Failed to parse datasets index: {}", e))
199    })?;
200
201    let datasets = content
202        .lines()
203        .map(|line| line.trim().to_string())
204        .filter(|line| !line.is_empty())
205        .collect();
206
207    Ok(datasets)
208}
209
210// Stub when download feature is not enabled
211#[cfg(not(feature = "download"))]
212/// Retrieves a list of available datasets
213///
214/// This is a stub implementation when the download feature is not enabled.
215/// It returns an error informing the user to enable the download feature.
216///
217/// # Returns
218///
219/// * An error indicating that the download feature is not enabled
220pub fn get_available_datasets() -> Result<Vec<String>> {
221    Err(DatasetsError::Other(
222        "Download feature is not enabled. Recompile with --features download".to_string(),
223    ))
224}