scirs2_datasets/
sample.rs

1//! Sample datasets for testing and demonstration
2//!
3//! This module provides larger, real-world datasets that can be downloaded
4//! and loaded for testing and demonstration purposes.
5
6use crate::error::{DatasetsError, Result};
7use crate::utils::Dataset;
8
9#[cfg(feature = "download")]
10use crate::cache::download_data;
11#[cfg(feature = "download")]
12use crate::loaders;
13
14/// URL for dataset resources
15#[allow(dead_code)]
16const DATASET_BASE_URL: &str = "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/";
17
18/// Load the California Housing dataset
19#[cfg(feature = "download")]
20#[allow(dead_code)]
21pub fn load_california_housing(force_download: bool) -> Result<Dataset> {
22    let url = format!("{DATASET_BASE_URL}/california_housing.csv");
23
24    // Download or load from cache
25    let data = download_data(&url, force_download)?;
26
27    // Create a temporary file
28    use std::io::Write;
29    let tempdir = std::env::temp_dir();
30    let temppath = tempdir.join("scirs2_california_housing.csv");
31
32    let mut temp_file = std::fs::File::create(&temppath).map_err(DatasetsError::IoError)?;
33
34    temp_file.write_all(&data).map_err(DatasetsError::IoError)?;
35
36    // Load from the temporary file (using CSV loader)
37    let config = loaders::CsvConfig::new()
38        .with_header(true)
39        .with_target_column(Some(8));
40    let mut dataset = loaders::load_csv(&temppath, config)?;
41
42    // Add metadata
43    let featurenames = vec![
44        "MedInc".to_string(),
45        "HouseAge".to_string(),
46        "AveRooms".to_string(),
47        "AveBedrms".to_string(),
48        "Population".to_string(),
49        "AveOccup".to_string(),
50        "Latitude".to_string(),
51        "Longitude".to_string(),
52    ];
53
54    let description = "California Housing dataset
55    
56The data was derived from the 1990 U.S. census, using one row per census block group.
57A block group is the smallest geographical unit for which the U.S. Census Bureau 
58publishes sample data.
59
60Features:
61- MedInc: median income in block group
62- HouseAge: median house age in block group
63- AveRooms: average number of rooms per household
64- AveBedrms: average number of bedrooms per household
65- Population: block group population
66- AveOccup: average number of household members
67- Latitude: block group latitude
68- Longitude: block group longitude
69
70Target: Median house value for California districts, expressed in hundreds of thousands of dollars.
71
72This dataset is useful for regression tasks."
73        .to_string();
74
75    dataset = dataset
76        .with_featurenames(featurenames)
77        .with_description(description);
78
79    // Remove the temporary file
80    std::fs::remove_file(temppath).ok();
81
82    Ok(dataset)
83}
84
85// Stub when download feature is not enabled
86#[cfg(not(feature = "download"))]
87/// Loads the California Housing dataset
88///
89/// This is a stub implementation when the download feature is not enabled.
90/// It returns an error informing the user to enable the download feature.
91///
92/// # Arguments
93///
94/// * `_force_download` - If true, force a new download instead of using cache
95///
96/// # Returns
97///
98/// * An error indicating that the download feature is not enabled
99#[allow(dead_code)]
100pub fn load_california_housing(_forcedownload: bool) -> Result<Dataset> {
101    Err(DatasetsError::Other(
102        "Download feature is not enabled. Recompile with --features _download".to_string(),
103    ))
104}
105
106/// Load the Wine dataset
107#[cfg(feature = "download")]
108#[allow(dead_code)]
109pub fn load_wine(force_download: bool) -> Result<Dataset> {
110    let url = format!("{DATASET_BASE_URL}/wine.csv");
111
112    // Download or load from cache
113    let data = download_data(&url, force_download)?;
114
115    // Create a temporary file
116    use std::io::Write;
117    let tempdir = std::env::temp_dir();
118    let temppath = tempdir.join("scirs2_wine.csv");
119
120    let mut temp_file = std::fs::File::create(&temppath).map_err(DatasetsError::IoError)?;
121
122    temp_file.write_all(&data).map_err(DatasetsError::IoError)?;
123
124    // Load from the temporary file (using CSV loader)
125    let mut dataset = loaders::load_csv_legacy(&temppath, true, Some(0))?;
126
127    // Add metadata
128    let featurenames = vec![
129        "alcohol".to_string(),
130        "malic_acid".to_string(),
131        "ash".to_string(),
132        "alcalinity_of_ash".to_string(),
133        "magnesium".to_string(),
134        "total_phenols".to_string(),
135        "flavanoids".to_string(),
136        "nonflavanoid_phenols".to_string(),
137        "proanthocyanins".to_string(),
138        "color_intensity".to_string(),
139        "hue".to_string(),
140        "od280_od315_of_diluted_wines".to_string(),
141        "proline".to_string(),
142    ];
143
144    let targetnames = vec![
145        "class_0".to_string(),
146        "class_1".to_string(),
147        "class_2".to_string(),
148    ];
149
150    let description = "Wine Recognition dataset
151    
152The data is the results of a chemical analysis of wines grown in the same region in Italy
153but derived from three different cultivars. The analysis determined the quantities of
15413 constituents found in each of the three types of wines.
155
156Features: Various chemical properties of the wine
157
158Target: Class of wine (0, 1, or 2)
159
160This dataset is useful for classification tasks."
161        .to_string();
162
163    dataset = dataset
164        .with_featurenames(featurenames)
165        .with_targetnames(targetnames)
166        .with_description(description);
167
168    // Remove the temporary file
169    std::fs::remove_file(temppath).ok();
170
171    Ok(dataset)
172}
173
174// Stub when download feature is not enabled
175#[cfg(not(feature = "download"))]
176/// Loads the Wine dataset
177///
178/// This is a stub implementation when the download feature is not enabled.
179/// It returns an error informing the user to enable the download feature.
180///
181/// # Arguments
182///
183/// * `_force_download` - If true, force a new download instead of using cache
184///
185/// # Returns
186///
187/// * An error indicating that the download feature is not enabled
188#[allow(dead_code)]
189pub fn load_wine(_forcedownload: bool) -> Result<Dataset> {
190    Err(DatasetsError::Other(
191        "Download feature is not enabled. Recompile with --features _download".to_string(),
192    ))
193}
194
195/// Sample data fetcher - retrieves a list of available datasets
196#[cfg(feature = "download")]
197#[allow(dead_code)]
198pub fn get_available_datasets() -> Result<Vec<String>> {
199    let url = format!("{DATASET_BASE_URL}/datasets_index.txt");
200
201    // Download or load from cache
202    let data = download_data(&url, true)?;
203
204    // Parse the list of datasets
205    let content = String::from_utf8(data).map_err(|e| {
206        DatasetsError::InvalidFormat(format!("Failed to parse datasets index: {e}"))
207    })?;
208
209    let datasets = content
210        .lines()
211        .map(|line| line.trim().to_string())
212        .filter(|line| !line.is_empty())
213        .collect();
214
215    Ok(datasets)
216}
217
218// Stub when download feature is not enabled
219#[cfg(not(feature = "download"))]
220/// Retrieves a list of available datasets
221///
222/// This is a stub implementation when the download feature is not enabled.
223/// It returns an error informing the user to enable the download feature.
224///
225/// # Returns
226///
227/// * An error indicating that the download feature is not enabled
228#[allow(dead_code)]
229pub fn get_available_datasets() -> Result<Vec<String>> {
230    Err(DatasetsError::Other(
231        "Download feature is not enabled. Recompile with --features download".to_string(),
232    ))
233}