scirs2_datasets/
sample.rs

1//! Sample datasets for testing and demonstration
2//!
3//! This module provides larger, real-world datasets that can be downloaded
4//! and loaded for testing and demonstration purposes.
5
6use crate::error::{DatasetsError, Result};
7use crate::utils::Dataset;
8
9/// URL for dataset resources
10#[allow(dead_code)]
11const DATASET_BASE_URL: &str = "https://raw.githubusercontent.com/cool-japan/scirs-datasets/main/";
12
13/// Load the California Housing dataset
14#[cfg(feature = "download")]
15pub fn load_california_housing(force_download: bool) -> Result<Dataset> {
16    let url = format!("{}/california_housing.csv", DATASET_BASE_URL);
17
18    // Download or load from cache
19    let data = download_data(&url, force_download)?;
20
21    // Create a temporary file
22    use std::io::Write;
23    let temp_dir = std::env::temp_dir();
24    let temp_path = temp_dir.join("scirs2_california_housing.csv");
25
26    let mut temp_file = std::fs::File::create(&temp_path).map_err(|e| DatasetsError::IoError(e))?;
27
28    temp_file
29        .write_all(&data)
30        .map_err(|e| DatasetsError::IoError(e))?;
31
32    // Load from the temporary file (using CSV loader)
33    let mut dataset = loaders::load_csv(&temp_path, true, Some(8))?;
34
35    // Add metadata
36    let feature_names = vec![
37        "MedInc".to_string(),
38        "HouseAge".to_string(),
39        "AveRooms".to_string(),
40        "AveBedrms".to_string(),
41        "Population".to_string(),
42        "AveOccup".to_string(),
43        "Latitude".to_string(),
44        "Longitude".to_string(),
45    ];
46
47    let description = "California Housing dataset
48    
49The data was derived from the 1990 U.S. census, using one row per census block group.
50A block group is the smallest geographical unit for which the U.S. Census Bureau 
51publishes sample data.
52
53Features:
54- MedInc: median income in block group
55- HouseAge: median house age in block group
56- AveRooms: average number of rooms per household
57- AveBedrms: average number of bedrooms per household
58- Population: block group population
59- AveOccup: average number of household members
60- Latitude: block group latitude
61- Longitude: block group longitude
62
63Target: Median house value for California districts, expressed in hundreds of thousands of dollars.
64
65This dataset is useful for regression tasks."
66        .to_string();
67
68    dataset = dataset
69        .with_feature_names(feature_names)
70        .with_description(description);
71
72    // Remove the temporary file
73    std::fs::remove_file(temp_path).ok();
74
75    Ok(dataset)
76}
77
78// Stub when download feature is not enabled
79#[cfg(not(feature = "download"))]
80/// Loads the California Housing dataset
81///
82/// This is a stub implementation when the download feature is not enabled.
83/// It returns an error informing the user to enable the download feature.
84///
85/// # Arguments
86///
87/// * `_force_download` - If true, force a new download instead of using cache
88///
89/// # Returns
90///
91/// * An error indicating that the download feature is not enabled
92pub fn load_california_housing(_force_download: bool) -> Result<Dataset> {
93    Err(DatasetsError::Other(
94        "Download feature is not enabled. Recompile with --features download".to_string(),
95    ))
96}
97
98/// Load the Wine dataset
99#[cfg(feature = "download")]
100pub fn load_wine(force_download: bool) -> Result<Dataset> {
101    let url = format!("{}/wine.csv", DATASET_BASE_URL);
102
103    // Download or load from cache
104    let data = download_data(&url, force_download)?;
105
106    // Create a temporary file
107    use std::io::Write;
108    let temp_dir = std::env::temp_dir();
109    let temp_path = temp_dir.join("scirs2_wine.csv");
110
111    let mut temp_file = std::fs::File::create(&temp_path).map_err(|e| DatasetsError::IoError(e))?;
112
113    temp_file
114        .write_all(&data)
115        .map_err(|e| DatasetsError::IoError(e))?;
116
117    // Load from the temporary file (using CSV loader)
118    let mut dataset = loaders::load_csv(&temp_path, true, Some(0))?;
119
120    // Add metadata
121    let feature_names = vec![
122        "alcohol".to_string(),
123        "malic_acid".to_string(),
124        "ash".to_string(),
125        "alcalinity_of_ash".to_string(),
126        "magnesium".to_string(),
127        "total_phenols".to_string(),
128        "flavanoids".to_string(),
129        "nonflavanoid_phenols".to_string(),
130        "proanthocyanins".to_string(),
131        "color_intensity".to_string(),
132        "hue".to_string(),
133        "od280_od315_of_diluted_wines".to_string(),
134        "proline".to_string(),
135    ];
136
137    let target_names = vec![
138        "class_0".to_string(),
139        "class_1".to_string(),
140        "class_2".to_string(),
141    ];
142
143    let description = "Wine Recognition dataset
144    
145The data is the results of a chemical analysis of wines grown in the same region in Italy
146but derived from three different cultivars. The analysis determined the quantities of
14713 constituents found in each of the three types of wines.
148
149Features: Various chemical properties of the wine
150
151Target: Class of wine (0, 1, or 2)
152
153This dataset is useful for classification tasks."
154        .to_string();
155
156    dataset = dataset
157        .with_feature_names(feature_names)
158        .with_target_names(target_names)
159        .with_description(description);
160
161    // Remove the temporary file
162    std::fs::remove_file(temp_path).ok();
163
164    Ok(dataset)
165}
166
167// Stub when download feature is not enabled
168#[cfg(not(feature = "download"))]
169/// Loads the Wine dataset
170///
171/// This is a stub implementation when the download feature is not enabled.
172/// It returns an error informing the user to enable the download feature.
173///
174/// # Arguments
175///
176/// * `_force_download` - If true, force a new download instead of using cache
177///
178/// # Returns
179///
180/// * An error indicating that the download feature is not enabled
181pub fn load_wine(_force_download: bool) -> Result<Dataset> {
182    Err(DatasetsError::Other(
183        "Download feature is not enabled. Recompile with --features download".to_string(),
184    ))
185}
186
187/// Sample data fetcher - retrieves a list of available datasets
188#[cfg(feature = "download")]
189pub fn get_available_datasets() -> Result<Vec<String>> {
190    let url = format!("{}/datasets_index.txt", DATASET_BASE_URL);
191
192    // Download or load from cache
193    let data = download_data(&url, true)?;
194
195    // Parse the list of datasets
196    let content = String::from_utf8(data).map_err(|e| {
197        DatasetsError::InvalidFormat(format!("Failed to parse datasets index: {}", e))
198    })?;
199
200    let datasets = content
201        .lines()
202        .map(|line| line.trim().to_string())
203        .filter(|line| !line.is_empty())
204        .collect();
205
206    Ok(datasets)
207}
208
209// Stub when download feature is not enabled
210#[cfg(not(feature = "download"))]
211/// Retrieves a list of available datasets
212///
213/// This is a stub implementation when the download feature is not enabled.
214/// It returns an error informing the user to enable the download feature.
215///
216/// # Returns
217///
218/// * An error indicating that the download feature is not enabled
219pub fn get_available_datasets() -> Result<Vec<String>> {
220    Err(DatasetsError::Other(
221        "Download feature is not enabled. Recompile with --features download".to_string(),
222    ))
223}