scirs2_datasets/
time_series.rs

1//! Time series datasets.
2//!
3//! This module provides access to common time series datasets, including:
4//! - Electrocardiogram (ECG) dataset
5//! - Stock market dataset
6//! - Weather dataset
7//!
8//! These datasets are designed for testing time series analysis algorithms,
9//! signal processing, and forecasting methods.
10
11use crate::cache::{fetch_data, RegistryEntry};
12use crate::error::{DatasetsError, Result};
13use crate::utils::Dataset;
14use scirs2_core::ndarray::{Array1, Array2};
15use serde::Deserialize;
16use std::collections::HashMap;
17use std::fs;
18
19// Registry mapping files to their SHA256 hashes and URLs
20lazy_static::lazy_static! {
21    static ref REGISTRY: HashMap<&'static str, RegistryEntry> = {
22        let mut registry = HashMap::new();
23
24        // ECG dataset
25        registry.insert("ecg.dat", RegistryEntry {
26            sha256: "f20ad3365fb9b7f845d0e5c48b6fe67081377ee466c3a220b7f69f35c8958baf",
27            url: "https://raw.githubusercontent.com/scipy/dataset-ecg/main/ecg.dat",
28        });
29
30        // Stock market dataset
31        registry.insert("stock_market.csv", RegistryEntry {
32            sha256: "e6d5392bd79e82e3f6d7fe171d8c2fafae84b1a4e9e95a532ec252caa3053dc9",
33            url: "https://raw.githubusercontent.com/scirs/datasets/main/stock_market.csv",
34        });
35
36        // Weather dataset
37        registry.insert("weather.csv", RegistryEntry {
38            sha256: "f8bdaef6d968c1eddb0c0c7cf9c245b07d60ffe3a7d8e5ed8953f5750ee0f610",
39            url: "https://raw.githubusercontent.com/scirs/datasets/main/weather.csv",
40        });
41
42        registry
43    };
44}
45
46/// Load an electrocardiogram as an example for a 1-D signal.
47///
48/// The returned signal is a 5 minute long electrocardiogram (ECG), a medical
49/// recording of the heart's electrical activity, sampled at 360 Hz.
50///
51/// # Returns
52///
53/// A Dataset containing:
54/// - `data`: The electrocardiogram in millivolt (mV) sampled at 360 Hz (as a column vector)
55/// - No target values
56/// - Metadata including sampling rate
57///
58/// # Examples
59///
60/// ```
61/// use scirs2_datasets::time_series::electrocardiogram;
62///
63/// let ecg = electrocardiogram().unwrap();
64/// println!("ECG data shape: ({}, {})", ecg.n_samples(), ecg.n_features());
65/// ```
66#[allow(dead_code)]
67pub fn electrocardiogram() -> Result<Dataset> {
68    // Fetch the ECG data file
69    let ecg_file = match fetch_data("ecg.dat", REGISTRY.get("ecg.dat")) {
70        Ok(path) => path,
71        Err(e) => {
72            return Err(DatasetsError::LoadingError(format!(
73                "Failed to fetch ECG data: {e}"
74            )))
75        }
76    };
77
78    // Read the file
79    let ecg_data = match fs::read(ecg_file) {
80        Ok(data) => data,
81        Err(e) => {
82            return Err(DatasetsError::LoadingError(format!(
83                "Failed to read ECG data: {e}"
84            )))
85        }
86    };
87
88    // Parse the binary data - ECG data is 16-bit integers
89    let mut ecg_values = Vec::with_capacity(ecg_data.len() / 2);
90    let mut i = 0;
91    while i < ecg_data.len() {
92        if i + 1 < ecg_data.len() {
93            let value = (ecg_data[i] as u16) | ((ecg_data[i + 1] as u16) << 8);
94            ecg_values.push(value);
95        }
96        i += 2;
97    }
98
99    // Convert raw ADC output to mV: (ecg - adc_zero) / adc_gain
100    // Following SciPy's conversion formula
101    let ecg_values = ecg_values
102        .into_iter()
103        .map(|x| (x as f64 - 1024.0) / 200.0)
104        .collect::<Vec<f64>>();
105
106    let ecg_array = Array1::from_vec(ecg_values);
107
108    // Get the length before converting to avoid borrow after move
109    let len = ecg_array.len();
110
111    // Convert the 1D array to a 2D column vector using reshape which should be safer
112    let data = ecg_array.into_shape_with_order((len, 1)).unwrap();
113
114    // Create the dataset
115    let mut dataset = Dataset::new(data, None);
116    dataset = dataset
117        .with_featurenames(vec!["ecg".to_string()])
118        .with_description("Electrocardiogram (ECG) data, 5 minutes sampled at 360 Hz".to_string())
119        .with_metadata("sampling_rate", "360")
120        .with_metadata("units", "mV")
121        .with_metadata("duration", "5 minutes");
122
123    Ok(dataset)
124}
125
126/// Stock market price data structure for parsing CSV
127#[derive(Debug, Deserialize)]
128struct StockPrice {
129    date: String,
130    open: f64,
131    #[allow(dead_code)]
132    high: f64,
133    #[allow(dead_code)]
134    low: f64,
135    close: f64,
136    #[allow(dead_code)]
137    volume: f64,
138    symbol: String,
139}
140
141/// Load stock market data for multiple companies.
142///
143/// This dataset contains historical daily price data for multiple companies,
144/// which can be used for financial time series analysis.
145///
146/// # Parameters
147///
148/// * `returns` - If true, returns daily price changes (close - open) instead of absolute prices.
149///
150/// # Returns
151///
152/// A Dataset containing:
153/// - `data`: Price data for multiple stocks over time
154/// - No target values
155/// - Feature names corresponding to stock symbols
156/// - Metadata including date range and symbols
157///
158/// # Examples
159///
160/// ```ignore
161/// use scirs2_datasets::time_series::stock_market;
162///
163/// let stock_data = stock_market(true).unwrap(); // Get price changes
164/// println!("Stock data shape: ({}, {})", stock_data.n_samples(), stock_data.n_features());
165/// ```
166#[allow(dead_code)]
167pub fn stock_market(returns: bool) -> Result<Dataset> {
168    // Fetch the stock market data file
169    let stock_file = match fetch_data("stock_market.csv", REGISTRY.get("stock_market.csv")) {
170        Ok(path) => path,
171        Err(e) => {
172            return Err(DatasetsError::LoadingError(format!(
173                "Failed to fetch stock market data: {e}"
174            )))
175        }
176    };
177
178    // Read and parse the CSV file
179    let file_content = match fs::read_to_string(&stock_file) {
180        Ok(content) => content,
181        Err(e) => {
182            return Err(DatasetsError::LoadingError(format!(
183                "Failed to read stock market data: {e}"
184            )))
185        }
186    };
187
188    let mut reader = csv::Reader::from_reader(file_content.as_bytes());
189    let records: Result<Vec<StockPrice>> = reader
190        .deserialize()
191        .map(|result| {
192            result.map_err(|e| DatasetsError::LoadingError(format!("CSV parsing error: {e}")))
193        })
194        .collect();
195
196    let records = records?;
197    if records.is_empty() {
198        return Err(DatasetsError::LoadingError(
199            "Stock market data is empty".to_string(),
200        ));
201    }
202
203    // Extract unique symbols and dates
204    let mut symbols = Vec::new();
205    let mut dates = Vec::new();
206    for record in &records {
207        if !symbols.contains(&record.symbol) {
208            symbols.push(record.symbol.clone());
209        }
210        if !dates.contains(&record.date) {
211            dates.push(record.date.clone());
212        }
213    }
214
215    symbols.sort();
216    dates.sort();
217
218    // Create a mapping of (date, symbol) to price data
219    let mut date_symbol_map = HashMap::new();
220    for record in &records {
221        date_symbol_map.insert((record.date.clone(), record.symbol.clone()), record);
222    }
223
224    // Create the data matrix (dates x symbols)
225    let mut data = Array2::zeros((dates.len(), symbols.len()));
226
227    for (i, date) in dates.iter().enumerate() {
228        for (j, symbol) in symbols.iter().enumerate() {
229            if let Some(record) = date_symbol_map.get(&(date.clone(), symbol.clone())) {
230                data[[i, j]] = if returns {
231                    record.close - record.open
232                } else {
233                    record.close
234                };
235            }
236        }
237    }
238
239    // Create the dataset
240    let mut dataset = Dataset::new(data, None);
241    dataset = dataset
242        .with_featurenames(symbols.clone())
243        .with_description(format!(
244            "Stock market data for {} companies from {} to {}",
245            symbols.len(),
246            dates.first().unwrap_or(&"unknown".to_string()),
247            dates.last().unwrap_or(&"unknown".to_string())
248        ))
249        .with_metadata("n_symbols", &symbols.len().to_string())
250        .with_metadata(
251            "start_date",
252            dates.first().unwrap_or(&"unknown".to_string()),
253        )
254        .with_metadata("end_date", dates.last().unwrap_or(&"unknown".to_string()))
255        .with_metadata("data_type", if returns { "_returns" } else { "prices" });
256
257    Ok(dataset)
258}
259
260/// Weather observation data structure for parsing CSV
261#[derive(Debug, Deserialize)]
262struct WeatherObservation {
263    date: String,
264    temperature: f64,
265    humidity: f64,
266    pressure: f64,
267    wind_speed: f64,
268    precipitation: f64,
269    location: String,
270}
271
272/// Load weather time series data from multiple locations.
273///
274/// This dataset contains daily weather measurements (temperature, humidity, pressure,
275/// wind speed, and precipitation) for multiple locations.
276///
277/// # Parameters
278///
279/// * `feature` - Which weather feature to use as the primary data. Options are:
280///   "temperature", "humidity", "pressure", "wind_speed", or "precipitation".
281///   If None, returns all features.
282///
283/// # Returns
284///
285/// A Dataset containing:
286/// - `data`: Weather data over time (dates x locations or dates x features x locations)
287/// - No target values
288/// - Feature names corresponding to locations or weather measurements
289/// - Metadata including date range and locations
290///
291/// # Examples
292///
293/// ```ignore
294/// use scirs2_datasets::time_series::weather;
295///
296/// // Get temperature data for all locations
297/// let temp_data = weather(Some("temperature")).unwrap();
298/// println!("Temperature data shape: ({}, {})", temp_data.n_samples(), temp_data.n_features());
299///
300/// // Get all weather features
301/// let all_weather = weather(None).unwrap();
302/// println!("All weather data shape: ({}, {})", all_weather.n_samples(), all_weather.n_features());
303/// ```
304#[allow(dead_code)]
305pub fn weather(feature: Option<&str>) -> Result<Dataset> {
306    // Validate _feature parameter
307    let valid_features = vec![
308        "temperature",
309        "humidity",
310        "pressure",
311        "wind_speed",
312        "precipitation",
313    ];
314
315    if let Some(f) = feature {
316        if !valid_features.contains(&f) {
317            return Err(DatasetsError::InvalidFormat(format!(
318                "Invalid _feature: {f}. Valid features are: {valid_features:?}"
319            )));
320        }
321    }
322
323    // Fetch the weather data file
324    let weather_file = match fetch_data("weather.csv", REGISTRY.get("weather.csv")) {
325        Ok(path) => path,
326        Err(e) => {
327            return Err(DatasetsError::LoadingError(format!(
328                "Failed to fetch weather data: {e}"
329            )))
330        }
331    };
332
333    // Read and parse the CSV file
334    let file_content = match fs::read_to_string(&weather_file) {
335        Ok(content) => content,
336        Err(e) => {
337            return Err(DatasetsError::LoadingError(format!(
338                "Failed to read weather data: {e}"
339            )))
340        }
341    };
342
343    let mut reader = csv::Reader::from_reader(file_content.as_bytes());
344    let records: Result<Vec<WeatherObservation>> = reader
345        .deserialize()
346        .map(|result| {
347            result.map_err(|e| DatasetsError::LoadingError(format!("CSV parsing error: {e}")))
348        })
349        .collect();
350
351    let records = records?;
352    if records.is_empty() {
353        return Err(DatasetsError::LoadingError(
354            "Weather data is empty".to_string(),
355        ));
356    }
357
358    // Extract unique locations and dates
359    let mut locations = Vec::new();
360    let mut dates = Vec::new();
361    for record in &records {
362        if !locations.contains(&record.location) {
363            locations.push(record.location.clone());
364        }
365        if !dates.contains(&record.date) {
366            dates.push(record.date.clone());
367        }
368    }
369
370    locations.sort();
371    dates.sort();
372
373    // Create a mapping of (date, location) to weather data
374    let mut date_location_map = HashMap::new();
375    for record in &records {
376        date_location_map.insert((record.date.clone(), record.location.clone()), record);
377    }
378
379    let mut dataset = match feature {
380        Some(feat) => {
381            // Single _feature mode - create a 2D matrix (dates x locations)
382            let mut data = Array2::zeros((dates.len(), locations.len()));
383
384            for (i, date) in dates.iter().enumerate() {
385                for (j, location) in locations.iter().enumerate() {
386                    if let Some(record) = date_location_map.get(&(date.clone(), location.clone())) {
387                        data[[i, j]] = match feat {
388                            "temperature" => record.temperature,
389                            "humidity" => record.humidity,
390                            "pressure" => record.pressure,
391                            "wind_speed" => record.wind_speed,
392                            "precipitation" => record.precipitation,
393                            _ => 0.0, // Should never happen due to validation above
394                        };
395                    }
396                }
397            }
398
399            // Create the dataset
400            let mut ds = Dataset::new(data, None);
401
402            // Feature names are location names in this case
403            ds = ds
404                .with_featurenames(locations.clone())
405                .with_description(format!(
406                    "Weather {} data for {} locations from {} to {}",
407                    feat,
408                    locations.len(),
409                    dates.first().unwrap_or(&"unknown".to_string()),
410                    dates.last().unwrap_or(&"unknown".to_string())
411                ))
412                .with_metadata("_feature", feat)
413                .with_metadata("n_locations", &locations.len().to_string())
414                .with_metadata(
415                    "start_date",
416                    dates.first().unwrap_or(&"unknown".to_string()),
417                )
418                .with_metadata("end_date", dates.last().unwrap_or(&"unknown".to_string()));
419
420            ds
421        }
422        None => {
423            // All features mode - create a 2D matrix (dates x (features*locations))
424            // Each location will have multiple columns, one for each _feature
425            let n_features = valid_features.len();
426            let mut data = Array2::zeros((dates.len(), n_features * locations.len()));
427
428            for (i, date) in dates.iter().enumerate() {
429                for (j, location) in locations.iter().enumerate() {
430                    if let Some(record) = date_location_map.get(&(date.clone(), location.clone())) {
431                        // Calculate base column index for this location
432                        let base_col = j * n_features;
433
434                        // Fill in all features for this location and date
435                        data[[i, base_col]] = record.temperature;
436                        data[[i, base_col + 1]] = record.humidity;
437                        data[[i, base_col + 2]] = record.pressure;
438                        data[[i, base_col + 3]] = record.wind_speed;
439                        data[[i, base_col + 4]] = record.precipitation;
440                    }
441                }
442            }
443
444            // Create _feature names: for each location, add each _feature
445            let mut featurenames = Vec::with_capacity(n_features * locations.len());
446            for location in &locations {
447                for feat in &valid_features {
448                    featurenames.push(format!("{location}_{feat}"));
449                }
450            }
451
452            // Create the dataset
453            let mut ds = Dataset::new(data, None);
454            ds = ds
455                .with_featurenames(featurenames)
456                .with_description(format!(
457                    "Weather data (all features) for {} locations from {} to {}",
458                    locations.len(),
459                    dates.first().unwrap_or(&"unknown".to_string()),
460                    dates.last().unwrap_or(&"unknown".to_string())
461                ))
462                .with_metadata("features", &valid_features.join(","))
463                .with_metadata("n_locations", &locations.len().to_string())
464                .with_metadata(
465                    "start_date",
466                    dates.first().unwrap_or(&"unknown".to_string()),
467                )
468                .with_metadata("end_date", dates.last().unwrap_or(&"unknown".to_string()));
469
470            ds
471        }
472    };
473
474    // Add locations metadata
475    dataset = dataset.with_metadata("locations", &locations.join(","));
476
477    Ok(dataset)
478}
479
480// The fetch_data function is now provided by the cache module