use std::collections::HashMap;
use scirs2_core::ndarray::{Array1, Array2};
use scirs2_core::random::prelude::*;
use scirs2_core::random::{Distribution, Uniform};
use serde::{Deserialize, Serialize};
use crate::cache::DatasetCache;
use crate::error::{DatasetsError, Result};
use crate::external::ExternalClient;
use crate::utils::Dataset;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DomainConfig {
pub base_url: Option<String>,
pub api_key: Option<String>,
pub preferred_formats: Vec<String>,
pub quality_filters: QualityFilters,
}
impl Default for DomainConfig {
fn default() -> Self {
Self {
base_url: None,
api_key: None,
preferred_formats: vec!["csv".to_string(), "fits".to_string(), "hdf5".to_string()],
quality_filters: QualityFilters::default(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityFilters {
pub min_samples: Option<usize>,
pub max_missing_percent: Option<f64>,
pub min_completeness: Option<f64>,
pub min_year: Option<u32>,
}
impl Default for QualityFilters {
fn default() -> Self {
Self {
min_samples: Some(100),
max_missing_percent: Some(0.1),
min_completeness: Some(0.9),
min_year: Some(2000),
}
}
}
pub mod astronomy {
use super::*;
pub struct StellarDatasets {
#[allow(dead_code)]
client: ExternalClient,
#[allow(dead_code)]
cache: DatasetCache,
}
impl StellarDatasets {
pub fn new() -> Result<Self> {
let cachedir = crate::platform_dirs::cache_dir()
.ok_or_else(|| {
DatasetsError::Other("Could not determine cache directory".to_string())
})?
.join("scirs2-datasets");
Ok(Self {
client: ExternalClient::new()?,
cache: DatasetCache::new(cachedir),
})
}
pub fn load_hipparcos_catalog(&self) -> Result<Dataset> {
self.load_synthetic_stellar_data("hipparcos", 118218)
}
pub fn load_gaia_dr3_sample(&self) -> Result<Dataset> {
self.load_synthetic_stellar_data("gaia_dr3", 50000)
}
pub fn load_exoplanet_catalog(&self) -> Result<Dataset> {
self.load_synthetic_exoplanet_data(5000)
}
pub fn load_supernova_photometry(&self) -> Result<Dataset> {
self.load_synthetic_supernova_data(1000)
}
fn load_synthetic_stellar_data(&self, catalog: &str, nstars: usize) -> Result<Dataset> {
use scirs2_core::random::{Distribution, Normal};
let mut rng = thread_rng();
let mut data = Vec::with_capacity(nstars * 8);
let mut spectral_classes = Vec::with_capacity(nstars);
let ra_dist = scirs2_core::random::Uniform::new(0.0, 360.0).expect("Operation failed");
let dec_dist =
scirs2_core::random::Uniform::new(-90.0, 90.0).expect("Operation failed");
let magnitude_dist = Normal::new(8.0, 3.0).expect("Operation failed");
let color_dist = Normal::new(0.5, 0.3).expect("Operation failed");
let parallax_dist = Normal::new(10.0, 5.0).expect("Operation failed");
let proper_motion_dist = Normal::new(0.0, 50.0).expect("Operation failed");
let radial_velocity_dist = Normal::new(0.0, 30.0).expect("Operation failed");
for _ in 0..nstars {
data.push(ra_dist.sample(&mut rng));
data.push(dec_dist.sample(&mut rng));
data.push(magnitude_dist.sample(&mut rng));
data.push(color_dist.sample(&mut rng));
data.push((parallax_dist.sample(&mut rng) as f64).max(0.1f64));
data.push(proper_motion_dist.sample(&mut rng));
data.push(proper_motion_dist.sample(&mut rng));
data.push(radial_velocity_dist.sample(&mut rng));
let color = data[data.len() - 5];
let spectral_class = match color {
c if c < -0.3 => 0, c if c < -0.1 => 1, c if c < 0.2 => 2, c if c < 0.5 => 3, c if c < 0.8 => 4, c if c < 1.2 => 5, _ => 6, };
spectral_classes.push(spectral_class as f64);
}
let data_array = Array2::from_shape_vec((nstars, 8), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(spectral_classes);
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(vec![
"ra".to_string(),
"dec".to_string(),
"magnitude".to_string(),
"color_bv".to_string(),
"parallax".to_string(),
"pm_ra".to_string(),
"pm_dec".to_string(),
"radial_velocity".to_string(),
]),
targetnames: Some(vec![
"O".to_string(),
"B".to_string(),
"A".to_string(),
"F".to_string(),
"G".to_string(),
"K".to_string(),
"M".to_string(),
]),
feature_descriptions: Some(vec![
"Right Ascension (degrees)".to_string(),
"Declination (degrees)".to_string(),
"Apparent magnitude (visual)".to_string(),
"B-V color index".to_string(),
"Parallax (arcseconds)".to_string(),
"Proper motion RA (mas/year)".to_string(),
"Proper motion Dec (mas/year)".to_string(),
"Radial velocity (km/s)".to_string(),
]),
description: Some(format!(
"Synthetic {catalog} stellar catalog with {nstars} _stars"
)),
metadata: std::collections::HashMap::new(),
})
}
fn load_synthetic_exoplanet_data(&self, nplanets: usize) -> Result<Dataset> {
use scirs2_core::random::{Distribution, LogNormal, Normal};
let mut rng = thread_rng();
let mut data = Vec::with_capacity(nplanets * 6);
let mut planet_types = Vec::with_capacity(nplanets);
let period_dist = LogNormal::new(1.0, 1.5).expect("Operation failed");
let radius_dist = LogNormal::new(0.0, 0.8).expect("Operation failed");
let mass_dist = LogNormal::new(1.0, 1.2).expect("Operation failed");
let stellar_mass_dist = Normal::new(1.0, 0.3).expect("Operation failed");
let stellar_temp_dist = Normal::new(5800.0, 1000.0).expect("Operation failed");
let metallicity_dist = Normal::new(0.0, 0.3).expect("Operation failed");
for _ in 0..nplanets {
data.push(period_dist.sample(&mut rng));
data.push(radius_dist.sample(&mut rng));
data.push(mass_dist.sample(&mut rng));
data.push((stellar_mass_dist.sample(&mut rng) as f64).max(0.1f64));
data.push(stellar_temp_dist.sample(&mut rng));
data.push(metallicity_dist.sample(&mut rng));
let radius = data[data.len() - 5];
let planet_type = match radius {
r if r < 1.25 => 0, r if r < 2.0 => 1, r if r < 4.0 => 2, r if r < 11.0 => 3, _ => 4, };
planet_types.push(planet_type as f64);
}
let data_array = Array2::from_shape_vec((nplanets, 6), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(planet_types);
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(vec![
"period".to_string(),
"radius".to_string(),
"mass".to_string(),
"stellar_mass".to_string(),
"stellar_temp".to_string(),
"metallicity".to_string(),
]),
targetnames: Some(vec![
"Rocky".to_string(),
"Super-Earth".to_string(),
"Sub-Neptune".to_string(),
"Neptune".to_string(),
"Jupiter".to_string(),
]),
feature_descriptions: Some(vec![
"Orbital period (days)".to_string(),
"Planet radius (Earth radii)".to_string(),
"Planet mass (Earth masses)".to_string(),
"Stellar mass (Solar masses)".to_string(),
"Stellar temperature (K)".to_string(),
"Stellar metallicity [Fe/H]".to_string(),
]),
description: Some(format!(
"Synthetic exoplanet catalog with {nplanets} _planets"
)),
metadata: std::collections::HashMap::new(),
})
}
fn load_synthetic_supernova_data(&self, nsupernovae: usize) -> Result<Dataset> {
use scirs2_core::random::{Distribution, Normal};
let mut rng = thread_rng();
let mut data = Vec::with_capacity(nsupernovae * 10);
let mut sn_types = Vec::with_capacity(nsupernovae);
let _type_probs = [0.7, 0.15, 0.10, 0.05];
for _ in 0..nsupernovae {
let sn_type = rng.sample(Uniform::new(0, 4).expect("Operation failed"));
let (peak_mag, decline_rate, color_evolution, host_mass) = match sn_type {
0 => (-19.3, 1.1, 0.2, 10.5), 1 => (-18.5, 1.8, 0.5, 9.8), 2 => (-16.8, 0.8, 0.3, 9.2), _ => (-17.5, 1.2, 0.4, 9.0), };
let peak_noise = Normal::new(0.0, 0.3).expect("Operation failed");
let decline_noise = Normal::new(0.0, 0.2).expect("Operation failed");
let color_noise = Normal::new(0.0, 0.1).expect("Operation failed");
let host_noise = Normal::new(0.0, 0.5).expect("Operation failed");
data.push(peak_mag + peak_noise.sample(&mut rng));
data.push(decline_rate + decline_noise.sample(&mut rng));
data.push(color_evolution + color_noise.sample(&mut rng));
data.push(host_mass + host_noise.sample(&mut rng));
data.push(rng.random_range(0.01..0.3));
data.push(rng.random_range(20.0..200.0));
data.push(rng.random_range(0.7..1.3));
data.push(rng.random_range(0.0..0.5));
data.push(rng.random_range(15.0..22.0));
data.push(rng.random_range(-90.0..90.0));
sn_types.push(sn_type as f64);
}
let data_array = Array2::from_shape_vec((nsupernovae, 10), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(sn_types);
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(vec![
"peak_magnitude".to_string(),
"decline_rate".to_string(),
"color_max".to_string(),
"host_mass".to_string(),
"redshift".to_string(),
"duration".to_string(),
"stretch".to_string(),
"color_excess".to_string(),
"discovery_mag".to_string(),
"galactic_lat".to_string(),
]),
targetnames: Some(vec![
"Type Ia".to_string(),
"Type Ib/c".to_string(),
"Type II-P".to_string(),
"Type II-L".to_string(),
]),
feature_descriptions: Some(vec![
"Peak apparent magnitude".to_string(),
"Magnitude decline rate (mag/day)".to_string(),
"Maximum color index".to_string(),
"Host galaxy stellar mass (log10 M_sun)".to_string(),
"Cosmological redshift".to_string(),
"Light curve duration (days)".to_string(),
"Light curve stretch factor".to_string(),
"Host galaxy color excess E(B-V)".to_string(),
"Discovery magnitude".to_string(),
"Galactic latitude (degrees)".to_string(),
]),
description: Some(format!(
"Synthetic supernova catalog with {nsupernovae} events"
)),
metadata: std::collections::HashMap::new(),
})
}
}
}
pub mod genomics {
use super::*;
pub struct GenomicsDatasets {
#[allow(dead_code)]
client: ExternalClient,
#[allow(dead_code)]
cache: DatasetCache,
}
impl GenomicsDatasets {
pub fn new() -> Result<Self> {
let cachedir = crate::platform_dirs::cache_dir()
.ok_or_else(|| {
DatasetsError::Other("Could not determine cache directory".to_string())
})?
.join("scirs2-datasets");
Ok(Self {
client: ExternalClient::new()?,
cache: DatasetCache::new(cachedir),
})
}
pub fn load_gene_expression(&self, n_samples: usize, ngenes: usize) -> Result<Dataset> {
use scirs2_core::random::{Distribution, LogNormal, Normal};
let mut rng = thread_rng();
let mut data = Vec::with_capacity(n_samples * ngenes);
let mut phenotypes = Vec::with_capacity(n_samples);
let condition_effects = [1.0, 2.5, 0.4, 1.8, 0.7];
for sample_idx in 0..n_samples {
let condition = sample_idx % condition_effects.len();
let base_effect = condition_effects[condition];
for gene_idx in 0..ngenes {
let base_expr = LogNormal::new(5.0, 2.0)
.expect("Operation failed")
.sample(&mut rng);
let gene_effect = if gene_idx < ngenes / 10 {
base_effect
} else {
1.0
};
let noise = Normal::new(1.0, 0.2)
.expect("Operation failed")
.sample(&mut rng);
let expression: f64 = base_expr * gene_effect * noise;
data.push(expression.ln()); }
phenotypes.push(condition as f64);
}
let data_array = Array2::from_shape_vec((n_samples, ngenes), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(phenotypes);
let featurenames: Vec<String> = (0..ngenes).map(|i| format!("GENE_{i:06}")).collect();
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(featurenames.clone()),
targetnames: Some(vec![
"Control".to_string(),
"Treatment_A".to_string(),
"Treatment_B".to_string(),
"Disease_X".to_string(),
"Disease_Y".to_string(),
]),
feature_descriptions: Some(
featurenames
.iter()
.map(|name| format!("Expression level of {name}"))
.collect(),
),
description: Some(format!(
"Synthetic gene expression data: {n_samples} _samples × {ngenes} _genes"
)),
metadata: std::collections::HashMap::new(),
})
}
pub fn load_dnasequences(
&self,
nsequences: usize,
sequence_length: usize,
) -> Result<Dataset> {
let mut rng = thread_rng();
let nucleotides = ['A', 'T', 'G', 'C'];
let mut sequences = Vec::new();
let mut sequence_types = Vec::with_capacity(nsequences);
for seq_idx in 0..nsequences {
let mut sequence = String::with_capacity(sequence_length);
let seq_type = seq_idx % 3;
for _pos in 0..sequence_length {
let nucleotide = match seq_type {
0 => {
if rng.random::<f64>() < 0.6 {
if rng.random::<f64>() < 0.5 {
'G'
} else {
'C'
}
} else if rng.random::<f64>() < 0.5 {
'A'
} else {
'T'
}
}
1 => {
if rng.random::<f64>() < 0.6 {
if rng.random::<f64>() < 0.5 {
'A'
} else {
'T'
}
} else if rng.random::<f64>() < 0.5 {
'G'
} else {
'C'
}
}
_ => {
nucleotides[rng.sample(Uniform::new(0, 4).expect("Operation failed"))]
}
};
sequence.push(nucleotide);
}
sequences.push(sequence);
sequence_types.push(seq_type as f64);
}
let mut data = Vec::new();
let k = 3;
let kmers = Self::generate_kmers(k);
for sequence in &sequences {
let kmer_counts = Self::count_kmers(sequence, k, &kmers);
data.extend(kmer_counts);
}
let n_features = 4_usize.pow(k as u32); let data_array = Array2::from_shape_vec((nsequences, n_features), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(sequence_types);
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(kmers.clone()),
targetnames: Some(vec![
"GC-rich".to_string(),
"AT-rich".to_string(),
"Random".to_string(),
]),
feature_descriptions: Some(
kmers
.iter()
.map(|kmer| format!("Frequency of {k}-mer: {kmer}"))
.collect(),
),
description: Some(format!(
"DNA sequences: {nsequences} seqs × {k}-mer features"
)),
metadata: std::collections::HashMap::new(),
})
}
fn generate_kmers(k: usize) -> Vec<String> {
let nucleotides = vec!['A', 'T', 'G', 'C'];
let mut kmers = Vec::new();
fn generate_recursive(
current: String,
remaining: usize,
nucleotides: &[char],
kmers: &mut Vec<String>,
) {
if remaining == 0 {
kmers.push(current);
return;
}
for &nucleotide in nucleotides {
let mut new_current = current.clone();
new_current.push(nucleotide);
generate_recursive(new_current, remaining - 1, nucleotides, kmers);
}
}
generate_recursive(String::new(), k, &nucleotides, &mut kmers);
kmers
}
fn count_kmers(sequence: &str, k: usize, kmers: &[String]) -> Vec<f64> {
let mut counts = vec![0.0; kmers.len()];
let kmer_to_idx: HashMap<&str, usize> = kmers
.iter()
.enumerate()
.map(|(i, k)| (k.as_str(), i))
.collect();
for i in 0..=sequence.len().saturating_sub(k) {
let kmer = &sequence[i..i + k];
if let Some(&idx) = kmer_to_idx.get(kmer) {
counts[idx] += 1.0;
}
}
let total: f64 = counts.iter().sum();
if total > 0.0 {
for count in &mut counts {
*count /= total;
}
}
counts
}
}
}
pub mod climate {
use super::*;
pub struct ClimateDatasets {
#[allow(dead_code)]
client: ExternalClient,
#[allow(dead_code)]
cache: DatasetCache,
}
impl ClimateDatasets {
pub fn new() -> Result<Self> {
let cachedir = crate::platform_dirs::cache_dir()
.ok_or_else(|| {
DatasetsError::Other("Could not determine cache directory".to_string())
})?
.join("scirs2-datasets");
Ok(Self {
client: ExternalClient::new()?,
cache: DatasetCache::new(cachedir),
})
}
pub fn load_temperature_timeseries(
&self,
n_stations: usize,
n_years: usize,
) -> Result<Dataset> {
use scirs2_core::random::{Distribution, Normal};
let mut rng = thread_rng();
let days_per_year = 365;
let total_days = n_years * days_per_year;
let mut data = Vec::with_capacity(n_stations * 8); let mut climate_zones = Vec::with_capacity(n_stations);
for station_idx in 0..n_stations {
let zone = station_idx % 5; climate_zones.push(zone as f64);
let (base_temp, temp_amplitude, annual_precip, humidity) = match zone {
0 => (25.0, 5.0, 2000.0, 80.0), 1 => (15.0, 15.0, 800.0, 60.0), 2 => (-5.0, 20.0, 400.0, 70.0), 3 => (5.0, 8.0, 200.0, 40.0), _ => (-10.0, 25.0, 300.0, 75.0), };
let mut temperatures = Vec::with_capacity(total_days);
let mut precipitation = Vec::with_capacity(total_days);
for day in 0..total_days {
let year_progress = (day % days_per_year) as f64 / days_per_year as f64;
let seasonal_temp = base_temp
+ temp_amplitude * (year_progress * 2.0 * std::f64::consts::PI).cos();
let temp_noise = Normal::new(0.0, 2.0).expect("Operation failed");
let temp = seasonal_temp + temp_noise.sample(&mut rng);
temperatures.push(temp);
let seasonal_precip_factor = match zone {
0 => {
1.0 + 0.3
* (year_progress * 2.0 * std::f64::consts::PI
+ std::f64::consts::PI)
.cos()
}
1 => 1.0 + 0.2 * (year_progress * 2.0 * std::f64::consts::PI).sin(),
_ => 1.0,
};
let precip = if rng.random::<f64>() < 0.3 {
rng.random_range(0.0..20.0) * seasonal_precip_factor
} else {
0.0
};
precipitation.push(precip);
}
let mean_temp = temperatures.iter().sum::<f64>() / temperatures.len() as f64;
let max_temp = temperatures
.iter()
.fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let min_temp = temperatures.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let temp_range = max_temp - min_temp;
let total_precip = precipitation.iter().sum::<f64>();
let precip_days = precipitation.iter().filter(|&&p| p > 0.0).count() as f64;
let avg_humidity = humidity
+ Normal::new(0.0, 5.0)
.expect("Operation failed")
.sample(&mut rng);
let wind_speed = rng.random_range(2.0..15.0);
data.extend(vec![
mean_temp,
temp_range,
total_precip,
precip_days,
avg_humidity,
wind_speed,
base_temp, annual_precip / 365.0, ]);
}
let data_array = Array2::from_shape_vec((n_stations, 8), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(climate_zones);
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(vec![
"mean_temperature".to_string(),
"temperature_range".to_string(),
"annual_precipitation".to_string(),
"precipitation_days".to_string(),
"avg_humidity".to_string(),
"avg_wind_speed".to_string(),
"latitude_proxy".to_string(),
"daily_precip_avg".to_string(),
]),
targetnames: Some(vec![
"Tropical".to_string(),
"Temperate".to_string(),
"Continental".to_string(),
"Desert".to_string(),
"Arctic".to_string(),
]),
feature_descriptions: Some(vec![
"Mean annual temperature (°C)".to_string(),
"Temperature range (max-min, °C)".to_string(),
"Total annual precipitation (mm)".to_string(),
"Number of precipitation days per year".to_string(),
"Average humidity (%)".to_string(),
"Average wind speed (m/s)".to_string(),
"Latitude proxy (normalized)".to_string(),
"Average daily precipitation (mm/day)".to_string(),
]),
description: Some(format!(
"Climate data: {n_stations} _stations × {n_years} _years"
)),
metadata: std::collections::HashMap::new(),
})
}
pub fn load_atmospheric_chemistry(&self, nmeasurements: usize) -> Result<Dataset> {
use scirs2_core::random::{Distribution, LogNormal, Normal};
let mut rng = thread_rng();
let mut data = Vec::with_capacity(nmeasurements * 12);
let mut air_quality_index = Vec::with_capacity(nmeasurements);
for _ in 0..nmeasurements {
let base_pollution = rng.random_range(0.0..1.0);
let pm25: f64 = LogNormal::new(2.0 + base_pollution, 0.5)
.expect("Failed to create array")
.sample(&mut rng);
let pm10 = pm25 * rng.random_range(1.5..2.5);
let no2 = LogNormal::new(3.0 + base_pollution * 0.5, 0.3)
.expect("Failed to create array")
.sample(&mut rng);
let so2 = LogNormal::new(1.0 + base_pollution * 0.3, 0.4)
.expect("Failed to create array")
.sample(&mut rng);
let o3 = LogNormal::new(4.0 - base_pollution * 0.2, 0.2)
.expect("Failed to create array")
.sample(&mut rng);
let co = LogNormal::new(0.5 + base_pollution * 0.4, 0.3)
.expect("Failed to create array")
.sample(&mut rng);
let temperature = Normal::new(20.0, 10.0)
.expect("Operation failed")
.sample(&mut rng);
let humidity = rng.random_range(30.0..90.0);
let wind_speed = rng.random_range(0.5..12.0);
let pressure = Normal::new(1013.0, 15.0)
.expect("Operation failed")
.sample(&mut rng);
let visibility = (50.0 - pm25.ln() * 5.0).max(1.0);
let uv_index = rng.random_range(0.0..12.0);
data.extend(vec![
pm25,
pm10,
no2,
so2,
o3,
co,
temperature,
humidity,
wind_speed,
pressure,
visibility,
uv_index,
]);
let aqi = Self::calculate_aqi(pm25, pm10, no2, so2, o3, co);
air_quality_index.push(aqi);
}
let data_array = Array2::from_shape_vec((nmeasurements, 12), data)
.map_err(|e| DatasetsError::FormatError(e.to_string()))?;
let target = Array1::from_vec(air_quality_index);
Ok(Dataset {
data: data_array,
target: Some(target),
featurenames: Some(vec![
"pm2_5".to_string(),
"pm10".to_string(),
"no2".to_string(),
"so2".to_string(),
"o3".to_string(),
"co".to_string(),
"temperature".to_string(),
"humidity".to_string(),
"wind_speed".to_string(),
"pressure".to_string(),
"visibility".to_string(),
"uv_index".to_string(),
]),
targetnames: None,
feature_descriptions: Some(vec![
"PM2.5 concentration (µg/m³)".to_string(),
"PM10 concentration (µg/m³)".to_string(),
"NO2 concentration (µg/m³)".to_string(),
"SO2 concentration (µg/m³)".to_string(),
"O3 concentration (µg/m³)".to_string(),
"CO concentration (µg/m³)".to_string(),
"Temperature (°C)".to_string(),
"Relative humidity (%)".to_string(),
"Wind speed (m/s)".to_string(),
"Atmospheric pressure (hPa)".to_string(),
"Visibility (km)".to_string(),
"UV index".to_string(),
]),
description: Some(format!(
"Atmospheric chemistry _measurements: {nmeasurements} samples"
)),
metadata: std::collections::HashMap::new(),
})
}
#[allow(clippy::too_many_arguments)]
fn calculate_aqi(pm25: f64, pm10: f64, no2: f64, so2: f64, o3: f64, co: f64) -> f64 {
let pm25_aqi = (pm25 / 35.0 * 100.0).min(300.0);
let pm10_aqi = (pm10 / 150.0 * 100.0).min(300.0);
let no2_aqi = (no2 / 100.0 * 100.0).min(300.0);
let so2_aqi = (so2 / 75.0 * 100.0).min(300.0);
let o3_aqi = (o3 / 120.0 * 100.0).min(300.0);
let co_aqi = (co / 9.0 * 100.0).min(300.0);
[pm25_aqi, pm10_aqi, no2_aqi, so2_aqi, o3_aqi, co_aqi]
.iter()
.fold(0.0f64, |a, &b| a.max(b))
}
}
}
pub mod convenience {
use super::astronomy::StellarDatasets;
use super::climate::ClimateDatasets;
use super::genomics::GenomicsDatasets;
use super::*;
pub fn load_stellar_classification() -> Result<Dataset> {
let datasets = StellarDatasets::new()?;
datasets.load_hipparcos_catalog()
}
pub fn load_exoplanets() -> Result<Dataset> {
let datasets = StellarDatasets::new()?;
datasets.load_exoplanet_catalog()
}
pub fn load_gene_expression(
n_samples: Option<usize>,
ngenes: Option<usize>,
) -> Result<Dataset> {
let datasets = GenomicsDatasets::new()?;
datasets.load_gene_expression(n_samples.unwrap_or(200), ngenes.unwrap_or(1000))
}
pub fn load_climate_data(
_n_stations: Option<usize>,
n_years: Option<usize>,
) -> Result<Dataset> {
let datasets = ClimateDatasets::new()?;
datasets.load_temperature_timeseries(_n_stations.unwrap_or(100), n_years.unwrap_or(10))
}
pub fn load_atmospheric_chemistry(_nmeasurements: Option<usize>) -> Result<Dataset> {
let datasets = ClimateDatasets::new()?;
datasets.load_atmospheric_chemistry(_nmeasurements.unwrap_or(1000))
}
pub fn list_domain_datasets() -> Vec<(&'static str, &'static str)> {
vec![
("astronomy", "stellar_classification"),
("astronomy", "exoplanets"),
("astronomy", "supernovae"),
("astronomy", "gaia_dr3"),
("genomics", "gene_expression"),
("genomics", "dnasequences"),
("climate", "temperature_timeseries"),
("climate", "atmospheric_chemistry"),
]
}
}
#[cfg(test)]
mod tests {
use super::convenience::*;
use scirs2_core::random::Uniform;
#[test]
fn test_load_stellar_classification() {
let dataset = load_stellar_classification().expect("Operation failed");
assert!(dataset.n_samples() > 1000);
assert_eq!(dataset.n_features(), 8);
assert!(dataset.target.is_some());
}
#[test]
fn test_load_exoplanets() {
let dataset = load_exoplanets().expect("Operation failed");
assert!(dataset.n_samples() > 100);
assert_eq!(dataset.n_features(), 6);
assert!(dataset.target.is_some());
}
#[test]
fn test_load_gene_expression() {
let dataset = load_gene_expression(Some(50), Some(100)).expect("Operation failed");
assert_eq!(dataset.n_samples(), 50);
assert_eq!(dataset.n_features(), 100);
assert!(dataset.target.is_some());
}
#[test]
fn test_load_climate_data() {
let dataset = load_climate_data(Some(20), Some(5)).expect("Operation failed");
assert_eq!(dataset.n_samples(), 20);
assert_eq!(dataset.n_features(), 8);
assert!(dataset.target.is_some());
}
#[test]
fn test_load_atmospheric_chemistry() {
let dataset = load_atmospheric_chemistry(Some(100)).expect("Operation failed");
assert_eq!(dataset.n_samples(), 100);
assert_eq!(dataset.n_features(), 12);
assert!(dataset.target.is_some());
}
#[test]
fn test_list_domain_datasets() {
let datasets = list_domain_datasets();
assert!(!datasets.is_empty());
assert!(datasets.iter().any(|(domain_, _)| *domain_ == "astronomy"));
assert!(datasets.iter().any(|(domain_, _)| *domain_ == "genomics"));
assert!(datasets.iter().any(|(domain_, _)| *domain_ == "climate"));
}
}