use crate::error::{DatasetsError, Result};
use crate::utils::Dataset;
use scirs2_core::ndarray::{Array1, Array2};
use scirs2_core::random::prelude::*;
use scirs2_core::random::rand_distributions::Distribution;
fn create_rng(randomseed: Option<u64>) -> StdRng {
match randomseed {
Some(seed) => StdRng::seed_from_u64(seed),
None => {
let mut r = thread_rng();
StdRng::seed_from_u64(r.next_u64())
}
}
}
pub fn make_biclusters(
shape: (usize, usize),
n_clusters: usize,
noise: f64,
shuffle: bool,
random_state: Option<u64>,
) -> Result<Dataset> {
let (n_rows, n_cols) = shape;
if n_rows == 0 || n_cols == 0 {
return Err(DatasetsError::InvalidFormat(
"shape dimensions must be > 0".to_string(),
));
}
if n_clusters == 0 {
return Err(DatasetsError::InvalidFormat(
"n_clusters must be > 0".to_string(),
));
}
if noise < 0.0 {
return Err(DatasetsError::InvalidFormat(
"noise must be >= 0".to_string(),
));
}
let mut rng = create_rng(random_state);
let normal = scirs2_core::random::Normal::new(0.0, noise.max(1e-30)).map_err(|e| {
DatasetsError::ComputationError(format!("Failed to create normal dist: {e}"))
})?;
let mut data = Array2::zeros((n_rows, n_cols));
if noise > 0.0 {
for i in 0..n_rows {
for j in 0..n_cols {
data[[i, j]] = normal.sample(&mut rng);
}
}
}
let mut row_labels = Array1::zeros(n_rows);
let rows_per_cluster = n_rows / n_clusters;
let cols_per_cluster = n_cols / n_clusters;
for c in 0..n_clusters {
let row_start = c * rows_per_cluster;
let row_end = if c == n_clusters - 1 {
n_rows
} else {
(c + 1) * rows_per_cluster
};
let col_start = c * cols_per_cluster;
let col_end = if c == n_clusters - 1 {
n_cols
} else {
(c + 1) * cols_per_cluster
};
let signal = 5.0 + 3.0 * rng.random::<f64>();
for i in row_start..row_end {
row_labels[i] = c as f64;
for j in col_start..col_end {
data[[i, j]] += signal;
}
}
}
if shuffle {
for i in (1..n_rows).rev() {
let j = rng.random_range(0..=i);
if i != j {
for col in 0..n_cols {
let tmp = data[[i, col]];
data[[i, col]] = data[[j, col]];
data[[j, col]] = tmp;
}
let tmp = row_labels[i];
row_labels[i] = row_labels[j];
row_labels[j] = tmp;
}
}
for j in (1..n_cols).rev() {
let k = rng.random_range(0..=j);
if j != k {
for i in 0..n_rows {
let tmp = data[[i, j]];
data[[i, j]] = data[[i, k]];
data[[i, k]] = tmp;
}
}
}
}
let feature_names: Vec<String> = (0..n_cols).map(|j| format!("col_{j}")).collect();
let dataset = Dataset::new(data, Some(row_labels))
.with_featurenames(feature_names)
.with_description(format!(
"Bicluster matrix ({n_rows}x{n_cols}) with {n_clusters} embedded biclusters"
))
.with_metadata("generator", "make_biclusters")
.with_metadata("n_clusters", &n_clusters.to_string())
.with_metadata("noise", &noise.to_string());
Ok(dataset)
}
pub fn make_checkerboard(
shape: (usize, usize),
n_clusters: (usize, usize),
noise: f64,
random_state: Option<u64>,
) -> Result<Dataset> {
let (n_rows, n_cols) = shape;
let (n_row_clusters, n_col_clusters) = n_clusters;
if n_rows == 0 || n_cols == 0 {
return Err(DatasetsError::InvalidFormat(
"shape dimensions must be > 0".to_string(),
));
}
if n_row_clusters == 0 || n_col_clusters == 0 {
return Err(DatasetsError::InvalidFormat(
"cluster dimensions must be > 0".to_string(),
));
}
if noise < 0.0 {
return Err(DatasetsError::InvalidFormat(
"noise must be >= 0".to_string(),
));
}
let mut rng = create_rng(random_state);
let normal = scirs2_core::random::Normal::new(0.0, noise.max(1e-30)).map_err(|e| {
DatasetsError::ComputationError(format!("Failed to create normal dist: {e}"))
})?;
let mut data = Array2::zeros((n_rows, n_cols));
let mut row_labels = Array1::zeros(n_rows);
let rows_per_cluster = n_rows / n_row_clusters;
let cols_per_cluster = n_cols / n_col_clusters;
for i in 0..n_rows {
let row_cluster = (i / rows_per_cluster).min(n_row_clusters - 1);
row_labels[i] = row_cluster as f64;
for j in 0..n_cols {
let col_cluster = (j / cols_per_cluster).min(n_col_clusters - 1);
let base = if (row_cluster + col_cluster) % 2 == 0 {
5.0
} else {
-5.0
};
data[[i, j]] = base;
if noise > 0.0 {
data[[i, j]] += normal.sample(&mut rng);
}
}
}
let feature_names: Vec<String> = (0..n_cols).map(|j| format!("col_{j}")).collect();
let dataset = Dataset::new(data, Some(row_labels))
.with_featurenames(feature_names)
.with_description(format!(
"Checkerboard bicluster matrix ({n_rows}x{n_cols}) with ({n_row_clusters}x{n_col_clusters}) pattern"
))
.with_metadata("generator", "make_checkerboard")
.with_metadata("n_row_clusters", &n_row_clusters.to_string())
.with_metadata("n_col_clusters", &n_col_clusters.to_string())
.with_metadata("noise", &noise.to_string());
Ok(dataset)
}
pub fn make_spd_matrix(n_dim: usize, random_state: Option<u64>) -> Result<Dataset> {
if n_dim == 0 {
return Err(DatasetsError::InvalidFormat(
"n_dim must be > 0".to_string(),
));
}
let mut rng = create_rng(random_state);
let normal = scirs2_core::random::Normal::new(0.0, 1.0).map_err(|e| {
DatasetsError::ComputationError(format!("Failed to create normal dist: {e}"))
})?;
let mut b_mat = Array2::zeros((n_dim, n_dim));
for i in 0..n_dim {
for j in 0..n_dim {
b_mat[[i, j]] = normal.sample(&mut rng);
}
}
let mut matrix = Array2::zeros((n_dim, n_dim));
for i in 0..n_dim {
for j in 0..=i {
let mut sum = 0.0;
for k in 0..n_dim {
sum += b_mat[[i, k]] * b_mat[[j, k]];
}
matrix[[i, j]] = sum;
matrix[[j, i]] = sum;
}
}
let alpha = 0.1;
for i in 0..n_dim {
matrix[[i, i]] += alpha;
}
let diag = Array1::from_vec((0..n_dim).map(|i| matrix[[i, i]]).collect());
let feature_names: Vec<String> = (0..n_dim).map(|j| format!("col_{j}")).collect();
let dataset = Dataset::new(matrix, Some(diag))
.with_featurenames(feature_names)
.with_description(format!(
"Random SPD matrix ({n_dim}x{n_dim}), A = B*B^T + {alpha}*I"
))
.with_metadata("generator", "make_spd_matrix")
.with_metadata("n_dim", &n_dim.to_string());
Ok(dataset)
}
pub fn make_sparse_spd_matrix(
n_dim: usize,
alpha: f64,
density: f64,
random_state: Option<u64>,
) -> Result<Dataset> {
if n_dim == 0 {
return Err(DatasetsError::InvalidFormat(
"n_dim must be > 0".to_string(),
));
}
if alpha < 0.0 {
return Err(DatasetsError::InvalidFormat(
"alpha must be >= 0".to_string(),
));
}
if density <= 0.0 || density > 1.0 {
return Err(DatasetsError::InvalidFormat(
"density must be in (0, 1]".to_string(),
));
}
let mut rng = create_rng(random_state);
let normal = scirs2_core::random::Normal::new(0.0, 1.0).map_err(|e| {
DatasetsError::ComputationError(format!("Failed to create normal dist: {e}"))
})?;
let uniform = scirs2_core::random::Uniform::new(0.0, 1.0).map_err(|e| {
DatasetsError::ComputationError(format!("Failed to create uniform dist: {e}"))
})?;
let mut lower = Array2::zeros((n_dim, n_dim));
for i in 0..n_dim {
let v: f64 = normal.sample(&mut rng);
lower[[i, i]] = 1.0 + v.abs(); }
for i in 1..n_dim {
for j in 0..i {
if uniform.sample(&mut rng) < density {
lower[[i, j]] = normal.sample(&mut rng);
}
}
}
let mut matrix = Array2::zeros((n_dim, n_dim));
for i in 0..n_dim {
for j in 0..=i {
let mut sum = 0.0;
for k in 0..=j {
sum += lower[[i, k]] * lower[[j, k]];
}
matrix[[i, j]] = sum;
matrix[[j, i]] = sum;
}
}
for i in 0..n_dim {
matrix[[i, i]] += alpha;
}
let eps = 1e-14;
let nnz: usize = matrix.iter().filter(|&&v: &&f64| v.abs() > eps).count();
let diag = Array1::from_vec((0..n_dim).map(|i| matrix[[i, i]]).collect());
let feature_names: Vec<String> = (0..n_dim).map(|j| format!("col_{j}")).collect();
let dataset = Dataset::new(matrix, Some(diag))
.with_featurenames(feature_names)
.with_description(format!(
"Sparse SPD matrix ({n_dim}x{n_dim}, factor density={density})"
))
.with_metadata("generator", "make_sparse_spd_matrix")
.with_metadata("n_dim", &n_dim.to_string())
.with_metadata("alpha", &alpha.to_string())
.with_metadata("density", &density.to_string())
.with_metadata("nnz", &nnz.to_string());
Ok(dataset)
}
pub fn make_sparse_coded_signal(
n_samples: usize,
n_features: usize,
n_components: usize,
n_nonzero_coefs: usize,
random_state: Option<u64>,
) -> Result<Dataset> {
if n_samples == 0 {
return Err(DatasetsError::InvalidFormat(
"n_samples must be > 0".to_string(),
));
}
if n_features == 0 {
return Err(DatasetsError::InvalidFormat(
"n_features must be > 0".to_string(),
));
}
if n_components == 0 {
return Err(DatasetsError::InvalidFormat(
"n_components must be > 0".to_string(),
));
}
if n_nonzero_coefs == 0 || n_nonzero_coefs > n_components {
return Err(DatasetsError::InvalidFormat(format!(
"n_nonzero_coefs ({n_nonzero_coefs}) must be in [1, n_components ({n_components})]"
)));
}
let mut rng = create_rng(random_state);
let normal = scirs2_core::random::Normal::new(0.0, 1.0).map_err(|e| {
DatasetsError::ComputationError(format!("Failed to create normal dist: {e}"))
})?;
let mut dictionary = Array2::zeros((n_features, n_components));
for j in 0..n_components {
for i in 0..n_features {
dictionary[[i, j]] = normal.sample(&mut rng);
}
let norm: f64 = (0..n_features)
.map(|i| dictionary[[i, j]] * dictionary[[i, j]])
.sum::<f64>()
.sqrt();
if norm > 1e-15 {
for i in 0..n_features {
dictionary[[i, j]] /= norm;
}
}
}
let mut code = Array2::zeros((n_components, n_samples));
for s in 0..n_samples {
let mut indices: Vec<usize> = (0..n_components).collect();
for k in 0..n_nonzero_coefs {
let swap_idx = rng.random_range(k..n_components);
indices.swap(k, swap_idx);
}
for k in 0..n_nonzero_coefs {
code[[indices[k], s]] = normal.sample(&mut rng);
}
}
let mut data = Array2::zeros((n_samples, n_features));
for s in 0..n_samples {
for f in 0..n_features {
let mut val = 0.0;
for k in 0..n_components {
val += code[[k, s]] * dictionary[[f, k]];
}
data[[s, f]] = val;
}
}
let mut dict_flat = Vec::with_capacity(n_features * n_components);
for i in 0..n_features {
for j in 0..n_components {
dict_flat.push(dictionary[[i, j]]);
}
}
let target = Array1::from_vec(dict_flat);
let feature_names: Vec<String> = (0..n_features).map(|j| format!("signal_{j}")).collect();
let dataset = Dataset::new(data, Some(target))
.with_featurenames(feature_names)
.with_description(format!(
"Sparse coded signal: {n_samples} samples, {n_features} features, \
{n_components} dictionary atoms, {n_nonzero_coefs} non-zero coefficients per sample. \
Target stores the flattened dictionary ({n_features}x{n_components})."
))
.with_metadata("generator", "make_sparse_coded_signal")
.with_metadata("n_components", &n_components.to_string())
.with_metadata("n_nonzero_coefs", &n_nonzero_coefs.to_string());
Ok(dataset)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_biclusters_basic() {
let ds = make_biclusters((100, 50), 3, 0.5, false, Some(42)).expect("ok");
assert_eq!(ds.n_samples(), 100);
assert_eq!(ds.n_features(), 50);
assert!(ds.target.is_some());
}
#[test]
fn test_biclusters_structure_no_noise() {
let ds = make_biclusters((90, 60), 3, 0.0, false, Some(42)).expect("ok");
let mut block_sum = 0.0;
for i in 0..30 {
for j in 0..20 {
block_sum += ds.data[[i, j]];
}
}
let block_mean: f64 = block_sum / (30.0 * 20.0);
let mut off_block_sum = 0.0;
for i in 0..30 {
for j in 20..40 {
off_block_sum += ds.data[[i, j]];
}
}
let off_block_mean: f64 = off_block_sum / (30.0 * 20.0);
assert!(
block_mean > 4.0,
"Block mean should be elevated, got {block_mean}"
);
assert!(
off_block_mean.abs() < 0.01,
"Off-block mean should be ~0, got {off_block_mean}"
);
}
#[test]
fn test_biclusters_labels() {
let ds = make_biclusters((60, 30), 3, 0.0, false, Some(42)).expect("ok");
let target = ds.target.as_ref().expect("target");
for i in 0..20 {
assert_eq!(target[i], 0.0);
}
for i in 20..40 {
assert_eq!(target[i], 1.0);
}
for i in 40..60 {
assert_eq!(target[i], 2.0);
}
}
#[test]
fn test_biclusters_shuffle() {
let ds = make_biclusters((60, 30), 3, 0.0, true, Some(42)).expect("ok");
let target = ds.target.as_ref().expect("target");
let mut is_sorted = true;
for i in 1..60 {
if target[i] < target[i - 1] {
is_sorted = false;
break;
}
}
assert!(!is_sorted, "Labels should be shuffled");
}
#[test]
fn test_biclusters_validation() {
assert!(make_biclusters((0, 10), 2, 0.0, false, None).is_err());
assert!(make_biclusters((10, 0), 2, 0.0, false, None).is_err());
assert!(make_biclusters((10, 10), 0, 0.0, false, None).is_err());
assert!(make_biclusters((10, 10), 2, -1.0, false, None).is_err());
}
#[test]
fn test_checkerboard_basic() {
let ds = make_checkerboard((100, 80), (4, 5), 0.0, Some(42)).expect("ok");
assert_eq!(ds.n_samples(), 100);
assert_eq!(ds.n_features(), 80);
}
#[test]
fn test_checkerboard_pattern_noiseless() {
let ds = make_checkerboard((40, 40), (2, 2), 0.0, Some(42)).expect("ok");
assert!(
(ds.data[[0, 0]] - 5.0).abs() < 1e-10,
"Top-left should be +5, got {}",
ds.data[[0, 0]]
);
assert!(
(ds.data[[0, 20]] - (-5.0)).abs() < 1e-10,
"Top-right should be -5, got {}",
ds.data[[0, 20]]
);
assert!(
(ds.data[[20, 0]] - (-5.0)).abs() < 1e-10,
"Bottom-left should be -5, got {}",
ds.data[[20, 0]]
);
assert!(
(ds.data[[20, 20]] - 5.0).abs() < 1e-10,
"Bottom-right should be +5, got {}",
ds.data[[20, 20]]
);
}
#[test]
fn test_checkerboard_validation() {
assert!(make_checkerboard((0, 10), (2, 2), 0.0, None).is_err());
assert!(make_checkerboard((10, 10), (0, 2), 0.0, None).is_err());
assert!(make_checkerboard((10, 10), (2, 2), -1.0, None).is_err());
}
#[test]
fn test_spd_matrix_basic() {
let ds = make_spd_matrix(20, Some(42)).expect("ok");
assert_eq!(ds.n_samples(), 20);
assert_eq!(ds.n_features(), 20);
}
#[test]
fn test_spd_matrix_symmetric() {
let n = 15;
let ds = make_spd_matrix(n, Some(42)).expect("ok");
for i in 0..n {
for j in 0..n {
assert!(
(ds.data[[i, j]] - ds.data[[j, i]]).abs() < 1e-10,
"Not symmetric at ({i},{j})"
);
}
}
}
#[test]
fn test_spd_matrix_positive_definite() {
let n = 10;
let ds = make_spd_matrix(n, Some(42)).expect("ok");
let mut rng = StdRng::seed_from_u64(999);
let normal = scirs2_core::random::Normal::new(0.0, 1.0).expect("ok");
for _ in 0..20 {
let x: Vec<f64> = (0..n).map(|_| normal.sample(&mut rng)).collect();
let mut xtax = 0.0;
for i in 0..n {
for j in 0..n {
xtax += x[i] * ds.data[[i, j]] * x[j];
}
}
assert!(xtax > 0.0, "x^T A x should be positive, got {xtax}");
}
}
#[test]
fn test_spd_matrix_validation() {
assert!(make_spd_matrix(0, None).is_err());
}
#[test]
fn test_sparse_spd_matrix_basic() {
let ds = make_sparse_spd_matrix(30, 0.1, 0.3, Some(42)).expect("ok");
assert_eq!(ds.n_samples(), 30);
assert_eq!(ds.n_features(), 30);
}
#[test]
fn test_sparse_spd_matrix_symmetric() {
let n = 20;
let ds = make_sparse_spd_matrix(n, 0.1, 0.2, Some(42)).expect("ok");
for i in 0..n {
for j in 0..n {
assert!(
(ds.data[[i, j]] - ds.data[[j, i]]).abs() < 1e-10,
"Not symmetric at ({i},{j})"
);
}
}
}
#[test]
fn test_sparse_spd_matrix_positive_definite() {
let n = 10;
let ds = make_sparse_spd_matrix(n, 0.5, 0.3, Some(42)).expect("ok");
let mut rng = StdRng::seed_from_u64(999);
let normal = scirs2_core::random::Normal::new(0.0, 1.0).expect("ok");
for _ in 0..20 {
let x: Vec<f64> = (0..n).map(|_| normal.sample(&mut rng)).collect();
let mut xtax = 0.0;
for i in 0..n {
for j in 0..n {
xtax += x[i] * ds.data[[i, j]] * x[j];
}
}
assert!(xtax > 0.0, "x^T A x should be positive, got {xtax}");
}
}
#[test]
fn test_sparse_spd_matrix_validation() {
assert!(make_sparse_spd_matrix(0, 0.1, 0.3, None).is_err());
assert!(make_sparse_spd_matrix(10, -0.1, 0.3, None).is_err());
assert!(make_sparse_spd_matrix(10, 0.1, 0.0, None).is_err());
assert!(make_sparse_spd_matrix(10, 0.1, 1.5, None).is_err());
}
#[test]
fn test_sparse_coded_signal_basic() {
let ds = make_sparse_coded_signal(50, 20, 10, 3, Some(42)).expect("ok");
assert_eq!(ds.n_samples(), 50);
assert_eq!(ds.n_features(), 20);
assert!(ds.target.is_some());
let target = ds.target.as_ref().expect("target");
assert_eq!(target.len(), 20 * 10);
}
#[test]
fn test_sparse_coded_signal_dictionary_normalized() {
let n_features = 15;
let n_components = 8;
let ds = make_sparse_coded_signal(30, n_features, n_components, 2, Some(42)).expect("ok");
let target = ds.target.as_ref().expect("target");
for j in 0..n_components {
let mut norm_sq = 0.0;
for i in 0..n_features {
let val = target[i * n_components + j];
norm_sq += val * val;
}
assert!(
(norm_sq.sqrt() - 1.0).abs() < 1e-10,
"Dictionary atom {j} should have unit norm, got {}",
norm_sq.sqrt()
);
}
}
#[test]
fn test_sparse_coded_signal_reproducibility() {
let ds1 = make_sparse_coded_signal(20, 10, 5, 2, Some(42)).expect("ok");
let ds2 = make_sparse_coded_signal(20, 10, 5, 2, Some(42)).expect("ok");
for i in 0..20 {
for j in 0..10 {
assert!(
(ds1.data[[i, j]] - ds2.data[[i, j]]).abs() < 1e-12,
"Reproducibility failed at ({i},{j})"
);
}
}
}
#[test]
fn test_sparse_coded_signal_validation() {
assert!(make_sparse_coded_signal(0, 10, 5, 2, None).is_err());
assert!(make_sparse_coded_signal(10, 0, 5, 2, None).is_err());
assert!(make_sparse_coded_signal(10, 10, 0, 2, None).is_err());
assert!(make_sparse_coded_signal(10, 10, 5, 0, None).is_err());
assert!(make_sparse_coded_signal(10, 10, 5, 6, None).is_err());
}
}