use faer::Mat;
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::{Rng, SeedableRng};
pub fn generate_swiss_roll(n_samples: usize, noise: f64, seed: u64) -> Mat<f64> {
let mut rng = StdRng::seed_from_u64(seed);
let mut data = Mat::<f64>::zeros(n_samples, 3);
for i in 0..n_samples {
let t = 1.5 * std::f64::consts::PI * (1.0 + 2.0 * rng.random::<f64>());
let height = 21.0 * rng.random::<f64>();
let noise_x = rng.random_range(-noise..noise);
let noise_y = rng.random_range(-noise..noise);
let noise_z = rng.random_range(-noise..noise);
data[(i, 0)] = t * t.cos() + noise_x;
data[(i, 1)] = height + noise_y;
data[(i, 2)] = t * t.sin() + noise_z;
}
data
}
pub fn generate_swiss_roll_biased(
n_samples: usize,
noise: f64,
density_bias: f64,
seed: u64,
) -> (Mat<f64>, Vec<f64>) {
let mut rng = StdRng::seed_from_u64(seed);
let mut data = Mat::<f64>::zeros(n_samples, 3);
let mut t_values = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let u: f64 = rng.random();
let u_biased = u.powf(density_bias);
let t = 1.5 * std::f64::consts::PI * (1.0 + 2.0 * u_biased);
let height = 21.0 * rng.random::<f64>();
let noise_x = rng.random_range(-noise..noise);
let noise_y = rng.random_range(-noise..noise);
let noise_z = rng.random_range(-noise..noise);
data[(i, 0)] = t * t.cos() + noise_x;
data[(i, 1)] = height + noise_y;
data[(i, 2)] = t * t.sin() + noise_z;
t_values.push(t);
}
(data, t_values)
}
pub fn generate_clustered_data(
n_samples: usize,
dim: usize,
n_clusters: usize,
seed: u64,
) -> (Mat<f64>, Vec<usize>) {
let mut rng = StdRng::seed_from_u64(seed);
let mut data = Mat::<f64>::zeros(n_samples, dim);
let mut centres = Vec::with_capacity(n_clusters);
let mut cluster_stds = Vec::new();
for _ in 0..n_clusters {
let centre: Vec<f64> = (0..dim).map(|_| rng.random_range(-7.5..7.5)).collect();
centres.push(centre);
cluster_stds.push(rng.random_range(0.5..2.5));
}
let mut cluster_assignments = Vec::new();
for cluster_idx in 0..n_clusters {
let weight = rng.random_range(0.5..2.5);
let n_in_cluster = ((n_samples as f64 * weight) / (n_clusters as f64 * 1.25)) as usize;
cluster_assignments.extend(vec![cluster_idx; n_in_cluster]);
}
while cluster_assignments.len() < n_samples {
cluster_assignments.push(rng.random_range(0..n_clusters));
}
cluster_assignments.shuffle(&mut rng);
cluster_assignments.truncate(n_samples);
for (i, &cluster_idx) in cluster_assignments.iter().enumerate() {
let centre = ¢res[cluster_idx];
let std = cluster_stds[cluster_idx];
for j in 0..dim {
let u1: f64 = rng.random();
let u2: f64 = rng.random();
let noise = (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos();
data[(i, j)] = centre[j] + noise * std;
}
}
(data, cluster_assignments)
}
#[derive(Default, Clone, Debug)]
pub enum TrajectoryTopology {
#[default]
DeepBifurcation,
Linear,
Comb,
}
pub fn parse_topology(s: &str) -> Option<TrajectoryTopology> {
match s.to_lowercase().as_str() {
"bifurcation" => Some(TrajectoryTopology::DeepBifurcation),
"linear" => Some(TrajectoryTopology::Linear),
"combination" => Some(TrajectoryTopology::Comb),
_ => None,
}
}
#[derive(Clone, Debug)]
pub struct BranchSpec {
pub parent: Option<usize>,
pub split_at: f64,
pub length: f64,
}
pub fn generate_example_branches(topology: &TrajectoryTopology) -> Vec<BranchSpec> {
match topology {
TrajectoryTopology::DeepBifurcation => vec![
BranchSpec {
parent: None,
split_at: 0.0,
length: 0.75,
}, BranchSpec {
parent: Some(0),
split_at: 1.0,
length: 0.5,
}, BranchSpec {
parent: Some(1),
split_at: 1.0,
length: 3.0,
}, BranchSpec {
parent: Some(1),
split_at: 1.0,
length: 1.0,
}, BranchSpec {
parent: Some(3),
split_at: 1.0,
length: 1.5,
}, BranchSpec {
parent: Some(3),
split_at: 1.0,
length: 2.5,
}, ],
TrajectoryTopology::Linear => vec![
BranchSpec {
parent: None,
split_at: 0.0,
length: 1.0,
}, BranchSpec {
parent: Some(0),
split_at: 1.0,
length: 2.0,
}, BranchSpec {
parent: Some(1),
split_at: 1.0,
length: 3.0,
}, ],
TrajectoryTopology::Comb => vec![
BranchSpec {
parent: None,
split_at: 0.0,
length: 5.0,
}, BranchSpec {
parent: Some(0),
split_at: 0.2,
length: 2.0,
}, BranchSpec {
parent: Some(0),
split_at: 0.5,
length: 2.5,
}, BranchSpec {
parent: Some(0),
split_at: 0.8,
length: 3.0,
}, ],
}
}
fn box_muller(rng: &mut StdRng) -> f64 {
let u1: f64 = rng.random();
let u2: f64 = rng.random();
(-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos()
}
pub fn generate_trajectory(
n_samples: usize,
branches: &[BranchSpec],
dim: usize,
noise: f64,
seed: u64,
) -> (Mat<f64>, Vec<usize>) {
assert!(dim >= branches.len(), "dim must be >= number of branches");
let mut rng = StdRng::seed_from_u64(seed);
let n_branches = branches.len();
let n_per_branch = n_samples / n_branches;
let n_bg = 3usize;
let bg_weight = 0.15;
let bg_dirs: Vec<Vec<f64>> = (0..n_bg)
.map(|_| {
let mut v: Vec<f64> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
let norm = v.iter().map(|x| x * x).sum::<f64>().sqrt();
v.iter_mut().for_each(|x| *x /= norm);
v
})
.collect();
let mut dirs: Vec<Vec<f64>> = Vec::with_capacity(n_branches);
let mut curve_dirs: Vec<Vec<f64>> = Vec::with_capacity(n_branches);
let mut starts: Vec<Vec<f64>> = Vec::with_capacity(n_branches);
for spec in branches.iter() {
let mut dir: Vec<f64> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
for prev in &dirs {
let dot: f64 = dir.iter().zip(prev).map(|(a, b)| a * b).sum();
for j in 0..dim {
dir[j] -= dot * prev[j];
}
}
let norm = dir.iter().map(|x| x * x).sum::<f64>().sqrt();
let mut dir: Vec<f64> = dir.iter().map(|x| x / norm).collect();
if let Some(p) = spec.parent {
let alpha = 0.4f64;
for j in 0..dim {
dir[j] = alpha * dirs[p][j] + (1.0 - alpha) * dir[j];
}
let norm = dir.iter().map(|x| x * x).sum::<f64>().sqrt();
dir = dir.iter().map(|x| x / norm).collect();
}
let mut c_dir: Vec<f64> = (0..dim).map(|_| rng.random_range(-1.0..1.0)).collect();
let dot_c: f64 = c_dir.iter().zip(&dir).map(|(c, d)| c * d).sum();
for j in 0..dim {
c_dir[j] -= dot_c * dir[j];
}
let norm_c = c_dir.iter().map(|x| x * x).sum::<f64>().sqrt();
let c_dir: Vec<f64> = c_dir.iter().map(|x| x / norm_c).collect();
let start = match spec.parent {
None => vec![0.0; dim],
Some(p) => {
let t_parent = spec.split_at * branches[p].length;
let parent_curve_amt = (t_parent / branches[p].length * std::f64::consts::PI).sin()
* (branches[p].length * 0.3);
(0..dim)
.map(|j| {
starts[p][j] + t_parent * dirs[p][j] + parent_curve_amt * curve_dirs[p][j]
})
.collect()
}
};
dirs.push(dir);
curve_dirs.push(c_dir);
starts.push(start);
}
let mut data = Mat::<f64>::zeros(n_samples, dim);
let mut assignments = Vec::with_capacity(n_samples);
let mut idx = 0;
let transition_window = 0.3f64;
for (b, spec) in branches.iter().enumerate() {
let count = if b == n_branches - 1 {
n_samples - idx
} else {
n_per_branch
};
for _ in 0..count {
let u: f64 = rng.random();
let t = spec.length * u.powf(2.5);
let blend = if spec.parent.is_some() {
let frac = t / spec.length;
if frac < transition_window {
(1.0 - frac / transition_window).powi(2)
} else {
0.0
}
} else {
0.0
};
let mut clean_pos = vec![0.0; dim];
let curve_amt = (t / spec.length * std::f64::consts::PI).sin() * (spec.length * 0.3);
for j in 0..dim {
clean_pos[j] = starts[b][j] + t * dirs[b][j] + curve_amt * curve_dirs[b][j];
}
if blend > 0.0 {
if let Some(p) = spec.parent {
let t_parent = spec.split_at * branches[p].length;
let parent_curve_amt = (t_parent / branches[p].length * std::f64::consts::PI)
.sin()
* (branches[p].length * 0.3);
for j in 0..dim {
let parent_pos = starts[p][j]
+ t_parent * dirs[p][j]
+ parent_curve_amt * curve_dirs[p][j];
clean_pos[j] = blend * parent_pos + (1.0 - blend) * clean_pos[j];
}
}
}
let local_noise = noise * (1.0 + t / spec.length);
for j in 0..dim {
let noise_val = local_noise * box_muller(&mut rng);
data[(idx, j)] = clean_pos[j] + noise_val;
}
let dot: f64 = (0..dim)
.map(|j| (data[(idx, j)] - clean_pos[j]) * dirs[b][j])
.sum();
for j in 0..dim {
data[(idx, j)] -= dot * dirs[b][j];
}
for bg in &bg_dirs {
let coeff = bg_weight * box_muller(&mut rng);
for j in 0..dim {
data[(idx, j)] += coeff * bg[j];
}
}
assignments.push(b);
idx += 1;
}
}
(data, assignments)
}
#[allow(clippy::too_many_arguments)]
pub fn generate_hierarchical_clusters(
n_samples: usize,
dim: usize,
n_supergroups: usize,
n_subclusts: usize,
supergroup_spread: f64,
subcluster_spread: f64,
point_std: f64,
seed: u64,
) -> (Mat<f64>, Vec<usize>, Vec<usize>) {
let mut rng = StdRng::seed_from_u64(seed);
let total_subclusters = n_supergroups * n_subclusts;
let n_per_subcluster = n_samples / total_subclusters;
let supergroup_centres: Vec<Vec<f64>> = (0..n_supergroups)
.map(|_| {
(0..dim)
.map(|_| rng.random_range(-supergroup_spread..supergroup_spread))
.collect()
})
.collect();
let mut subcluster_centres: Vec<(usize, Vec<f64>)> = Vec::with_capacity(total_subclusters);
for sg in 0..n_supergroups {
let sc = &supergroup_centres[sg];
for _ in 0..n_subclusts {
let centre = sc
.iter()
.map(|&c| c + rng.random_range(-subcluster_spread..subcluster_spread))
.collect();
subcluster_centres.push((sg, centre));
}
}
let actual_n = n_per_subcluster * total_subclusters;
let mut data = Mat::<f64>::zeros(actual_n, dim);
let mut supergroup_labels = Vec::with_capacity(actual_n);
let mut subcluster_labels = Vec::with_capacity(actual_n);
let mut idx = 0;
for (sc_idx, (sg_idx, centre)) in subcluster_centres.iter().enumerate() {
for _ in 0..n_per_subcluster {
for j in 0..dim {
let u1: f64 = rng.random();
let u2: f64 = rng.random();
let noise = (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos();
data[(idx, j)] = centre[j] + noise * point_std;
}
supergroup_labels.push(*sg_idx);
subcluster_labels.push(sc_idx);
idx += 1;
}
}
(data, supergroup_labels, subcluster_labels)
}