datasynth_core/diffusion/
utils.rs1use rand::SeedableRng;
2use rand_chacha::ChaCha8Rng;
3use rand_distr::{Distribution, Normal};
4
5pub fn add_gaussian_noise(x: &[f64], variance: f64, rng: &mut ChaCha8Rng) -> Vec<f64> {
7 let std_dev = variance.sqrt();
8 if let Ok(normal) = Normal::new(0.0, std_dev) {
9 x.iter().map(|&v| v + normal.sample(rng)).collect()
10 } else {
11 x.to_vec()
12 }
13}
14
15pub fn normalize_features(data: &[Vec<f64>]) -> (Vec<Vec<f64>>, Vec<f64>, Vec<f64>) {
18 if data.is_empty() {
19 return (vec![], vec![], vec![]);
20 }
21
22 let n_features = data[0].len();
23 let n_samples = data.len() as f64;
24
25 let mut means = vec![0.0; n_features];
27 for row in data {
28 for (j, &val) in row.iter().enumerate() {
29 if j < n_features {
30 means[j] += val;
31 }
32 }
33 }
34 for m in &mut means {
35 *m /= n_samples;
36 }
37
38 let mut stds = vec![0.0; n_features];
40 for row in data {
41 for (j, &val) in row.iter().enumerate() {
42 if j < n_features {
43 stds[j] += (val - means[j]).powi(2);
44 }
45 }
46 }
47 for s in &mut stds {
48 *s = (*s / n_samples).sqrt().max(1e-8); }
50
51 let normalized: Vec<Vec<f64>> = data
53 .iter()
54 .map(|row| {
55 row.iter()
56 .enumerate()
57 .map(|(j, &val)| {
58 if j < n_features {
59 (val - means[j]) / stds[j]
60 } else {
61 val
62 }
63 })
64 .collect()
65 })
66 .collect();
67
68 (normalized, means, stds)
69}
70
71pub fn denormalize_features(data: &[Vec<f64>], means: &[f64], stds: &[f64]) -> Vec<Vec<f64>> {
73 data.iter()
74 .map(|row| {
75 row.iter()
76 .enumerate()
77 .map(|(j, &val)| {
78 if j < means.len() && j < stds.len() {
79 val * stds[j] + means[j]
80 } else {
81 val
82 }
83 })
84 .collect()
85 })
86 .collect()
87}
88
89pub fn clip_values(data: &mut [Vec<f64>], min: f64, max: f64) {
91 for row in data.iter_mut() {
92 for val in row.iter_mut() {
93 *val = val.clamp(min, max);
94 }
95 }
96}
97
98pub fn generate_noise(n_samples: usize, n_features: usize, seed: u64) -> Vec<Vec<f64>> {
100 let mut rng = ChaCha8Rng::seed_from_u64(seed);
101 if let Ok(normal) = Normal::new(0.0, 1.0) {
102 (0..n_samples)
103 .map(|_| (0..n_features).map(|_| normal.sample(&mut rng)).collect())
104 .collect()
105 } else {
106 vec![vec![0.0; n_features]; n_samples]
107 }
108}
109
110#[cfg(test)]
111mod tests {
112 use super::*;
113
114 #[test]
115 fn test_add_gaussian_noise() {
116 let mut rng = ChaCha8Rng::seed_from_u64(42);
117 let x = vec![1.0, 2.0, 3.0];
118 let noised = add_gaussian_noise(&x, 0.01, &mut rng);
119 assert_eq!(noised.len(), 3);
120 for (orig, noised) in x.iter().zip(noised.iter()) {
122 assert!((orig - noised).abs() < 1.0);
123 }
124 }
125
126 #[test]
127 fn test_normalize_denormalize_roundtrip() {
128 let data = vec![vec![10.0, 20.0], vec![12.0, 22.0], vec![14.0, 24.0]];
129 let (normalized, means, stds) = normalize_features(&data);
130 let recovered = denormalize_features(&normalized, &means, &stds);
131
132 for (orig, rec) in data.iter().zip(recovered.iter()) {
133 for (o, r) in orig.iter().zip(rec.iter()) {
134 assert!((o - r).abs() < 1e-10, "Roundtrip failed: {} vs {}", o, r);
135 }
136 }
137 }
138
139 #[test]
140 fn test_normalize_zero_mean() {
141 let data = vec![vec![10.0, 20.0], vec![20.0, 40.0]];
142 let (normalized, _, _) = normalize_features(&data);
143 let mean: f64 = normalized.iter().map(|r| r[0]).sum::<f64>() / normalized.len() as f64;
144 assert!(
145 mean.abs() < 1e-10,
146 "Normalized mean should be ~0, got {}",
147 mean
148 );
149 }
150
151 #[test]
152 fn test_clip_values() {
153 let mut data = vec![vec![-5.0, 10.0, 0.5]];
154 clip_values(&mut data, 0.0, 1.0);
155 assert_eq!(data[0], vec![0.0, 1.0, 0.5]);
156 }
157
158 #[test]
159 fn test_generate_noise_shape() {
160 let noise = generate_noise(100, 5, 42);
161 assert_eq!(noise.len(), 100);
162 assert_eq!(noise[0].len(), 5);
163 }
164
165 #[test]
166 fn test_normalize_empty() {
167 let (data, means, stds) = normalize_features(&[]);
168 assert!(data.is_empty());
169 assert!(means.is_empty());
170 assert!(stds.is_empty());
171 }
172}