1use ghostflow_core::Tensor;
4
5pub struct IncrementalPCA {
7 pub n_components: usize,
8 pub batch_size: Option<usize>,
9 pub whiten: bool,
10 components_: Option<Vec<Vec<f32>>>,
11 mean_: Option<Vec<f32>>,
12 var_: Option<Vec<f32>>,
13 singular_values_: Option<Vec<f32>>,
14 explained_variance_: Option<Vec<f32>>,
15 explained_variance_ratio_: Option<Vec<f32>>,
16 n_samples_seen_: usize,
17 n_features_: usize,
18}
19
20impl IncrementalPCA {
21 pub fn new(n_components: usize) -> Self {
22 IncrementalPCA {
23 n_components,
24 batch_size: None,
25 whiten: false,
26 components_: None,
27 mean_: None,
28 var_: None,
29 singular_values_: None,
30 explained_variance_: None,
31 explained_variance_ratio_: None,
32 n_samples_seen_: 0,
33 n_features_: 0,
34 }
35 }
36
37 pub fn batch_size(mut self, size: usize) -> Self {
38 self.batch_size = Some(size);
39 self
40 }
41
42 pub fn whiten(mut self, w: bool) -> Self {
43 self.whiten = w;
44 self
45 }
46
47 pub fn partial_fit(&mut self, x: &Tensor) {
49 let x_data = x.data_f32();
50 let n_samples = x.dims()[0];
51 let n_features = x.dims()[1];
52
53 if self.n_samples_seen_ == 0 {
54 self.n_features_ = n_features;
55 self.mean_ = Some(vec![0.0; n_features]);
56 self.var_ = Some(vec![0.0; n_features]);
57 }
58
59 let mean = self.mean_.as_mut().unwrap();
60 let var = self.var_.as_mut().unwrap();
61
62 let old_n = self.n_samples_seen_ as f32;
64 let new_n = (self.n_samples_seen_ + n_samples) as f32;
65
66 let batch_mean: Vec<f32> = (0..n_features)
68 .map(|j| (0..n_samples).map(|i| x_data[i * n_features + j]).sum::<f32>() / n_samples as f32)
69 .collect();
70
71 let batch_var: Vec<f32> = (0..n_features)
72 .map(|j| {
73 let m = batch_mean[j];
74 (0..n_samples).map(|i| (x_data[i * n_features + j] - m).powi(2)).sum::<f32>() / n_samples as f32
75 })
76 .collect();
77
78 for j in 0..n_features {
80 let delta = batch_mean[j] - mean[j];
81 let new_mean = mean[j] + delta * n_samples as f32 / new_n;
82
83 let m_a = var[j] * old_n;
85 let m_b = batch_var[j] * n_samples as f32;
86 let m2 = m_a + m_b + delta * delta * old_n * n_samples as f32 / new_n;
87
88 mean[j] = new_mean;
89 var[j] = m2 / new_n;
90 }
91
92 let x_centered: Vec<f32> = (0..n_samples)
94 .flat_map(|i| (0..n_features).map(|j| x_data[i * n_features + j] - mean[j]).collect::<Vec<_>>())
95 .collect();
96
97 if self.components_.is_none() {
99 let (_u, s, vt) = self.svd(&x_centered, n_samples, n_features);
101
102 let n_comp = self.n_components.min(n_samples).min(n_features);
103 self.components_ = Some(vt.into_iter().take(n_comp).collect());
104 self.singular_values_ = Some(s.into_iter().take(n_comp).collect());
105 } else {
106 let components = self.components_.as_ref().unwrap();
108 let singular_values = self.singular_values_.as_ref().unwrap();
109 let n_comp = components.len();
110
111 let mut combined = Vec::with_capacity((n_comp + n_samples) * n_features);
114
115 for k in 0..n_comp {
117 for j in 0..n_features {
118 combined.push(singular_values[k] * components[k][j]);
119 }
120 }
121
122 combined.extend_from_slice(&x_centered);
124
125 let (_, s, vt) = self.svd(&combined, n_comp + n_samples, n_features);
127
128 let new_n_comp = self.n_components.min(n_comp + n_samples).min(n_features);
129 self.components_ = Some(vt.into_iter().take(new_n_comp).collect());
130 self.singular_values_ = Some(s.into_iter().take(new_n_comp).collect());
131 }
132
133 self.n_samples_seen_ += n_samples;
134
135 self.update_explained_variance();
137 }
138
139 fn svd(&self, x: &[f32], n_rows: usize, n_cols: usize) -> (Vec<Vec<f32>>, Vec<f32>, Vec<Vec<f32>>) {
140 let mut xtx = vec![0.0f32; n_cols * n_cols];
142 for i in 0..n_cols {
143 for j in 0..n_cols {
144 for k in 0..n_rows {
145 xtx[i * n_cols + j] += x[k * n_cols + i] * x[k * n_cols + j];
146 }
147 }
148 }
149
150 let n_components = self.n_components.min(n_rows).min(n_cols);
152 let mut eigenvalues = Vec::new();
153 let mut eigenvectors = Vec::new();
154
155 for _ in 0..n_components {
156 let mut v = vec![1.0f32; n_cols];
157 normalize(&mut v);
158
159 for _ in 0..100 {
160 let mut av = vec![0.0f32; n_cols];
161 for i in 0..n_cols {
162 for j in 0..n_cols {
163 av[i] += xtx[i * n_cols + j] * v[j];
164 }
165 }
166
167 let norm: f32 = av.iter().map(|&x| x * x).sum::<f32>().sqrt();
168 if norm < 1e-10 { break; }
169
170 let prev_v = v.clone();
171 for (vi, avi) in v.iter_mut().zip(av.iter()) {
172 *vi = *avi / norm;
173 }
174
175 let diff: f32 = v.iter().zip(prev_v.iter()).map(|(&a, &b)| (a - b).abs()).sum();
176 if diff < 1e-6 { break; }
177 }
178
179 let mut eigenvalue = 0.0f32;
181 for i in 0..n_cols {
182 let mut av_i = 0.0f32;
183 for j in 0..n_cols {
184 av_i += xtx[i * n_cols + j] * v[j];
185 }
186 eigenvalue += v[i] * av_i;
187 }
188
189 eigenvalues.push(eigenvalue.max(0.0).sqrt());
190 eigenvectors.push(v.clone());
191
192 for i in 0..n_cols {
194 for j in 0..n_cols {
195 xtx[i * n_cols + j] -= eigenvalue * v[i] * v[j];
196 }
197 }
198 }
199
200 let mut u = Vec::new();
202 for k in 0..eigenvalues.len() {
203 if eigenvalues[k] > 1e-10 {
204 let mut u_k = vec![0.0f32; n_rows];
205 for i in 0..n_rows {
206 for j in 0..n_cols {
207 u_k[i] += x[i * n_cols + j] * eigenvectors[k][j];
208 }
209 u_k[i] /= eigenvalues[k];
210 }
211 u.push(u_k);
212 }
213 }
214
215 (u, eigenvalues, eigenvectors)
216 }
217
218 fn update_explained_variance(&mut self) {
219 if let Some(ref sv) = self.singular_values_ {
220 let n = self.n_samples_seen_ as f32;
221 let explained_var: Vec<f32> = sv.iter().map(|&s| s * s / (n - 1.0).max(1.0)).collect();
222 let total_var: f32 = explained_var.iter().sum();
223 let explained_ratio: Vec<f32> = explained_var.iter()
224 .map(|&v| v / total_var.max(1e-10))
225 .collect();
226
227 self.explained_variance_ = Some(explained_var);
228 self.explained_variance_ratio_ = Some(explained_ratio);
229 }
230 }
231
232 pub fn fit(&mut self, x: &Tensor) {
234 let batch_size = self.batch_size.unwrap_or(x.dims()[0]);
235 let n_samples = x.dims()[0];
236 let n_features = x.dims()[1];
237 let x_data = x.data_f32();
238
239 self.n_samples_seen_ = 0;
241 self.components_ = None;
242 self.mean_ = None;
243 self.var_ = None;
244
245 let mut start = 0;
247 while start < n_samples {
248 let end = (start + batch_size).min(n_samples);
249 let batch_data: Vec<f32> = x_data[start * n_features..end * n_features].to_vec();
250 let batch = Tensor::from_slice(&batch_data, &[end - start, n_features]).unwrap();
251 self.partial_fit(&batch);
252 start = end;
253 }
254 }
255
256 pub fn transform(&self, x: &Tensor) -> Tensor {
257 let x_data = x.data_f32();
258 let n_samples = x.dims()[0];
259 let n_features = x.dims()[1];
260
261 let components = self.components_.as_ref().expect("Model not fitted");
262 let mean = self.mean_.as_ref().unwrap();
263 let n_components = components.len();
264
265 let mut result = vec![0.0f32; n_samples * n_components];
266
267 for i in 0..n_samples {
268 for k in 0..n_components {
269 for j in 0..n_features {
270 result[i * n_components + k] +=
271 (x_data[i * n_features + j] - mean[j]) * components[k][j];
272 }
273
274 if self.whiten {
275 if let Some(ref sv) = self.singular_values_ {
276 let scale = (self.n_samples_seen_ as f32 - 1.0).sqrt() / sv[k].max(1e-10);
277 result[i * n_components + k] *= scale;
278 }
279 }
280 }
281 }
282
283 Tensor::from_slice(&result, &[n_samples, n_components]).unwrap()
284 }
285
286 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
287 self.fit(x);
288 self.transform(x)
289 }
290
291 pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
292 let x_data = x.data_f32();
293 let n_samples = x.dims()[0];
294 let n_components = x.dims()[1];
295
296 let components = self.components_.as_ref().expect("Model not fitted");
297 let mean = self.mean_.as_ref().unwrap();
298 let n_features = self.n_features_;
299
300 let mut result = vec![0.0f32; n_samples * n_features];
301
302 for i in 0..n_samples {
303 for j in 0..n_features {
304 result[i * n_features + j] = mean[j];
305 for k in 0..n_components {
306 let mut val = x_data[i * n_components + k];
307
308 if self.whiten {
309 if let Some(ref sv) = self.singular_values_ {
310 val *= sv[k] / (self.n_samples_seen_ as f32 - 1.0).sqrt().max(1e-10);
311 }
312 }
313
314 result[i * n_features + j] += val * components[k][j];
315 }
316 }
317 }
318
319 Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
320 }
321
322 pub fn explained_variance_ratio(&self) -> Option<&Vec<f32>> {
323 self.explained_variance_ratio_.as_ref()
324 }
325
326 pub fn n_samples_seen(&self) -> usize {
327 self.n_samples_seen_
328 }
329}
330
331fn normalize(v: &mut [f32]) {
332 let norm: f32 = v.iter().map(|&x| x * x).sum::<f32>().sqrt();
333 if norm > 1e-10 {
334 for vi in v.iter_mut() { *vi /= norm; }
335 }
336}
337
338pub struct MiniBatchSparsePCA {
340 pub n_components: usize,
341 pub alpha: f32,
342 pub batch_size: usize,
343 pub max_iter: usize,
344 components_: Option<Vec<Vec<f32>>>,
345 mean_: Option<Vec<f32>>,
346 n_iter_: usize,
347}
348
349impl MiniBatchSparsePCA {
350 pub fn new(n_components: usize) -> Self {
351 MiniBatchSparsePCA {
352 n_components,
353 alpha: 1.0,
354 batch_size: 100,
355 max_iter: 100,
356 components_: None,
357 mean_: None,
358 n_iter_: 0,
359 }
360 }
361
362 pub fn alpha(mut self, a: f32) -> Self { self.alpha = a; self }
363 pub fn batch_size(mut self, b: usize) -> Self { self.batch_size = b; self }
364
365 pub fn fit(&mut self, x: &Tensor) {
366 use rand::prelude::*;
367
368 let x_data = x.data_f32();
369 let n_samples = x.dims()[0];
370 let n_features = x.dims()[1];
371
372 let mean: Vec<f32> = (0..n_features)
374 .map(|j| (0..n_samples).map(|i| x_data[i * n_features + j]).sum::<f32>() / n_samples as f32)
375 .collect();
376
377 let mut rng = thread_rng();
379 let mut components: Vec<Vec<f32>> = (0..self.n_components)
380 .map(|_| {
381 let mut v: Vec<f32> = (0..n_features).map(|_| rng.gen::<f32>() - 0.5).collect();
382 normalize(&mut v);
383 v
384 })
385 .collect();
386
387 let mut indices: Vec<usize> = (0..n_samples).collect();
389
390 for iter in 0..self.max_iter {
391 indices.shuffle(&mut rng);
392
393 for batch_start in (0..n_samples).step_by(self.batch_size) {
394 let batch_end = (batch_start + self.batch_size).min(n_samples);
395 let batch_indices = &indices[batch_start..batch_end];
396
397 let batch: Vec<f32> = batch_indices.iter()
399 .flat_map(|&i| (0..n_features).map(|j| x_data[i * n_features + j] - mean[j]).collect::<Vec<_>>())
400 .collect();
401 let batch_size = batch_indices.len();
402
403 for k in 0..self.n_components {
405 let mut grad = vec![0.0f32; n_features];
407
408 for i in 0..batch_size {
409 let mut proj = 0.0f32;
411 for j in 0..n_features {
412 proj += batch[i * n_features + j] * components[k][j];
413 }
414
415 for j in 0..n_features {
417 grad[j] += proj * batch[i * n_features + j];
418 }
419 }
420
421 let lr = 0.01 / (iter + 1) as f32;
423 for j in 0..n_features {
424 let g = grad[j] / batch_size as f32;
425 components[k][j] += lr * g;
426
427 let sign = components[k][j].signum();
429 components[k][j] = (components[k][j].abs() - lr * self.alpha).max(0.0) * sign;
430 }
431
432 normalize(&mut components[k]);
433 }
434 }
435
436 self.n_iter_ = iter + 1;
437 }
438
439 self.components_ = Some(components);
440 self.mean_ = Some(mean);
441 }
442
443 pub fn transform(&self, x: &Tensor) -> Tensor {
444 let x_data = x.data_f32();
445 let n_samples = x.dims()[0];
446 let n_features = x.dims()[1];
447
448 let components = self.components_.as_ref().expect("Model not fitted");
449 let mean = self.mean_.as_ref().unwrap();
450
451 let mut result = vec![0.0f32; n_samples * self.n_components];
452
453 for i in 0..n_samples {
454 for k in 0..self.n_components {
455 for j in 0..n_features {
456 result[i * self.n_components + k] +=
457 (x_data[i * n_features + j] - mean[j]) * components[k][j];
458 }
459 }
460 }
461
462 Tensor::from_slice(&result, &[n_samples, self.n_components]).unwrap()
463 }
464
465 pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
466 self.fit(x);
467 self.transform(x)
468 }
469}
470
471#[cfg(test)]
472mod tests {
473 use super::*;
474
475 #[test]
476 fn test_incremental_pca() {
477 let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0,
478 4.0, 5.0, 6.0,
479 7.0, 8.0, 9.0,
480 10.0, 11.0, 12.0,
481 ], &[4, 3]).unwrap();
482
483 let mut ipca = IncrementalPCA::new(2).batch_size(2);
484 let result = ipca.fit_transform(&x);
485 assert_eq!(result.dims(), &[4, 2]);
486 }
487
488 #[test]
489 fn test_incremental_pca_partial_fit() {
490 let mut ipca = IncrementalPCA::new(2);
491
492 let batch1 = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
493 let batch2 = Tensor::from_slice(&[7.0f32, 8.0, 9.0, 10.0, 11.0, 12.0], &[2, 3]).unwrap();
494
495 ipca.partial_fit(&batch1);
496 ipca.partial_fit(&batch2);
497
498 assert_eq!(ipca.n_samples_seen(), 4);
499 }
500}
501
502