ghostflow_ml/
decomposition_incremental.rs

1//! Incremental Decomposition - IncrementalPCA, MiniBatchDictionaryLearning
2
3use ghostflow_core::Tensor;
4
5/// Incremental PCA - for large datasets that don't fit in memory
6pub struct IncrementalPCA {
7    pub n_components: usize,
8    pub batch_size: Option<usize>,
9    pub whiten: bool,
10    components_: Option<Vec<Vec<f32>>>,
11    mean_: Option<Vec<f32>>,
12    var_: Option<Vec<f32>>,
13    singular_values_: Option<Vec<f32>>,
14    explained_variance_: Option<Vec<f32>>,
15    explained_variance_ratio_: Option<Vec<f32>>,
16    n_samples_seen_: usize,
17    n_features_: usize,
18}
19
20impl IncrementalPCA {
21    pub fn new(n_components: usize) -> Self {
22        IncrementalPCA {
23            n_components,
24            batch_size: None,
25            whiten: false,
26            components_: None,
27            mean_: None,
28            var_: None,
29            singular_values_: None,
30            explained_variance_: None,
31            explained_variance_ratio_: None,
32            n_samples_seen_: 0,
33            n_features_: 0,
34        }
35    }
36
37    pub fn batch_size(mut self, size: usize) -> Self {
38        self.batch_size = Some(size);
39        self
40    }
41
42    pub fn whiten(mut self, w: bool) -> Self {
43        self.whiten = w;
44        self
45    }
46
47    /// Partial fit on a batch of data
48    pub fn partial_fit(&mut self, x: &Tensor) {
49        let x_data = x.data_f32();
50        let n_samples = x.dims()[0];
51        let n_features = x.dims()[1];
52
53        if self.n_samples_seen_ == 0 {
54            self.n_features_ = n_features;
55            self.mean_ = Some(vec![0.0; n_features]);
56            self.var_ = Some(vec![0.0; n_features]);
57        }
58
59        let mean = self.mean_.as_mut().unwrap();
60        let var = self.var_.as_mut().unwrap();
61
62        // Update mean and variance incrementally (Welford's algorithm)
63        let old_n = self.n_samples_seen_ as f32;
64        let new_n = (self.n_samples_seen_ + n_samples) as f32;
65
66        // Compute batch statistics
67        let batch_mean: Vec<f32> = (0..n_features)
68            .map(|j| (0..n_samples).map(|i| x_data[i * n_features + j]).sum::<f32>() / n_samples as f32)
69            .collect();
70
71        let batch_var: Vec<f32> = (0..n_features)
72            .map(|j| {
73                let m = batch_mean[j];
74                (0..n_samples).map(|i| (x_data[i * n_features + j] - m).powi(2)).sum::<f32>() / n_samples as f32
75            })
76            .collect();
77
78        // Update global statistics
79        for j in 0..n_features {
80            let delta = batch_mean[j] - mean[j];
81            let new_mean = mean[j] + delta * n_samples as f32 / new_n;
82            
83            // Combined variance
84            let m_a = var[j] * old_n;
85            let m_b = batch_var[j] * n_samples as f32;
86            let m2 = m_a + m_b + delta * delta * old_n * n_samples as f32 / new_n;
87            
88            mean[j] = new_mean;
89            var[j] = m2 / new_n;
90        }
91
92        // Center the batch data
93        let x_centered: Vec<f32> = (0..n_samples)
94            .flat_map(|i| (0..n_features).map(|j| x_data[i * n_features + j] - mean[j]).collect::<Vec<_>>())
95            .collect();
96
97        // Incremental SVD update
98        if self.components_.is_none() {
99            // First batch: compute SVD directly
100            let (_u, s, vt) = self.svd(&x_centered, n_samples, n_features);
101            
102            let n_comp = self.n_components.min(n_samples).min(n_features);
103            self.components_ = Some(vt.into_iter().take(n_comp).collect());
104            self.singular_values_ = Some(s.into_iter().take(n_comp).collect());
105        } else {
106            // Subsequent batches: incremental update
107            let components = self.components_.as_ref().unwrap();
108            let singular_values = self.singular_values_.as_ref().unwrap();
109            let n_comp = components.len();
110
111            // Project old components scaled by singular values
112            // Combine with new data for SVD
113            let mut combined = Vec::with_capacity((n_comp + n_samples) * n_features);
114            
115            // Add scaled components
116            for k in 0..n_comp {
117                for j in 0..n_features {
118                    combined.push(singular_values[k] * components[k][j]);
119                }
120            }
121            
122            // Add centered batch data
123            combined.extend_from_slice(&x_centered);
124
125            // SVD on combined matrix
126            let (_, s, vt) = self.svd(&combined, n_comp + n_samples, n_features);
127            
128            let new_n_comp = self.n_components.min(n_comp + n_samples).min(n_features);
129            self.components_ = Some(vt.into_iter().take(new_n_comp).collect());
130            self.singular_values_ = Some(s.into_iter().take(new_n_comp).collect());
131        }
132
133        self.n_samples_seen_ += n_samples;
134
135        // Update explained variance
136        self.update_explained_variance();
137    }
138
139    fn svd(&self, x: &[f32], n_rows: usize, n_cols: usize) -> (Vec<Vec<f32>>, Vec<f32>, Vec<Vec<f32>>) {
140        // Compute X^T X for eigendecomposition
141        let mut xtx = vec![0.0f32; n_cols * n_cols];
142        for i in 0..n_cols {
143            for j in 0..n_cols {
144                for k in 0..n_rows {
145                    xtx[i * n_cols + j] += x[k * n_cols + i] * x[k * n_cols + j];
146                }
147            }
148        }
149
150        // Power iteration for top eigenvectors
151        let n_components = self.n_components.min(n_rows).min(n_cols);
152        let mut eigenvalues = Vec::new();
153        let mut eigenvectors = Vec::new();
154
155        for _ in 0..n_components {
156            let mut v = vec![1.0f32; n_cols];
157            normalize(&mut v);
158
159            for _ in 0..100 {
160                let mut av = vec![0.0f32; n_cols];
161                for i in 0..n_cols {
162                    for j in 0..n_cols {
163                        av[i] += xtx[i * n_cols + j] * v[j];
164                    }
165                }
166
167                let norm: f32 = av.iter().map(|&x| x * x).sum::<f32>().sqrt();
168                if norm < 1e-10 { break; }
169
170                let prev_v = v.clone();
171                for (vi, avi) in v.iter_mut().zip(av.iter()) {
172                    *vi = *avi / norm;
173                }
174
175                let diff: f32 = v.iter().zip(prev_v.iter()).map(|(&a, &b)| (a - b).abs()).sum();
176                if diff < 1e-6 { break; }
177            }
178
179            // Compute eigenvalue
180            let mut eigenvalue = 0.0f32;
181            for i in 0..n_cols {
182                let mut av_i = 0.0f32;
183                for j in 0..n_cols {
184                    av_i += xtx[i * n_cols + j] * v[j];
185                }
186                eigenvalue += v[i] * av_i;
187            }
188
189            eigenvalues.push(eigenvalue.max(0.0).sqrt());
190            eigenvectors.push(v.clone());
191
192            // Deflate
193            for i in 0..n_cols {
194                for j in 0..n_cols {
195                    xtx[i * n_cols + j] -= eigenvalue * v[i] * v[j];
196                }
197            }
198        }
199
200        // Compute U = X V S^-1
201        let mut u = Vec::new();
202        for k in 0..eigenvalues.len() {
203            if eigenvalues[k] > 1e-10 {
204                let mut u_k = vec![0.0f32; n_rows];
205                for i in 0..n_rows {
206                    for j in 0..n_cols {
207                        u_k[i] += x[i * n_cols + j] * eigenvectors[k][j];
208                    }
209                    u_k[i] /= eigenvalues[k];
210                }
211                u.push(u_k);
212            }
213        }
214
215        (u, eigenvalues, eigenvectors)
216    }
217
218    fn update_explained_variance(&mut self) {
219        if let Some(ref sv) = self.singular_values_ {
220            let n = self.n_samples_seen_ as f32;
221            let explained_var: Vec<f32> = sv.iter().map(|&s| s * s / (n - 1.0).max(1.0)).collect();
222            let total_var: f32 = explained_var.iter().sum();
223            let explained_ratio: Vec<f32> = explained_var.iter()
224                .map(|&v| v / total_var.max(1e-10))
225                .collect();
226
227            self.explained_variance_ = Some(explained_var);
228            self.explained_variance_ratio_ = Some(explained_ratio);
229        }
230    }
231
232    /// Fit the model with all data at once (convenience method)
233    pub fn fit(&mut self, x: &Tensor) {
234        let batch_size = self.batch_size.unwrap_or(x.dims()[0]);
235        let n_samples = x.dims()[0];
236        let n_features = x.dims()[1];
237        let x_data = x.data_f32();
238
239        // Reset state
240        self.n_samples_seen_ = 0;
241        self.components_ = None;
242        self.mean_ = None;
243        self.var_ = None;
244
245        // Process in batches
246        let mut start = 0;
247        while start < n_samples {
248            let end = (start + batch_size).min(n_samples);
249            let batch_data: Vec<f32> = x_data[start * n_features..end * n_features].to_vec();
250            let batch = Tensor::from_slice(&batch_data, &[end - start, n_features]).unwrap();
251            self.partial_fit(&batch);
252            start = end;
253        }
254    }
255
256    pub fn transform(&self, x: &Tensor) -> Tensor {
257        let x_data = x.data_f32();
258        let n_samples = x.dims()[0];
259        let n_features = x.dims()[1];
260
261        let components = self.components_.as_ref().expect("Model not fitted");
262        let mean = self.mean_.as_ref().unwrap();
263        let n_components = components.len();
264
265        let mut result = vec![0.0f32; n_samples * n_components];
266
267        for i in 0..n_samples {
268            for k in 0..n_components {
269                for j in 0..n_features {
270                    result[i * n_components + k] += 
271                        (x_data[i * n_features + j] - mean[j]) * components[k][j];
272                }
273
274                if self.whiten {
275                    if let Some(ref sv) = self.singular_values_ {
276                        let scale = (self.n_samples_seen_ as f32 - 1.0).sqrt() / sv[k].max(1e-10);
277                        result[i * n_components + k] *= scale;
278                    }
279                }
280            }
281        }
282
283        Tensor::from_slice(&result, &[n_samples, n_components]).unwrap()
284    }
285
286    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
287        self.fit(x);
288        self.transform(x)
289    }
290
291    pub fn inverse_transform(&self, x: &Tensor) -> Tensor {
292        let x_data = x.data_f32();
293        let n_samples = x.dims()[0];
294        let n_components = x.dims()[1];
295
296        let components = self.components_.as_ref().expect("Model not fitted");
297        let mean = self.mean_.as_ref().unwrap();
298        let n_features = self.n_features_;
299
300        let mut result = vec![0.0f32; n_samples * n_features];
301
302        for i in 0..n_samples {
303            for j in 0..n_features {
304                result[i * n_features + j] = mean[j];
305                for k in 0..n_components {
306                    let mut val = x_data[i * n_components + k];
307                    
308                    if self.whiten {
309                        if let Some(ref sv) = self.singular_values_ {
310                            val *= sv[k] / (self.n_samples_seen_ as f32 - 1.0).sqrt().max(1e-10);
311                        }
312                    }
313                    
314                    result[i * n_features + j] += val * components[k][j];
315                }
316            }
317        }
318
319        Tensor::from_slice(&result, &[n_samples, n_features]).unwrap()
320    }
321
322    pub fn explained_variance_ratio(&self) -> Option<&Vec<f32>> {
323        self.explained_variance_ratio_.as_ref()
324    }
325
326    pub fn n_samples_seen(&self) -> usize {
327        self.n_samples_seen_
328    }
329}
330
331fn normalize(v: &mut [f32]) {
332    let norm: f32 = v.iter().map(|&x| x * x).sum::<f32>().sqrt();
333    if norm > 1e-10 {
334        for vi in v.iter_mut() { *vi /= norm; }
335    }
336}
337
338/// Mini-Batch Sparse PCA
339pub struct MiniBatchSparsePCA {
340    pub n_components: usize,
341    pub alpha: f32,
342    pub batch_size: usize,
343    pub max_iter: usize,
344    components_: Option<Vec<Vec<f32>>>,
345    mean_: Option<Vec<f32>>,
346    n_iter_: usize,
347}
348
349impl MiniBatchSparsePCA {
350    pub fn new(n_components: usize) -> Self {
351        MiniBatchSparsePCA {
352            n_components,
353            alpha: 1.0,
354            batch_size: 100,
355            max_iter: 100,
356            components_: None,
357            mean_: None,
358            n_iter_: 0,
359        }
360    }
361
362    pub fn alpha(mut self, a: f32) -> Self { self.alpha = a; self }
363    pub fn batch_size(mut self, b: usize) -> Self { self.batch_size = b; self }
364
365    pub fn fit(&mut self, x: &Tensor) {
366        use rand::prelude::*;
367        
368        let x_data = x.data_f32();
369        let n_samples = x.dims()[0];
370        let n_features = x.dims()[1];
371
372        // Compute mean
373        let mean: Vec<f32> = (0..n_features)
374            .map(|j| (0..n_samples).map(|i| x_data[i * n_features + j]).sum::<f32>() / n_samples as f32)
375            .collect();
376
377        // Initialize components randomly
378        let mut rng = thread_rng();
379        let mut components: Vec<Vec<f32>> = (0..self.n_components)
380            .map(|_| {
381                let mut v: Vec<f32> = (0..n_features).map(|_| rng.gen::<f32>() - 0.5).collect();
382                normalize(&mut v);
383                v
384            })
385            .collect();
386
387        // Mini-batch optimization
388        let mut indices: Vec<usize> = (0..n_samples).collect();
389
390        for iter in 0..self.max_iter {
391            indices.shuffle(&mut rng);
392
393            for batch_start in (0..n_samples).step_by(self.batch_size) {
394                let batch_end = (batch_start + self.batch_size).min(n_samples);
395                let batch_indices = &indices[batch_start..batch_end];
396
397                // Extract and center batch
398                let batch: Vec<f32> = batch_indices.iter()
399                    .flat_map(|&i| (0..n_features).map(|j| x_data[i * n_features + j] - mean[j]).collect::<Vec<_>>())
400                    .collect();
401                let batch_size = batch_indices.len();
402
403                // Update each component
404                for k in 0..self.n_components {
405                    // Compute gradient
406                    let mut grad = vec![0.0f32; n_features];
407                    
408                    for i in 0..batch_size {
409                        // Project sample onto component
410                        let mut proj = 0.0f32;
411                        for j in 0..n_features {
412                            proj += batch[i * n_features + j] * components[k][j];
413                        }
414
415                        // Gradient of reconstruction error
416                        for j in 0..n_features {
417                            grad[j] += proj * batch[i * n_features + j];
418                        }
419                    }
420
421                    // Update with soft thresholding (L1)
422                    let lr = 0.01 / (iter + 1) as f32;
423                    for j in 0..n_features {
424                        let g = grad[j] / batch_size as f32;
425                        components[k][j] += lr * g;
426                        
427                        // Soft thresholding
428                        let sign = components[k][j].signum();
429                        components[k][j] = (components[k][j].abs() - lr * self.alpha).max(0.0) * sign;
430                    }
431
432                    normalize(&mut components[k]);
433                }
434            }
435
436            self.n_iter_ = iter + 1;
437        }
438
439        self.components_ = Some(components);
440        self.mean_ = Some(mean);
441    }
442
443    pub fn transform(&self, x: &Tensor) -> Tensor {
444        let x_data = x.data_f32();
445        let n_samples = x.dims()[0];
446        let n_features = x.dims()[1];
447
448        let components = self.components_.as_ref().expect("Model not fitted");
449        let mean = self.mean_.as_ref().unwrap();
450
451        let mut result = vec![0.0f32; n_samples * self.n_components];
452
453        for i in 0..n_samples {
454            for k in 0..self.n_components {
455                for j in 0..n_features {
456                    result[i * self.n_components + k] += 
457                        (x_data[i * n_features + j] - mean[j]) * components[k][j];
458                }
459            }
460        }
461
462        Tensor::from_slice(&result, &[n_samples, self.n_components]).unwrap()
463    }
464
465    pub fn fit_transform(&mut self, x: &Tensor) -> Tensor {
466        self.fit(x);
467        self.transform(x)
468    }
469}
470
471#[cfg(test)]
472mod tests {
473    use super::*;
474
475    #[test]
476    fn test_incremental_pca() {
477        let x = Tensor::from_slice(&[1.0f32, 2.0, 3.0,
478            4.0, 5.0, 6.0,
479            7.0, 8.0, 9.0,
480            10.0, 11.0, 12.0,
481        ], &[4, 3]).unwrap();
482
483        let mut ipca = IncrementalPCA::new(2).batch_size(2);
484        let result = ipca.fit_transform(&x);
485        assert_eq!(result.dims(), &[4, 2]);
486    }
487
488    #[test]
489    fn test_incremental_pca_partial_fit() {
490        let mut ipca = IncrementalPCA::new(2);
491        
492        let batch1 = Tensor::from_slice(&[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0], &[2, 3]).unwrap();
493        let batch2 = Tensor::from_slice(&[7.0f32, 8.0, 9.0, 10.0, 11.0, 12.0], &[2, 3]).unwrap();
494        
495        ipca.partial_fit(&batch1);
496        ipca.partial_fit(&batch2);
497        
498        assert_eq!(ipca.n_samples_seen(), 4);
499    }
500}
501
502