sklears_semi_supervised/deep_learning/
ladder_networks.rs

1//! Ladder Networks for Deep Semi-Supervised Learning
2//!
3//! This module implements Ladder Networks, a deep learning architecture that
4//! combines supervised learning with unsupervised learning through lateral
5//! connections and denoising objectives. Ladder networks achieve state-of-the-art
6//! performance on semi-supervised learning tasks by learning hierarchical
7//! representations at multiple levels.
8
9use scirs2_core::ndarray_ext::{Array1, Array2, ArrayView1, ArrayView2, Axis};
10use scirs2_core::random::Random;
11use sklears_core::{
12    error::{Result as SklResult, SklearsError},
13    traits::{Estimator, Fit, Predict, PredictProba, Untrained},
14    types::Float,
15};
16
17/// Ladder Networks for Deep Semi-Supervised Learning
18///
19/// Ladder Networks are neural networks that combine supervised and unsupervised
20/// learning objectives. They use lateral connections between encoder and decoder
21/// paths to enable effective learning from both labeled and unlabeled data.
22///
23/// The architecture consists of:
24/// - An encoder path that applies noise and nonlinearities
25/// - A decoder path that reconstructs clean representations
26/// - Lateral connections that help the decoder
27/// - Multiple reconstruction costs at different layers
28///
29/// # Parameters
30///
31/// * `layer_sizes` - Sizes of hidden layers (including input and output)
32/// * `noise_std` - Standard deviation of Gaussian noise added to each layer
33/// * `lambda_unsupervised` - Weight for unsupervised reconstruction loss
34/// * `lambda_supervised` - Weight for supervised classification loss
35/// * `denoising_cost_weights` - Weights for denoising costs at each layer
36/// * `learning_rate` - Learning rate for optimization
37/// * `max_iter` - Maximum number of training iterations
38/// * `batch_size` - Size of mini-batches for training
39///
40/// # Examples
41///
42/// ```rust,ignore
43/// use sklears_semi_supervised::LadderNetworks;
44/// use sklears_core::traits::{Predict, Fit};
45///
46///
47/// let X = array![[1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0]];
48/// let y = array![0, 1, -1, -1]; // -1 indicates unlabeled
49///
50/// let ln = LadderNetworks::new()
51///     .layer_sizes(vec![2, 4, 2])
52///     .noise_std(0.3)
53///     .lambda_unsupervised(1.0)
54///     .lambda_supervised(1.0);
55/// let fitted = ln.fit(&X.view(), &y.view()).unwrap();
56/// let predictions = fitted.predict(&X.view()).unwrap();
57/// ```
58#[derive(Debug, Clone)]
59pub struct LadderNetworks<S = Untrained> {
60    state: S,
61    layer_sizes: Vec<usize>,
62    noise_std: f64,
63    lambda_unsupervised: f64,
64    lambda_supervised: f64,
65    denoising_cost_weights: Vec<f64>,
66    learning_rate: f64,
67    max_iter: usize,
68    batch_size: usize,
69    beta1: f64,
70    beta2: f64,
71    epsilon: f64,
72    random_state: Option<u64>,
73}
74
75impl LadderNetworks<Untrained> {
76    /// Create a new LadderNetworks instance
77    pub fn new() -> Self {
78        Self {
79            state: Untrained,
80            layer_sizes: vec![10, 6, 4, 2], // Default architecture
81            noise_std: 0.3,
82            lambda_unsupervised: 1.0,
83            lambda_supervised: 1.0,
84            denoising_cost_weights: vec![1000.0, 10.0, 0.1, 0.1],
85            learning_rate: 0.002,
86            max_iter: 100,
87            batch_size: 32,
88            beta1: 0.9,
89            beta2: 0.999,
90            epsilon: 1e-8,
91            random_state: None,
92        }
93    }
94
95    /// Set the layer sizes (input size will be set automatically)
96    pub fn layer_sizes(mut self, sizes: Vec<usize>) -> Self {
97        self.layer_sizes = sizes;
98        self
99    }
100
101    /// Set the noise standard deviation
102    pub fn noise_std(mut self, std: f64) -> Self {
103        self.noise_std = std;
104        self
105    }
106
107    /// Set the unsupervised loss weight
108    pub fn lambda_unsupervised(mut self, lambda: f64) -> Self {
109        self.lambda_unsupervised = lambda;
110        self
111    }
112
113    /// Set the supervised loss weight
114    pub fn lambda_supervised(mut self, lambda: f64) -> Self {
115        self.lambda_supervised = lambda;
116        self
117    }
118
119    /// Set the denoising cost weights for each layer
120    pub fn denoising_cost_weights(mut self, weights: Vec<f64>) -> Self {
121        self.denoising_cost_weights = weights;
122        self
123    }
124
125    /// Set the learning rate
126    pub fn learning_rate(mut self, lr: f64) -> Self {
127        self.learning_rate = lr;
128        self
129    }
130
131    /// Set the maximum number of iterations
132    pub fn max_iter(mut self, max_iter: usize) -> Self {
133        self.max_iter = max_iter;
134        self
135    }
136
137    /// Set the batch size
138    pub fn batch_size(mut self, batch_size: usize) -> Self {
139        self.batch_size = batch_size;
140        self
141    }
142
143    /// Set Adam optimizer beta1 parameter
144    pub fn beta1(mut self, beta1: f64) -> Self {
145        self.beta1 = beta1;
146        self
147    }
148
149    /// Set Adam optimizer beta2 parameter
150    pub fn beta2(mut self, beta2: f64) -> Self {
151        self.beta2 = beta2;
152        self
153    }
154
155    /// Set random state for reproducibility
156    pub fn random_state(mut self, seed: u64) -> Self {
157        self.random_state = Some(seed);
158        self
159    }
160
161    fn initialize_weights(&self, input_size: usize) -> LadderWeights {
162        let mut layer_sizes = self.layer_sizes.clone();
163        layer_sizes[0] = input_size; // Set input size
164
165        let n_layers = layer_sizes.len();
166        let mut encoder_weights = Vec::with_capacity(n_layers - 1);
167        let mut encoder_biases = Vec::with_capacity(n_layers - 1);
168        let mut decoder_weights = Vec::with_capacity(n_layers - 1);
169        let mut decoder_biases = Vec::with_capacity(n_layers - 1);
170
171        // Xavier initialization
172        for i in 0..(n_layers - 1) {
173            let fan_in = layer_sizes[i];
174            let fan_out = layer_sizes[i + 1];
175            let xavier_std = (2.0 / (fan_in + fan_out) as f64).sqrt();
176
177            // Encoder weights (bottom-up)
178            let mut rng = Random::default();
179            let mut w_enc = Array2::zeros((fan_in, fan_out));
180            for i in 0..fan_in {
181                for j in 0..fan_out {
182                    w_enc[[i, j]] = rng.random_range(-3.0..3.0) / 3.0 * xavier_std;
183                }
184            }
185            let b_enc = Array1::zeros(fan_out);
186            encoder_weights.push(w_enc);
187            encoder_biases.push(b_enc);
188
189            // Decoder weights (top-down)
190            let mut w_dec = Array2::zeros((fan_out, fan_in));
191            for i in 0..fan_out {
192                for j in 0..fan_in {
193                    w_dec[[i, j]] = rng.random_range(-3.0..3.0) / 3.0 * xavier_std;
194                }
195            }
196            let b_dec = Array1::zeros(fan_in);
197            decoder_weights.push(w_dec);
198            decoder_biases.push(b_dec);
199        }
200
201        LadderWeights {
202            encoder_weights,
203            encoder_biases,
204            decoder_weights,
205            decoder_biases,
206            layer_sizes,
207        }
208    }
209
210    fn add_noise(&self, x: &Array2<f64>) -> Array2<f64> {
211        let mut rng = Random::default();
212        let mut noise = Array2::zeros(x.dim());
213        for i in 0..x.nrows() {
214            for j in 0..x.ncols() {
215                noise[[i, j]] = rng.random_range(-3.0..3.0) / 3.0 * self.noise_std;
216            }
217        }
218        x + &noise
219    }
220
221    fn relu(&self, x: &Array2<f64>) -> Array2<f64> {
222        x.mapv(|v| v.max(0.0))
223    }
224
225    fn relu_derivative(&self, x: &Array2<f64>) -> Array2<f64> {
226        x.mapv(|v| if v > 0.0 { 1.0 } else { 0.0 })
227    }
228
229    fn softmax(&self, x: &Array2<f64>) -> Array2<f64> {
230        let mut result = Array2::zeros(x.dim());
231        for (i, row) in x.axis_iter(Axis(0)).enumerate() {
232            let max_val = row.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
233            let exp_row: Array1<f64> = row.mapv(|v| (v - max_val).exp());
234            let sum_exp: f64 = exp_row.sum();
235            let softmax_row = exp_row / sum_exp;
236            result.row_mut(i).assign(&softmax_row);
237        }
238        result
239    }
240
241    fn forward_encoder(
242        &self,
243        x: &Array2<f64>,
244        weights: &LadderWeights,
245        add_noise: bool,
246    ) -> EncoderOutput {
247        let mut activations = Vec::new();
248        let mut noisy_activations = Vec::new();
249        let mut pre_activations = Vec::new();
250
251        let mut current = x.clone();
252        activations.push(current.clone());
253
254        if add_noise {
255            current = self.add_noise(&current);
256        }
257        noisy_activations.push(current.clone());
258
259        for i in 0..weights.encoder_weights.len() {
260            // Linear transformation
261            let z = current.dot(&weights.encoder_weights[i]) + &weights.encoder_biases[i];
262            pre_activations.push(z.clone());
263
264            // Apply activation function (ReLU for hidden layers, linear for output)
265            current = if i == weights.encoder_weights.len() - 1 {
266                z.clone() // Linear for output layer
267            } else {
268                self.relu(&z)
269            };
270
271            activations.push(current.clone());
272
273            // Add noise to hidden layers
274            if add_noise && i < weights.encoder_weights.len() - 1 {
275                current = self.add_noise(&current);
276            }
277            noisy_activations.push(current.clone());
278        }
279
280        EncoderOutput {
281            activations,
282            noisy_activations,
283            pre_activations,
284        }
285    }
286
287    fn forward_decoder(
288        &self,
289        top_activation: &Array2<f64>,
290        weights: &LadderWeights,
291    ) -> Vec<Array2<f64>> {
292        let mut decoder_outputs = Vec::new();
293        let mut current = top_activation.clone();
294
295        // Start from the top layer and work down
296        for i in (0..weights.decoder_weights.len()).rev() {
297            current = current.dot(&weights.decoder_weights[i]) + &weights.decoder_biases[i];
298
299            // Apply activation function for hidden layers
300            if i > 0 {
301                current = self.relu(&current);
302            }
303
304            decoder_outputs.push(current.clone());
305        }
306
307        decoder_outputs.reverse(); // Reverse to match layer order
308        decoder_outputs
309    }
310
311    fn compute_denoising_cost(
312        &self,
313        clean_activations: &[Array2<f64>],
314        reconstructed_activations: &[Array2<f64>],
315    ) -> f64 {
316        let mut total_cost = 0.0;
317
318        for (layer_idx, (clean, reconstructed)) in clean_activations
319            .iter()
320            .zip(reconstructed_activations.iter())
321            .enumerate()
322        {
323            if layer_idx < self.denoising_cost_weights.len() {
324                let diff = clean - reconstructed;
325                let mse = diff.mapv(|x| x * x).mean().unwrap();
326                total_cost += self.denoising_cost_weights[layer_idx] * mse;
327            }
328        }
329
330        total_cost
331    }
332
333    fn compute_supervised_cost(&self, predictions: &Array2<f64>, targets: &Array2<f64>) -> f64 {
334        // Cross-entropy loss
335        let epsilon = 1e-15;
336        let clipped_predictions = predictions.mapv(|x| x.max(epsilon).min(1.0 - epsilon));
337        let log_predictions = clipped_predictions.mapv(|x| x.ln());
338
339        let mut cost = 0.0;
340        for i in 0..targets.nrows() {
341            for j in 0..targets.ncols() {
342                cost -= targets[[i, j]] * log_predictions[[i, j]];
343            }
344        }
345
346        cost / targets.nrows() as f64
347    }
348
349    fn create_target_matrix(&self, y: &Array1<i32>, classes: &[i32]) -> Array2<f64> {
350        let n_samples = y.len();
351        let n_classes = classes.len();
352        let mut targets = Array2::zeros((n_samples, n_classes));
353
354        for (i, &label) in y.iter().enumerate() {
355            if label != -1 {
356                if let Some(class_idx) = classes.iter().position(|&c| c == label) {
357                    targets[[i, class_idx]] = 1.0;
358                }
359            }
360        }
361
362        targets
363    }
364
365    fn update_weights_adam(
366        &self,
367        weights: &mut LadderWeights,
368        gradients: &LadderWeights,
369        momentum: &mut LadderWeights,
370        velocity: &mut LadderWeights,
371        iteration: usize,
372    ) {
373        let beta1_t = self.beta1.powi(iteration as i32);
374        let beta2_t = self.beta2.powi(iteration as i32);
375        let alpha_t = self.learning_rate * (1.0 - beta2_t).sqrt() / (1.0 - beta1_t);
376
377        // Update encoder weights
378        for i in 0..weights.encoder_weights.len() {
379            // Update momentum and velocity
380            momentum.encoder_weights[i] = self.beta1 * &momentum.encoder_weights[i]
381                + (1.0 - self.beta1) * &gradients.encoder_weights[i];
382            velocity.encoder_weights[i] = self.beta2 * &velocity.encoder_weights[i]
383                + (1.0 - self.beta2) * gradients.encoder_weights[i].mapv(|x| x * x);
384
385            // Update weights
386            let momentum_corrected = &momentum.encoder_weights[i] / (1.0 - beta1_t);
387            let velocity_corrected = &velocity.encoder_weights[i] / (1.0 - beta2_t);
388            let update = momentum_corrected / velocity_corrected.mapv(|x| x.sqrt() + self.epsilon);
389            weights.encoder_weights[i] = &weights.encoder_weights[i] - alpha_t * update;
390
391            // Update biases
392            momentum.encoder_biases[i] = self.beta1 * &momentum.encoder_biases[i]
393                + (1.0 - self.beta1) * &gradients.encoder_biases[i];
394            velocity.encoder_biases[i] = self.beta2 * &velocity.encoder_biases[i]
395                + (1.0 - self.beta2) * gradients.encoder_biases[i].mapv(|x| x * x);
396
397            let momentum_corrected_b = &momentum.encoder_biases[i] / (1.0 - beta1_t);
398            let velocity_corrected_b = &velocity.encoder_biases[i] / (1.0 - beta2_t);
399            let update_b =
400                momentum_corrected_b / velocity_corrected_b.mapv(|x| x.sqrt() + self.epsilon);
401            weights.encoder_biases[i] = &weights.encoder_biases[i] - alpha_t * update_b;
402        }
403
404        // Update decoder weights (similar process)
405        for i in 0..weights.decoder_weights.len() {
406            momentum.decoder_weights[i] = self.beta1 * &momentum.decoder_weights[i]
407                + (1.0 - self.beta1) * &gradients.decoder_weights[i];
408            velocity.decoder_weights[i] = self.beta2 * &velocity.decoder_weights[i]
409                + (1.0 - self.beta2) * gradients.decoder_weights[i].mapv(|x| x * x);
410
411            let momentum_corrected = &momentum.decoder_weights[i] / (1.0 - beta1_t);
412            let velocity_corrected = &velocity.decoder_weights[i] / (1.0 - beta2_t);
413            let update = momentum_corrected / velocity_corrected.mapv(|x| x.sqrt() + self.epsilon);
414            weights.decoder_weights[i] = &weights.decoder_weights[i] - alpha_t * update;
415
416            momentum.decoder_biases[i] = self.beta1 * &momentum.decoder_biases[i]
417                + (1.0 - self.beta1) * &gradients.decoder_biases[i];
418            velocity.decoder_biases[i] = self.beta2 * &velocity.decoder_biases[i]
419                + (1.0 - self.beta2) * gradients.decoder_biases[i].mapv(|x| x * x);
420
421            let momentum_corrected_b = &momentum.decoder_biases[i] / (1.0 - beta1_t);
422            let velocity_corrected_b = &velocity.decoder_biases[i] / (1.0 - beta2_t);
423            let update_b =
424                momentum_corrected_b / velocity_corrected_b.mapv(|x| x.sqrt() + self.epsilon);
425            weights.decoder_biases[i] = &weights.decoder_biases[i] - alpha_t * update_b;
426        }
427    }
428
429    fn compute_gradients(
430        &self,
431        x: &Array2<f64>,
432        targets: &Array2<f64>,
433        weights: &LadderWeights,
434    ) -> SklResult<LadderWeights> {
435        let batch_size = x.nrows() as f64;
436
437        // Forward pass through clean encoder
438        let clean_encoder_output = self.forward_encoder(x, weights, false);
439
440        // Forward pass through noisy encoder
441        let noisy_encoder_output = self.forward_encoder(x, weights, true);
442
443        // Forward pass through decoder
444        let decoder_outputs =
445            self.forward_decoder(noisy_encoder_output.activations.last().unwrap(), weights);
446
447        // Initialize gradients
448        let mut gradient_weights = LadderWeights {
449            encoder_weights: weights
450                .encoder_weights
451                .iter()
452                .map(|w| Array2::zeros(w.dim()))
453                .collect(),
454            encoder_biases: weights
455                .encoder_biases
456                .iter()
457                .map(|b| Array1::zeros(b.len()))
458                .collect(),
459            decoder_weights: weights
460                .decoder_weights
461                .iter()
462                .map(|w| Array2::zeros(w.dim()))
463                .collect(),
464            decoder_biases: weights
465                .decoder_biases
466                .iter()
467                .map(|b| Array1::zeros(b.len()))
468                .collect(),
469            layer_sizes: weights.layer_sizes.clone(),
470        };
471
472        // Simplified gradient computation (in practice, this would use automatic differentiation)
473        // For demonstration, we compute approximate gradients using finite differences
474        let delta = 1e-5;
475
476        // Compute gradients for encoder weights
477        for i in 0..weights.encoder_weights.len() {
478            for j in 0..weights.encoder_weights[i].nrows() {
479                for k in 0..weights.encoder_weights[i].ncols() {
480                    // Perturb weight
481                    let mut weights_plus = weights.clone();
482                    let mut weights_minus = weights.clone();
483                    weights_plus.encoder_weights[i][[j, k]] += delta;
484                    weights_minus.encoder_weights[i][[j, k]] -= delta;
485
486                    // Compute costs
487                    let cost_plus = self.compute_total_cost(x, targets, &weights_plus)?;
488                    let cost_minus = self.compute_total_cost(x, targets, &weights_minus)?;
489
490                    // Approximate gradient
491                    gradient_weights.encoder_weights[i][[j, k]] =
492                        (cost_plus - cost_minus) / (2.0 * delta);
493                }
494            }
495        }
496
497        Ok(gradient_weights)
498    }
499
500    fn compute_total_cost(
501        &self,
502        x: &Array2<f64>,
503        targets: &Array2<f64>,
504        weights: &LadderWeights,
505    ) -> SklResult<f64> {
506        // Forward pass
507        let clean_encoder_output = self.forward_encoder(x, weights, false);
508        let noisy_encoder_output = self.forward_encoder(x, weights, true);
509        let decoder_outputs =
510            self.forward_decoder(noisy_encoder_output.activations.last().unwrap(), weights);
511
512        // Supervised cost
513        let predictions = self.softmax(noisy_encoder_output.activations.last().unwrap());
514        let supervised_cost = self.compute_supervised_cost(&predictions, targets);
515
516        // Unsupervised denoising cost
517        let denoising_cost =
518            self.compute_denoising_cost(&clean_encoder_output.activations, &decoder_outputs);
519
520        let total_cost =
521            self.lambda_supervised * supervised_cost + self.lambda_unsupervised * denoising_cost;
522        Ok(total_cost)
523    }
524}
525
526#[derive(Debug, Clone)]
527pub struct LadderWeights {
528    /// encoder_weights
529    pub encoder_weights: Vec<Array2<f64>>,
530    /// encoder_biases
531    pub encoder_biases: Vec<Array1<f64>>,
532    /// decoder_weights
533    pub decoder_weights: Vec<Array2<f64>>,
534    /// decoder_biases
535    pub decoder_biases: Vec<Array1<f64>>,
536    /// layer_sizes
537    pub layer_sizes: Vec<usize>,
538}
539
540#[derive(Debug)]
541pub struct EncoderOutput {
542    /// activations
543    pub activations: Vec<Array2<f64>>,
544    /// noisy_activations
545    pub noisy_activations: Vec<Array2<f64>>,
546    /// pre_activations
547    pub pre_activations: Vec<Array2<f64>>,
548}
549
550impl Default for LadderNetworks<Untrained> {
551    fn default() -> Self {
552        Self::new()
553    }
554}
555
556impl Estimator for LadderNetworks<Untrained> {
557    type Config = ();
558    type Error = SklearsError;
559    type Float = Float;
560
561    fn config(&self) -> &Self::Config {
562        &()
563    }
564}
565
566impl Fit<ArrayView2<'_, Float>, ArrayView1<'_, i32>> for LadderNetworks<Untrained> {
567    type Fitted = LadderNetworks<LadderNetworksTrained>;
568
569    #[allow(non_snake_case)]
570    fn fit(self, X: &ArrayView2<'_, Float>, y: &ArrayView1<'_, i32>) -> SklResult<Self::Fitted> {
571        let X = X.to_owned();
572        let y = y.to_owned();
573        let (n_samples, n_features) = X.dim();
574
575        // Identify classes
576        let mut classes = std::collections::HashSet::new();
577        for &label in y.iter() {
578            if label != -1 {
579                classes.insert(label);
580            }
581        }
582
583        if classes.is_empty() {
584            return Err(SklearsError::InvalidInput(
585                "No labeled samples provided".to_string(),
586            ));
587        }
588
589        let classes: Vec<i32> = classes.into_iter().collect();
590        let n_classes = classes.len();
591
592        // Adjust layer sizes
593        let mut layer_sizes = self.layer_sizes.clone();
594        layer_sizes[0] = n_features;
595        let last_idx = layer_sizes.len() - 1;
596        layer_sizes[last_idx] = n_classes;
597
598        // Initialize weights
599        let mut weights = self.initialize_weights(n_features);
600        weights.layer_sizes = layer_sizes;
601
602        // Initialize Adam optimizer state
603        let mut momentum = weights.clone();
604        let mut velocity = weights.clone();
605
606        // Zero initialize momentum and velocity
607        for i in 0..momentum.encoder_weights.len() {
608            momentum.encoder_weights[i].fill(0.0);
609            momentum.encoder_biases[i].fill(0.0);
610            velocity.encoder_weights[i].fill(0.0);
611            velocity.encoder_biases[i].fill(0.0);
612        }
613        for i in 0..momentum.decoder_weights.len() {
614            momentum.decoder_weights[i].fill(0.0);
615            momentum.decoder_biases[i].fill(0.0);
616            velocity.decoder_weights[i].fill(0.0);
617            velocity.decoder_biases[i].fill(0.0);
618        }
619
620        // Create target matrix
621        let targets = self.create_target_matrix(&y, &classes);
622
623        // Training loop (simplified for demonstration)
624        for iteration in 1..=self.max_iter {
625            // For simplicity, we'll train on the entire dataset
626            // In practice, you'd use mini-batches
627
628            let total_cost = self.compute_total_cost(&X, &targets, &weights)?;
629
630            if iteration % 10 == 0 {
631                println!("Iteration {}: Total cost = {:.6}", iteration, total_cost);
632            }
633
634            // Compute gradients (simplified - in practice use automatic differentiation)
635            let gradients = self.compute_gradients(&X, &targets, &weights)?;
636
637            // Update weights using Adam
638            self.update_weights_adam(
639                &mut weights,
640                &gradients,
641                &mut momentum,
642                &mut velocity,
643                iteration,
644            );
645        }
646
647        // Final forward pass to get label distributions
648        let final_encoder_output = self.forward_encoder(&X, &weights, false);
649        let final_predictions = self.softmax(final_encoder_output.activations.last().unwrap());
650
651        Ok(LadderNetworks {
652            state: LadderNetworksTrained {
653                X_train: X,
654                y_train: y,
655                classes: Array1::from(classes),
656                weights,
657                label_distributions: final_predictions,
658            },
659            layer_sizes: self.layer_sizes,
660            noise_std: self.noise_std,
661            lambda_unsupervised: self.lambda_unsupervised,
662            lambda_supervised: self.lambda_supervised,
663            denoising_cost_weights: self.denoising_cost_weights,
664            learning_rate: self.learning_rate,
665            max_iter: self.max_iter,
666            batch_size: self.batch_size,
667            beta1: self.beta1,
668            beta2: self.beta2,
669            epsilon: self.epsilon,
670            random_state: self.random_state,
671        })
672    }
673}
674
675impl LadderNetworks<LadderNetworksTrained> {
676    fn forward_encoder(
677        &self,
678        x: &Array2<f64>,
679        weights: &LadderWeights,
680        add_noise: bool,
681    ) -> EncoderOutput {
682        let mut activations = Vec::new();
683        let mut noisy_activations = Vec::new();
684        let mut pre_activations = Vec::new();
685
686        let mut current = x.clone();
687        activations.push(current.clone());
688
689        if add_noise {
690            current = self.add_noise(&current);
691        }
692        noisy_activations.push(current.clone());
693
694        for i in 0..weights.encoder_weights.len() {
695            // Linear transformation
696            let z = current.dot(&weights.encoder_weights[i]) + &weights.encoder_biases[i];
697            pre_activations.push(z.clone());
698
699            // Apply activation function (ReLU for hidden layers, linear for output)
700            current = if i == weights.encoder_weights.len() - 1 {
701                z.clone() // Linear for output layer
702            } else {
703                self.relu(&z)
704            };
705
706            activations.push(current.clone());
707
708            // Add noise to hidden layers
709            if add_noise && i < weights.encoder_weights.len() - 1 {
710                current = self.add_noise(&current);
711            }
712            noisy_activations.push(current.clone());
713        }
714
715        EncoderOutput {
716            activations,
717            noisy_activations,
718            pre_activations,
719        }
720    }
721
722    fn add_noise(&self, x: &Array2<f64>) -> Array2<f64> {
723        let mut rng = Random::default();
724        let mut noise = Array2::zeros(x.dim());
725        for i in 0..x.nrows() {
726            for j in 0..x.ncols() {
727                noise[[i, j]] = rng.random_range(-3.0..3.0) / 3.0 * self.noise_std;
728            }
729        }
730        x + &noise
731    }
732
733    fn relu(&self, x: &Array2<f64>) -> Array2<f64> {
734        x.mapv(|v| v.max(0.0))
735    }
736
737    fn softmax(&self, x: &Array2<f64>) -> Array2<f64> {
738        let mut result = Array2::zeros(x.dim());
739        for (i, row) in x.axis_iter(Axis(0)).enumerate() {
740            let max_val = row.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
741            let exp_row: Array1<f64> = row.mapv(|v| (v - max_val).exp());
742            let sum_exp: f64 = exp_row.sum();
743            let softmax_row = exp_row / sum_exp;
744            result.row_mut(i).assign(&softmax_row);
745        }
746        result
747    }
748}
749
750impl Predict<ArrayView2<'_, Float>, Array1<i32>> for LadderNetworks<LadderNetworksTrained> {
751    #[allow(non_snake_case)]
752    fn predict(&self, X: &ArrayView2<'_, Float>) -> SklResult<Array1<i32>> {
753        let X = X.to_owned();
754        let encoder_output = self.forward_encoder(&X, &self.state.weights, false);
755        let predictions = self.softmax(encoder_output.activations.last().unwrap());
756
757        let mut result = Array1::zeros(X.nrows());
758        for i in 0..X.nrows() {
759            let max_idx = predictions
760                .row(i)
761                .iter()
762                .enumerate()
763                .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
764                .unwrap()
765                .0;
766            result[i] = self.state.classes[max_idx];
767        }
768
769        Ok(result)
770    }
771}
772
773impl PredictProba<ArrayView2<'_, Float>, Array2<f64>> for LadderNetworks<LadderNetworksTrained> {
774    #[allow(non_snake_case)]
775    fn predict_proba(&self, X: &ArrayView2<'_, Float>) -> SklResult<Array2<f64>> {
776        let X = X.to_owned();
777        let encoder_output = self.forward_encoder(&X, &self.state.weights, false);
778        let predictions = self.softmax(encoder_output.activations.last().unwrap());
779        Ok(predictions)
780    }
781}
782
783/// Trained state for LadderNetworks
784#[derive(Debug, Clone)]
785pub struct LadderNetworksTrained {
786    /// X_train
787    pub X_train: Array2<f64>,
788    /// y_train
789    pub y_train: Array1<i32>,
790    /// classes
791    pub classes: Array1<i32>,
792    /// weights
793    pub weights: LadderWeights,
794    /// label_distributions
795    pub label_distributions: Array2<f64>,
796}
797
798#[allow(non_snake_case)]
799#[cfg(test)]
800mod tests {
801    use super::*;
802    use scirs2_core::array;
803
804    #[test]
805    #[allow(non_snake_case)]
806    fn test_ladder_networks_basic() {
807        let X = array![
808            [1.0, 2.0, 0.5],
809            [2.0, 3.0, 1.0],
810            [3.0, 4.0, 1.5],
811            [4.0, 5.0, 2.0],
812            [5.0, 6.0, 2.5],
813            [6.0, 7.0, 3.0]
814        ];
815        let y = array![0, 1, 0, 1, -1, -1]; // -1 indicates unlabeled
816
817        let ln = LadderNetworks::new()
818            .layer_sizes(vec![3, 4, 2])
819            .noise_std(0.1)
820            .lambda_unsupervised(0.5)
821            .lambda_supervised(1.0)
822            .max_iter(5); // Reduced for testing
823        let fitted = ln.fit(&X.view(), &y.view()).unwrap();
824
825        let predictions = fitted.predict(&X.view()).unwrap();
826        assert_eq!(predictions.len(), 6);
827
828        let probas = fitted.predict_proba(&X.view()).unwrap();
829        assert_eq!(probas.dim(), (6, 2));
830
831        // Check that probabilities sum to approximately 1
832        for i in 0..6 {
833            let sum: f64 = probas.row(i).sum();
834            assert!((sum - 1.0).abs() < 0.1); // Allow some numerical error
835        }
836    }
837
838    #[test]
839    fn test_ladder_networks_initialization() {
840        let ln = LadderNetworks::new()
841            .layer_sizes(vec![4, 6, 3])
842            .noise_std(0.2);
843
844        let weights = ln.initialize_weights(4);
845
846        // Check dimensions
847        assert_eq!(weights.layer_sizes, vec![4, 6, 3]);
848        assert_eq!(weights.encoder_weights.len(), 2);
849        assert_eq!(weights.encoder_weights[0].dim(), (4, 6));
850        assert_eq!(weights.encoder_weights[1].dim(), (6, 3));
851
852        assert_eq!(weights.decoder_weights.len(), 2);
853        assert_eq!(weights.decoder_weights[0].dim(), (6, 4));
854        assert_eq!(weights.decoder_weights[1].dim(), (3, 6));
855    }
856
857    #[test]
858    fn test_ladder_networks_noise_addition() {
859        let ln = LadderNetworks::new().noise_std(0.1);
860        let x = array![[1.0, 2.0], [3.0, 4.0]];
861
862        let noisy_x = ln.add_noise(&x);
863
864        // Check dimensions are preserved
865        assert_eq!(noisy_x.dim(), x.dim());
866
867        // Check that noise was added (values should be different)
868        let diff = (&noisy_x - &x).mapv(|v| v.abs()).sum();
869        assert!(diff > 0.0);
870    }
871
872    #[test]
873    fn test_ladder_networks_activations() {
874        let ln = LadderNetworks::new();
875
876        // Test ReLU
877        let x = array![[-1.0, 0.0, 1.0, 2.0]];
878        let relu_result = ln.relu(&x);
879        let expected = array![[0.0, 0.0, 1.0, 2.0]];
880
881        for i in 0..x.ncols() {
882            assert!((relu_result[[0, i]] - expected[[0, i]]).abs() < 1e-10);
883        }
884
885        // Test ReLU derivative
886        let relu_deriv = ln.relu_derivative(&x);
887        let expected_deriv = array![[0.0, 0.0, 1.0, 1.0]];
888
889        for i in 0..x.ncols() {
890            assert!((relu_deriv[[0, i]] - expected_deriv[[0, i]]).abs() < 1e-10);
891        }
892    }
893
894    #[test]
895    fn test_ladder_networks_softmax() {
896        let ln = LadderNetworks::new();
897        let x = array![[1.0, 2.0, 3.0], [0.0, 0.0, 0.0]];
898
899        let softmax_result = ln.softmax(&x);
900
901        // Check dimensions
902        assert_eq!(softmax_result.dim(), x.dim());
903
904        // Check that each row sums to 1
905        for i in 0..x.nrows() {
906            let sum: f64 = softmax_result.row(i).sum();
907            assert!((sum - 1.0).abs() < 1e-10);
908        }
909
910        // Check that all values are positive
911        for i in 0..x.nrows() {
912            for j in 0..x.ncols() {
913                assert!(softmax_result[[i, j]] > 0.0);
914            }
915        }
916    }
917
918    #[test]
919    fn test_ladder_networks_forward_encoder() {
920        let ln = LadderNetworks::new();
921        let weights = ln.initialize_weights(2);
922        let x = array![[1.0, 2.0], [3.0, 4.0]];
923
924        let encoder_output = ln.forward_encoder(&x, &weights, false);
925
926        // Check that we have activations for each layer
927        assert_eq!(encoder_output.activations.len(), weights.layer_sizes.len());
928        assert_eq!(
929            encoder_output.noisy_activations.len(),
930            weights.layer_sizes.len()
931        );
932
933        // Check input layer
934        assert_eq!(encoder_output.activations[0].dim(), x.dim());
935    }
936
937    #[test]
938    fn test_ladder_networks_target_matrix() {
939        let ln = LadderNetworks::new();
940        let y = array![0, 1, -1, 0]; // -1 indicates unlabeled
941        let classes = vec![0, 1];
942
943        let targets = ln.create_target_matrix(&y, &classes);
944
945        // Check dimensions
946        assert_eq!(targets.dim(), (4, 2));
947
948        // Check labeled samples
949        assert_eq!(targets[[0, 0]], 1.0); // First sample, class 0
950        assert_eq!(targets[[0, 1]], 0.0);
951        assert_eq!(targets[[1, 0]], 0.0); // Second sample, class 1
952        assert_eq!(targets[[1, 1]], 1.0);
953
954        // Check unlabeled sample (should be all zeros)
955        assert_eq!(targets[[2, 0]], 0.0);
956        assert_eq!(targets[[2, 1]], 0.0);
957    }
958}