aprender-core 0.29.3

Next-generation machine learning library in pure Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
//! Model Interpretability and Explainability.
//!
//! This module provides tools for understanding model predictions through
//! feature attribution and explanation methods.
//!
//! # Methods
//!
//! - **SHAP (`SHapley` Additive exPlanations)**: Computes feature importance using
//!   Shapley values from cooperative game theory.
//! - **Permutation Importance**: Measures feature importance by shuffling features
//!   and measuring prediction change.
//! - **Feature Contributions**: Decomposes predictions into per-feature contributions.
//!
//! # Example
//!
//! ```ignore
//! use aprender::interpret::{KernelSHAP, Explainer};
//!
//! // Create explainer with trained model
//! let explainer = KernelSHAP::new(model, background_data);
//!
//! // Explain a prediction
//! let shap_values = explainer.explain(&sample);
//!
//! // shap_values[i] = contribution of feature i to the prediction
//! ```
//!
//! # References
//!
//! - Lundberg, S. M., & Lee, S. I. (2017). A Unified Approach to Interpreting
//!   Model Predictions. `NeurIPS`.
//! - Ribeiro, M. T., et al. (2016). "Why Should I Trust You?": Explaining the
//!   Predictions of Any Classifier (LIME). KDD.

use crate::primitives::Vector;

/// Trait for model explainers.
///
/// Explainers compute feature attributions/importance scores for predictions.
pub trait Explainer {
    /// Explain a single prediction.
    ///
    /// Returns a vector of feature contributions where:
    /// - Positive values indicate features that increase the prediction
    /// - Negative values indicate features that decrease the prediction
    /// - Sum of contributions + `expected_value` ≈ prediction
    ///
    /// # Arguments
    ///
    /// * `sample` - Input sample to explain
    ///
    /// # Returns
    ///
    /// Vector of SHAP values (one per feature)
    fn explain(&self, sample: &Vector<f32>) -> Vector<f32>;

    /// Get the expected value (baseline prediction).
    ///
    /// This is the model's average prediction over the background dataset.
    fn expected_value(&self) -> f32;
}

/// SHAP (`SHapley` Additive exPlanations) values for feature attribution.
///
/// Computes exact or approximate Shapley values to explain model predictions.
/// Uses kernel-based approximation for efficiency.
///
/// # Properties
///
/// SHAP values satisfy three important properties:
/// 1. **Local accuracy**: `prediction = expected_value + sum(shap_values)`
/// 2. **Missingness**: Features not present have zero attribution
/// 3. **Consistency**: If a feature's contribution increases, its SHAP value won't decrease
///
/// # Example
///
/// ```ignore
/// use aprender::interpret::ShapExplainer;
///
/// let explainer = ShapExplainer::new(&model, &background_data);
/// let shap_values = explainer.explain(&sample);
///
/// // Top contributing features
/// let mut importance: Vec<_> = shap_values.iter().enumerate().collect();
/// importance.sort_by(|a, b| b.1.abs().total_cmp(&a.1.abs()));
/// ```
#[derive(Debug)]
pub struct ShapExplainer {
    /// Background data for computing expected values
    background: Vec<Vector<f32>>,
    /// Expected prediction (baseline)
    expected_value: f32,
    /// Number of samples for Monte Carlo approximation
    n_samples: usize,
    /// Number of features
    n_features: usize,
}

impl ShapExplainer {
    /// Create a new SHAP explainer.
    ///
    /// # Arguments
    ///
    /// * `background` - Background dataset for computing expected values
    /// * `model_fn` - Function that makes predictions given input
    ///
    /// # Example
    ///
    /// ```ignore
    /// let background = vec![sample1, sample2, sample3];
    /// let explainer = ShapExplainer::new(&background, |x| model.predict(x));
    /// ```
    pub fn new<F>(background: &[Vector<f32>], model_fn: F) -> Self
    where
        F: Fn(&Vector<f32>) -> f32,
    {
        assert!(!background.is_empty(), "Background data cannot be empty");

        let n_features = background[0].len();

        // Compute expected value as mean prediction over background
        let expected_value: f32 =
            background.iter().map(&model_fn).sum::<f32>() / background.len() as f32;

        Self {
            background: background.to_vec(),
            expected_value,
            n_samples: 100, // Default number of samples
            n_features,
        }
    }

    /// Set the number of samples for Monte Carlo approximation.
    ///
    /// Higher values give more accurate SHAP values but take longer.
    #[must_use]
    pub fn with_n_samples(mut self, n_samples: usize) -> Self {
        self.n_samples = n_samples;
        self
    }

    /// Compute SHAP values using kernel approximation.
    ///
    /// Uses a simplified kernel SHAP approach:
    /// 1. Sample coalitions (subsets of features)
    /// 2. For each coalition, compute marginal contribution
    /// 3. Weight contributions by kernel weights
    pub fn explain_with_model<F>(&self, sample: &Vector<f32>, model_fn: F) -> Vector<f32>
    where
        F: Fn(&Vector<f32>) -> f32,
    {
        assert_eq!(
            sample.len(),
            self.n_features,
            "Sample must have {} features",
            self.n_features
        );

        let n = self.n_features;
        let mut shap_values = vec![0.0f32; n];

        // For each feature, estimate marginal contribution
        for feature_idx in 0..n {
            let mut contribution = 0.0f32;
            let mut count = 0;

            // Monte Carlo sampling over coalitions and background
            for bg_sample in &self.background {
                // Prediction with feature from sample
                let mut x_with = bg_sample.clone();
                x_with[feature_idx] = sample[feature_idx];

                // Prediction without feature (using background value)
                let x_without = bg_sample.clone();

                let pred_with = model_fn(&x_with);
                let pred_without = model_fn(&x_without);

                contribution += pred_with - pred_without;
                count += 1;
            }

            shap_values[feature_idx] = contribution / count.max(1) as f32;
        }

        // Normalize to satisfy local accuracy
        let sum_shap: f32 = shap_values.iter().sum();
        let prediction = model_fn(sample);
        let target_sum = prediction - self.expected_value;

        if sum_shap.abs() > 1e-8 {
            let scale = target_sum / sum_shap;
            for v in &mut shap_values {
                *v *= scale;
            }
        }

        Vector::from_slice(&shap_values)
    }

    /// Get the background dataset.
    #[must_use]
    pub fn background(&self) -> &[Vector<f32>] {
        &self.background
    }

    /// Get the expected value.
    #[must_use]
    pub fn expected_value(&self) -> f32 {
        self.expected_value
    }

    /// Get the number of features.
    #[must_use]
    pub fn n_features(&self) -> usize {
        self.n_features
    }
}

/// Permutation feature importance.
///
/// Measures feature importance by shuffling each feature and measuring
/// the decrease in model performance.
///
/// # Properties
///
/// - **Model-agnostic**: Works with any model
/// - **Fast**: `O(n_features` * `n_samples`) predictions
/// - **Simple interpretation**: Importance = performance drop when feature is shuffled
///
/// # Example
///
/// ```ignore
/// use aprender::interpret::PermutationImportance;
///
/// let importance = PermutationImportance::compute(
///     &model,
///     &X_test,
///     &y_test,
///     |pred, true_val| (pred - true_val).powi(2), // MSE
/// );
///
/// // importance[i] = how much error increases when feature i is shuffled
/// ```
#[derive(Debug, Clone)]
pub struct PermutationImportance {
    /// Importance scores for each feature
    pub importance: Vector<f32>,
    /// Baseline score (without shuffling)
    pub baseline_score: f32,
}

impl PermutationImportance {
    /// Compute permutation importance.
    ///
    /// # Arguments
    ///
    /// * `predict_fn` - Function that predicts given input
    /// * `X` - Feature matrix (samples as vectors)
    /// * `y` - True labels
    /// * `score_fn` - Scoring function (e.g., MSE, accuracy)
    ///
    /// # Returns
    ///
    /// `PermutationImportance` with importance scores for each feature
    pub fn compute<P, S>(predict_fn: P, x: &[Vector<f32>], y: &[f32], score_fn: S) -> Self
    where
        P: Fn(&Vector<f32>) -> f32,
        S: Fn(f32, f32) -> f32,
    {
        assert!(!x.is_empty(), "Data cannot be empty");
        assert_eq!(x.len(), y.len(), "X and y must have same length");

        let n_samples = x.len();
        let n_features = x[0].len();

        // Compute baseline score
        let baseline_score: f32 = x
            .iter()
            .zip(y.iter())
            .map(|(xi, &yi)| score_fn(predict_fn(xi), yi))
            .sum::<f32>()
            / n_samples as f32;

        let mut importance = vec![0.0f32; n_features];

        // For each feature, shuffle and measure score drop
        for feature_idx in 0..n_features {
            let mut total_shuffled_score = 0.0f32;

            // Simple permutation: use a circular shift
            for (i, xi) in x.iter().enumerate() {
                // Create shuffled sample (use next sample's feature value)
                let mut xi_shuffled = xi.clone();
                let shuffled_idx = (i + 1) % n_samples;
                xi_shuffled[feature_idx] = x[shuffled_idx][feature_idx];

                let pred = predict_fn(&xi_shuffled);
                total_shuffled_score += score_fn(pred, y[i]);
            }

            let shuffled_score = total_shuffled_score / n_samples as f32;
            importance[feature_idx] = shuffled_score - baseline_score;
        }

        Self {
            importance: Vector::from_slice(&importance),
            baseline_score,
        }
    }

    /// Get importance scores.
    #[must_use]
    pub fn scores(&self) -> &Vector<f32> {
        &self.importance
    }

    /// Get feature ranking (indices sorted by importance, descending).
    #[must_use]
    pub fn ranking(&self) -> Vec<usize> {
        let mut indices: Vec<usize> = (0..self.importance.len()).collect();
        indices.sort_by(|&a, &b| {
            self.importance[b]
                .abs()
                .partial_cmp(&self.importance[a].abs())
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        indices
    }
}

/// Feature contribution analysis.
///
/// Decomposes a prediction into per-feature contributions for interpretable
/// models (e.g., linear models, tree ensembles).
///
/// For a linear model: `prediction = bias + sum(weight[i] * feature[i])`
/// Each term `weight[i] * feature[i]` is a feature contribution.
#[derive(Debug, Clone)]
pub struct FeatureContributions {
    /// Contribution of each feature to the prediction
    pub contributions: Vector<f32>,
    /// Bias/intercept term
    pub bias: f32,
    /// Total prediction
    pub prediction: f32,
}

impl FeatureContributions {
    /// Create feature contributions for a linear model.
    ///
    /// # Arguments
    ///
    /// * `weights` - Model weights (coefficients)
    /// * `features` - Input features
    /// * `bias` - Model bias/intercept
    #[must_use]
    pub fn from_linear(weights: &Vector<f32>, features: &Vector<f32>, bias: f32) -> Self {
        assert_eq!(
            weights.len(),
            features.len(),
            "Weights and features must have same length"
        );

        let contributions: Vec<f32> = weights
            .as_slice()
            .iter()
            .zip(features.as_slice().iter())
            .map(|(&w, &f)| w * f)
            .collect();

        let prediction = bias + contributions.iter().sum::<f32>();

        Self {
            contributions: Vector::from_slice(&contributions),
            bias,
            prediction,
        }
    }

    /// Create from pre-computed contributions.
    #[must_use]
    pub fn new(contributions: Vector<f32>, bias: f32) -> Self {
        let prediction = bias + contributions.sum();
        Self {
            contributions,
            bias,
            prediction,
        }
    }

    /// Get top contributing features.
    ///
    /// Returns indices of features with highest absolute contributions.
    #[must_use]
    pub fn top_features(&self, k: usize) -> Vec<(usize, f32)> {
        let mut indexed: Vec<(usize, f32)> = self
            .contributions
            .as_slice()
            .iter()
            .copied()
            .enumerate()
            .collect();
        indexed.sort_by(|a, b| {
            b.1.abs()
                .partial_cmp(&a.1.abs())
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        indexed.truncate(k);
        indexed
    }

    /// Verify that contributions sum to prediction.
    ///
    /// Returns true if `bias + sum(contributions) ≈ prediction`.
    #[must_use]
    pub fn verify_sum(&self, tolerance: f32) -> bool {
        let reconstructed = self.bias + self.contributions.sum();
        (reconstructed - self.prediction).abs() < tolerance
    }
}

/// Integrated Gradients for neural network attribution.
///
/// Computes attributions by integrating gradients along a path from
/// a baseline to the input.
///
/// ```text
/// IG_i(x) = (x_i - x'_i) * ∫ (∂F(x' + α(x-x')) / ∂x_i) dα
/// ```
///
/// where x' is a baseline (typically zeros).
///
/// # References
///
/// - Sundararajan, M., et al. (2017). Axiomatic Attribution for Deep Networks.
#[derive(Debug)]
pub struct IntegratedGradients {
    /// Number of steps for numerical integration
    n_steps: usize,
}

include!("integrated_gradients.rs");
include!("counterfactual.rs");