aprender-core 0.29.3

Next-generation machine learning library in pure Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
/// Compute block fitness for a candidate block `[l, r]` in sorted data,
/// returning the total fitness including the prior penalty.
///
/// Used by `bayesian_blocks_edges` dynamic programming.
fn bayesian_blocks_block_fitness(
    sorted_data: &[f32],
    best_fitness: &[f32],
    l: usize,
    r: usize,
    ncp_prior: f32,
) -> f32 {
    let block_count = (r - l + 1) as f32;
    let block_min = sorted_data[l];
    let block_max = sorted_data[r];
    let block_range = (block_max - block_min).max(1e-10);

    // Fitness: prefer blocks with uniform density (low range relative to count)
    let density_score = -block_range / block_count.sqrt();

    // Total fitness: previous best + current block - prior penalty
    if l == 0 {
        density_score - ncp_prior
    } else {
        best_fitness[l - 1] + density_score - ncp_prior
    }
}

/// Fill the dynamic programming table for Bayesian Blocks, returning
/// `(best_fitness, last_change_point)` arrays.
fn bayesian_blocks_dp(sorted_data: &[f32], ncp_prior: f32) -> (Vec<f32>, Vec<usize>) {
    let n = sorted_data.len();
    let mut best_fitness = vec![0.0_f32; n];
    let mut last_change_point = vec![0_usize; n];

    for r in 1..n {
        let mut max_fitness = f32::NEG_INFINITY;
        let mut best_cp = 0;

        for l in 0..=r {
            let fitness =
                bayesian_blocks_block_fitness(sorted_data, &best_fitness, l, r, ncp_prior);
            if fitness > max_fitness {
                max_fitness = fitness;
                best_cp = l;
            }
        }

        best_fitness[r] = max_fitness;
        last_change_point[r] = best_cp;
    }

    (best_fitness, last_change_point)
}

/// Backtrack the DP table to recover the list of change point indices.
fn bayesian_blocks_backtrack(last_change_point: &[usize]) -> Vec<usize> {
    let mut change_points = Vec::new();
    let mut current = last_change_point.len() - 1;

    while current > 0 {
        let cp = last_change_point[current];
        if cp > 0 {
            change_points.push(cp);
        }
        if cp == 0 {
            break;
        }
        current = cp - 1;
    }

    change_points.reverse();
    change_points
}

/// Convert change-point indices and sorted data into a strictly-increasing
/// vector of bin edges (with a small margin around the data range).
fn bayesian_blocks_edges_from_change_points(
    sorted_data: &[f32],
    change_points: &[usize],
) -> Vec<f32> {
    let n = sorted_data.len();
    let data_min = sorted_data[0];
    let data_max = sorted_data[n - 1];
    let range = data_max - data_min;
    let margin = range * 0.001; // 0.1% margin

    let mut edges = Vec::new();
    edges.push(data_min - margin);

    for &cp in change_points {
        if cp > 0 && cp < n {
            let edge = f32::midpoint(sorted_data[cp - 1], sorted_data[cp]);
            edges.push(edge);
        }
    }

    edges.push(data_max + margin);

    // Ensure edges are strictly increasing and unique
    edges.dedup();
    edges.sort_by(|a, b| {
        a.partial_cmp(b)
            .expect("f32 values should be comparable (not NaN)")
    });

    // Remove any non-strictly-increasing edges
    let mut i = 1;
    while i < edges.len() {
        if edges[i] <= edges[i - 1] {
            edges.remove(i);
        } else {
            i += 1;
        }
    }

    // Ensure we have at least 2 edges
    if edges.len() < 2 {
        return vec![data_min - margin, data_max + margin];
    }

    edges
}

impl<'a> DescriptiveStats<'a> {
    /// Create a new `DescriptiveStats` instance from a data vector.
    ///
    /// # Arguments
    /// * `data` - Reference to a `Vector<f32>` containing the data
    ///
    /// # Examples
    /// ```
    /// use aprender::stats::DescriptiveStats;
    /// use trueno::Vector;
    ///
    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0]);
    /// let stats = DescriptiveStats::new(&data);
    /// ```
    #[must_use]
    pub fn new(data: &'a Vector<f32>) -> Self {
        Self { data }
    }

    /// Compute quantile using linear interpolation (R-7 method).
    ///
    /// Uses the method from Hyndman & Fan (1996) commonly used in
    /// statistical packages (R, `NumPy`, Pandas).
    ///
    /// # Performance
    /// Uses `QuickSelect` (`select_nth_unstable`) for O(n) average-case
    /// performance instead of full sort O(n log n). This is a Toyota Way
    /// Muda elimination optimization (Floyd & Rivest 1975).
    ///
    /// # Arguments
    /// * `q` - Quantile value in [0, 1]
    ///
    /// # Returns
    /// Interpolated quantile value
    ///
    /// # Errors
    /// Returns error if:
    /// - Data vector is empty
    /// - Quantile q is not in [0, 1]
    ///
    /// # Examples
    /// ```
    /// use aprender::stats::DescriptiveStats;
    /// use trueno::Vector;
    ///
    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
    /// let stats = DescriptiveStats::new(&data);
    ///
    /// assert_eq!(stats.quantile(0.5).expect("median should be computable for valid data"), 3.0); // median
    /// assert_eq!(stats.quantile(0.0).expect("min quantile should be computable for valid data"), 1.0); // min
    /// assert_eq!(stats.quantile(1.0).expect("max quantile should be computable for valid data"), 5.0); // max
    /// ```
    pub fn quantile(&self, q: f64) -> Result<f32, String> {
        // Validation
        if self.data.is_empty() {
            return Err("Cannot compute quantile of empty vector".to_string());
        }
        if !(0.0..=1.0).contains(&q) {
            return Err(format!("Quantile must be in [0, 1], got {q}"));
        }

        let n = self.data.len();

        // Edge cases: single element
        if n == 1 {
            return Ok(self.data.as_slice()[0]);
        }

        // R-7 method: h = (n - 1) * q
        // Position in sorted array (0-indexed)
        let h = (n - 1) as f64 * q;
        let h_floor = h.floor() as usize;
        let h_ceil = h.ceil() as usize;

        // Toyota Way: Use QuickSelect for O(n) instead of full sort O(n log n)
        // This is Muda elimination (waste of unnecessary sorting)
        let mut working_copy = self.data.as_slice().to_vec();

        // Handle edge quantiles (q = 0.0 or q = 1.0)
        if h_floor == h_ceil {
            // Exact index, no interpolation needed
            working_copy.select_nth_unstable_by(h_floor, |a, b| {
                a.partial_cmp(b)
                    .expect("f32 values should be comparable (not NaN)")
            });
            return Ok(working_copy[h_floor]);
        }

        // For interpolation, we need both floor and ceil values
        // Use nth_element twice (still O(n) average case)
        working_copy.select_nth_unstable_by(h_floor, |a, b| {
            a.partial_cmp(b)
                .expect("f32 values should be comparable (not NaN)")
        });
        let lower = working_copy[h_floor];

        // After first partition, ceil element might be in different partition
        // Full sort is still O(n log n), but for single quantile with interpolation,
        // we need a different approach
        working_copy.select_nth_unstable_by(h_ceil, |a, b| {
            a.partial_cmp(b)
                .expect("f32 values should be comparable (not NaN)")
        });
        let upper = working_copy[h_ceil];

        // Linear interpolation
        let fraction = h - h_floor as f64;
        let result = lower + (fraction as f32) * (upper - lower);

        Ok(result)
    }

    /// Compute multiple percentiles efficiently (single sort).
    ///
    /// When computing multiple quantiles, it's more efficient to sort once
    /// and then index into the sorted array. This is O(n log n) amortized.
    ///
    /// # Arguments
    /// * `percentiles` - Slice of percentile values (0-100)
    ///
    /// # Returns
    /// Vector of percentile values in the same order as input
    ///
    /// # Examples
    /// ```
    /// use aprender::stats::DescriptiveStats;
    /// use trueno::Vector;
    ///
    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
    /// let stats = DescriptiveStats::new(&data);
    /// let p = stats.percentiles(&[25.0, 50.0, 75.0]).expect("percentiles should be computable for valid data");
    /// assert_eq!(p, vec![2.0, 3.0, 4.0]);
    /// ```
    pub fn percentiles(&self, percentiles: &[f64]) -> Result<Vec<f32>, String> {
        // Validate inputs
        if self.data.is_empty() {
            return Err("Cannot compute percentiles of empty vector".to_string());
        }
        for &p in percentiles {
            if !(0.0..=100.0).contains(&p) {
                return Err(format!("Percentile must be in [0, 100], got {p}"));
            }
        }

        // For multiple quantiles, full sort is optimal
        let mut sorted = self.data.as_slice().to_vec();
        sorted.sort_by(|a, b| {
            a.partial_cmp(b)
                .expect("f32 values should be comparable (not NaN)")
        });

        let n = sorted.len();
        let mut results = Vec::with_capacity(percentiles.len());

        for &p in percentiles {
            let q = p / 100.0;
            let h = (n - 1) as f64 * q;
            let h_floor = h.floor() as usize;
            let h_ceil = h.ceil() as usize;

            let value = if h_floor == h_ceil {
                sorted[h_floor]
            } else {
                let fraction = h - h_floor as f64;
                sorted[h_floor] + (fraction as f32) * (sorted[h_ceil] - sorted[h_floor])
            };

            results.push(value);
        }

        Ok(results)
    }

    /// Compute five-number summary: min, Q1, median, Q3, max.
    ///
    /// This is the foundation for box plots and outlier detection.
    ///
    /// # Examples
    /// ```
    /// use aprender::stats::DescriptiveStats;
    /// use trueno::Vector;
    ///
    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
    /// let stats = DescriptiveStats::new(&data);
    /// let summary = stats.five_number_summary().expect("five-number summary should be computable for valid data");
    ///
    /// assert_eq!(summary.min, 1.0);
    /// assert_eq!(summary.q1, 2.0);
    /// assert_eq!(summary.median, 3.0);
    /// assert_eq!(summary.q3, 4.0);
    /// assert_eq!(summary.max, 5.0);
    /// ```
    pub fn five_number_summary(&self) -> Result<FiveNumberSummary, String> {
        if self.data.is_empty() {
            return Err("Cannot compute summary of empty vector".to_string());
        }

        // Use percentiles for efficiency (single sort)
        let values = self.percentiles(&[0.0, 25.0, 50.0, 75.0, 100.0])?;

        Ok(FiveNumberSummary {
            min: values[0],
            q1: values[1],
            median: values[2],
            q3: values[3],
            max: values[4],
        })
    }

    /// Compute interquartile range (IQR = Q3 - Q1).
    ///
    /// The IQR is a measure of statistical dispersion, being equal to the
    /// difference between 75th and 25th percentiles.
    ///
    /// # Examples
    /// ```
    /// use aprender::stats::DescriptiveStats;
    /// use trueno::Vector;
    ///
    /// let data = Vector::from_slice(&[1.0, 2.0, 3.0, 4.0, 5.0]);
    /// let stats = DescriptiveStats::new(&data);
    /// assert_eq!(stats.iqr().expect("IQR should be computable for valid data"), 2.0);
    /// ```
    pub fn iqr(&self) -> Result<f32, String> {
        let summary = self.five_number_summary()?;
        Ok(summary.q3 - summary.q1)
    }

    /// Compute histogram with automatic bin selection (Freedman-Diaconis rule).
    ///
    /// Uses Freedman-Diaconis rule: `bin_width` = 2 * IQR / n^(1/3)
    /// This is optimal for unimodal, symmetric distributions.
    ///
    /// # Examples
    /// ```
    /// use aprender::stats::DescriptiveStats;
    /// use trueno::Vector;
    ///
    /// let data = Vector::from_slice(&[1.0, 2.0, 2.0, 3.0, 5.0]);
    /// let stats = DescriptiveStats::new(&data);
    /// let hist = stats.histogram_auto().expect("histogram should be computable for valid data");
    /// assert_eq!(hist.bins.len(), hist.counts.len() + 1);
    /// ```
    pub fn histogram_auto(&self) -> Result<Histogram, String> {
        self.histogram_method(BinMethod::FreedmanDiaconis)
    }

    /// Compute optimal histogram bin edges using Bayesian Blocks algorithm.
    ///
    /// This implements the Bayesian Blocks algorithm (Scargle et al., 2013) which
    /// finds optimal change points in the data using dynamic programming.
    ///
    /// # Returns
    /// Vector of bin edges (sorted, strictly increasing)
    fn bayesian_blocks_edges(&self) -> Result<Vec<f32>, String> {
        if self.data.is_empty() {
            return Err("Cannot compute Bayesian Blocks on empty data".to_string());
        }

        let n = self.data.len();

        // Handle edge cases
        if n == 1 {
            let val = self.data.as_slice()[0];
            return Ok(vec![val - 0.5, val + 0.5]);
        }

        // Sort data (Bayesian Blocks requires sorted data)
        let mut sorted_data: Vec<f32> = self.data.as_slice().to_vec();
        sorted_data.sort_by(|a, b| {
            a.partial_cmp(b)
                .expect("f32 values should be comparable (not NaN)")
        });

        // Handle all same values
        if sorted_data[0] == sorted_data[n - 1] {
            let val = sorted_data[0];
            return Ok(vec![val - 0.5, val + 0.5]);
        }

        // Prior on number of change points (ncp_prior)
        // Following Scargle et al. (2013), we use a prior that penalizes too many blocks
        // but allows detection of significant changes. Lower value = more blocks.
        let ncp_prior = 0.5_f32;

        // Fill DP table via extracted helper
        let (_best_fitness, last_change_point) = bayesian_blocks_dp(&sorted_data, ncp_prior);

        // Backtrack to find change points
        let change_points = bayesian_blocks_backtrack(&last_change_point);

        // Convert change points to bin edges
        Ok(bayesian_blocks_edges_from_change_points(
            &sorted_data,
            &change_points,
        ))
    }
}