aprender-core 0.30.0

Next-generation machine learning library in pure Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

impl DESearch {
    /// Create DE search with n iterations.
    #[must_use]
    pub fn new(n_iter: usize) -> Self {
        Self {
            n_iter,
            population_size: 0, // Auto
            seed: 42,
            strategy: DEStrategy::Rand1Bin,
            use_jade: false,
            population: Vec::new(),
            fitness: Vec::new(),
            best_idx: 0,
            param_order: Vec::new(),
            param_bounds: Vec::new(),
            trials_generated: 0,
            initialized: false,
            mutation_factor: 0.8,
            crossover_rate: 0.9,
        }
    }

    /// Set population size (0 = auto-select based on dimension).
    #[must_use]
    pub fn with_population_size(mut self, size: usize) -> Self {
        self.population_size = size;
        self
    }

    /// Set random seed for reproducibility.
    #[must_use]
    pub fn with_seed(mut self, seed: u64) -> Self {
        self.seed = seed;
        self
    }

    /// Set mutation strategy.
    #[must_use]
    pub fn with_strategy(mut self, strategy: DEStrategy) -> Self {
        self.strategy = strategy;
        self
    }

    /// Enable JADE adaptive parameters.
    #[must_use]
    pub fn with_jade(mut self) -> Self {
        self.use_jade = true;
        self
    }

    /// Set mutation factor F (default: 0.8).
    #[must_use]
    pub fn with_mutation_factor(mut self, f: f64) -> Self {
        self.mutation_factor = f;
        self
    }

    /// Set crossover rate CR (default: 0.9).
    #[must_use]
    pub fn with_crossover_rate(mut self, cr: f64) -> Self {
        self.crossover_rate = cr;
        self
    }

    /// Remaining trials to generate.
    #[must_use]
    pub fn remaining(&self) -> usize {
        self.n_iter.saturating_sub(self.trials_generated)
    }

    /// Initialize population from search space.
    fn initialize<P: ParamKey>(&mut self, space: &SearchSpace<P>) {
        // Extract parameter info in deterministic order
        self.param_order.clear();
        self.param_bounds.clear();

        let mut params: Vec<_> = space.params.iter().collect();
        params.sort_by(|a, b| format!("{:?}", a.0).cmp(&format!("{:?}", b.0)));

        for (key, hyper) in params {
            let key_str = format!("{key:?}");
            let bounds = match hyper {
                HyperParam::Continuous { low, high, log_scale } => (*low, *high, false, *log_scale),
                HyperParam::Integer { low, high } => (*low as f64, *high as f64, true, false),
                HyperParam::Categorical { choices } => (0.0, (choices.len() - 1) as f64, true, false),
            };
            self.param_order.push(key_str);
            self.param_bounds.push(bounds);
        }

        let dim = self.param_bounds.len();
        let pop_size = if self.population_size == 0 {
            (10 * dim).clamp(20, 100)
        } else {
            self.population_size
        };

        // Initialize population with random values
        let mut rng = XorShift64::new(self.seed);
        self.population = (0..pop_size)
            .map(|_| {
                self.param_bounds
                    .iter()
                    .map(|(low, high, is_int, is_log)| {
                        let val = if *is_log {
                            let log_low = low.ln();
                            let log_high = high.ln();
                            (log_low + rng.gen_f64() * (log_high - log_low)).exp()
                        } else {
                            *low + rng.gen_f64() * (*high - *low)
                        };
                        if *is_int {
                            val.round()
                        } else {
                            val
                        }
                    })
                    .collect()
            })
            .collect();

        self.fitness = vec![f64::INFINITY; pop_size];
        self.initialized = true;
    }

    /// Convert parameter vector to Trial.
    fn vector_to_trial<P: ParamKey>(vec: &[f64], space: &SearchSpace<P>) -> Trial<P> {
        let mut values = HashMap::new();
        let mut params: Vec<_> = space.params.iter().collect();
        params.sort_by(|a, b| format!("{:?}", a.0).cmp(&format!("{:?}", b.0)));

        for (i, (key, hyper)) in params.iter().enumerate() {
            let val = vec[i];
            let param_value = match hyper {
                HyperParam::Continuous { .. } => ParamValue::Float(val),
                HyperParam::Integer { .. } => ParamValue::Int(val.round() as i64),
                HyperParam::Categorical { choices } => {
                    let idx = (val.round() as usize).min(choices.len() - 1);
                    choices[idx].clone()
                }
            };
            values.insert(**key, param_value);
        }

        Trial { values }
    }

    /// Clip value to bounds.
    fn clip(&self, val: f64, idx: usize) -> f64 {
        let (low, high, is_int, _) = self.param_bounds[idx];
        let clipped = val.clamp(low, high);
        if is_int {
            clipped.round()
        } else {
            clipped
        }
    }

    /// Generate a mutant vector by applying a per-dimension formula.
    fn mutate_vector(&self, dim: usize, formula: impl Fn(usize) -> f64) -> Vec<f64> {
        (0..dim).map(formula).collect()
    }

    /// Select `count` distinct random indices, all different from `exclude`.
    fn select_distinct_indices(rng: &mut XorShift64, pop_size: usize, exclude: usize, count: usize) -> Vec<usize> {
        let mut indices = Vec::with_capacity(count);
        while indices.len() < count {
            let idx = rng.gen_usize(pop_size);
            if idx != exclude && !indices.contains(&idx) {
                indices.push(idx);
            }
        }
        indices
    }
}

impl<P: ParamKey> SearchStrategy<P> for DESearch {
    fn suggest(&mut self, space: &SearchSpace<P>, n: usize) -> Vec<Trial<P>> {
        if !self.initialized {
            self.initialize(space);
        }

        let n = n.min(self.remaining()).min(self.population.len());

        // Return current population members as trials
        let trials: Vec<Trial<P>> = self.population[..n]
            .iter()
            .map(|vec| Self::vector_to_trial(vec, space))
            .collect();

        self.trials_generated += trials.len();
        trials
    }

    fn update(&mut self, results: &[TrialResult<P>]) {
        if results.is_empty() || !self.initialized {
            return;
        }

        // Update fitness for evaluated individuals
        // Note: AutoML uses higher=better, DE uses lower=better
        for (i, result) in results.iter().enumerate() {
            if i < self.fitness.len() {
                // Negate score since DE minimizes
                self.fitness[i] = -result.score;
            }
        }

        // Update best
        self.best_idx = self
            .fitness
            .iter()
            .enumerate()
            .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
            .map_or(0, |(i, _)| i);

        // Evolve population for next generation
        let pop_size = self.population.len();
        let dim = self.param_bounds.len();
        let mut rng = XorShift64::new(self.seed.wrapping_add(self.trials_generated as u64));

        let mut new_population = self.population.clone();

        #[allow(clippy::needless_range_loop)]
        for i in 0..pop_size {
            let indices = Self::select_distinct_indices(&mut rng, pop_size, i, 3);
            let (a, b, c) = (indices[0], indices[1], indices[2]);
            let f = self.mutation_factor;
            let pop = &self.population;
            let best = self.best_idx;

            // Mutation based on strategy
            let mutant: Vec<f64> = match self.strategy {
                DEStrategy::Rand1Bin => self.mutate_vector(dim, |j| {
                    pop[a][j] + f * (pop[b][j] - pop[c][j])
                }),
                DEStrategy::Best1Bin => self.mutate_vector(dim, |j| {
                    pop[best][j] + f * (pop[a][j] - pop[b][j])
                }),
                DEStrategy::CurrentToBest1Bin => self.mutate_vector(dim, |j| {
                    pop[i][j] + f * (pop[best][j] - pop[i][j]) + f * (pop[a][j] - pop[b][j])
                }),
                DEStrategy::Rand2Bin => {
                    let more = Self::select_distinct_indices(&mut rng, pop_size, i, 5);
                    let (ra, rb, rc, d, e) = (more[0], more[1], more[2], more[3], more[4]);
                    self.mutate_vector(dim, |j| {
                        pop[ra][j] + f * (pop[rb][j] - pop[rc][j]) + f * (pop[d][j] - pop[e][j])
                    })
                }
            };

            // Crossover
            let j_rand = rng.gen_usize(dim);
            let trial: Vec<f64> = (0..dim)
                .map(|j| {
                    let use_mutant = j == j_rand || rng.gen_f64() < self.crossover_rate;
                    let val = if use_mutant {
                        mutant[j]
                    } else {
                        self.population[i][j]
                    };
                    self.clip(val, j)
                })
                .collect();

            // Selection will happen on next update
            // For now, just replace with trial (we'll get actual fitness next round)
            if self.fitness[i] == f64::INFINITY {
                // Not yet evaluated, keep trial
                new_population[i] = trial;
            }
            // Otherwise keep current (greedy selection happens implicitly via fitness)
        }

        self.population = new_population;
    }
}

/// Active Learning search optimizer.
///
/// Wraps any base search strategy and adds uncertainty-based stopping.
/// Implements the "Pull System" from Lean manufacturing - only generates
/// samples while uncertainty is high (Settles, 2009).
///
/// # Muda Elimination (Waste Reduction)
///
/// Traditional batch generation ("Push System") produces many redundant samples.
/// Active Learning stops when confidence saturates, eliminating overproduction.
///
/// # Example
///
/// ```
/// use aprender::automl::{ActiveLearningSearch, RandomSearch, SearchSpace, SearchStrategy};
/// use aprender::automl::params::RandomForestParam as RF;
///
/// let space = SearchSpace::new()
///     .add_continuous(RF::NEstimators, 10.0, 500.0);
///
/// let base = RandomSearch::new(1000).with_seed(42);
/// let mut search = ActiveLearningSearch::new(base)
///     .with_uncertainty_threshold(0.1)  // Stop when uncertainty < 0.1
///     .with_min_samples(10);            // Need at least 10 samples
///
/// // Pull system: generate until confident
/// while !search.should_stop() {
///     let trials = search.suggest(&space, 5);
///     if trials.is_empty() { break; }
///     // ... evaluate trials ...
///     // search.update(&results);
/// }
/// ```
#[derive(Debug, Clone)]
pub struct ActiveLearningSearch<S> {
    /// Base search strategy.
    base: S,
    /// Uncertainty threshold for stopping (default: 0.1).
    uncertainty_threshold: f64,
    /// Minimum samples before stopping is allowed.
    min_samples: usize,
    /// Collected scores for uncertainty estimation.
    scores: Vec<f64>,
    /// Current uncertainty estimate.
    current_uncertainty: f64,
}

impl<S> ActiveLearningSearch<S> {
    /// Create active learning wrapper around base strategy.
    #[must_use]
    pub fn new(base: S) -> Self {
        Self {
            base,
            uncertainty_threshold: 0.1,
            min_samples: 10,
            scores: Vec::new(),
            current_uncertainty: f64::INFINITY,
        }
    }

    /// Set uncertainty threshold for stopping.
    ///
    /// When estimated uncertainty drops below this threshold, `should_stop()` returns true.
    #[must_use]
    pub fn with_uncertainty_threshold(mut self, threshold: f64) -> Self {
        self.uncertainty_threshold = threshold;
        self
    }

    /// Set minimum samples before stopping is considered.
    #[must_use]
    pub fn with_min_samples(mut self, min: usize) -> Self {
        self.min_samples = min;
        self
    }

    /// Check if optimization should stop due to low uncertainty.
    ///
    /// Returns true when:
    /// 1. At least `min_samples` have been evaluated
    /// 2. Uncertainty is below `uncertainty_threshold`
    #[must_use]
    pub fn should_stop(&self) -> bool {
        self.scores.len() >= self.min_samples
            && self.current_uncertainty < self.uncertainty_threshold
    }

    /// Get current uncertainty estimate.
    ///
    /// Uses coefficient of variation (`std_dev` / mean) as uncertainty metric.
    /// Returns infinity if not enough samples.
    #[must_use]
    pub fn uncertainty(&self) -> f64 {
        self.current_uncertainty
    }

    /// Compute uncertainty from collected scores.
    ///
    /// Uses coefficient of variation: σ / μ
    /// - Low CV = consistent scores = low uncertainty
    /// - High CV = variable scores = high uncertainty
    fn compute_uncertainty(&mut self) {
        if self.scores.len() < 2 {
            self.current_uncertainty = f64::INFINITY;
            return;
        }

        let n = self.scores.len() as f64;
        let mean = self.scores.iter().sum::<f64>() / n;

        if mean.abs() < 1e-10 {
            // Avoid division by zero - if mean is ~0, use std dev directly
            let variance = self.scores.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
            self.current_uncertainty = variance.sqrt();
        } else {
            let variance = self.scores.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
            let std_dev = variance.sqrt();
            // Coefficient of variation
            self.current_uncertainty = std_dev / mean.abs();
        }
    }

    /// Get number of samples collected.
    #[must_use]
    pub fn sample_count(&self) -> usize {
        self.scores.len()
    }
}