1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
use std::{fmt::Debug, iter, ops::RangeInclusive};
use fastrand::Rng;
use num_traits::{FromPrimitive, One, Zero};
use crate::{
iter::{Triple, Triples},
kde::KernelDensityEstimator,
kernel::Kernel,
optimizer::trial::{Trial, Trials},
range::CopyRange,
traits::{
loopback::{SelfMul, SelfSub},
shortcuts::Multiplicative,
},
Density,
Sample,
};
mod trial;
/// ✨ Hyperparameter optimizer.
///
/// # Generic parameters
///
/// - [`KInit`]: kernel type of the initial (prior) estimator component
/// - [`P`]: type of parameter that is optimized
/// - [`M`]: value of the target function, the less – the better
#[derive(Debug)]
pub struct Optimizer<KInit, P, M> {
pub cutoff: f64,
pub n_candidates: usize,
pub bandwidth: P,
range: RangeInclusive<P>,
init_kernel: KInit,
rng: Rng,
good_trials: Trials<P, M>,
bad_trials: Trials<P, M>,
}
impl<KInit, P, M> Optimizer<KInit, P, M> {
/// Construct the new optimizer.
///
/// Here begins your adventure!
///
/// # Parameters
///
/// - `range`: parameter search range, [`Optimizer`] will clamp random samples to this range
/// - `init_kernel`: your prior belief about which values of the searched parameter is more optimal
/// - `kernel`: kernel for the trial components
pub fn new(range: RangeInclusive<P>, init_kernel: KInit, rng: Rng) -> Self
where
P: One,
{
Self {
range,
init_kernel,
rng,
cutoff: 0.1,
n_candidates: 25,
bandwidth: P::one(),
good_trials: Trials::new(),
bad_trials: Trials::new(),
}
}
/// Set the ratio of «good» trials.
#[must_use]
pub fn cutoff(mut self, cutoff: impl Into<f64>) -> Self {
self.cutoff = cutoff.into();
self
}
/// Set the number of candidates to choose the next trial from the acquisition function[^1].
///
/// Sampling from the acquisition function is cheaper than evaluating the target cost function,
/// so the more candidates – the more effective is the optimization step.
///
/// However, the acquisition function is an approximation of a potential gain,
/// so the fewer candidates – the more precise is the optimization step.
///
/// The number of candidates is therefore a tradeoff.
///
/// [^1]: Acquisition function is basically a ratio
/// between the «good» KDE and «bad» KDE at the same point.
#[must_use]
pub fn n_candidates(mut self, n_candidates: impl Into<usize>) -> Self {
self.n_candidates = n_candidates.into();
self
}
/// Set the bandwidth multiplier for the estimator kernels.
///
/// Standard deviation of the kernel is the distance from the point to its furthest neighbour,
/// multiplied by this coefficient.
///
/// The default multiplier is [`P::one`]. Lower bandwidth approximates the density better,
/// however, is also prone to over-fitting. Higher bandwidth avoid over-fitting better,
/// but is also smoother and less precise.
#[must_use]
pub fn bandwidth(mut self, bandwidth: impl Into<P>) -> Self {
self.bandwidth = bandwidth.into();
self
}
/// Provide the information about the trial, or in other words, «fit» the optimizer on the sample.
///
/// Normally, you'll call your target function on parameters supplied by [`Optimizer::new_trial`],
/// and feed back the results. But you also can feed it with any arbitrary parameters.
///
/// # Parameters
///
/// - `parameter`: the target function parameter
/// - `metric`: the target function metric
#[allow(
clippy::cast_possible_truncation,
clippy::cast_precision_loss,
clippy::cast_sign_loss,
clippy::missing_panics_doc
)]
pub fn feed_back(&mut self, parameter: P, metric: M)
where
P: Copy + Debug + Ord,
M: Debug + Ord,
{
let trial = Trial { metric, parameter };
let n_expected_good_trials = {
// `+ 1` is for this new trial.
let n_total_trials = self.good_trials.len() + self.bad_trials.len() + 1;
(self.cutoff * n_total_trials as f64).round() as usize
};
// This uses an algorithm similar to the median tracking using two heaps,
// only I'm tracking the `cutoff` quantile, and I have the B-tree sets instead of heaps.
if self
.good_trials
.worst()
.is_some_and(|worst_good_trial| &trial <= worst_good_trial)
{
// New trial is not worse than the worst good trial, so it belongs to the good trials:
if self.good_trials.insert(trial) {
// Re-balance:
while self.good_trials.len() > n_expected_good_trials {
self.bad_trials
.insert(self.good_trials.pop_worst().unwrap());
}
}
}
// Otherwise, it belongs to the bad trials:
else if self.bad_trials.insert(trial) {
// Re-balance:
while self.good_trials.len() < n_expected_good_trials {
self.good_trials.insert(self.bad_trials.pop_best().unwrap());
}
}
// Verify the invariant:
#[cfg(debug_assertions)]
{
if let (Some(worst_good), Some(best_bad)) =
(self.good_trials.worst(), self.bad_trials.best())
{
assert!(
worst_good <= best_bad,
"the worst good trial `{worst_good:?}` must not be worse than the best bad trial `{best_bad:?}`",
);
}
}
}
/// Construct the kernel for the triple of adjacent trials.
fn construct_kernel<K>(triple: Triple<P>, bounds: RangeInclusive<P>, bandwidth: P) -> K
where
K: Kernel<Param = P>,
P: Copy + Ord + SelfSub + SelfMul,
{
match triple {
Triple::Full(left, location, right) => {
// For the middle point we take the maximum of the distances to the left and right neighbors:
Kernel::new(
location,
bandwidth * (right - location).max(location - left),
)
}
Triple::LeftMiddle(left, location) => {
// For the left-middle pair: the maximum between them and to the right bound:
K::new(
location,
bandwidth * (location - left).max(*bounds.end() - location),
)
}
Triple::MiddleRight(location, right) => {
// Similar, but to the left bound:
K::new(
location,
bandwidth * (right - location).max(location - *bounds.start()),
)
}
Triple::Left(location) | Triple::Middle(location) | Triple::Right(location) => {
// Maximum between the distances to the bounds:
K::new(
location,
bandwidth * (*bounds.end() - location).max(location - *bounds.start()),
)
}
}
}
/// Construct a [`KernelDensityEstimator`] from the trials.
fn construct_kde<K>(
parameters: impl Iterator<Item = P> + Clone,
bounds: RangeInclusive<P>,
bandwidth: P,
) -> KernelDensityEstimator<impl Iterator<Item = K> + Clone>
where
P: Copy + Ord + SelfSub + SelfMul,
K: Copy + Kernel<Param = P>,
{
KernelDensityEstimator(
Triples::new(parameters)
.map(move |triple| Self::construct_kernel(triple, bounds.copy(), bandwidth)),
)
}
/// Generate a parameter value for a new trial.
///
/// After evaluating the target function with this parameter,
/// you'd better feed the metric back with [`Optimizer::feed_back`].
///
/// # Type parameters
///
/// - [`K`]: kernel type
/// - [`D`]: kernel density type
///
/// # Panics
///
/// This method may panic if a random or calculated number cannot be converted to
/// the parameter or density type.
#[allow(clippy::cast_precision_loss)]
pub fn new_trial<K>(&mut self) -> P
where
KInit: Copy + Density<Param = P> + Sample<Param = P>,
KInit::Output: Copy + Debug + Ord + Multiplicative + FromPrimitive + Zero,
K: Copy
+ Kernel<Param = P>
+ Sample<Param = P>
+ Density<Param = P, Output = <KInit as Density>::Output>,
P: Copy + Ord + SelfSub + SelfMul,
{
// Abandon hope, all ye who enter here!
// Okay… Slow breath in… and out…
// First, construct the KDEs:
let good_kde = Self::construct_kde::<K>(
self.good_trials.iter_parameters(),
self.range.copy(),
self.bandwidth,
);
let bad_kde = Self::construct_kde::<K>(
self.bad_trials.iter_parameters(),
self.range.copy(),
self.bandwidth,
);
// Now, sample candidates:
let candidates = iter::from_fn(|| {
if self.good_trials.len() < 2 || self.rng.usize(0..=self.good_trials.len()) == 0 {
// Select from the first component, if the good KDE is empty or with probability `1 / (n + 1)`.
Some(self.init_kernel.sample(&mut self.rng))
} else {
// Select normally from the good KDE:
Some(
good_kde
.sample(&mut self.rng)
.expect("KDE should return a sample"),
)
}
});
// Clamp them to the bounds:
let valid_candidates = candidates
.map(|parameter| num_traits::clamp(parameter, *self.range.start(), *self.range.end()));
// Filter out tried ones:
let new_candidates = valid_candidates
.filter(|parameter| !self.good_trials.contains(parameter))
.filter(|parameter| !self.bad_trials.contains(parameter));
// Calculate the acquisition function:
let evaluated_candidates = new_candidates
.map(|parameter| {
// Use weighted average of the initial component and KDE:
let init_density = self.init_kernel.density(parameter);
let l = (init_density
+ good_kde.density(parameter)
* K::Output::from_usize(self.good_trials.len()).unwrap())
/ K::Output::from_usize(self.good_trials.len() + 1).unwrap();
assert!(
l >= K::Output::zero(),
"«good» density should not be negative: {l:?}"
);
let g = (init_density
+ bad_kde.density(parameter)
* K::Output::from_usize(self.bad_trials.len()).unwrap())
/ K::Output::from_usize(self.bad_trials.len() + 1).unwrap();
assert!(
g > K::Output::zero(),
"«bad» density should be positive: {g:?}"
);
(parameter, l / g)
})
.take(self.n_candidates);
// Take the best one by the acquisition function value:
evaluated_candidates
.max_by_key(|(_, acquisition)| *acquisition)
.expect("there should be at least one candidate")
.0
}
/// Get the best trial.
pub fn best_trial(&self) -> Option<&Trial<P, M>>
where
P: Ord,
M: Ord,
{
self.good_trials.best().or_else(|| self.bad_trials.best())
}
}