gazefilter_arrsac/lib.rs
1#![no_std]
2
3extern crate alloc;
4use core::cmp::Reverse;
5
6use alloc::{vec, vec::Vec};
7use rand_core::RngCore;
8use sample_consensus::{Consensus, Estimator, Model};
9
10/// The ARRSAC algorithm for sample consensus.
11///
12/// Don't forget to shuffle your input data points to avoid bias before
13/// using this consensus process. It will not shuffle your data for you.
14/// If you do not shuffle, the output will be biased towards data at the beginning
15/// of the inputs.
16#[derive(Debug, Clone)]
17pub struct Arrsac<R> {
18 initialization_hypotheses: usize,
19 initialization_blocks: usize,
20 max_candidate_hypotheses: usize,
21 estimations_per_block: usize,
22 block_size: usize,
23 likelihood_ratio_threshold: f32,
24 inlier_threshold: f64,
25 rng: R,
26 random_samples: Vec<u32>,
27}
28
29impl<R> Arrsac<R>
30where
31 R: RngCore,
32{
33 /// `rng` should have the same properties you would want for a Monte Carlo simulation.
34 /// It should generate random numbers quickly without having any discernable patterns.
35 ///
36 /// The `inlier_threshold` is the one parameter that is always specific to your dataset.
37 /// This must be set to the threshold in which a data point's residual is considered an inlier.
38 /// Some of the other parameters may need to be configured based on the amount of data,
39 /// such as `block_size`, `likelihood_ratio_threshold`, and `block_size`. However,
40 /// `inlier_threshold` has to be set based on the residual function used with the model.
41 ///
42 /// `initial_epsilon` must be higher than `initial_delta`. If you modify these values,
43 /// you need to make sure that within one `block_size` the `likelihood_ratio_threshold`
44 /// can be reached and a model can be rejected. Basically, make sure that
45 /// `((1.0 - delta) / (1.0 - epsilon))^block_size >>> likelihood_ratio_threshold`.
46 /// This must be done to ensure outlier models are rejected during the initial generation
47 /// phase, which only processes `block_size` datapoints.
48 ///
49 /// `initial_epsilon` should also be as large as you can set it where it is still relatively
50 /// pessimistic. This is so that we can more easily reject a model early in the process
51 /// to compute an updated value for delta during the adaptive process. This may not be possible
52 /// and will depend on your data.
53 pub fn new(inlier_threshold: f64, rng: R) -> Self {
54 Self {
55 initialization_hypotheses: 256,
56 initialization_blocks: 4,
57 max_candidate_hypotheses: 64,
58 estimations_per_block: 64,
59 block_size: 64,
60 likelihood_ratio_threshold: 1e3,
61 inlier_threshold,
62 rng,
63 random_samples: vec![],
64 }
65 }
66
67 /// Number of models generated in the initial step when epsilon and delta are being estimated.
68 ///
69 /// Default: `256`
70 #[must_use]
71 pub fn initialization_hypotheses(self, initialization_hypotheses: usize) -> Self {
72 Self {
73 initialization_hypotheses,
74 ..self
75 }
76 }
77
78 /// Number of data blocks used to compute the initial estimate of delta and epsilon
79 /// before proceeding with regular block processing. This is used instead of
80 /// an initial epsilon and delta, which were suggested by the paper.
81 ///
82 /// Default: `4`
83 #[must_use]
84 pub fn initialization_blocks(self, initialization_blocks: usize) -> Self {
85 Self {
86 initialization_blocks,
87 ..self
88 }
89 }
90
91 /// Maximum number of best hypotheses to retain during block processing
92 ///
93 /// This number is halved on each block such that on block `n` the number of
94 /// hypotheses retained is `max_candidate_hypotheses >> n`.
95 ///
96 /// Default: `64`
97 #[must_use]
98 pub fn max_candidate_hypotheses(self, max_candidate_hypotheses: usize) -> Self {
99 Self {
100 max_candidate_hypotheses,
101 ..self
102 }
103 }
104
105 /// Number of estmations (may generate multiple hypotheses) that will be ran
106 /// for each block of data evaluated
107 ///
108 /// Default: `64`
109 #[must_use]
110 pub fn estimations_per_block(self, estimations_per_block: usize) -> Self {
111 Self {
112 estimations_per_block,
113 ..self
114 }
115 }
116
117 /// Number of data points evaluated before more hypotheses are generated
118 ///
119 /// Default: `64`
120 #[must_use]
121 pub fn block_size(self, block_size: usize) -> Self {
122 Self { block_size, ..self }
123 }
124
125 /// Once a model reaches this level of unlikelihood, it is rejected. Set this
126 /// higher to make it less restrictive, usually at the cost of more execution time.
127 ///
128 /// Increasing this will make it more likely to find a good result.
129 ///
130 /// Decreasing this will speed up execution.
131 ///
132 /// This ratio is not exposed as a parameter in the original paper, but is instead computed
133 /// recursively for a few iterations. It is roughly equivalent to the **reciprocal** of the
134 /// **probability of rejecting a good model**. You can use that to control the probability
135 /// that a good model is rejected.
136 ///
137 /// Default: `1e3`
138 #[must_use]
139 pub fn likelihood_ratio_threshold(self, likelihood_ratio_threshold: f32) -> Self {
140 Self {
141 likelihood_ratio_threshold,
142 ..self
143 }
144 }
145
146 /// Residual threshold for determining if a data point is an inlier or an outlier of a model
147 #[must_use]
148 pub fn inlier_threshold(self, inlier_threshold: f64) -> Self {
149 Self {
150 inlier_threshold,
151 ..self
152 }
153 }
154
155 /// Adapted from algorithm 3 from "A Comparative Analysis of RANSAC Techniques Leading to Adaptive
156 /// Real-Time Random Sample Consensus", but it was effectively rewritten to avoid the need for
157 /// initial epsilon and delta.
158 ///
159 /// Returns the initial models (and their num inliers) sorted by decreasing inliers
160 /// and `delta` in that order.
161 fn initial_hypotheses<E, Data>(
162 &mut self,
163 estimator: &E,
164 data: impl Iterator<Item = Data> + Clone,
165 ) -> (Vec<(E::Model, usize)>, f32)
166 where
167 E: Estimator<Data>,
168 {
169 assert!(
170 self.initialization_blocks > 0,
171 "ARRSAC must have at least 1 initialization block"
172 );
173 // NOTE: This whole function is different than that specified in the ARRSAC paper.
174 // The reason is that you needed to provide a good initial guess for epsilon and delta
175 // otherwise it could lead to delta exceeding epsilon or situations where models could no
176 // longer be rejected or were almost always rejected. This solution is an imperfect compomise
177 // that assumes that delta will be roughly equal to the inlier ratio of the worst model generated,
178 // which both assumes that the worst model is an outlier and that it is actually representative of the
179 // population. The assumption of epsilon also assumes that the best model is an inlier, but is an
180 // otherwise good initial guess. The other caveat with this approach is that a sufficiently large
181 // set of initial datapoints is required to be able to accurately determine epsilon and delta.
182 // Therefore a new paremeter is added to separate the normal blocks from the initial generation set.
183 let mut hypotheses = vec![];
184 // We don't want more than `block_size` data points to be used to evaluate models initially.
185 let initial_datapoints = core::cmp::min(
186 self.initialization_blocks * self.block_size,
187 data.clone().count(),
188 );
189 // Generate the initial batch of random hypotheses and count their inliers and outliers.
190 for _ in 0..self.initialization_hypotheses {
191 for model in self.generate_random_hypotheses(estimator, data.clone()) {
192 let inliers = self.count_inliers(data.clone().take(initial_datapoints), &model);
193 hypotheses.push((model, inliers));
194 }
195 }
196
197 // Bail early when no hypothesis was found.
198 // This will cause execution to terminate.
199 if hypotheses.is_empty() {
200 return (hypotheses, 0.0);
201 }
202
203 // Sort the hypotheses by their inliers.
204 hypotheses.sort_unstable_by_key(|&(_, inliers)| Reverse(inliers));
205
206 // Compute epsilon and delta using the best and worst model generated.
207 let epsilon = hypotheses
208 .first()
209 .map(|&(_, inliers)| inliers as f32 / initial_datapoints as f32)
210 .unwrap_or_default();
211 let delta = hypotheses
212 .last()
213 .map(|&(_, inliers)| if inliers < E::MIN_SAMPLES {E::MIN_SAMPLES} else {inliers} as f32 / initial_datapoints as f32)
214 .unwrap_or_default();
215
216 if epsilon < delta {
217 // If epsilon is less than delta, then better hypotheses will get rejected and worse accepted,
218 // which is counter to what we want. In this case, we had a bad initialization, so clear the hypotheses.
219 // This will cause execution to terminate.
220 hypotheses.clear();
221 return (hypotheses, delta);
222 }
223
224 // Populate hypotheses with hypotheses generated from the inliers of the best hypothesis.
225 // This will use the initialization datapoints and filter with SPRT.
226 self.populate_hypotheses_sprt(
227 estimator,
228 &mut hypotheses,
229 delta,
230 data,
231 initial_datapoints,
232 self.initialization_hypotheses,
233 );
234
235 // Sort the hypotheses by their inliers.
236 hypotheses.sort_unstable_by_key(|&(_, inliers)| Reverse(inliers));
237
238 // Filter down the hypotheses to just the best ones.
239 hypotheses.truncate(self.max_candidate_hypotheses >> (self.initialization_blocks - 1));
240
241 (hypotheses, delta)
242 }
243
244 /// Populates `self.random_samples` using a len.
245 fn populate_samples(&mut self, num: usize, len: usize) {
246 // We can generate no hypotheses if the amout of data is too low.
247 if len < num {
248 panic!("cannot use arrsac without having enough samples");
249 }
250 let len = len as u32;
251 // Threshold generation below adapted from randomize::RandRangeU32.
252 let threshold = len.wrapping_neg() % len;
253 self.random_samples.clear();
254 for _ in 0..num {
255 loop {
256 let mul = u64::from(self.rng.next_u32()).wrapping_mul(u64::from(len));
257 if mul as u32 >= threshold {
258 let s = (mul >> 32) as u32;
259 if !self.random_samples.contains(&s) {
260 self.random_samples.push(s);
261 break;
262 }
263 }
264 }
265 }
266 }
267
268 fn populate_hypotheses_sprt<E, Data>(
269 &mut self,
270 estimator: &E,
271 hypotheses: &mut Vec<(E::Model, usize)>,
272 delta: f32,
273 data: impl Iterator<Item = Data> + Clone,
274 num_checked: usize,
275 num_hypotheses: usize,
276 ) where
277 E: Estimator<Data>,
278 {
279 // Update epsilon using the best model.
280 // Since epsilon can only increase and delta is fixed, we can be sure that these ratios
281 // will still be valid (epsilon > delta).
282 let epsilon = hypotheses[0].1 as f32 / num_checked as f32;
283 // Create the likelihood ratios for inliers and outliers.
284 let positive_likelihood_ratio = delta / epsilon;
285 let negative_likelihood_ratio = (1.0 - delta) / (1.0 - epsilon);
286 // Generate the list of inliers for the best model.
287 let mut inliers = self.inliers(data.clone().take(num_checked), &hypotheses[0].0);
288 if inliers.len() <= E::MIN_SAMPLES {
289 // If we don't have enough samples to generate more models, then we should expand the inliers to
290 // the entire dataset.
291 inliers = self.inliers(data.clone().take(num_checked), &hypotheses[0].0);
292 }
293 // We generate hypotheses until we reach the initial num hypotheses.
294 // We can't count the number generated because it could generate 0 hypotheses
295 // and then the loop would continue indefinitely.
296 let mut random_hypotheses = Vec::new();
297 for _ in 0..num_hypotheses {
298 random_hypotheses.extend(self.generate_random_hypotheses_subset(
299 estimator,
300 data.clone(),
301 &inliers,
302 ));
303 for model in random_hypotheses.drain(..) {
304 if let Some(inliers) = self.asprt(
305 data.clone().take(num_checked),
306 &model,
307 positive_likelihood_ratio,
308 negative_likelihood_ratio,
309 E::MIN_SAMPLES,
310 ) {
311 hypotheses.push((model, inliers));
312 }
313 }
314 }
315 }
316
317 /// Generates as many hypotheses as one call to `Estimator::estimate()` returns from all data.
318 fn generate_random_hypotheses<E, Data>(
319 &mut self,
320 estimator: &E,
321 data: impl Iterator<Item = Data> + Clone,
322 ) -> E::ModelIter
323 where
324 E: Estimator<Data>,
325 {
326 self.populate_samples(E::MIN_SAMPLES, data.clone().count());
327 estimator.estimate(
328 self.random_samples
329 .iter()
330 .map(|&ix| data.clone().nth(ix as usize).unwrap()),
331 )
332 }
333
334 /// Generates as many hypotheses as one call to `Estimator::estimate()` returns from a subset of the data.
335 fn generate_random_hypotheses_subset<E, Data>(
336 &mut self,
337 estimator: &E,
338 data: impl Iterator<Item = Data> + Clone,
339 subset: &[usize],
340 ) -> E::ModelIter
341 where
342 E: Estimator<Data>,
343 {
344 self.populate_samples(E::MIN_SAMPLES, subset.len());
345 estimator.estimate(
346 core::mem::take(&mut self.random_samples)
347 .iter()
348 .map(|&ix| data.clone().nth(subset[ix as usize]).unwrap()),
349 )
350 }
351
352 /// Algorithm 1 in "Randomized RANSAC with Sequential Probability Ratio Test".
353 ///
354 /// This tests if a model is accepted. Returns `Some(inliers)` if accepted or `None` if rejected.
355 ///
356 /// `inlier_threshold` - The model residual error threshold between inliers and outliers
357 /// `positive_likelihood_ratio` - `δ / ε`
358 /// `negative_likelihood_ratio` - `(1 - δ) / (1 - ε)`
359 fn asprt<Data, M: Model<Data>>(
360 &self,
361 data: impl Iterator<Item = Data>,
362 model: &M,
363 positive_likelihood_ratio: f32,
364 negative_likelihood_ratio: f32,
365 minimum_samples: usize,
366 ) -> Option<usize> {
367 let mut likelihood_ratio = 1.0;
368 let mut inliers = 0;
369 for data in data {
370 likelihood_ratio *= if model.residual(&data) < self.inlier_threshold {
371 inliers += 1;
372 positive_likelihood_ratio
373 } else {
374 negative_likelihood_ratio
375 };
376
377 if likelihood_ratio > self.likelihood_ratio_threshold || likelihood_ratio.is_nan() {
378 return None;
379 }
380 }
381
382 (inliers >= minimum_samples).then_some(inliers)
383 }
384
385 /// Determines the number of inliers a model has.
386 fn count_inliers<Data, M: Model<Data>>(
387 &self,
388 data: impl Iterator<Item = Data>,
389 model: &M,
390 ) -> usize {
391 data.filter(|data| model.residual(data) < self.inlier_threshold)
392 .count()
393 }
394
395 /// Gets indices of inliers for a model.
396 fn inliers<Data, M: Model<Data>>(
397 &self,
398 data: impl Iterator<Item = Data>,
399 model: &M,
400 ) -> Vec<usize> {
401 data.enumerate()
402 .filter(|(_, data)| model.residual(data) < self.inlier_threshold)
403 .map(|(ix, _)| ix)
404 .collect()
405 }
406}
407
408impl<E, R, Data> Consensus<E, Data> for Arrsac<R>
409where
410 E: Estimator<Data>,
411 R: RngCore,
412{
413 type Inliers = Vec<usize>;
414
415 fn model<I>(&mut self, estimator: &E, data: I) -> Option<E::Model>
416 where
417 I: Iterator<Item = Data> + Clone,
418 {
419 self.model_inliers(estimator, data).map(|(model, _)| model)
420 }
421
422 fn model_inliers<I>(&mut self, estimator: &E, data: I) -> Option<(E::Model, Self::Inliers)>
423 where
424 I: Iterator<Item = Data> + Clone,
425 {
426 // Don't do anything if we don't have enough data.
427 if data.clone().count() < E::MIN_SAMPLES {
428 return None;
429 }
430 // Generate the initial set of hypotheses. This also gets us an estimate of delta.
431 let (mut hypotheses, delta) = self.initial_hypotheses(estimator, data.clone());
432
433 // If there are no initial hypotheses then initialization failed, so exit early.
434 if hypotheses.is_empty() {
435 return None;
436 }
437
438 // Gradually increase how many datapoints we are evaluating until we evaluate them all.
439 // This starts at the first block that was not evaluated in initial_hypotheses.
440 'outer: for block in self.initialization_blocks.. {
441 let samples_up_to_beginning_of_block = block * self.block_size;
442 let samples_up_to_end_of_block = samples_up_to_beginning_of_block + self.block_size;
443 // Score hypotheses with samples.
444 for sample in samples_up_to_beginning_of_block..samples_up_to_end_of_block {
445 // Score the hypotheses with the new datapoint.
446 let new_datapoint = if let Some(datapoint) = data.clone().nth(sample) {
447 datapoint
448 } else {
449 // We reached the last datapoint, so break out of the outer loop.
450 break 'outer;
451 };
452 for (hypothesis, inlier_count) in hypotheses.iter_mut() {
453 if hypothesis.residual(&new_datapoint) < self.inlier_threshold {
454 *inlier_count += 1;
455 }
456 }
457 }
458 // Sort the hypotheses by their inliers to find the best.
459 hypotheses.sort_unstable_by_key(|&(_, inliers)| Reverse(inliers));
460 // Populate hypotheses with hypotheses that pass SPRT.
461 self.populate_hypotheses_sprt(
462 estimator,
463 &mut hypotheses,
464 delta,
465 data.clone(),
466 samples_up_to_end_of_block,
467 self.estimations_per_block,
468 );
469 // This will retain at least half of the hypotheses each time
470 // and gradually decrease as the number of samples we are evaluating increases.
471 // NOTE:
472 // The paper says to use a peculiar formula that just results in doing
473 // this basic right shift below, but as written it contained some apparent errors in
474 // where it was ran. This seems to be the correct location to do this.
475 hypotheses.sort_unstable_by_key(|&(_, inliers)| Reverse(inliers));
476 hypotheses.truncate(self.max_candidate_hypotheses >> block);
477 if hypotheses.len() <= 1 {
478 break 'outer;
479 }
480 }
481 hypotheses
482 .into_iter()
483 .max_by_key(|&(_, inliers)| inliers)
484 .map(|(model, _)| {
485 let inliers = self.inliers(data.clone(), &model);
486 (model, inliers)
487 })
488 }
489}