hotcoco 0.4.0

Rust implementation of pycocotools — COCO dataset API for object detection, segmentation, and keypoint evaluation
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
use std::collections::{HashMap, HashSet};

use rand::Rng;
use rand::SeedableRng;
use rand::rngs::SmallRng;
use rayon::prelude::*;
use serde::Serialize;

use super::COCOeval;
use super::accumulate::accumulate_impl;
use super::summarize::{MetricDef, build_metric_defs, per_cat_ap_static, summarize_impl};

/// Options for pairwise model comparison.
#[derive(Debug, Clone)]
pub struct CompareOpts {
    /// Number of bootstrap samples for confidence intervals. 0 = no bootstrap.
    pub n_bootstrap: usize,
    /// Random seed for reproducibility.
    pub seed: u64,
    /// Confidence level for bootstrap intervals (e.g. 0.95 for 95% CI).
    pub confidence: f64,
}

impl Default for CompareOpts {
    fn default() -> Self {
        Self {
            n_bootstrap: 0,
            seed: 42,
            confidence: 0.95,
        }
    }
}

/// Bootstrap confidence interval for a metric difference.
#[derive(Debug, Clone, Serialize)]
pub struct BootstrapCI {
    /// Lower bound of the confidence interval.
    pub lower: f64,
    /// Upper bound of the confidence interval.
    pub upper: f64,
    /// Confidence level (e.g. 0.95).
    pub confidence: f64,
    /// Fraction of bootstrap samples where delta > 0.
    pub prob_positive: f64,
    /// Standard error of the delta across bootstrap samples.
    pub std_err: f64,
}

/// Per-category AP comparison entry.
#[derive(Debug, Clone, Serialize)]
pub struct CategoryDelta {
    /// COCO category ID.
    pub cat_id: u64,
    /// Human-readable category name.
    pub cat_name: String,
    /// AP for model A.
    pub ap_a: f64,
    /// AP for model B.
    pub ap_b: f64,
    /// Delta (B - A).
    pub delta: f64,
}

/// Full pairwise comparison result.
#[derive(Debug, Clone, Serialize)]
pub struct ComparisonResult {
    /// Metric names in canonical display order (from the evaluation mode's MetricDef vec).
    pub metric_keys: Vec<String>,
    /// All summary metrics for model A.
    pub metrics_a: HashMap<String, f64>,
    /// All summary metrics for model B.
    pub metrics_b: HashMap<String, f64>,
    /// Per-metric delta (B - A).
    pub deltas: HashMap<String, f64>,
    /// Bootstrap CIs on summary metric deltas. `None` if bootstrap disabled.
    pub ci: Option<HashMap<String, BootstrapCI>>,
    /// Per-category AP comparison, sorted by delta ascending (worst regressions first).
    pub per_category: Vec<CategoryDelta>,
    /// Number of bootstrap samples used (0 if disabled).
    pub n_bootstrap: usize,
    /// Number of shared images in the comparison.
    pub num_images: usize,
}

/// Compare two evaluations on the same dataset.
///
/// Both evaluators must have had [`evaluate()`](COCOeval::evaluate) called and must
/// use the same `eval_mode` and `iou_type`. Accumulation and summarization are
/// performed internally on the shared image set — callers do not need to call
/// `accumulate()` or `summarize()` first.
///
/// When `opts.n_bootstrap > 0`, bootstrap confidence intervals are computed on
/// the summary metric deltas by resampling images with replacement and
/// re-accumulating for each sample. This is parallelized with rayon.
pub fn compare(
    eval_a: &COCOeval,
    eval_b: &COCOeval,
    opts: &CompareOpts,
) -> crate::error::Result<ComparisonResult> {
    // --- Validation ---
    if eval_a.eval_imgs.is_empty() {
        return Err("evaluate() must be called on eval_a before compare()".into());
    }
    if eval_b.eval_imgs.is_empty() {
        return Err("evaluate() must be called on eval_b before compare()".into());
    }
    if eval_a.eval_mode != eval_b.eval_mode {
        return Err(format!(
            "eval_mode mismatch: {:?} vs {:?}",
            eval_a.eval_mode, eval_b.eval_mode
        )
        .into());
    }
    if eval_a.params.iou_type != eval_b.params.iou_type {
        return Err(format!(
            "iou_type mismatch: {:?} vs {:?}",
            eval_a.params.iou_type, eval_b.params.iou_type
        )
        .into());
    }
    if opts.confidence <= 0.0 || opts.confidence >= 1.0 {
        return Err(format!("confidence must be in (0, 1), got {}", opts.confidence).into());
    }

    // --- Shared image set ---
    let imgs_a: HashSet<u64> = eval_a.params.img_ids.iter().copied().collect();
    let imgs_b: HashSet<u64> = eval_b.params.img_ids.iter().copied().collect();
    let shared_set: HashSet<u64> = imgs_a.intersection(&imgs_b).copied().collect();
    if shared_set.is_empty() {
        return Err("no shared images between eval_a and eval_b".into());
    }
    let num_images = shared_set.len();
    let shared_sorted: Vec<u64> = {
        let mut v: Vec<u64> = shared_set.iter().copied().collect();
        v.sort_unstable();
        v
    };

    // --- Accumulate + summarize on shared images ---
    let metrics = build_metric_defs(&eval_a.params, eval_a.eval_mode);
    let metric_keys: Vec<&str> = metrics.iter().map(|m| m.name).collect();

    let acc_a = accumulate_impl(&eval_a.eval_imgs, &eval_a.params, Some(&shared_set));
    let stats_a = summarize_impl(
        &acc_a,
        &eval_a.params,
        eval_a.eval_mode,
        &eval_a.freq_groups,
        &metrics,
    );

    let acc_b = accumulate_impl(&eval_b.eval_imgs, &eval_b.params, Some(&shared_set));
    let stats_b = summarize_impl(
        &acc_b,
        &eval_b.params,
        eval_b.eval_mode,
        &eval_b.freq_groups,
        &metrics,
    );

    // --- Metric maps ---
    let metrics_a = stats_to_map(&metric_keys, &stats_a);
    let metrics_b = stats_to_map(&metric_keys, &stats_b);

    let deltas: HashMap<String, f64> = metric_keys
        .iter()
        .zip(stats_a.iter().zip(stats_b.iter()))
        .map(|(&k, (&a, &b))| {
            let d = if a >= 0.0 && b >= 0.0 { b - a } else { 0.0 };
            (k.to_string(), d)
        })
        .collect();

    // --- Per-category AP ---
    let per_cat_a = per_cat_ap_static(&acc_a, &eval_a.params);
    let per_cat_b = per_cat_ap_static(&acc_b, &eval_b.params);

    let mut per_category: Vec<CategoryDelta> = eval_a
        .params
        .cat_ids
        .iter()
        .enumerate()
        .filter_map(|(i, &cat_id)| {
            let ap_a = per_cat_a.get(i).copied().unwrap_or(-1.0);
            let ap_b = per_cat_b.get(i).copied().unwrap_or(-1.0);
            // Skip categories with no data in either model
            if ap_a < 0.0 && ap_b < 0.0 {
                return None;
            }
            let cat_name = eval_a
                .coco_gt
                .get_cat(cat_id)
                .map_or_else(|| cat_id.to_string(), |c| c.name.clone());
            let delta = match (ap_a >= 0.0, ap_b >= 0.0) {
                (true, true) => ap_b - ap_a,
                (false, true) => ap_b,
                (true, false) => -ap_a,
                (false, false) => unreachable!("both < 0 filtered above"),
            };
            Some(CategoryDelta {
                cat_id,
                cat_name,
                ap_a: if ap_a >= 0.0 { ap_a } else { -1.0 },
                ap_b: if ap_b >= 0.0 { ap_b } else { -1.0 },
                delta,
            })
        })
        .collect();

    // Sort by delta ascending (worst regressions first)
    per_category.sort_by(|a, b| {
        a.delta
            .partial_cmp(&b.delta)
            .unwrap_or(std::cmp::Ordering::Equal)
    });

    // --- Bootstrap ---
    let ci = if opts.n_bootstrap > 0 {
        Some(bootstrap_compare(
            eval_a,
            eval_b,
            &shared_sorted,
            opts,
            &metrics,
            &metric_keys,
        ))
    } else {
        None
    };

    Ok(ComparisonResult {
        metric_keys: metric_keys.iter().map(|&k| k.to_string()).collect(),
        metrics_a,
        metrics_b,
        deltas,
        ci,
        per_category,
        n_bootstrap: opts.n_bootstrap,
        num_images,
    })
}

/// Run bootstrap resampling to compute confidence intervals on metric deltas.
///
/// For each sample, draws `shared_img_ids.len()` image IDs with replacement,
/// deduplicates into a HashSet, and re-accumulates both evaluators on that
/// subset. The deduplication means each sample contains ~63% of the original
/// images — this is standard practice for detection evaluation bootstrap since
/// `accumulate_impl` treats images as present/absent (not weighted).
fn bootstrap_compare(
    eval_a: &COCOeval,
    eval_b: &COCOeval,
    shared_img_ids: &[u64],
    opts: &CompareOpts,
    metrics: &[MetricDef],
    metric_keys: &[&str],
) -> HashMap<String, BootstrapCI> {
    let n = shared_img_ids.len();

    // Each bootstrap sample produces a delta vector
    let all_deltas: Vec<Vec<f64>> = (0..opts.n_bootstrap)
        .into_par_iter()
        .map(|i| {
            let mut rng = SmallRng::seed_from_u64(opts.seed.wrapping_add(i as u64));
            let sample: HashSet<u64> = (0..n)
                .map(|_| shared_img_ids[rng.random_range(0..n)])
                .collect();

            let acc_a = accumulate_impl(&eval_a.eval_imgs, &eval_a.params, Some(&sample));
            let stats_a = summarize_impl(
                &acc_a,
                &eval_a.params,
                eval_a.eval_mode,
                &eval_a.freq_groups,
                metrics,
            );

            let acc_b = accumulate_impl(&eval_b.eval_imgs, &eval_b.params, Some(&sample));
            let stats_b = summarize_impl(
                &acc_b,
                &eval_b.params,
                eval_b.eval_mode,
                &eval_b.freq_groups,
                metrics,
            );

            stats_a
                .iter()
                .zip(stats_b.iter())
                .map(|(&a, &b)| if a >= 0.0 && b >= 0.0 { b - a } else { 0.0 })
                .collect()
        })
        .collect();

    // Compute CIs from the sampled deltas
    let alpha = 1.0 - opts.confidence;
    let nb = opts.n_bootstrap;

    metric_keys
        .iter()
        .enumerate()
        .map(|(m, &name)| {
            let mut samples: Vec<f64> = all_deltas.iter().map(|d| d[m]).collect();
            samples.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));

            let lo_idx = ((alpha / 2.0) * nb as f64).floor() as usize;
            let hi_idx = ((1.0 - alpha / 2.0) * nb as f64).ceil() as usize;

            let lower = samples[lo_idx.min(nb - 1)];
            let upper = samples[hi_idx.min(nb - 1)];

            let pos_count = samples.iter().filter(|&&x| x > 0.0).count();
            let prob_positive = pos_count as f64 / nb as f64;

            let mean: f64 = samples.iter().sum::<f64>() / nb as f64;
            let variance = if nb > 1 {
                samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (nb - 1) as f64
            } else {
                0.0
            };

            (
                name.to_string(),
                BootstrapCI {
                    lower,
                    upper,
                    confidence: opts.confidence,
                    prob_positive,
                    std_err: variance.sqrt(),
                },
            )
        })
        .collect()
}

fn stats_to_map(metric_keys: &[&str], stats: &[f64]) -> HashMap<String, f64> {
    metric_keys
        .iter()
        .zip(stats.iter())
        .map(|(&k, &v)| (k.to_string(), v))
        .collect()
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;
    use crate::COCO;
    use crate::params::IouType;
    use std::path::PathBuf;

    fn fixtures_dir() -> PathBuf {
        PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures")
    }

    fn make_eval() -> COCOeval {
        let gt = COCO::new(&fixtures_dir().join("gt.json")).unwrap();
        let dt = gt.load_res(&fixtures_dir().join("dt.json")).unwrap();
        let mut ev = COCOeval::new(gt, dt, IouType::Bbox);
        ev.evaluate();
        ev
    }

    #[test]
    fn test_compare_identical_models() {
        let ev_a = make_eval();
        let ev_b = make_eval();
        let result = compare(&ev_a, &ev_b, &CompareOpts::default()).unwrap();

        assert_eq!(result.num_images, ev_a.params.img_ids.len());
        for (key, delta) in &result.deltas {
            assert!(
                delta.abs() < 1e-10,
                "expected zero delta for {key}, got {delta}"
            );
        }
        assert!(result.ci.is_none());
    }

    #[test]
    fn test_compare_with_bootstrap() {
        let ev_a = make_eval();
        let ev_b = make_eval();
        let opts = CompareOpts {
            n_bootstrap: 50,
            seed: 42,
            confidence: 0.95,
        };
        let result = compare(&ev_a, &ev_b, &opts).unwrap();

        assert!(result.ci.is_some());
        let ci = result.ci.as_ref().unwrap();
        assert!(!ci.is_empty());

        // For identical models, CIs should be tight around zero
        for (key, boot_ci) in ci {
            assert!(
                boot_ci.lower.abs() < 0.1 && boot_ci.upper.abs() < 0.1,
                "expected tight CI for {key}, got [{}, {}]",
                boot_ci.lower,
                boot_ci.upper
            );
            assert_eq!(boot_ci.confidence, 0.95);
        }
    }

    #[test]
    fn test_bootstrap_seed_reproducibility() {
        let ev_a = make_eval();
        let ev_b = make_eval();
        let opts = CompareOpts {
            n_bootstrap: 20,
            seed: 123,
            confidence: 0.95,
        };

        let r1 = compare(&ev_a, &ev_b, &opts).unwrap();
        let r2 = compare(&ev_a, &ev_b, &opts).unwrap();

        let ci1 = r1.ci.as_ref().unwrap();
        let ci2 = r2.ci.as_ref().unwrap();
        for key in ci1.keys() {
            assert_eq!(
                ci1[key].lower, ci2[key].lower,
                "CI lower mismatch for {key}"
            );
            assert_eq!(
                ci1[key].upper, ci2[key].upper,
                "CI upper mismatch for {key}"
            );
        }
    }

    #[test]
    fn test_compare_before_evaluate_errors() {
        let gt = COCO::new(&fixtures_dir().join("gt.json")).unwrap();
        let dt = gt.load_res(&fixtures_dir().join("dt.json")).unwrap();
        let ev_a = COCOeval::new(gt, dt, IouType::Bbox);
        // ev_a has not called evaluate(), so compare should fail
        let gt2 = COCO::new(&fixtures_dir().join("gt.json")).unwrap();
        let dt2 = gt2.load_res(&fixtures_dir().join("dt.json")).unwrap();
        let ev_b = COCOeval::new(gt2, dt2, IouType::Bbox);

        let err = compare(&ev_a, &ev_b, &CompareOpts::default()).unwrap_err();
        assert!(err.to_string().contains("evaluate()"));
    }

    #[test]
    fn test_compare_per_category() {
        let ev_a = make_eval();
        let ev_b = make_eval();
        let result = compare(&ev_a, &ev_b, &CompareOpts::default()).unwrap();

        // Per-category deltas should all be zero for identical models
        for cat in &result.per_category {
            assert!(
                cat.delta.abs() < 1e-10,
                "expected zero delta for {}, got {}",
                cat.cat_name,
                cat.delta
            );
            assert!(cat.ap_a >= 0.0 || cat.ap_b >= 0.0);
        }
    }

    #[test]
    fn test_compare_invalid_confidence() {
        let ev_a = make_eval();
        let ev_b = make_eval();
        let opts = CompareOpts {
            confidence: 1.5,
            ..Default::default()
        };
        let err = compare(&ev_a, &ev_b, &opts).unwrap_err();
        assert!(err.to_string().contains("confidence"));
    }
}