1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
//! Dedupe invariants and false-positive sanity tests.
//!
//! The crawler's frontier uses a growable bloom filter backed by a bounded
//! exact-recent set. These tests lock the contract:
//! * first insert of a key returns true (newly seen);
//! * second insert of the same key returns false (dupe);
//! * fp rate stays close to configured when exercised at scale.
use crawlex::frontier::Dedupe;
#[test]
fn first_insert_is_new_second_is_dupe() {
let d = Dedupe::new(1000, 0.001);
assert!(d.insert_if_new("https://example.com/a"));
assert!(!d.insert_if_new("https://example.com/a"));
}
#[test]
fn distinct_keys_are_all_new() {
let d = Dedupe::new(1000, 0.001);
for i in 0..500 {
let key = format!("https://example.com/p/{i}");
assert!(d.insert_if_new(&key), "key {i} unexpectedly marked dupe");
}
}
#[test]
fn exact_recent_wipe_does_not_corrupt_bloom() {
// exact_cap is 100_000. Fill past it so the wipe happens, then verify
// keys inserted before the wipe are still deduped by the bloom.
let d = Dedupe::new(200_000, 0.01);
let known = "https://pre-wipe.example/zero";
assert!(d.insert_if_new(known));
for i in 0..100_500 {
let _ = d.insert_if_new(&format!("https://bulk.example/{i}"));
}
// The known key survived in the bloom even though exact_recent was
// cleared — second attempt must still be dupe.
assert!(!d.insert_if_new(known), "bloom lost a pre-wipe key");
}
#[test]
fn false_positive_rate_stays_reasonable() {
// With fp_rate 0.01 and expected 10_000 we insert 10_000 distinct keys
// then query 10_000 different ones and assert <2% false positive.
// Loose bound so the test stays stable; the real fp rate should sit
// below 1% but we don't want flaky CI.
let d = Dedupe::new(10_000, 0.01);
for i in 0..10_000 {
let _ = d.insert_if_new(&format!("A-{i}"));
}
let mut fps = 0usize;
for i in 0..10_000 {
// Insert B-i which is different from all A-i entries; a `false`
// return means the bloom incorrectly thought we'd seen it.
if !d.insert_if_new(&format!("B-{i}")) {
fps += 1;
}
}
let rate = fps as f64 / 10_000.0;
assert!(
rate < 0.02,
"fp rate {:.4} exceeded 2% ceiling (raw {})",
rate,
fps
);
}