Skip to main content

zer_blocking/keys/
document.rs

1use zer_core::{record::Record, schema::Schema};
2
3use crate::normalize::normalize_digits_only;
4use super::BlockingKey;
5
6// ── DocumentSuffixKey ─────────────────────────────────────────────────────────
7
8/// Blocking key that strips non-alphanumeric characters from a document number
9/// and emits the last `suffix_len` characters as a key.
10///
11/// Useful for matching passport or ID numbers that may be entered with
12/// different prefix conventions or formatting (e.g. "P-NL-AB123456" vs
13/// "AB123456"), while the suffix (serial part) stays stable.
14///
15/// Key format: `"SUFFIX"` (uppercase, alphanumeric only)
16pub struct DocumentSuffixKey {
17    field:      String,
18    suffix_len: usize,
19}
20
21impl DocumentSuffixKey {
22    /// `suffix_len = 6` is a reasonable default for European ID numbers.
23    pub fn new(field: &str, suffix_len: usize) -> Self {
24        Self { field: field.into(), suffix_len }
25    }
26}
27
28impl BlockingKey for DocumentSuffixKey {
29    fn name(&self) -> &str { "document_suffix" }
30
31    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
32        let cow = record.field_as_str(&self.field);
33        let raw = match cow.as_deref() {
34            Some(s) => s,
35            None    => return vec![],
36        };
37        let clean: String = raw
38            .chars()
39            .filter(|c| c.is_ascii_alphanumeric())
40            .collect::<String>()
41            .to_ascii_uppercase();
42        if clean.len() < self.suffix_len {
43            return vec![];
44        }
45        let suffix = &clean[clean.len() - self.suffix_len..];
46        vec![suffix.to_string()]
47    }
48}
49
50// ── DocumentDigitSuffixKey ────────────────────────────────────────────────────
51
52/// Variant that strips ALL non-digit characters before taking the suffix.
53///
54/// Intended for purely numeric document identifiers (BSN, fiscal numbers)
55/// where alphabetic characters are noise or country-code prefixes.
56pub struct DocumentDigitSuffixKey {
57    field:      String,
58    suffix_len: usize,
59}
60
61impl DocumentDigitSuffixKey {
62    pub fn new(field: &str, suffix_len: usize) -> Self {
63        Self { field: field.into(), suffix_len }
64    }
65}
66
67impl BlockingKey for DocumentDigitSuffixKey {
68    fn name(&self) -> &str { "document_digit_suffix" }
69
70    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
71        let cow = record.field_as_str(&self.field);
72        let raw = match cow.as_deref() {
73            Some(s) => s,
74            None    => return vec![],
75        };
76        let digits = normalize_digits_only(raw);
77        if digits.len() < self.suffix_len {
78            return vec![];
79        }
80        let suffix = &digits[digits.len() - self.suffix_len..];
81        vec![suffix.to_string()]
82    }
83}
84
85// ── Tests ─────────────────────────────────────────────────────────────────────
86
87#[cfg(test)]
88mod tests {
89    use super::*;
90    use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
91
92    fn schema() -> Schema {
93        SchemaBuilder::new()
94            .field("document_nummer", FieldKind::Id)
95            .build()
96            .unwrap()
97    }
98
99    fn rec(id: u64, doc: &str) -> Record {
100        Record::new(id).insert("document_nummer", FieldValue::Text(doc.into()))
101    }
102
103    // ── DocumentSuffixKey
104
105    #[test]
106    fn suffix_key_strips_non_alphanum_and_uppercases() {
107        let schema = schema();
108        let key    = DocumentSuffixKey::new("document_nummer", 6);
109        let r      = rec(1, "P-NL-AB123456");
110        let keys   = key.extract(&r, &schema);
111        assert_eq!(keys, vec!["123456"]);
112    }
113
114    #[test]
115    fn suffix_key_same_serial_different_prefix_collide() {
116        let schema = schema();
117        let key    = DocumentSuffixKey::new("document_nummer", 6);
118
119        let r1 = rec(1, "P-NL-AB123456");
120        let r2 = rec(2, "AB123456");
121        assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
122    }
123
124    #[test]
125    fn suffix_key_too_short_returns_empty() {
126        let schema = schema();
127        let key    = DocumentSuffixKey::new("document_nummer", 6);
128        let r      = rec(1, "AB12"); // only 4 chars after stripping
129        assert!(key.extract(&r, &schema).is_empty());
130    }
131
132    #[test]
133    fn suffix_key_missing_field_returns_empty() {
134        let schema = schema();
135        let key    = DocumentSuffixKey::new("document_nummer", 6);
136        assert!(key.extract(&Record::new(1), &schema).is_empty());
137    }
138
139    // ── DocumentDigitSuffixKey
140
141    #[test]
142    fn digit_suffix_strips_all_letters() {
143        let schema = schema();
144        let key    = DocumentDigitSuffixKey::new("document_nummer", 4);
145        let r      = rec(1, "BSN-12345678");
146        let keys   = key.extract(&r, &schema);
147        assert_eq!(keys, vec!["5678"]);
148    }
149
150    #[test]
151    fn digit_suffix_same_number_different_format_collide() {
152        let schema = schema();
153        let key    = DocumentDigitSuffixKey::new("document_nummer", 6);
154
155        let r1 = rec(1, "123-45-6789");
156        let r2 = rec(2, "123456789");
157        assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
158    }
159}