use zer_core::{record::Record, schema::Schema};
use crate::normalize::normalize_digits_only;
use super::BlockingKey;
pub struct DocumentSuffixKey {
field: String,
suffix_len: usize,
}
impl DocumentSuffixKey {
pub fn new(field: &str, suffix_len: usize) -> Self {
Self { field: field.into(), suffix_len }
}
}
impl BlockingKey for DocumentSuffixKey {
fn name(&self) -> &str { "document_suffix" }
fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
let cow = record.field_as_str(&self.field);
let raw = match cow.as_deref() {
Some(s) => s,
None => return vec![],
};
let clean: String = raw
.chars()
.filter(|c| c.is_ascii_alphanumeric())
.collect::<String>()
.to_ascii_uppercase();
if clean.len() < self.suffix_len {
return vec![];
}
let suffix = &clean[clean.len() - self.suffix_len..];
vec![suffix.to_string()]
}
}
pub struct DocumentDigitSuffixKey {
field: String,
suffix_len: usize,
}
impl DocumentDigitSuffixKey {
pub fn new(field: &str, suffix_len: usize) -> Self {
Self { field: field.into(), suffix_len }
}
}
impl BlockingKey for DocumentDigitSuffixKey {
fn name(&self) -> &str { "document_digit_suffix" }
fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
let cow = record.field_as_str(&self.field);
let raw = match cow.as_deref() {
Some(s) => s,
None => return vec![],
};
let digits = normalize_digits_only(raw);
if digits.len() < self.suffix_len {
return vec![];
}
let suffix = &digits[digits.len() - self.suffix_len..];
vec![suffix.to_string()]
}
}
#[cfg(test)]
mod tests {
use super::*;
use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
fn schema() -> Schema {
SchemaBuilder::new()
.field("document_nummer", FieldKind::Id)
.build()
.unwrap()
}
fn rec(id: u64, doc: &str) -> Record {
Record::new(id).insert("document_nummer", FieldValue::Text(doc.into()))
}
#[test]
fn suffix_key_strips_non_alphanum_and_uppercases() {
let schema = schema();
let key = DocumentSuffixKey::new("document_nummer", 6);
let r = rec(1, "P-NL-AB123456");
let keys = key.extract(&r, &schema);
assert_eq!(keys, vec!["123456"]);
}
#[test]
fn suffix_key_same_serial_different_prefix_collide() {
let schema = schema();
let key = DocumentSuffixKey::new("document_nummer", 6);
let r1 = rec(1, "P-NL-AB123456");
let r2 = rec(2, "AB123456");
assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
}
#[test]
fn suffix_key_too_short_returns_empty() {
let schema = schema();
let key = DocumentSuffixKey::new("document_nummer", 6);
let r = rec(1, "AB12"); assert!(key.extract(&r, &schema).is_empty());
}
#[test]
fn suffix_key_missing_field_returns_empty() {
let schema = schema();
let key = DocumentSuffixKey::new("document_nummer", 6);
assert!(key.extract(&Record::new(1), &schema).is_empty());
}
#[test]
fn digit_suffix_strips_all_letters() {
let schema = schema();
let key = DocumentDigitSuffixKey::new("document_nummer", 4);
let r = rec(1, "BSN-12345678");
let keys = key.extract(&r, &schema);
assert_eq!(keys, vec!["5678"]);
}
#[test]
fn digit_suffix_same_number_different_format_collide() {
let schema = schema();
let key = DocumentDigitSuffixKey::new("document_nummer", 6);
let r1 = rec(1, "123-45-6789");
let r2 = rec(2, "123456789");
assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
}
}