use std::sync::OnceLock;
use regex::Regex;
pub(crate) fn normalize_ocr_artifacts(text: &str) -> String {
let text = email_separator_regex()
.replace_all(text, "$pre@$post")
.into_owned();
email_domain_dot_regex()
.replace_all(&text, "$pre$post")
.into_owned()
}
fn email_separator_regex() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r"(?P<pre>\S)[ \t]*@[ \t]*(?P<post>\S)")
.expect("static email-separator pattern compiles")
})
}
fn email_domain_dot_regex() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r"(?P<pre>\S+@\S+\.)[ \t]+(?P<post>\S)")
.expect("static email-domain-dot pattern compiles")
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn collapses_space_before_at() {
assert_eq!(
normalize_ocr_artifacts("Email: jane.doe @example.invalid"),
"Email: jane.doe@example.invalid"
);
}
#[test]
fn collapses_space_after_at() {
assert_eq!(
normalize_ocr_artifacts("Email: jane.doe@ example.invalid"),
"Email: jane.doe@example.invalid"
);
}
#[test]
fn collapses_spaces_around_at() {
assert_eq!(
normalize_ocr_artifacts("Email: jane.doe @ example.invalid"),
"Email: jane.doe@example.invalid"
);
}
#[test]
fn collapses_tabs_around_at() {
assert_eq!(
normalize_ocr_artifacts("jane.doe\t@\texample.invalid"),
"jane.doe@example.invalid"
);
}
#[test]
fn collapses_space_after_domain_dot() {
assert_eq!(
normalize_ocr_artifacts("Email: jane.doe@example. invalid"),
"Email: jane.doe@example.invalid"
);
}
#[test]
fn already_clean_passthrough() {
let s = "Email: jane.doe@example.invalid\nPhone: +1-555-0142";
assert_eq!(normalize_ocr_artifacts(s), s);
}
#[test]
fn leaves_newline_adjacent_at_untouched() {
let s = "alpha\n@\nbeta";
assert_eq!(normalize_ocr_artifacts(s), s);
}
#[test]
fn leaves_leading_at_handle_untouched() {
let s = "ping @handle now";
assert_eq!(normalize_ocr_artifacts(s), "ping@handle now");
}
#[test]
fn handles_multiple_occurrences() {
let s = "a @b and c@ d and e @ f";
assert_eq!(normalize_ocr_artifacts(s), "a@b and c@d and e@f");
}
#[test]
fn preserves_non_at_whitespace() {
let s = "Bill to: Jane Doe \nPhone: +1 555 0142";
assert_eq!(normalize_ocr_artifacts(s), s);
}
}