Skip to main content

edgeparse_core/utils/
text_normalizer.rs

1//! Text normalization for PDF-extracted text.
2//!
3//! Handles ligature decomposition, soft-hyphen removal, zero-width
4//! character stripping, and whitespace normalization.
5
6use unicode_normalization::UnicodeNormalization;
7
8/// Normalize extracted PDF text: decompose ligatures, strip zero-width
9/// characters, remove soft hyphens, collapse whitespace, and apply NFC.
10pub fn normalize_pdf_text(text: &str) -> String {
11    let mut result = String::with_capacity(text.len());
12    for ch in text.chars() {
13        if let Some(replacement) = decompose_ligature(ch) {
14            result.push_str(replacement);
15        } else if is_ignorable(ch) {
16            // Skip zero-width and soft-hyphen characters
17        } else {
18            result.push(ch);
19        }
20    }
21    // NFC normalize after ligature decomposition
22    result.nfc().collect()
23}
24
25/// Decompose common typographic ligatures to their component letters.
26fn decompose_ligature(ch: char) -> Option<&'static str> {
27    match ch {
28        '\u{FB00}' => Some("ff"),
29        '\u{FB01}' => Some("fi"),
30        '\u{FB02}' => Some("fl"),
31        '\u{FB03}' => Some("ffi"),
32        '\u{FB04}' => Some("ffl"),
33        '\u{FB05}' => Some("st"), // long s + t
34        '\u{FB06}' => Some("st"), // st ligature
35        '\u{0132}' => Some("IJ"), // Dutch IJ
36        '\u{0133}' => Some("ij"), // Dutch ij
37        '\u{0152}' => Some("OE"), // OE ligature
38        '\u{0153}' => Some("oe"), // oe ligature
39        '\u{00C6}' => Some("AE"), // Æ
40        '\u{00E6}' => Some("ae"), // æ
41        _ => None,
42    }
43}
44
45/// Characters that should be stripped from extracted text.
46fn is_ignorable(ch: char) -> bool {
47    matches!(
48        ch,
49        '\u{00AD}'   // soft hyphen
50        | '\u{200B}' // zero-width space
51        | '\u{200C}' // zero-width non-joiner
52        | '\u{200D}' // zero-width joiner
53        | '\u{FEFF}' // byte-order mark / zero-width no-break space
54        | '\u{2060}' // word joiner
55        | '\u{FFFC}' // object replacement character
56    )
57}
58
59/// Collapse runs of whitespace into single spaces and trim.
60pub fn collapse_whitespace(text: &str) -> String {
61    let mut result = String::with_capacity(text.len());
62    let mut prev_space = true; // true to trim leading
63    for ch in text.chars() {
64        if ch.is_whitespace() {
65            if !prev_space {
66                result.push(' ');
67                prev_space = true;
68            }
69        } else {
70            result.push(ch);
71            prev_space = false;
72        }
73    }
74    // Trim trailing space
75    if result.ends_with(' ') {
76        result.pop();
77    }
78    result
79}
80
81/// Remove all diacritical marks from text (NFC → NFD → strip combining marks → NFC).
82/// Useful for accent-insensitive searching.
83pub fn strip_diacritics(text: &str) -> String {
84    text.nfd()
85        .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
86        .nfc()
87        .collect()
88}
89
90/// Normalize smart/curly quotes and dashes to ASCII equivalents.
91pub fn normalize_typography(text: &str) -> String {
92    let mut result = String::with_capacity(text.len());
93    for ch in text.chars() {
94        match ch {
95            '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => result.push('\''),
96            '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => result.push('"'),
97            '\u{2013}' => result.push('-'),       // en dash
98            '\u{2014}' => result.push_str("--"),  // em dash
99            '\u{2026}' => result.push_str("..."), // ellipsis
100            '\u{00A0}' => result.push(' '),       // non-breaking space
101            '\u{2002}' | '\u{2003}' | '\u{2009}' => result.push(' '), // en/em/thin space
102            _ => result.push(ch),
103        }
104    }
105    result
106}
107
108/// Full normalization pipeline: ligatures + ignorables + typography + whitespace + NFC.
109pub fn full_normalize(text: &str) -> String {
110    let step1 = normalize_pdf_text(text);
111    let step2 = normalize_typography(&step1);
112    collapse_whitespace(&step2)
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118
119    #[test]
120    fn test_ligature_decomposition() {
121        assert_eq!(normalize_pdf_text("e\u{FB03}cient"), "efficient");
122        assert_eq!(normalize_pdf_text("\u{FB01}le"), "file");
123        assert_eq!(normalize_pdf_text("\u{FB02}oor"), "floor");
124        assert_eq!(normalize_pdf_text("\u{FB00}ect"), "ffect");
125        assert_eq!(normalize_pdf_text("\u{FB04}e"), "ffle");
126    }
127
128    #[test]
129    fn test_soft_hyphen_removal() {
130        assert_eq!(normalize_pdf_text("con\u{00AD}tin\u{00AD}ue"), "continue");
131    }
132
133    #[test]
134    fn test_zero_width_removal() {
135        assert_eq!(
136            normalize_pdf_text("he\u{200B}llo\u{FEFF} world"),
137            "hello world"
138        );
139    }
140
141    #[test]
142    fn test_nfc_after_decomposition() {
143        // Combining accent should be composed after normalization
144        assert_eq!(normalize_pdf_text("e\u{0301}"), "é");
145    }
146
147    #[test]
148    fn test_collapse_whitespace() {
149        assert_eq!(collapse_whitespace("  hello   world  "), "hello world");
150        assert_eq!(collapse_whitespace("a\n\n  b\tc"), "a b c");
151        assert_eq!(collapse_whitespace(""), "");
152        assert_eq!(collapse_whitespace("   "), "");
153    }
154
155    #[test]
156    fn test_plain_text_unchanged() {
157        assert_eq!(normalize_pdf_text("Hello World"), "Hello World");
158    }
159
160    #[test]
161    fn test_strip_diacritics() {
162        assert_eq!(strip_diacritics("café"), "cafe");
163        assert_eq!(strip_diacritics("über"), "uber");
164        assert_eq!(strip_diacritics("naïve"), "naive");
165        assert_eq!(strip_diacritics("résumé"), "resume");
166    }
167
168    #[test]
169    fn test_normalize_typography() {
170        assert_eq!(normalize_typography("\u{201C}hello\u{201D}"), "\"hello\"");
171        assert_eq!(normalize_typography("it\u{2019}s"), "it's");
172        assert_eq!(normalize_typography("a\u{2013}b"), "a-b");
173        assert_eq!(normalize_typography("wait\u{2026}"), "wait...");
174    }
175
176    #[test]
177    fn test_full_normalize() {
178        let input = "\u{FB01}le\u{00AD}name\u{200B} \u{201C}test\u{201D}  \u{2026}";
179        let result = full_normalize(input);
180        assert_eq!(result, "filename \"test\" ...");
181    }
182}