edgeparse_core/utils/
text_normalizer.rs1use unicode_normalization::UnicodeNormalization;
7
8pub fn normalize_pdf_text(text: &str) -> String {
11 let mut result = String::with_capacity(text.len());
12 for ch in text.chars() {
13 if let Some(replacement) = decompose_ligature(ch) {
14 result.push_str(replacement);
15 } else if is_ignorable(ch) {
16 } else {
18 result.push(ch);
19 }
20 }
21 result.nfc().collect()
23}
24
25fn decompose_ligature(ch: char) -> Option<&'static str> {
27 match ch {
28 '\u{FB00}' => Some("ff"),
29 '\u{FB01}' => Some("fi"),
30 '\u{FB02}' => Some("fl"),
31 '\u{FB03}' => Some("ffi"),
32 '\u{FB04}' => Some("ffl"),
33 '\u{FB05}' => Some("st"), '\u{FB06}' => Some("st"), '\u{0132}' => Some("IJ"), '\u{0133}' => Some("ij"), '\u{0152}' => Some("OE"), '\u{0153}' => Some("oe"), '\u{00C6}' => Some("AE"), '\u{00E6}' => Some("ae"), _ => None,
42 }
43}
44
45fn is_ignorable(ch: char) -> bool {
47 matches!(
48 ch,
49 '\u{00AD}' | '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{2060}' | '\u{FFFC}' )
57}
58
59pub fn collapse_whitespace(text: &str) -> String {
61 let mut result = String::with_capacity(text.len());
62 let mut prev_space = true; for ch in text.chars() {
64 if ch.is_whitespace() {
65 if !prev_space {
66 result.push(' ');
67 prev_space = true;
68 }
69 } else {
70 result.push(ch);
71 prev_space = false;
72 }
73 }
74 if result.ends_with(' ') {
76 result.pop();
77 }
78 result
79}
80
81pub fn strip_diacritics(text: &str) -> String {
84 text.nfd()
85 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
86 .nfc()
87 .collect()
88}
89
90pub fn normalize_typography(text: &str) -> String {
92 let mut result = String::with_capacity(text.len());
93 for ch in text.chars() {
94 match ch {
95 '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => result.push('\''),
96 '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => result.push('"'),
97 '\u{2013}' => result.push('-'), '\u{2014}' => result.push_str("--"), '\u{2026}' => result.push_str("..."), '\u{00A0}' => result.push(' '), '\u{2002}' | '\u{2003}' | '\u{2009}' => result.push(' '), _ => result.push(ch),
103 }
104 }
105 result
106}
107
108pub fn full_normalize(text: &str) -> String {
110 let step1 = normalize_pdf_text(text);
111 let step2 = normalize_typography(&step1);
112 collapse_whitespace(&step2)
113}
114
115#[cfg(test)]
116mod tests {
117 use super::*;
118
119 #[test]
120 fn test_ligature_decomposition() {
121 assert_eq!(normalize_pdf_text("e\u{FB03}cient"), "efficient");
122 assert_eq!(normalize_pdf_text("\u{FB01}le"), "file");
123 assert_eq!(normalize_pdf_text("\u{FB02}oor"), "floor");
124 assert_eq!(normalize_pdf_text("\u{FB00}ect"), "ffect");
125 assert_eq!(normalize_pdf_text("\u{FB04}e"), "ffle");
126 }
127
128 #[test]
129 fn test_soft_hyphen_removal() {
130 assert_eq!(normalize_pdf_text("con\u{00AD}tin\u{00AD}ue"), "continue");
131 }
132
133 #[test]
134 fn test_zero_width_removal() {
135 assert_eq!(
136 normalize_pdf_text("he\u{200B}llo\u{FEFF} world"),
137 "hello world"
138 );
139 }
140
141 #[test]
142 fn test_nfc_after_decomposition() {
143 assert_eq!(normalize_pdf_text("e\u{0301}"), "é");
145 }
146
147 #[test]
148 fn test_collapse_whitespace() {
149 assert_eq!(collapse_whitespace(" hello world "), "hello world");
150 assert_eq!(collapse_whitespace("a\n\n b\tc"), "a b c");
151 assert_eq!(collapse_whitespace(""), "");
152 assert_eq!(collapse_whitespace(" "), "");
153 }
154
155 #[test]
156 fn test_plain_text_unchanged() {
157 assert_eq!(normalize_pdf_text("Hello World"), "Hello World");
158 }
159
160 #[test]
161 fn test_strip_diacritics() {
162 assert_eq!(strip_diacritics("café"), "cafe");
163 assert_eq!(strip_diacritics("über"), "uber");
164 assert_eq!(strip_diacritics("naïve"), "naive");
165 assert_eq!(strip_diacritics("résumé"), "resume");
166 }
167
168 #[test]
169 fn test_normalize_typography() {
170 assert_eq!(normalize_typography("\u{201C}hello\u{201D}"), "\"hello\"");
171 assert_eq!(normalize_typography("it\u{2019}s"), "it's");
172 assert_eq!(normalize_typography("a\u{2013}b"), "a-b");
173 assert_eq!(normalize_typography("wait\u{2026}"), "wait...");
174 }
175
176 #[test]
177 fn test_full_normalize() {
178 let input = "\u{FB01}le\u{00AD}name\u{200B} \u{201C}test\u{201D} \u{2026}";
179 let result = full_normalize(input);
180 assert_eq!(result, "filename \"test\" ...");
181 }
182}