1use unicode_normalization::UnicodeNormalization;
2use unicode_segmentation::UnicodeSegmentation;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct NormalizedText {
7 pub text: String,
8 pub truncated: bool,
9}
10
11impl NormalizedText {
12 #[must_use]
13 pub fn is_truncated(&self) -> bool {
14 self.truncated
15 }
16}
17
18#[must_use]
21pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
22 let limit = limit.max(1);
23 let normalised = input.nfkc().collect::<String>();
24
25 let mut cleaned = String::with_capacity(normalised.len());
26 let mut last_was_space = false;
27 let mut last_was_newline = false;
28
29 for mut ch in normalised.chars() {
30 if ch == '\r' {
31 ch = '\n';
32 }
33 if ch == '\t' {
34 ch = ' ';
35 }
36 if ch.is_control() && ch != '\n' {
37 continue;
38 }
39 if ch == '\n' {
40 if last_was_newline {
41 continue;
42 }
43 while cleaned.ends_with(' ') {
44 cleaned.pop();
45 }
46 cleaned.push('\n');
47 last_was_newline = true;
48 last_was_space = false;
49 } else if ch.is_whitespace() {
50 if last_was_space || cleaned.ends_with('\n') {
51 continue;
52 }
53 cleaned.push(' ');
54 last_was_space = true;
55 last_was_newline = false;
56 } else {
57 cleaned.push(ch);
58 last_was_space = false;
59 last_was_newline = false;
60 }
61 }
62
63 let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
64 if trimmed.is_empty() {
65 return None;
66 }
67
68 let mut truncated = false;
69 let mut out = String::new();
70 let mut consumed = 0usize;
71
72 for grapheme in trimmed.graphemes(true) {
73 let next = consumed + grapheme.len();
74 if next > limit {
75 truncated = true;
76 break;
77 }
78 out.push_str(grapheme);
79 consumed = next;
80 }
81
82 if out.is_empty() {
83 if let Some(first) = trimmed.graphemes(true).next() {
86 out.push_str(first);
87 truncated = true;
88 }
89 }
90
91 Some(NormalizedText {
92 text: out,
93 truncated,
94 })
95}
96
97#[must_use]
100pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
101 if s.len() <= limit {
102 return s.len();
103 }
104
105 let mut end = 0usize;
106 for (idx, grapheme) in s.grapheme_indices(true) {
107 let next = idx + grapheme.len();
108 if next > limit {
109 break;
110 }
111 end = next;
112 }
113
114 if end == 0 {
115 s.graphemes(true).next().map(|g| g.len()).unwrap_or(0)
116 } else {
117 end
118 }
119}
120
121#[must_use]
130pub fn fix_pdf_spacing(input: &str) -> String {
131 if input.len() < 3 || !input.contains(' ') {
133 return input.to_string();
134 }
135
136 const VALID_SINGLE_CHARS: &[char] = &['a', 'i', 'A', 'I'];
138
139 const COMMON_WORDS: &[&str] = &[
141 "a", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my",
142 "no", "of", "on", "or", "so", "to", "up", "us", "we", "am", "are", "can", "did", "for",
143 "get", "got", "had", "has", "her", "him", "his", "its", "let", "may", "nor", "not", "now",
144 "off", "old", "one", "our", "out", "own", "ran", "run", "saw", "say", "see", "set", "she",
145 "the", "too", "two", "use", "was", "way", "who", "why", "yet", "you", "all", "and", "any",
146 "but", "few", "how", "man", "new", "per", "put", "via",
147 ];
148
149 fn is_common_word(s: &str) -> bool {
150 let lower = s.to_ascii_lowercase();
151 COMMON_WORDS.contains(&lower.as_str())
152 }
153
154 fn is_valid_single_char(s: &str) -> bool {
155 s.len() == 1
156 && s.chars()
157 .next()
158 .map(|c| VALID_SINGLE_CHARS.contains(&c))
159 .unwrap_or(false)
160 }
161
162 fn is_purely_alpha(s: &str) -> bool {
163 !s.is_empty() && s.chars().all(|c| c.is_alphabetic())
164 }
165
166 fn alpha_len(s: &str) -> usize {
167 s.chars().filter(|c| c.is_alphabetic()).count()
168 }
169
170 fn is_orphan(word: &str) -> bool {
171 alpha_len(word) == 1 && is_purely_alpha(word) && !is_valid_single_char(word)
172 }
173
174 fn is_short_fragment(word: &str) -> bool {
175 let len = alpha_len(word);
176 len >= 2 && len <= 3 && is_purely_alpha(word) && !is_common_word(word)
178 }
179
180 fn is_likely_suffix(word: &str) -> bool {
181 let len = alpha_len(word);
182 len == 4 && is_purely_alpha(word) && !is_common_word(word)
185 }
186
187 fn should_start_merge(word: &str, next: &str) -> bool {
188 if !is_purely_alpha(word) || !is_purely_alpha(next) {
189 return false;
190 }
191
192 let word_len = alpha_len(word);
193 let next_len = alpha_len(next);
194 let word_common = is_common_word(word);
195 let next_common = is_common_word(next);
196
197 let word_orphan = is_orphan(word);
198 let next_orphan = is_orphan(next);
199 let word_fragment = is_short_fragment(word);
200 let next_fragment = is_short_fragment(next);
201 let next_suffix = is_likely_suffix(next);
202
203 if word_orphan {
205 return true;
206 }
207
208 if next_orphan {
210 return true;
211 }
212
213 if word_fragment && (next_fragment || next_orphan || next_suffix) {
216 return true;
217 }
218
219 if is_valid_single_char(word) && next_len <= 3 && !next_common {
222 return true;
223 }
224
225 if word_common && word_len <= 3 && (next_fragment || next_suffix) {
229 return true;
230 }
231
232 false
233 }
234
235 fn should_continue_merge(current: &str, next: &str, had_short_fragment: bool) -> bool {
236 if !had_short_fragment || !is_purely_alpha(next) {
237 return false;
238 }
239
240 let next_len = alpha_len(next);
241 if next_len <= 3 {
242 return true;
243 }
244
245 if next_len == 4 && !is_common_word(next) && alpha_len(current) <= 5 {
246 return true;
247 }
248
249 false
250 }
251
252 let words: Vec<&str> = input.split_whitespace().collect();
253 if words.len() < 2 {
254 return input.to_string();
255 }
256
257 let mut output: Vec<String> = Vec::with_capacity(words.len());
258 let mut i = 0;
259
260 while i < words.len() {
261 let word = words[i];
262
263 if i + 1 < words.len() && should_start_merge(word, words[i + 1]) {
264 let mut merged = String::from(word);
265 let mut had_short_fragment = is_short_fragment(word)
266 || is_short_fragment(words[i + 1])
267 || is_orphan(word)
268 || is_orphan(words[i + 1])
269 || (is_valid_single_char(word) && alpha_len(words[i + 1]) <= 3);
270
271 merged.push_str(words[i + 1]);
272 i += 2;
273
274 while i < words.len() && should_continue_merge(&merged, words[i], had_short_fragment) {
275 if is_short_fragment(words[i]) || is_orphan(words[i]) {
276 had_short_fragment = true;
277 }
278 merged.push_str(words[i]);
279 i += 1;
280 }
281
282 output.push(merged);
283 } else {
284 output.push(word.to_string());
285 i += 1;
286 }
287 }
288
289 output.join(" ")
290}
291
292#[cfg(test)]
293mod tests {
294 use super::*;
295
296 #[test]
301 fn fixes_pdf_spacing_single_chars() {
302 assert_eq!(fix_pdf_spacing("lo n ger"), "longer");
304 assert_eq!(fix_pdf_spacing("n o"), "no"); let result = fix_pdf_spacing("rep o rted");
307 assert!(
308 result == "reported" || result.contains("rep"),
309 "got: {}",
310 result
311 );
312 }
313
314 #[test]
315 fn fixes_pdf_spacing_preserves_normal_text() {
316 assert_eq!(
318 fix_pdf_spacing("The manager reported to the supervisor"),
319 "The manager reported to the supervisor"
320 );
321 assert_eq!(
323 fix_pdf_spacing("The manager reported"),
324 "The manager reported"
325 );
326 assert_eq!(fix_pdf_spacing("man ager"), "manager");
328 assert_eq!(fix_pdf_spacing("I am a person"), "I am a person");
330 let result = fix_pdf_spacing("older do cuments");
334 assert!(result.contains("older"), "got: {}", result);
335 assert_eq!(fix_pdf_spacing("These references"), "These references");
336 }
337
338 #[test]
339 fn fixes_pdf_spacing_two_letter_fragments() {
340 assert_eq!(fix_pdf_spacing("lo ng"), "long");
342 assert_eq!(fix_pdf_spacing("to be or"), "to be or");
344 }
345
346 #[test]
347 fn fixes_pdf_spacing_real_pdf_artifacts() {
348 assert_eq!(fix_pdf_spacing("C hlo e"), "Chloe");
350 assert_eq!(fix_pdf_spacing("C hlo e Nguyen"), "Chloe Nguyen");
352 assert_eq!(fix_pdf_spacing("n o lo n ger"), "nolonger");
354 }
355
356 #[test]
357 fn fixes_pdf_spacing_fragment_chains() {
358 let result = fix_pdf_spacing("A va Martin");
360 assert!(
361 result.contains("va") || result.contains("Ava"),
362 "got: {}",
363 result
364 );
365 let result = fix_pdf_spacing("emp lo yee");
366 assert!(
367 result == "employee" || result.contains("emp"),
368 "got: {}",
369 result
370 );
371 }
372
373 #[test]
374 fn normalises_control_and_whitespace() {
375 let input = " Hello\tWorld \u{000B} test\r\nnext";
376 let result = normalize_text(input, 128).expect("normalized");
377 assert_eq!(result.text, "Hello World test\nnext");
378 assert!(!result.truncated);
379 }
380
381 #[test]
382 fn normalize_truncates_on_grapheme_boundary() {
383 let input = "a\u{0301}bcd"; let result = normalize_text(input, 3).expect("normalized");
385 assert_eq!(result.text, "áb");
386 assert!(result.truncated);
387 }
388
389 #[test]
390 fn truncate_boundary_handles_long_grapheme() {
391 let s = "🇮🇳hello"; let idx = truncate_at_grapheme_boundary(s, 4);
393 assert!(idx >= 4);
394 assert_eq!(&s[..idx], "🇮🇳");
395 }
396}