1use unicode_normalization::UnicodeNormalization;
2use unicode_segmentation::UnicodeSegmentation;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct NormalizedText {
7 pub text: String,
8 pub truncated: bool,
9}
10
11impl NormalizedText {
12 #[must_use]
13 pub fn is_truncated(&self) -> bool {
14 self.truncated
15 }
16}
17
18#[must_use]
21pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
22 let limit = limit.max(1);
23 let normalised = input.nfkc().collect::<String>();
24
25 let mut cleaned = String::with_capacity(normalised.len());
26 let mut last_was_space = false;
27 let mut last_was_newline = false;
28
29 for mut ch in normalised.chars() {
30 if ch == '\r' {
31 ch = '\n';
32 }
33 if ch == '\t' {
34 ch = ' ';
35 }
36 if ch.is_control() && ch != '\n' {
37 continue;
38 }
39 if ch == '\n' {
40 if last_was_newline {
41 continue;
42 }
43 while cleaned.ends_with(' ') {
44 cleaned.pop();
45 }
46 cleaned.push('\n');
47 last_was_newline = true;
48 last_was_space = false;
49 } else if ch.is_whitespace() {
50 if last_was_space || cleaned.ends_with('\n') {
51 continue;
52 }
53 cleaned.push(' ');
54 last_was_space = true;
55 last_was_newline = false;
56 } else {
57 cleaned.push(ch);
58 last_was_space = false;
59 last_was_newline = false;
60 }
61 }
62
63 let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
64 if trimmed.is_empty() {
65 return None;
66 }
67
68 let mut truncated = false;
69 let mut out = String::new();
70 let mut consumed = 0usize;
71
72 for grapheme in trimmed.graphemes(true) {
73 let next = consumed + grapheme.len();
74 if next > limit {
75 truncated = true;
76 break;
77 }
78 out.push_str(grapheme);
79 consumed = next;
80 }
81
82 if out.is_empty() {
83 if let Some(first) = trimmed.graphemes(true).next() {
86 out.push_str(first);
87 truncated = true;
88 }
89 }
90
91 Some(NormalizedText {
92 text: out,
93 truncated,
94 })
95}
96
97#[must_use]
100pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
101 if s.len() <= limit {
102 return s.len();
103 }
104
105 let mut end = 0usize;
106 for (idx, grapheme) in s.grapheme_indices(true) {
107 let next = idx + grapheme.len();
108 if next > limit {
109 break;
110 }
111 end = next;
112 }
113
114 if end == 0 {
115 s.graphemes(true).next().map_or(0, str::len)
116 } else {
117 end
118 }
119}
120
121#[must_use]
130pub fn fix_pdf_spacing(input: &str) -> String {
131 if input.len() < 3 || !input.contains(' ') {
133 return input.to_string();
134 }
135
136 const VALID_SINGLE_CHARS: &[char] = &['a', 'i', 'A', 'I'];
138
139 const COMMON_WORDS: &[&str] = &[
141 "a", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my",
142 "no", "of", "on", "or", "so", "to", "up", "us", "we", "am", "are", "can", "did", "for",
143 "get", "got", "had", "has", "her", "him", "his", "its", "let", "may", "nor", "not", "now",
144 "off", "old", "one", "our", "out", "own", "ran", "run", "saw", "say", "see", "set", "she",
145 "the", "too", "two", "use", "was", "way", "who", "why", "yet", "you", "all", "and", "any",
146 "but", "few", "how", "man", "new", "per", "put", "via",
147 ];
148
149 fn is_common_word(s: &str) -> bool {
150 let lower = s.to_ascii_lowercase();
151 COMMON_WORDS.contains(&lower.as_str())
152 }
153
154 fn is_valid_single_char(s: &str) -> bool {
155 s.len() == 1
156 && s.chars()
157 .next()
158 .is_some_and(|c| VALID_SINGLE_CHARS.contains(&c))
159 }
160
161 fn is_purely_alpha(s: &str) -> bool {
162 !s.is_empty() && s.chars().all(char::is_alphabetic)
163 }
164
165 fn alpha_len(s: &str) -> usize {
166 s.chars().filter(|c| c.is_alphabetic()).count()
167 }
168
169 fn is_orphan(word: &str) -> bool {
170 alpha_len(word) == 1 && is_purely_alpha(word) && !is_valid_single_char(word)
171 }
172
173 fn is_short_fragment(word: &str) -> bool {
174 let len = alpha_len(word);
175 (2..=3).contains(&len) && is_purely_alpha(word) && !is_common_word(word)
177 }
178
179 fn is_likely_suffix(word: &str) -> bool {
180 let len = alpha_len(word);
181 len == 4 && is_purely_alpha(word) && !is_common_word(word)
184 }
185
186 fn should_start_merge(word: &str, next: &str) -> bool {
187 if !is_purely_alpha(word) || !is_purely_alpha(next) {
188 return false;
189 }
190
191 let word_len = alpha_len(word);
192 let next_len = alpha_len(next);
193 let word_common = is_common_word(word);
194 let next_common = is_common_word(next);
195
196 let word_orphan = is_orphan(word);
197 let next_orphan = is_orphan(next);
198 let word_fragment = is_short_fragment(word);
199 let next_fragment = is_short_fragment(next);
200 let next_suffix = is_likely_suffix(next);
201
202 if word_orphan {
204 return true;
205 }
206
207 if next_orphan {
209 return true;
210 }
211
212 if word_fragment && (next_fragment || next_orphan || next_suffix) {
215 return true;
216 }
217
218 if is_valid_single_char(word) && next_len <= 3 && !next_common {
221 return true;
222 }
223
224 if word_common && word_len <= 3 && (next_fragment || next_suffix) {
228 return true;
229 }
230
231 false
232 }
233
234 fn should_continue_merge(current: &str, next: &str, had_short_fragment: bool) -> bool {
235 if !had_short_fragment || !is_purely_alpha(next) {
236 return false;
237 }
238
239 let next_len = alpha_len(next);
240 if next_len <= 3 {
241 return true;
242 }
243
244 if next_len == 4 && !is_common_word(next) && alpha_len(current) <= 5 {
245 return true;
246 }
247
248 false
249 }
250
251 let words: Vec<&str> = input.split_whitespace().collect();
252 if words.len() < 2 {
253 return input.to_string();
254 }
255
256 let mut output: Vec<String> = Vec::with_capacity(words.len());
257 let mut i = 0;
258
259 while i < words.len() {
260 let word = words[i];
261
262 if i + 1 < words.len() && should_start_merge(word, words[i + 1]) {
263 let mut merged = String::from(word);
264 let mut had_short_fragment = is_short_fragment(word)
265 || is_short_fragment(words[i + 1])
266 || is_orphan(word)
267 || is_orphan(words[i + 1])
268 || (is_valid_single_char(word) && alpha_len(words[i + 1]) <= 3);
269
270 merged.push_str(words[i + 1]);
271 i += 2;
272
273 while i < words.len() && should_continue_merge(&merged, words[i], had_short_fragment) {
274 if is_short_fragment(words[i]) || is_orphan(words[i]) {
275 had_short_fragment = true;
276 }
277 merged.push_str(words[i]);
278 i += 1;
279 }
280
281 output.push(merged);
282 } else {
283 output.push(word.to_string());
284 i += 1;
285 }
286 }
287
288 output.join(" ")
289}
290
291#[cfg(test)]
292mod tests {
293 use super::*;
294
295 #[test]
300 fn fixes_pdf_spacing_single_chars() {
301 assert_eq!(fix_pdf_spacing("lo n ger"), "longer");
303 assert_eq!(fix_pdf_spacing("n o"), "no"); let result = fix_pdf_spacing("rep o rted");
306 assert!(
307 result == "reported" || result.contains("rep"),
308 "got: {}",
309 result
310 );
311 }
312
313 #[test]
314 fn fixes_pdf_spacing_preserves_normal_text() {
315 assert_eq!(
317 fix_pdf_spacing("The manager reported to the supervisor"),
318 "The manager reported to the supervisor"
319 );
320 assert_eq!(
322 fix_pdf_spacing("The manager reported"),
323 "The manager reported"
324 );
325 assert_eq!(fix_pdf_spacing("man ager"), "manager");
327 assert_eq!(fix_pdf_spacing("I am a person"), "I am a person");
329 let result = fix_pdf_spacing("older do cuments");
333 assert!(result.contains("older"), "got: {}", result);
334 assert_eq!(fix_pdf_spacing("These references"), "These references");
335 }
336
337 #[test]
338 fn fixes_pdf_spacing_two_letter_fragments() {
339 assert_eq!(fix_pdf_spacing("lo ng"), "long");
341 assert_eq!(fix_pdf_spacing("to be or"), "to be or");
343 }
344
345 #[test]
346 fn fixes_pdf_spacing_real_pdf_artifacts() {
347 assert_eq!(fix_pdf_spacing("C hlo e"), "Chloe");
349 assert_eq!(fix_pdf_spacing("C hlo e Nguyen"), "Chloe Nguyen");
351 assert_eq!(fix_pdf_spacing("n o lo n ger"), "nolonger");
353 }
354
355 #[test]
356 fn fixes_pdf_spacing_fragment_chains() {
357 let result = fix_pdf_spacing("A va Martin");
359 assert!(
360 result.contains("va") || result.contains("Ava"),
361 "got: {}",
362 result
363 );
364 let result = fix_pdf_spacing("emp lo yee");
365 assert!(
366 result == "employee" || result.contains("emp"),
367 "got: {}",
368 result
369 );
370 }
371
372 #[test]
373 fn normalises_control_and_whitespace() {
374 let input = " Hello\tWorld \u{000B} test\r\nnext";
375 let result = normalize_text(input, 128).expect("normalized");
376 assert_eq!(result.text, "Hello World test\nnext");
377 assert!(!result.truncated);
378 }
379
380 #[test]
381 fn normalize_truncates_on_grapheme_boundary() {
382 let input = "a\u{0301}bcd"; let result = normalize_text(input, 3).expect("normalized");
384 assert_eq!(result.text, "áb");
385 assert!(result.truncated);
386 }
387
388 #[test]
389 fn truncate_boundary_handles_long_grapheme() {
390 let s = "🇮🇳hello"; let idx = truncate_at_grapheme_boundary(s, 4);
392 assert!(idx >= 4);
393 assert_eq!(&s[..idx], "🇮🇳");
394 }
395}