memvid_core/
text.rs

1use unicode_normalization::UnicodeNormalization;
2use unicode_segmentation::UnicodeSegmentation;
3
4/// Normalised text with truncation metadata.
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct NormalizedText {
7    pub text: String,
8    pub truncated: bool,
9}
10
11impl NormalizedText {
12    #[must_use]
13    pub fn is_truncated(&self) -> bool {
14        self.truncated
15    }
16}
17
18/// Normalise text (NFKC), strip control characters, compact whitespace and
19/// truncate at grapheme boundaries.
20#[must_use]
21pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
22    let limit = limit.max(1);
23    let normalised = input.nfkc().collect::<String>();
24
25    let mut cleaned = String::with_capacity(normalised.len());
26    let mut last_was_space = false;
27    let mut last_was_newline = false;
28
29    for mut ch in normalised.chars() {
30        if ch == '\r' {
31            ch = '\n';
32        }
33        if ch == '\t' {
34            ch = ' ';
35        }
36        if ch.is_control() && ch != '\n' {
37            continue;
38        }
39        if ch == '\n' {
40            if last_was_newline {
41                continue;
42            }
43            while cleaned.ends_with(' ') {
44                cleaned.pop();
45            }
46            cleaned.push('\n');
47            last_was_newline = true;
48            last_was_space = false;
49        } else if ch.is_whitespace() {
50            if last_was_space || cleaned.ends_with('\n') {
51                continue;
52            }
53            cleaned.push(' ');
54            last_was_space = true;
55            last_was_newline = false;
56        } else {
57            cleaned.push(ch);
58            last_was_space = false;
59            last_was_newline = false;
60        }
61    }
62
63    let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
64    if trimmed.is_empty() {
65        return None;
66    }
67
68    let mut truncated = false;
69    let mut out = String::new();
70    let mut consumed = 0usize;
71
72    for grapheme in trimmed.graphemes(true) {
73        let next = consumed + grapheme.len();
74        if next > limit {
75            truncated = true;
76            break;
77        }
78        out.push_str(grapheme);
79        consumed = next;
80    }
81
82    if out.is_empty() {
83        // Fallback: include the first grapheme even if it exceeds the limit so
84        // we never return empty text for non-empty input.
85        if let Some(first) = trimmed.graphemes(true).next() {
86            out.push_str(first);
87            truncated = true;
88        }
89    }
90
91    Some(NormalizedText {
92        text: out,
93        truncated,
94    })
95}
96
97/// Return the byte index at which a string should be truncated while
98/// preserving grapheme boundaries.
99#[must_use]
100pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
101    if s.len() <= limit {
102        return s.len();
103    }
104
105    let mut end = 0usize;
106    for (idx, grapheme) in s.grapheme_indices(true) {
107        let next = idx + grapheme.len();
108        if next > limit {
109            break;
110        }
111        end = next;
112    }
113
114    if end == 0 {
115        s.graphemes(true).next().map_or(0, str::len)
116    } else {
117        end
118    }
119}
120
121/// Fix spurious character-level spacing from PDF extraction.
122///
123/// Some PDF extractors produce text like "man ager" instead of "manager"
124/// or "sup erviso r" instead of "supervisor". This function detects and
125/// fixes these patterns.
126///
127/// Strategy: Detect short fragment runs that likely represent a single word
128/// (e.g. "emp lo yee") and join them while preserving normal text.
129#[must_use]
130pub fn fix_pdf_spacing(input: &str) -> String {
131    // Fast path: if input has no spaces or is very short, return as-is
132    if input.len() < 3 || !input.contains(' ') {
133        return input.to_string();
134    }
135
136    // Single-char words that are valid English and should NOT be joined
137    const VALID_SINGLE_CHARS: &[char] = &['a', 'i', 'A', 'I'];
138
139    // Common short words that should NOT be joined with neighbors
140    const COMMON_WORDS: &[&str] = &[
141        "a", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my",
142        "no", "of", "on", "or", "so", "to", "up", "us", "we", "am", "are", "can", "did", "for",
143        "get", "got", "had", "has", "her", "him", "his", "its", "let", "may", "nor", "not", "now",
144        "off", "old", "one", "our", "out", "own", "ran", "run", "saw", "say", "see", "set", "she",
145        "the", "too", "two", "use", "was", "way", "who", "why", "yet", "you", "all", "and", "any",
146        "but", "few", "how", "man", "new", "per", "put", "via",
147    ];
148
149    fn is_common_word(s: &str) -> bool {
150        let lower = s.to_ascii_lowercase();
151        COMMON_WORDS.contains(&lower.as_str())
152    }
153
154    fn is_valid_single_char(s: &str) -> bool {
155        s.len() == 1
156            && s.chars()
157                .next()
158                .is_some_and(|c| VALID_SINGLE_CHARS.contains(&c))
159    }
160
161    fn is_purely_alpha(s: &str) -> bool {
162        !s.is_empty() && s.chars().all(char::is_alphabetic)
163    }
164
165    fn alpha_len(s: &str) -> usize {
166        s.chars().filter(|c| c.is_alphabetic()).count()
167    }
168
169    fn is_orphan(word: &str) -> bool {
170        alpha_len(word) == 1 && is_purely_alpha(word) && !is_valid_single_char(word)
171    }
172
173    fn is_short_fragment(word: &str) -> bool {
174        let len = alpha_len(word);
175        // 2-3 letter non-common words are definitely fragments
176        (2..=3).contains(&len) && is_purely_alpha(word) && !is_common_word(word)
177    }
178
179    fn is_likely_suffix(word: &str) -> bool {
180        let len = alpha_len(word);
181        // 4-letter non-common words that look like word suffixes
182        // e.g., "ager" from "manager", "ment" from "engagement"
183        len == 4 && is_purely_alpha(word) && !is_common_word(word)
184    }
185
186    fn should_start_merge(word: &str, next: &str) -> bool {
187        if !is_purely_alpha(word) || !is_purely_alpha(next) {
188            return false;
189        }
190
191        let word_len = alpha_len(word);
192        let next_len = alpha_len(next);
193        let word_common = is_common_word(word);
194        let next_common = is_common_word(next);
195
196        let word_orphan = is_orphan(word);
197        let next_orphan = is_orphan(next);
198        let word_fragment = is_short_fragment(word);
199        let next_fragment = is_short_fragment(next);
200        let next_suffix = is_likely_suffix(next);
201
202        // Rule 1: Current word is an orphan (single non-I/a char) - strong signal of PDF break
203        if word_orphan {
204            return true;
205        }
206
207        // Rule 2: Next word is an orphan - also strong signal
208        if next_orphan {
209            return true;
210        }
211
212        // Rule 3: Current word is a fragment AND next is also fragment/orphan/suffix
213        // This prevents "older" + "do" from merging (older is not a fragment)
214        if word_fragment && (next_fragment || next_orphan || next_suffix) {
215            return true;
216        }
217
218        // Rule 4: Valid single char (A/I) followed by short non-common fragment
219        // Handles "A va" -> "Ava"
220        if is_valid_single_char(word) && next_len <= 3 && !next_common {
221            return true;
222        }
223
224        // Rule 5: Short common word (2-3 chars) followed by fragment or suffix
225        // Handles "man ager" -> "manager", "in di" type patterns
226        // But NOT "older do" (older is 5 chars, not short common)
227        if word_common && word_len <= 3 && (next_fragment || next_suffix) {
228            return true;
229        }
230
231        false
232    }
233
234    fn should_continue_merge(current: &str, next: &str, had_short_fragment: bool) -> bool {
235        if !had_short_fragment || !is_purely_alpha(next) {
236            return false;
237        }
238
239        let next_len = alpha_len(next);
240        if next_len <= 3 {
241            return true;
242        }
243
244        if next_len == 4 && !is_common_word(next) && alpha_len(current) <= 5 {
245            return true;
246        }
247
248        false
249    }
250
251    let words: Vec<&str> = input.split_whitespace().collect();
252    if words.len() < 2 {
253        return input.to_string();
254    }
255
256    let mut output: Vec<String> = Vec::with_capacity(words.len());
257    let mut i = 0;
258
259    while i < words.len() {
260        let word = words[i];
261
262        if i + 1 < words.len() && should_start_merge(word, words[i + 1]) {
263            let mut merged = String::from(word);
264            let mut had_short_fragment = is_short_fragment(word)
265                || is_short_fragment(words[i + 1])
266                || is_orphan(word)
267                || is_orphan(words[i + 1])
268                || (is_valid_single_char(word) && alpha_len(words[i + 1]) <= 3);
269
270            merged.push_str(words[i + 1]);
271            i += 2;
272
273            while i < words.len() && should_continue_merge(&merged, words[i], had_short_fragment) {
274                if is_short_fragment(words[i]) || is_orphan(words[i]) {
275                    had_short_fragment = true;
276                }
277                merged.push_str(words[i]);
278                i += 1;
279            }
280
281            output.push(merged);
282        } else {
283            output.push(word.to_string());
284            i += 1;
285        }
286    }
287
288    output.join(" ")
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294
295    // Note: The heuristic fix_pdf_spacing tests are basic sanity checks.
296    // When symspell_cleanup feature is enabled (default), the SymSpell-based
297    // cleanup in symspell_cleanup.rs provides better results and has its own tests.
298
299    #[test]
300    fn fixes_pdf_spacing_single_chars() {
301        // Single orphan chars get joined with adjacent words
302        assert_eq!(fix_pdf_spacing("lo n ger"), "longer");
303        assert_eq!(fix_pdf_spacing("n o"), "no"); // both single chars, both orphans
304        // These are best-effort heuristics - SymSpell handles complex cases better
305        let result = fix_pdf_spacing("rep o rted");
306        assert!(
307            result == "reported" || result.contains("rep"),
308            "got: {}",
309            result
310        );
311    }
312
313    #[test]
314    fn fixes_pdf_spacing_preserves_normal_text() {
315        // Normal English text should be preserved
316        assert_eq!(
317            fix_pdf_spacing("The manager reported to the supervisor"),
318            "The manager reported to the supervisor"
319        );
320        // Valid words should not be merged across word boundaries
321        assert_eq!(
322            fix_pdf_spacing("The manager reported"),
323            "The manager reported"
324        );
325        // "man ager" is a common PDF fragment split
326        assert_eq!(fix_pdf_spacing("man ager"), "manager");
327        // Valid single chars (a, I) should stay separate
328        assert_eq!(fix_pdf_spacing("I am a person"), "I am a person");
329        // Complete words should NOT be joined with fragments
330        // "older" is a complete word, should not merge with "do cuments"
331        // Note: "do" is a common word so heuristic may not join it - SymSpell handles this better
332        let result = fix_pdf_spacing("older do cuments");
333        assert!(result.contains("older"), "got: {}", result);
334        assert_eq!(fix_pdf_spacing("These references"), "These references");
335    }
336
337    #[test]
338    fn fixes_pdf_spacing_two_letter_fragments() {
339        // Two-letter non-word fragments get joined together
340        assert_eq!(fix_pdf_spacing("lo ng"), "long");
341        // But common 2-letter words stay separate
342        assert_eq!(fix_pdf_spacing("to be or"), "to be or");
343    }
344
345    #[test]
346    fn fixes_pdf_spacing_real_pdf_artifacts() {
347        // Real example: single char "C" joins with "hlo", then "e" joins
348        assert_eq!(fix_pdf_spacing("C hlo e"), "Chloe");
349        // With a real name after
350        assert_eq!(fix_pdf_spacing("C hlo e Nguyen"), "Chloe Nguyen");
351        // Real patterns with orphans throughout
352        assert_eq!(fix_pdf_spacing("n o lo n ger"), "nolonger");
353    }
354
355    #[test]
356    fn fixes_pdf_spacing_fragment_chains() {
357        // These are best handled by SymSpell - heuristics are approximate
358        let result = fix_pdf_spacing("A va Martin");
359        assert!(
360            result.contains("va") || result.contains("Ava"),
361            "got: {}",
362            result
363        );
364        let result = fix_pdf_spacing("emp lo yee");
365        assert!(
366            result == "employee" || result.contains("emp"),
367            "got: {}",
368            result
369        );
370    }
371
372    #[test]
373    fn normalises_control_and_whitespace() {
374        let input = " Hello\tWorld \u{000B} test\r\nnext";
375        let result = normalize_text(input, 128).expect("normalized");
376        assert_eq!(result.text, "Hello World test\nnext");
377        assert!(!result.truncated);
378    }
379
380    #[test]
381    fn normalize_truncates_on_grapheme_boundary() {
382        let input = "a\u{0301}bcd"; // "á" decomposed plus letters.
383        let result = normalize_text(input, 3).expect("normalized");
384        assert_eq!(result.text, "áb");
385        assert!(result.truncated);
386    }
387
388    #[test]
389    fn truncate_boundary_handles_long_grapheme() {
390        let s = "🇮🇳hello"; // flag is 8 bytes, 1 grapheme.
391        let idx = truncate_at_grapheme_boundary(s, 4);
392        assert!(idx >= 4);
393        assert_eq!(&s[..idx], "🇮🇳");
394    }
395}
memvid_core/text.rs

memvid_core/
text.rs