memvid_core/
text.rs

1use unicode_normalization::UnicodeNormalization;
2use unicode_segmentation::UnicodeSegmentation;
3
4/// Normalised text with truncation metadata.
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct NormalizedText {
7    pub text: String,
8    pub truncated: bool,
9}
10
11impl NormalizedText {
12    #[must_use]
13    pub fn is_truncated(&self) -> bool {
14        self.truncated
15    }
16}
17
18/// Normalise text (NFKC), strip control characters, compact whitespace and
19/// truncate at grapheme boundaries.
20#[must_use]
21pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
22    let limit = limit.max(1);
23    let normalised = input.nfkc().collect::<String>();
24
25    let mut cleaned = String::with_capacity(normalised.len());
26    let mut last_was_space = false;
27    let mut last_was_newline = false;
28
29    for mut ch in normalised.chars() {
30        if ch == '\r' {
31            ch = '\n';
32        }
33        if ch == '\t' {
34            ch = ' ';
35        }
36        if ch.is_control() && ch != '\n' {
37            continue;
38        }
39        if ch == '\n' {
40            if last_was_newline {
41                continue;
42            }
43            while cleaned.ends_with(' ') {
44                cleaned.pop();
45            }
46            cleaned.push('\n');
47            last_was_newline = true;
48            last_was_space = false;
49        } else if ch.is_whitespace() {
50            if last_was_space || cleaned.ends_with('\n') {
51                continue;
52            }
53            cleaned.push(' ');
54            last_was_space = true;
55            last_was_newline = false;
56        } else {
57            cleaned.push(ch);
58            last_was_space = false;
59            last_was_newline = false;
60        }
61    }
62
63    let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
64    if trimmed.is_empty() {
65        return None;
66    }
67
68    let mut truncated = false;
69    let mut out = String::new();
70    let mut consumed = 0usize;
71
72    for grapheme in trimmed.graphemes(true) {
73        let next = consumed + grapheme.len();
74        if next > limit {
75            truncated = true;
76            break;
77        }
78        out.push_str(grapheme);
79        consumed = next;
80    }
81
82    if out.is_empty() {
83        // Fallback: include the first grapheme even if it exceeds the limit so
84        // we never return empty text for non-empty input.
85        if let Some(first) = trimmed.graphemes(true).next() {
86            out.push_str(first);
87            truncated = true;
88        }
89    }
90
91    Some(NormalizedText {
92        text: out,
93        truncated,
94    })
95}
96
97/// Return the byte index at which a string should be truncated while
98/// preserving grapheme boundaries.
99#[must_use]
100pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
101    if s.len() <= limit {
102        return s.len();
103    }
104
105    let mut end = 0usize;
106    for (idx, grapheme) in s.grapheme_indices(true) {
107        let next = idx + grapheme.len();
108        if next > limit {
109            break;
110        }
111        end = next;
112    }
113
114    if end == 0 {
115        s.graphemes(true).next().map(|g| g.len()).unwrap_or(0)
116    } else {
117        end
118    }
119}
120
121/// Fix spurious character-level spacing from PDF extraction.
122///
123/// Some PDF extractors produce text like "man ager" instead of "manager"
124/// or "sup erviso r" instead of "supervisor". This function detects and
125/// fixes these patterns.
126///
127/// Strategy: Detect short fragment runs that likely represent a single word
128/// (e.g. "emp lo yee") and join them while preserving normal text.
129#[must_use]
130pub fn fix_pdf_spacing(input: &str) -> String {
131    // Fast path: if input has no spaces or is very short, return as-is
132    if input.len() < 3 || !input.contains(' ') {
133        return input.to_string();
134    }
135
136    // Single-char words that are valid English and should NOT be joined
137    const VALID_SINGLE_CHARS: &[char] = &['a', 'i', 'A', 'I'];
138
139    // Common short words that should NOT be joined with neighbors
140    const COMMON_WORDS: &[&str] = &[
141        "a", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my",
142        "no", "of", "on", "or", "so", "to", "up", "us", "we", "am", "are", "can", "did", "for",
143        "get", "got", "had", "has", "her", "him", "his", "its", "let", "may", "nor", "not", "now",
144        "off", "old", "one", "our", "out", "own", "ran", "run", "saw", "say", "see", "set", "she",
145        "the", "too", "two", "use", "was", "way", "who", "why", "yet", "you", "all", "and", "any",
146        "but", "few", "how", "man", "new", "per", "put", "via",
147    ];
148
149    fn is_common_word(s: &str) -> bool {
150        let lower = s.to_ascii_lowercase();
151        COMMON_WORDS.contains(&lower.as_str())
152    }
153
154    fn is_valid_single_char(s: &str) -> bool {
155        s.len() == 1
156            && s.chars()
157                .next()
158                .map(|c| VALID_SINGLE_CHARS.contains(&c))
159                .unwrap_or(false)
160    }
161
162    fn is_purely_alpha(s: &str) -> bool {
163        !s.is_empty() && s.chars().all(|c| c.is_alphabetic())
164    }
165
166    fn alpha_len(s: &str) -> usize {
167        s.chars().filter(|c| c.is_alphabetic()).count()
168    }
169
170    fn is_orphan(word: &str) -> bool {
171        alpha_len(word) == 1 && is_purely_alpha(word) && !is_valid_single_char(word)
172    }
173
174    fn is_short_fragment(word: &str) -> bool {
175        let len = alpha_len(word);
176        // 2-3 letter non-common words are definitely fragments
177        len >= 2 && len <= 3 && is_purely_alpha(word) && !is_common_word(word)
178    }
179
180    fn is_likely_suffix(word: &str) -> bool {
181        let len = alpha_len(word);
182        // 4-letter non-common words that look like word suffixes
183        // e.g., "ager" from "manager", "ment" from "engagement"
184        len == 4 && is_purely_alpha(word) && !is_common_word(word)
185    }
186
187    fn should_start_merge(word: &str, next: &str) -> bool {
188        if !is_purely_alpha(word) || !is_purely_alpha(next) {
189            return false;
190        }
191
192        let word_len = alpha_len(word);
193        let next_len = alpha_len(next);
194        let word_common = is_common_word(word);
195        let next_common = is_common_word(next);
196
197        let word_orphan = is_orphan(word);
198        let next_orphan = is_orphan(next);
199        let word_fragment = is_short_fragment(word);
200        let next_fragment = is_short_fragment(next);
201        let next_suffix = is_likely_suffix(next);
202
203        // Rule 1: Current word is an orphan (single non-I/a char) - strong signal of PDF break
204        if word_orphan {
205            return true;
206        }
207
208        // Rule 2: Next word is an orphan - also strong signal
209        if next_orphan {
210            return true;
211        }
212
213        // Rule 3: Current word is a fragment AND next is also fragment/orphan/suffix
214        // This prevents "older" + "do" from merging (older is not a fragment)
215        if word_fragment && (next_fragment || next_orphan || next_suffix) {
216            return true;
217        }
218
219        // Rule 4: Valid single char (A/I) followed by short non-common fragment
220        // Handles "A va" -> "Ava"
221        if is_valid_single_char(word) && next_len <= 3 && !next_common {
222            return true;
223        }
224
225        // Rule 5: Short common word (2-3 chars) followed by fragment or suffix
226        // Handles "man ager" -> "manager", "in di" type patterns
227        // But NOT "older do" (older is 5 chars, not short common)
228        if word_common && word_len <= 3 && (next_fragment || next_suffix) {
229            return true;
230        }
231
232        false
233    }
234
235    fn should_continue_merge(current: &str, next: &str, had_short_fragment: bool) -> bool {
236        if !had_short_fragment || !is_purely_alpha(next) {
237            return false;
238        }
239
240        let next_len = alpha_len(next);
241        if next_len <= 3 {
242            return true;
243        }
244
245        if next_len == 4 && !is_common_word(next) && alpha_len(current) <= 5 {
246            return true;
247        }
248
249        false
250    }
251
252    let words: Vec<&str> = input.split_whitespace().collect();
253    if words.len() < 2 {
254        return input.to_string();
255    }
256
257    let mut output: Vec<String> = Vec::with_capacity(words.len());
258    let mut i = 0;
259
260    while i < words.len() {
261        let word = words[i];
262
263        if i + 1 < words.len() && should_start_merge(word, words[i + 1]) {
264            let mut merged = String::from(word);
265            let mut had_short_fragment = is_short_fragment(word)
266                || is_short_fragment(words[i + 1])
267                || is_orphan(word)
268                || is_orphan(words[i + 1])
269                || (is_valid_single_char(word) && alpha_len(words[i + 1]) <= 3);
270
271            merged.push_str(words[i + 1]);
272            i += 2;
273
274            while i < words.len() && should_continue_merge(&merged, words[i], had_short_fragment) {
275                if is_short_fragment(words[i]) || is_orphan(words[i]) {
276                    had_short_fragment = true;
277                }
278                merged.push_str(words[i]);
279                i += 1;
280            }
281
282            output.push(merged);
283        } else {
284            output.push(word.to_string());
285            i += 1;
286        }
287    }
288
289    output.join(" ")
290}
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295
296    // Note: The heuristic fix_pdf_spacing tests are basic sanity checks.
297    // When symspell_cleanup feature is enabled (default), the SymSpell-based
298    // cleanup in symspell_cleanup.rs provides better results and has its own tests.
299
300    #[test]
301    fn fixes_pdf_spacing_single_chars() {
302        // Single orphan chars get joined with adjacent words
303        assert_eq!(fix_pdf_spacing("lo n ger"), "longer");
304        assert_eq!(fix_pdf_spacing("n o"), "no"); // both single chars, both orphans
305        // These are best-effort heuristics - SymSpell handles complex cases better
306        let result = fix_pdf_spacing("rep o rted");
307        assert!(
308            result == "reported" || result.contains("rep"),
309            "got: {}",
310            result
311        );
312    }
313
314    #[test]
315    fn fixes_pdf_spacing_preserves_normal_text() {
316        // Normal English text should be preserved
317        assert_eq!(
318            fix_pdf_spacing("The manager reported to the supervisor"),
319            "The manager reported to the supervisor"
320        );
321        // Valid words should not be merged across word boundaries
322        assert_eq!(
323            fix_pdf_spacing("The manager reported"),
324            "The manager reported"
325        );
326        // "man ager" is a common PDF fragment split
327        assert_eq!(fix_pdf_spacing("man ager"), "manager");
328        // Valid single chars (a, I) should stay separate
329        assert_eq!(fix_pdf_spacing("I am a person"), "I am a person");
330        // Complete words should NOT be joined with fragments
331        // "older" is a complete word, should not merge with "do cuments"
332        // Note: "do" is a common word so heuristic may not join it - SymSpell handles this better
333        let result = fix_pdf_spacing("older do cuments");
334        assert!(result.contains("older"), "got: {}", result);
335        assert_eq!(fix_pdf_spacing("These references"), "These references");
336    }
337
338    #[test]
339    fn fixes_pdf_spacing_two_letter_fragments() {
340        // Two-letter non-word fragments get joined together
341        assert_eq!(fix_pdf_spacing("lo ng"), "long");
342        // But common 2-letter words stay separate
343        assert_eq!(fix_pdf_spacing("to be or"), "to be or");
344    }
345
346    #[test]
347    fn fixes_pdf_spacing_real_pdf_artifacts() {
348        // Real example: single char "C" joins with "hlo", then "e" joins
349        assert_eq!(fix_pdf_spacing("C hlo e"), "Chloe");
350        // With a real name after
351        assert_eq!(fix_pdf_spacing("C hlo e Nguyen"), "Chloe Nguyen");
352        // Real patterns with orphans throughout
353        assert_eq!(fix_pdf_spacing("n o lo n ger"), "nolonger");
354    }
355
356    #[test]
357    fn fixes_pdf_spacing_fragment_chains() {
358        // These are best handled by SymSpell - heuristics are approximate
359        let result = fix_pdf_spacing("A va Martin");
360        assert!(
361            result.contains("va") || result.contains("Ava"),
362            "got: {}",
363            result
364        );
365        let result = fix_pdf_spacing("emp lo yee");
366        assert!(
367            result == "employee" || result.contains("emp"),
368            "got: {}",
369            result
370        );
371    }
372
373    #[test]
374    fn normalises_control_and_whitespace() {
375        let input = " Hello\tWorld \u{000B} test\r\nnext";
376        let result = normalize_text(input, 128).expect("normalized");
377        assert_eq!(result.text, "Hello World test\nnext");
378        assert!(!result.truncated);
379    }
380
381    #[test]
382    fn normalize_truncates_on_grapheme_boundary() {
383        let input = "a\u{0301}bcd"; // "á" decomposed plus letters.
384        let result = normalize_text(input, 3).expect("normalized");
385        assert_eq!(result.text, "áb");
386        assert!(result.truncated);
387    }
388
389    #[test]
390    fn truncate_boundary_handles_long_grapheme() {
391        let s = "🇮🇳hello"; // flag is 8 bytes, 1 grapheme.
392        let idx = truncate_at_grapheme_boundary(s, 4);
393        assert!(idx >= 4);
394        assert_eq!(&s[..idx], "🇮🇳");
395    }
396}
memvid_core/text.rs

memvid_core/
text.rs