memvid_core/
text.rs

1use unicode_normalization::UnicodeNormalization;
2use unicode_segmentation::UnicodeSegmentation;
3
4/// Normalised text with truncation metadata.
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct NormalizedText {
7    pub text: String,
8    pub truncated: bool,
9}
10
11impl NormalizedText {
12    #[must_use]
13    pub fn is_truncated(&self) -> bool {
14        self.truncated
15    }
16}
17
18/// Normalise text (NFKC), strip control characters, compact whitespace and
19/// truncate at grapheme boundaries.
20#[must_use]
21pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
22    let limit = limit.max(1);
23    let normalised = input.nfkc().collect::<String>();
24
25    let mut cleaned = String::with_capacity(normalised.len());
26    let mut last_was_space = false;
27    let mut last_was_newline = false;
28
29    for mut ch in normalised.chars() {
30        if ch == '\r' {
31            ch = '\n';
32        }
33        if ch == '\t' {
34            ch = ' ';
35        }
36        if ch.is_control() && ch != '\n' {
37            continue;
38        }
39        if ch == '\n' {
40            if last_was_newline {
41                continue;
42            }
43            while cleaned.ends_with(' ') {
44                cleaned.pop();
45            }
46            cleaned.push('\n');
47            last_was_newline = true;
48            last_was_space = false;
49        } else if ch.is_whitespace() {
50            if last_was_space || cleaned.ends_with('\n') {
51                continue;
52            }
53            cleaned.push(' ');
54            last_was_space = true;
55            last_was_newline = false;
56        } else {
57            cleaned.push(ch);
58            last_was_space = false;
59            last_was_newline = false;
60        }
61    }
62
63    let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
64    if trimmed.is_empty() {
65        return None;
66    }
67
68    let mut truncated = false;
69    let mut out = String::new();
70    let mut consumed = 0usize;
71
72    for grapheme in trimmed.graphemes(true) {
73        let next = consumed + grapheme.len();
74        if next > limit {
75            truncated = true;
76            break;
77        }
78        out.push_str(grapheme);
79        consumed = next;
80    }
81
82    if out.is_empty() {
83        // Fallback: include the first grapheme even if it exceeds the limit so
84        // we never return empty text for non-empty input.
85        if let Some(first) = trimmed.graphemes(true).next() {
86            out.push_str(first);
87            truncated = true;
88        }
89    }
90
91    Some(NormalizedText {
92        text: out,
93        truncated,
94    })
95}
96
97/// Return the byte index at which a string should be truncated while
98/// preserving grapheme boundaries.
99#[must_use]
100pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
101    if s.len() <= limit {
102        return s.len();
103    }
104
105    let mut end = 0usize;
106    for (idx, grapheme) in s.grapheme_indices(true) {
107        let next = idx + grapheme.len();
108        if next > limit {
109            break;
110        }
111        end = next;
112    }
113
114    if end == 0 {
115        s.graphemes(true).next().map(|g| g.len()).unwrap_or(0)
116    } else {
117        end
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn normalises_control_and_whitespace() {
127        let input = " Hello\tWorld \u{000B} test\r\nnext";
128        let result = normalize_text(input, 128).expect("normalized");
129        assert_eq!(result.text, "Hello World test\nnext");
130        assert!(!result.truncated);
131    }
132
133    #[test]
134    fn normalize_truncates_on_grapheme_boundary() {
135        let input = "a\u{0301}bcd"; // "á" decomposed plus letters.
136        let result = normalize_text(input, 3).expect("normalized");
137        assert_eq!(result.text, "áb");
138        assert!(result.truncated);
139    }
140
141    #[test]
142    fn truncate_boundary_handles_long_grapheme() {
143        let s = "🇮🇳hello"; // flag is 8 bytes, 1 grapheme.
144        let idx = truncate_at_grapheme_boundary(s, 4);
145        assert!(idx >= 4);
146        assert_eq!(&s[..idx], "🇮🇳");
147    }
148}