1use unicode_normalization::UnicodeNormalization;
2use unicode_segmentation::UnicodeSegmentation;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct NormalizedText {
7 pub text: String,
8 pub truncated: bool,
9}
10
11impl NormalizedText {
12 #[must_use]
13 pub fn is_truncated(&self) -> bool {
14 self.truncated
15 }
16}
17
18#[must_use]
21pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
22 let limit = limit.max(1);
23 let normalised = input.nfkc().collect::<String>();
24
25 let mut cleaned = String::with_capacity(normalised.len());
26 let mut last_was_space = false;
27 let mut last_was_newline = false;
28
29 for mut ch in normalised.chars() {
30 if ch == '\r' {
31 ch = '\n';
32 }
33 if ch == '\t' {
34 ch = ' ';
35 }
36 if ch.is_control() && ch != '\n' {
37 continue;
38 }
39 if ch == '\n' {
40 if last_was_newline {
41 continue;
42 }
43 while cleaned.ends_with(' ') {
44 cleaned.pop();
45 }
46 cleaned.push('\n');
47 last_was_newline = true;
48 last_was_space = false;
49 } else if ch.is_whitespace() {
50 if last_was_space || cleaned.ends_with('\n') {
51 continue;
52 }
53 cleaned.push(' ');
54 last_was_space = true;
55 last_was_newline = false;
56 } else {
57 cleaned.push(ch);
58 last_was_space = false;
59 last_was_newline = false;
60 }
61 }
62
63 let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
64 if trimmed.is_empty() {
65 return None;
66 }
67
68 let mut truncated = false;
69 let mut out = String::new();
70 let mut consumed = 0usize;
71
72 for grapheme in trimmed.graphemes(true) {
73 let next = consumed + grapheme.len();
74 if next > limit {
75 truncated = true;
76 break;
77 }
78 out.push_str(grapheme);
79 consumed = next;
80 }
81
82 if out.is_empty() {
83 if let Some(first) = trimmed.graphemes(true).next() {
86 out.push_str(first);
87 truncated = true;
88 }
89 }
90
91 Some(NormalizedText {
92 text: out,
93 truncated,
94 })
95}
96
97#[must_use]
100pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
101 if s.len() <= limit {
102 return s.len();
103 }
104
105 let mut end = 0usize;
106 for (idx, grapheme) in s.grapheme_indices(true) {
107 let next = idx + grapheme.len();
108 if next > limit {
109 break;
110 }
111 end = next;
112 }
113
114 if end == 0 {
115 s.graphemes(true).next().map(|g| g.len()).unwrap_or(0)
116 } else {
117 end
118 }
119}
120
121#[cfg(test)]
122mod tests {
123 use super::*;
124
125 #[test]
126 fn normalises_control_and_whitespace() {
127 let input = " Hello\tWorld \u{000B} test\r\nnext";
128 let result = normalize_text(input, 128).expect("normalized");
129 assert_eq!(result.text, "Hello World test\nnext");
130 assert!(!result.truncated);
131 }
132
133 #[test]
134 fn normalize_truncates_on_grapheme_boundary() {
135 let input = "a\u{0301}bcd"; let result = normalize_text(input, 3).expect("normalized");
137 assert_eq!(result.text, "áb");
138 assert!(result.truncated);
139 }
140
141 #[test]
142 fn truncate_boundary_handles_long_grapheme() {
143 let s = "🇮🇳hello"; let idx = truncate_at_grapheme_boundary(s, 4);
145 assert!(idx >= 4);
146 assert_eq!(&s[..idx], "🇮🇳");
147 }
148}