1use unicode_general_category::{GeneralCategory, get_general_category};
7use unicode_normalization::UnicodeNormalization;
8
9const NEWLINES: &[char] = &[
11 '\u{000A}', '\u{000B}', '\u{000C}', '\u{000D}', '\u{0085}', '\u{2028}', '\u{2029}', ];
19
20fn is_c_category(c: char) -> bool {
22 matches!(
23 get_general_category(c),
24 GeneralCategory::Control
25 | GeneralCategory::Format
26 | GeneralCategory::Unassigned
27 | GeneralCategory::PrivateUse
28 | GeneralCategory::Surrogate
29 )
30}
31
32fn is_cmp_category(c: char) -> bool {
34 matches!(
35 get_general_category(c),
36 GeneralCategory::Control
38 | GeneralCategory::Format
39 | GeneralCategory::Unassigned
40 | GeneralCategory::PrivateUse
41 | GeneralCategory::Surrogate
42 | GeneralCategory::NonspacingMark
44 | GeneralCategory::SpacingMark
45 | GeneralCategory::EnclosingMark
46 | GeneralCategory::ConnectorPunctuation
48 | GeneralCategory::DashPunctuation
49 | GeneralCategory::OpenPunctuation
50 | GeneralCategory::ClosePunctuation
51 | GeneralCategory::InitialPunctuation
52 | GeneralCategory::FinalPunctuation
53 | GeneralCategory::OtherPunctuation
54 )
55}
56
57pub fn text_clean(text: &str) -> String {
63 let text: String = text.nfkc().collect();
65
66 let mut cleaned = String::with_capacity(text.len());
68 let mut chars = text.chars().peekable();
69 while let Some(c) = chars.next() {
70 if NEWLINES.contains(&c) {
71 if c == '\r' && chars.peek() == Some(&'\n') {
73 chars.next();
74 }
75 cleaned.push('\n');
76 } else if is_c_category(c) {
77 } else {
79 cleaned.push(c);
80 }
81 }
82
83 let mut result_lines: Vec<&str> = Vec::new();
85 let mut prev_empty = false;
86 for line in cleaned.split('\n') {
87 let is_empty = line.trim().is_empty();
88 if is_empty {
89 if prev_empty {
90 continue;
91 }
92 prev_empty = true;
93 } else {
94 prev_empty = false;
95 }
96 result_lines.push(line);
97 }
98
99 result_lines.join("\n").trim().to_string()
101}
102
103pub fn text_remove_newlines(text: &str) -> String {
108 text.split_whitespace().collect::<Vec<_>>().join(" ")
109}
110
111pub fn text_trim(text: &str, nbytes: usize) -> String {
117 if text.len() <= nbytes {
118 return text.trim().to_string();
119 }
120 let bytes = &text.as_bytes()[..nbytes];
121 let s = match std::str::from_utf8(bytes) {
122 Ok(s) => s,
123 Err(e) => &text[..e.valid_up_to()],
124 };
125 s.trim().to_string()
126}
127
128pub fn text_collapse(text: &str) -> String {
134 let nfd_lower: String = text.nfd().collect::<String>().to_lowercase();
136
137 let filtered: String = nfd_lower
139 .chars()
140 .filter(|&c| !c.is_whitespace() && !is_cmp_category(c))
141 .collect();
142
143 filtered.nfkc().collect()
145}
146
147pub(crate) fn multi_hash_blake3(data: &[u8]) -> String {
152 let digest = blake3::hash(data);
153 let mut result = Vec::with_capacity(34);
154 result.push(0x1e); result.push(0x20); result.extend_from_slice(digest.as_bytes());
157 hex::encode(result)
158}
159
160#[cfg(test)]
161mod tests {
162 use super::*;
163
164 #[test]
167 fn test_text_clean_nfkc_normalization() {
168 assert!(text_clean("ℍ").contains('H'));
170 }
171
172 #[test]
173 fn test_text_clean_removes_control_chars() {
174 assert_eq!(text_clean("hello\tworld"), "helloworld");
175 }
176
177 #[test]
178 fn test_text_clean_preserves_newlines() {
179 assert_eq!(text_clean("hello\nworld"), "hello\nworld");
180 }
181
182 #[test]
183 fn test_text_clean_collapses_empty_lines() {
184 assert_eq!(text_clean("a\n\n\nb"), "a\n\nb");
185 }
186
187 #[test]
188 fn test_text_clean_strips_whitespace() {
189 assert_eq!(text_clean(" hello "), "hello");
190 }
191
192 #[test]
193 fn test_text_clean_handles_crlf() {
194 assert_eq!(text_clean("a\r\nb"), "a\nb");
195 }
196
197 #[test]
198 fn test_text_clean_empty() {
199 assert_eq!(text_clean(""), "");
200 }
201
202 #[test]
205 fn test_text_remove_newlines() {
206 assert_eq!(text_remove_newlines("hello\nworld"), "hello world");
207 }
208
209 #[test]
210 fn test_text_remove_newlines_collapses_spaces() {
211 assert_eq!(text_remove_newlines("a b c"), "a b c");
212 }
213
214 #[test]
217 fn test_text_trim_no_truncation() {
218 assert_eq!(text_trim("hello", 10), "hello");
219 }
220
221 #[test]
222 fn test_text_trim_exact() {
223 assert_eq!(text_trim("hello", 5), "hello");
224 }
225
226 #[test]
227 fn test_text_trim_truncates() {
228 assert_eq!(text_trim("hello world", 5), "hello");
229 }
230
231 #[test]
232 fn test_text_trim_unicode_boundary() {
233 assert_eq!(text_trim("é", 1), "");
235 }
236
237 #[test]
238 fn test_text_trim_strips() {
239 assert_eq!(text_trim("hello ", 6), "hello");
240 }
241
242 #[test]
245 fn test_text_collapse_basic() {
246 assert_eq!(text_collapse("Hello World"), "helloworld");
247 }
248
249 #[test]
250 fn test_text_collapse_strips_accents() {
251 assert_eq!(text_collapse("café"), "cafe");
253 }
254
255 #[test]
256 fn test_text_collapse_strips_punctuation() {
257 assert_eq!(text_collapse("hello, world!"), "helloworld");
258 }
259
260 #[test]
261 fn test_text_collapse_empty() {
262 assert_eq!(text_collapse(""), "");
263 }
264
265 #[test]
268 fn test_multi_hash_blake3_empty() {
269 assert_eq!(
270 multi_hash_blake3(b""),
271 "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
272 );
273 }
274
275 #[test]
276 fn test_multi_hash_blake3_hello_world() {
277 assert_eq!(
278 multi_hash_blake3(b"hello world"),
279 "1e20d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
280 );
281 }
282}