1#[cfg(feature = "text-processing")]
7use unicode_general_category::{GeneralCategory, get_general_category};
8#[cfg(feature = "text-processing")]
9use unicode_normalization::UnicodeNormalization;
10
11#[cfg(feature = "text-processing")]
13const NEWLINES: &[char] = &[
14 '\u{000A}', '\u{000B}', '\u{000C}', '\u{000D}', '\u{0085}', '\u{2028}', '\u{2029}', ];
22
23#[cfg(feature = "text-processing")]
25fn is_c_category(c: char) -> bool {
26 matches!(
27 get_general_category(c),
28 GeneralCategory::Control
29 | GeneralCategory::Format
30 | GeneralCategory::Unassigned
31 | GeneralCategory::PrivateUse
32 | GeneralCategory::Surrogate
33 )
34}
35
36#[cfg(feature = "text-processing")]
38fn is_cmp_category(c: char) -> bool {
39 matches!(
40 get_general_category(c),
41 GeneralCategory::Control
43 | GeneralCategory::Format
44 | GeneralCategory::Unassigned
45 | GeneralCategory::PrivateUse
46 | GeneralCategory::Surrogate
47 | GeneralCategory::NonspacingMark
49 | GeneralCategory::SpacingMark
50 | GeneralCategory::EnclosingMark
51 | GeneralCategory::ConnectorPunctuation
53 | GeneralCategory::DashPunctuation
54 | GeneralCategory::OpenPunctuation
55 | GeneralCategory::ClosePunctuation
56 | GeneralCategory::InitialPunctuation
57 | GeneralCategory::FinalPunctuation
58 | GeneralCategory::OtherPunctuation
59 )
60}
61
62#[cfg(feature = "text-processing")]
68pub fn text_clean(text: &str) -> String {
69 let text: String = text.nfkc().collect();
71
72 let mut cleaned = String::with_capacity(text.len());
74 let mut chars = text.chars().peekable();
75 while let Some(c) = chars.next() {
76 if NEWLINES.contains(&c) {
77 if c == '\r' && chars.peek() == Some(&'\n') {
79 chars.next();
80 }
81 cleaned.push('\n');
82 } else if is_c_category(c) {
83 } else {
85 cleaned.push(c);
86 }
87 }
88
89 let mut result_lines: Vec<&str> = Vec::new();
91 let mut prev_empty = false;
92 for line in cleaned.split('\n') {
93 let is_empty = line.trim().is_empty();
94 if is_empty {
95 if prev_empty {
96 continue;
97 }
98 prev_empty = true;
99 } else {
100 prev_empty = false;
101 }
102 result_lines.push(line);
103 }
104
105 result_lines.join("\n").trim().to_string()
107}
108
109pub fn text_remove_newlines(text: &str) -> String {
114 text.split_whitespace().collect::<Vec<_>>().join(" ")
115}
116
117pub fn text_trim(text: &str, nbytes: usize) -> String {
123 if text.len() <= nbytes {
124 return text.trim().to_string();
125 }
126 let bytes = &text.as_bytes()[..nbytes];
127 let s = match std::str::from_utf8(bytes) {
128 Ok(s) => s,
129 Err(e) => &text[..e.valid_up_to()],
130 };
131 s.trim().to_string()
132}
133
134#[cfg(feature = "text-processing")]
140pub fn text_collapse(text: &str) -> String {
141 let nfd_lower: String = text.nfd().collect::<String>().to_lowercase();
143
144 let filtered: String = nfd_lower
146 .chars()
147 .filter(|&c| !c.is_whitespace() && !is_cmp_category(c))
148 .collect();
149
150 filtered.nfkc().collect()
152}
153
154pub(crate) fn multi_hash_blake3(data: &[u8]) -> String {
159 let digest = blake3::hash(data);
160 let mut result = Vec::with_capacity(34);
161 result.push(0x1e); result.push(0x20); result.extend_from_slice(digest.as_bytes());
164 hex::encode(result)
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170
171 #[cfg(feature = "text-processing")]
174 #[test]
175 fn test_text_clean_nfkc_normalization() {
176 assert!(text_clean("ℍ").contains('H'));
178 }
179
180 #[cfg(feature = "text-processing")]
181 #[test]
182 fn test_text_clean_removes_control_chars() {
183 assert_eq!(text_clean("hello\tworld"), "helloworld");
184 }
185
186 #[cfg(feature = "text-processing")]
187 #[test]
188 fn test_text_clean_preserves_newlines() {
189 assert_eq!(text_clean("hello\nworld"), "hello\nworld");
190 }
191
192 #[cfg(feature = "text-processing")]
193 #[test]
194 fn test_text_clean_collapses_empty_lines() {
195 assert_eq!(text_clean("a\n\n\nb"), "a\n\nb");
196 }
197
198 #[cfg(feature = "text-processing")]
199 #[test]
200 fn test_text_clean_strips_whitespace() {
201 assert_eq!(text_clean(" hello "), "hello");
202 }
203
204 #[cfg(feature = "text-processing")]
205 #[test]
206 fn test_text_clean_handles_crlf() {
207 assert_eq!(text_clean("a\r\nb"), "a\nb");
208 }
209
210 #[cfg(feature = "text-processing")]
211 #[test]
212 fn test_text_clean_empty() {
213 assert_eq!(text_clean(""), "");
214 }
215
216 #[test]
219 fn test_text_remove_newlines() {
220 assert_eq!(text_remove_newlines("hello\nworld"), "hello world");
221 }
222
223 #[test]
224 fn test_text_remove_newlines_collapses_spaces() {
225 assert_eq!(text_remove_newlines("a b c"), "a b c");
226 }
227
228 #[test]
231 fn test_text_trim_no_truncation() {
232 assert_eq!(text_trim("hello", 10), "hello");
233 }
234
235 #[test]
236 fn test_text_trim_exact() {
237 assert_eq!(text_trim("hello", 5), "hello");
238 }
239
240 #[test]
241 fn test_text_trim_truncates() {
242 assert_eq!(text_trim("hello world", 5), "hello");
243 }
244
245 #[test]
246 fn test_text_trim_unicode_boundary() {
247 assert_eq!(text_trim("é", 1), "");
249 }
250
251 #[test]
252 fn test_text_trim_strips() {
253 assert_eq!(text_trim("hello ", 6), "hello");
254 }
255
256 #[cfg(feature = "text-processing")]
259 #[test]
260 fn test_text_collapse_basic() {
261 assert_eq!(text_collapse("Hello World"), "helloworld");
262 }
263
264 #[cfg(feature = "text-processing")]
265 #[test]
266 fn test_text_collapse_strips_accents() {
267 assert_eq!(text_collapse("café"), "cafe");
269 }
270
271 #[cfg(feature = "text-processing")]
272 #[test]
273 fn test_text_collapse_strips_punctuation() {
274 assert_eq!(text_collapse("hello, world!"), "helloworld");
275 }
276
277 #[cfg(feature = "text-processing")]
278 #[test]
279 fn test_text_collapse_empty() {
280 assert_eq!(text_collapse(""), "");
281 }
282
283 #[test]
286 fn test_multi_hash_blake3_empty() {
287 assert_eq!(
288 multi_hash_blake3(b""),
289 "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
290 );
291 }
292
293 #[test]
294 fn test_multi_hash_blake3_hello_world() {
295 assert_eq!(
296 multi_hash_blake3(b"hello world"),
297 "1e20d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
298 );
299 }
300}