Skip to main content

iscc_lib/
utils.rs

1//! Text normalization and hashing utilities for ISCC code generation.
2//!
3//! Provides text cleaning, trimming, collapsing, and BLAKE3 multihash functions
4//! ported from `iscc-core` `code_meta.py` and `utils.py`.
5
6#[cfg(feature = "text-processing")]
7use unicode_general_category::{GeneralCategory, get_general_category};
8#[cfg(feature = "text-processing")]
9use unicode_normalization::UnicodeNormalization;
10
11/// Characters treated as newlines (preserved during control-char removal).
12#[cfg(feature = "text-processing")]
13const NEWLINES: &[char] = &[
14    '\u{000A}', // LINE FEED
15    '\u{000B}', // VERTICAL TAB
16    '\u{000C}', // FORM FEED
17    '\u{000D}', // CARRIAGE RETURN
18    '\u{0085}', // NEXT LINE
19    '\u{2028}', // LINE SEPARATOR
20    '\u{2029}', // PARAGRAPH SEPARATOR
21];
22
23/// Check if a character belongs to a Unicode "C" (control/format/etc) category.
24#[cfg(feature = "text-processing")]
25fn is_c_category(c: char) -> bool {
26    matches!(
27        get_general_category(c),
28        GeneralCategory::Control
29            | GeneralCategory::Format
30            | GeneralCategory::Unassigned
31            | GeneralCategory::PrivateUse
32            | GeneralCategory::Surrogate
33    )
34}
35
36/// Check if a character belongs to Unicode "C", "M", or "P" categories.
37#[cfg(feature = "text-processing")]
38fn is_cmp_category(c: char) -> bool {
39    matches!(
40        get_general_category(c),
41        // C: Control categories
42        GeneralCategory::Control
43            | GeneralCategory::Format
44            | GeneralCategory::Unassigned
45            | GeneralCategory::PrivateUse
46            | GeneralCategory::Surrogate
47            // M: Mark categories
48            | GeneralCategory::NonspacingMark
49            | GeneralCategory::SpacingMark
50            | GeneralCategory::EnclosingMark
51            // P: Punctuation categories
52            | GeneralCategory::ConnectorPunctuation
53            | GeneralCategory::DashPunctuation
54            | GeneralCategory::OpenPunctuation
55            | GeneralCategory::ClosePunctuation
56            | GeneralCategory::InitialPunctuation
57            | GeneralCategory::FinalPunctuation
58            | GeneralCategory::OtherPunctuation
59    )
60}
61
62/// Clean and normalize text for display.
63///
64/// Applies NFKC normalization, removes control characters (except newlines),
65/// normalizes `\r\n` to `\n`, collapses consecutive empty lines to at most
66/// one, and strips leading/trailing whitespace.
67#[cfg(feature = "text-processing")]
68pub fn text_clean(text: &str) -> String {
69    // 1. NFKC normalize
70    let text: String = text.nfkc().collect();
71
72    // 2. Remove control chars except newlines, normalizing all newlines to \n
73    let mut cleaned = String::with_capacity(text.len());
74    let mut chars = text.chars().peekable();
75    while let Some(c) = chars.next() {
76        if NEWLINES.contains(&c) {
77            // Handle \r\n as a single newline
78            if c == '\r' && chars.peek() == Some(&'\n') {
79                chars.next();
80            }
81            cleaned.push('\n');
82        } else if is_c_category(c) {
83            // Skip control characters
84        } else {
85            cleaned.push(c);
86        }
87    }
88
89    // 3. Split on \n, collapse consecutive empty/whitespace-only lines
90    let mut result_lines: Vec<&str> = Vec::new();
91    let mut prev_empty = false;
92    for line in cleaned.split('\n') {
93        let is_empty = line.trim().is_empty();
94        if is_empty {
95            if prev_empty {
96                continue;
97            }
98            prev_empty = true;
99        } else {
100            prev_empty = false;
101        }
102        result_lines.push(line);
103    }
104
105    // 4. Join with \n and strip leading/trailing whitespace
106    result_lines.join("\n").trim().to_string()
107}
108
109/// Remove newlines and collapse whitespace to single spaces.
110///
111/// Converts multi-line text into a single normalized line by splitting on
112/// whitespace boundaries and joining with a single space.
113pub fn text_remove_newlines(text: &str) -> String {
114    text.split_whitespace().collect::<Vec<_>>().join(" ")
115}
116
117/// Trim text so its UTF-8 encoded size does not exceed `nbytes`.
118///
119/// Finds the largest valid UTF-8 prefix within `nbytes`, then strips
120/// leading/trailing whitespace from the result. Multi-byte characters
121/// that would be split are dropped entirely.
122pub fn text_trim(text: &str, nbytes: usize) -> String {
123    if text.len() <= nbytes {
124        return text.trim().to_string();
125    }
126    let bytes = &text.as_bytes()[..nbytes];
127    let s = match std::str::from_utf8(bytes) {
128        Ok(s) => s,
129        Err(e) => &text[..e.valid_up_to()],
130    };
131    s.trim().to_string()
132}
133
134/// Normalize and simplify text for similarity hashing.
135///
136/// Applies NFD normalization, lowercasing, removes whitespace and characters
137/// in Unicode categories C (control), M (mark), and P (punctuation), then
138/// recombines with NFKC normalization.
139#[cfg(feature = "text-processing")]
140pub fn text_collapse(text: &str) -> String {
141    // 1. NFD normalize and lowercase
142    let nfd_lower: String = text.nfd().collect::<String>().to_lowercase();
143
144    // 2. Filter: keep chars that are NOT whitespace AND NOT in C/M/P categories
145    let filtered: String = nfd_lower
146        .chars()
147        .filter(|&c| !c.is_whitespace() && !is_cmp_category(c))
148        .collect();
149
150    // 3. NFKC normalize the filtered result
151    filtered.nfkc().collect()
152}
153
154/// Compute a BLAKE3 hash with multihash prefix.
155///
156/// Returns a hex-encoded string with the BLAKE3 multicodec prefix (0x1e)
157/// and digest length (0x20 = 32 bytes).
158pub(crate) fn multi_hash_blake3(data: &[u8]) -> String {
159    let digest = blake3::hash(data);
160    let mut result = Vec::with_capacity(34);
161    result.push(0x1e); // BLAKE3 multicodec
162    result.push(0x20); // 32 bytes length
163    result.extend_from_slice(digest.as_bytes());
164    hex::encode(result)
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    // ---- text_clean tests ----
172
173    #[cfg(feature = "text-processing")]
174    #[test]
175    fn test_text_clean_nfkc_normalization() {
176        // ℍ (U+210D) should normalize to H under NFKC
177        assert!(text_clean("ℍ").contains('H'));
178    }
179
180    #[cfg(feature = "text-processing")]
181    #[test]
182    fn test_text_clean_removes_control_chars() {
183        assert_eq!(text_clean("hello\tworld"), "helloworld");
184    }
185
186    #[cfg(feature = "text-processing")]
187    #[test]
188    fn test_text_clean_preserves_newlines() {
189        assert_eq!(text_clean("hello\nworld"), "hello\nworld");
190    }
191
192    #[cfg(feature = "text-processing")]
193    #[test]
194    fn test_text_clean_collapses_empty_lines() {
195        assert_eq!(text_clean("a\n\n\nb"), "a\n\nb");
196    }
197
198    #[cfg(feature = "text-processing")]
199    #[test]
200    fn test_text_clean_strips_whitespace() {
201        assert_eq!(text_clean("  hello  "), "hello");
202    }
203
204    #[cfg(feature = "text-processing")]
205    #[test]
206    fn test_text_clean_handles_crlf() {
207        assert_eq!(text_clean("a\r\nb"), "a\nb");
208    }
209
210    #[cfg(feature = "text-processing")]
211    #[test]
212    fn test_text_clean_empty() {
213        assert_eq!(text_clean(""), "");
214    }
215
216    // ---- text_remove_newlines tests ----
217
218    #[test]
219    fn test_text_remove_newlines() {
220        assert_eq!(text_remove_newlines("hello\nworld"), "hello world");
221    }
222
223    #[test]
224    fn test_text_remove_newlines_collapses_spaces() {
225        assert_eq!(text_remove_newlines("a  b   c"), "a b c");
226    }
227
228    // ---- text_trim tests ----
229
230    #[test]
231    fn test_text_trim_no_truncation() {
232        assert_eq!(text_trim("hello", 10), "hello");
233    }
234
235    #[test]
236    fn test_text_trim_exact() {
237        assert_eq!(text_trim("hello", 5), "hello");
238    }
239
240    #[test]
241    fn test_text_trim_truncates() {
242        assert_eq!(text_trim("hello world", 5), "hello");
243    }
244
245    #[test]
246    fn test_text_trim_unicode_boundary() {
247        // "é" is 2 bytes in UTF-8 (C3 A9). Truncating at 1 byte should drop it.
248        assert_eq!(text_trim("é", 1), "");
249    }
250
251    #[test]
252    fn test_text_trim_strips() {
253        assert_eq!(text_trim("hello ", 6), "hello");
254    }
255
256    // ---- text_collapse tests ----
257
258    #[cfg(feature = "text-processing")]
259    #[test]
260    fn test_text_collapse_basic() {
261        assert_eq!(text_collapse("Hello World"), "helloworld");
262    }
263
264    #[cfg(feature = "text-processing")]
265    #[test]
266    fn test_text_collapse_strips_accents() {
267        // NFD decomposes accented chars, then M-category marks are filtered
268        assert_eq!(text_collapse("café"), "cafe");
269    }
270
271    #[cfg(feature = "text-processing")]
272    #[test]
273    fn test_text_collapse_strips_punctuation() {
274        assert_eq!(text_collapse("hello, world!"), "helloworld");
275    }
276
277    #[cfg(feature = "text-processing")]
278    #[test]
279    fn test_text_collapse_empty() {
280        assert_eq!(text_collapse(""), "");
281    }
282
283    // ---- multi_hash_blake3 tests ----
284
285    #[test]
286    fn test_multi_hash_blake3_empty() {
287        assert_eq!(
288            multi_hash_blake3(b""),
289            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
290        );
291    }
292
293    #[test]
294    fn test_multi_hash_blake3_hello_world() {
295        assert_eq!(
296            multi_hash_blake3(b"hello world"),
297            "1e20d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
298        );
299    }
300}