Skip to main content

iscc_lib/
utils.rs

1//! Text normalization and hashing utilities for ISCC code generation.
2//!
3//! Provides text cleaning, trimming, collapsing, and BLAKE3 multihash functions
4//! ported from `iscc-core` `code_meta.py` and `utils.py`.
5
6use unicode_general_category::{GeneralCategory, get_general_category};
7use unicode_normalization::UnicodeNormalization;
8
9/// Characters treated as newlines (preserved during control-char removal).
10const NEWLINES: &[char] = &[
11    '\u{000A}', // LINE FEED
12    '\u{000B}', // VERTICAL TAB
13    '\u{000C}', // FORM FEED
14    '\u{000D}', // CARRIAGE RETURN
15    '\u{0085}', // NEXT LINE
16    '\u{2028}', // LINE SEPARATOR
17    '\u{2029}', // PARAGRAPH SEPARATOR
18];
19
20/// Check if a character belongs to a Unicode "C" (control/format/etc) category.
21fn is_c_category(c: char) -> bool {
22    matches!(
23        get_general_category(c),
24        GeneralCategory::Control
25            | GeneralCategory::Format
26            | GeneralCategory::Unassigned
27            | GeneralCategory::PrivateUse
28            | GeneralCategory::Surrogate
29    )
30}
31
32/// Check if a character belongs to Unicode "C", "M", or "P" categories.
33fn is_cmp_category(c: char) -> bool {
34    matches!(
35        get_general_category(c),
36        // C: Control categories
37        GeneralCategory::Control
38            | GeneralCategory::Format
39            | GeneralCategory::Unassigned
40            | GeneralCategory::PrivateUse
41            | GeneralCategory::Surrogate
42            // M: Mark categories
43            | GeneralCategory::NonspacingMark
44            | GeneralCategory::SpacingMark
45            | GeneralCategory::EnclosingMark
46            // P: Punctuation categories
47            | GeneralCategory::ConnectorPunctuation
48            | GeneralCategory::DashPunctuation
49            | GeneralCategory::OpenPunctuation
50            | GeneralCategory::ClosePunctuation
51            | GeneralCategory::InitialPunctuation
52            | GeneralCategory::FinalPunctuation
53            | GeneralCategory::OtherPunctuation
54    )
55}
56
57/// Clean and normalize text for display.
58///
59/// Applies NFKC normalization, removes control characters (except newlines),
60/// normalizes `\r\n` to `\n`, collapses consecutive empty lines to at most
61/// one, and strips leading/trailing whitespace.
62pub fn text_clean(text: &str) -> String {
63    // 1. NFKC normalize
64    let text: String = text.nfkc().collect();
65
66    // 2. Remove control chars except newlines, normalizing all newlines to \n
67    let mut cleaned = String::with_capacity(text.len());
68    let mut chars = text.chars().peekable();
69    while let Some(c) = chars.next() {
70        if NEWLINES.contains(&c) {
71            // Handle \r\n as a single newline
72            if c == '\r' && chars.peek() == Some(&'\n') {
73                chars.next();
74            }
75            cleaned.push('\n');
76        } else if is_c_category(c) {
77            // Skip control characters
78        } else {
79            cleaned.push(c);
80        }
81    }
82
83    // 3. Split on \n, collapse consecutive empty/whitespace-only lines
84    let mut result_lines: Vec<&str> = Vec::new();
85    let mut prev_empty = false;
86    for line in cleaned.split('\n') {
87        let is_empty = line.trim().is_empty();
88        if is_empty {
89            if prev_empty {
90                continue;
91            }
92            prev_empty = true;
93        } else {
94            prev_empty = false;
95        }
96        result_lines.push(line);
97    }
98
99    // 4. Join with \n and strip leading/trailing whitespace
100    result_lines.join("\n").trim().to_string()
101}
102
103/// Remove newlines and collapse whitespace to single spaces.
104///
105/// Converts multi-line text into a single normalized line by splitting on
106/// whitespace boundaries and joining with a single space.
107pub fn text_remove_newlines(text: &str) -> String {
108    text.split_whitespace().collect::<Vec<_>>().join(" ")
109}
110
111/// Trim text so its UTF-8 encoded size does not exceed `nbytes`.
112///
113/// Finds the largest valid UTF-8 prefix within `nbytes`, then strips
114/// leading/trailing whitespace from the result. Multi-byte characters
115/// that would be split are dropped entirely.
116pub fn text_trim(text: &str, nbytes: usize) -> String {
117    if text.len() <= nbytes {
118        return text.trim().to_string();
119    }
120    let bytes = &text.as_bytes()[..nbytes];
121    let s = match std::str::from_utf8(bytes) {
122        Ok(s) => s,
123        Err(e) => &text[..e.valid_up_to()],
124    };
125    s.trim().to_string()
126}
127
128/// Normalize and simplify text for similarity hashing.
129///
130/// Applies NFD normalization, lowercasing, removes whitespace and characters
131/// in Unicode categories C (control), M (mark), and P (punctuation), then
132/// recombines with NFKC normalization.
133pub fn text_collapse(text: &str) -> String {
134    // 1. NFD normalize and lowercase
135    let nfd_lower: String = text.nfd().collect::<String>().to_lowercase();
136
137    // 2. Filter: keep chars that are NOT whitespace AND NOT in C/M/P categories
138    let filtered: String = nfd_lower
139        .chars()
140        .filter(|&c| !c.is_whitespace() && !is_cmp_category(c))
141        .collect();
142
143    // 3. NFKC normalize the filtered result
144    filtered.nfkc().collect()
145}
146
147/// Compute a BLAKE3 hash with multihash prefix.
148///
149/// Returns a hex-encoded string with the BLAKE3 multicodec prefix (0x1e)
150/// and digest length (0x20 = 32 bytes).
151pub(crate) fn multi_hash_blake3(data: &[u8]) -> String {
152    let digest = blake3::hash(data);
153    let mut result = Vec::with_capacity(34);
154    result.push(0x1e); // BLAKE3 multicodec
155    result.push(0x20); // 32 bytes length
156    result.extend_from_slice(digest.as_bytes());
157    hex::encode(result)
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    // ---- text_clean tests ----
165
166    #[test]
167    fn test_text_clean_nfkc_normalization() {
168        // ℍ (U+210D) should normalize to H under NFKC
169        assert!(text_clean("ℍ").contains('H'));
170    }
171
172    #[test]
173    fn test_text_clean_removes_control_chars() {
174        assert_eq!(text_clean("hello\tworld"), "helloworld");
175    }
176
177    #[test]
178    fn test_text_clean_preserves_newlines() {
179        assert_eq!(text_clean("hello\nworld"), "hello\nworld");
180    }
181
182    #[test]
183    fn test_text_clean_collapses_empty_lines() {
184        assert_eq!(text_clean("a\n\n\nb"), "a\n\nb");
185    }
186
187    #[test]
188    fn test_text_clean_strips_whitespace() {
189        assert_eq!(text_clean("  hello  "), "hello");
190    }
191
192    #[test]
193    fn test_text_clean_handles_crlf() {
194        assert_eq!(text_clean("a\r\nb"), "a\nb");
195    }
196
197    #[test]
198    fn test_text_clean_empty() {
199        assert_eq!(text_clean(""), "");
200    }
201
202    // ---- text_remove_newlines tests ----
203
204    #[test]
205    fn test_text_remove_newlines() {
206        assert_eq!(text_remove_newlines("hello\nworld"), "hello world");
207    }
208
209    #[test]
210    fn test_text_remove_newlines_collapses_spaces() {
211        assert_eq!(text_remove_newlines("a  b   c"), "a b c");
212    }
213
214    // ---- text_trim tests ----
215
216    #[test]
217    fn test_text_trim_no_truncation() {
218        assert_eq!(text_trim("hello", 10), "hello");
219    }
220
221    #[test]
222    fn test_text_trim_exact() {
223        assert_eq!(text_trim("hello", 5), "hello");
224    }
225
226    #[test]
227    fn test_text_trim_truncates() {
228        assert_eq!(text_trim("hello world", 5), "hello");
229    }
230
231    #[test]
232    fn test_text_trim_unicode_boundary() {
233        // "é" is 2 bytes in UTF-8 (C3 A9). Truncating at 1 byte should drop it.
234        assert_eq!(text_trim("é", 1), "");
235    }
236
237    #[test]
238    fn test_text_trim_strips() {
239        assert_eq!(text_trim("hello ", 6), "hello");
240    }
241
242    // ---- text_collapse tests ----
243
244    #[test]
245    fn test_text_collapse_basic() {
246        assert_eq!(text_collapse("Hello World"), "helloworld");
247    }
248
249    #[test]
250    fn test_text_collapse_strips_accents() {
251        // NFD decomposes accented chars, then M-category marks are filtered
252        assert_eq!(text_collapse("café"), "cafe");
253    }
254
255    #[test]
256    fn test_text_collapse_strips_punctuation() {
257        assert_eq!(text_collapse("hello, world!"), "helloworld");
258    }
259
260    #[test]
261    fn test_text_collapse_empty() {
262        assert_eq!(text_collapse(""), "");
263    }
264
265    // ---- multi_hash_blake3 tests ----
266
267    #[test]
268    fn test_multi_hash_blake3_empty() {
269        assert_eq!(
270            multi_hash_blake3(b""),
271            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
272        );
273    }
274
275    #[test]
276    fn test_multi_hash_blake3_hello_world() {
277        assert_eq!(
278            multi_hash_blake3(b"hello world"),
279            "1e20d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"
280        );
281    }
282}