Skip to main content

ucm_core/
normalize.rs

1//! Content normalization for deterministic hashing.
2//!
3//! Normalization ensures that semantically equivalent content produces
4//! identical hashes, regardless of superficial differences like whitespace
5//! or Unicode representation.
6
7use crate::content::{Cell, Code, Column, Content, Math, Media, MediaSource, Row, Table, Text};
8use unicode_normalization::UnicodeNormalization;
9
10/// Normalization configuration
11#[derive(Debug, Clone, Copy, Default)]
12pub struct NormalizationConfig {
13    /// Unicode normalization form
14    pub unicode_form: UnicodeForm,
15    /// Whitespace handling
16    pub whitespace: WhitespaceNorm,
17    /// Line ending normalization
18    pub line_endings: LineEndingNorm,
19}
20
21/// Unicode normalization form (per TR15)
22#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
23pub enum UnicodeForm {
24    /// Canonical Decomposition, followed by Canonical Composition (default)
25    #[default]
26    NFC,
27    /// Canonical Decomposition
28    NFD,
29    /// Compatibility Decomposition, followed by Canonical Composition
30    NFKC,
31    /// Compatibility Decomposition
32    NFKD,
33}
34
35/// Whitespace normalization strategy
36#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
37pub enum WhitespaceNorm {
38    /// Collapse runs of whitespace to single space (default for text)
39    #[default]
40    Collapse,
41    /// Preserve whitespace exactly (for code)
42    Preserve,
43    /// Trim leading/trailing only
44    Trim,
45}
46
47/// Line ending normalization
48#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
49pub enum LineEndingNorm {
50    /// Unix-style \n (default)
51    #[default]
52    LF,
53    /// Windows-style \r\n
54    CRLF,
55    /// Preserve original
56    Preserve,
57}
58
59/// Normalize content for hashing.
60///
61/// This function produces a canonical string representation of any content
62/// type that can be used for deterministic ID generation.
63///
64/// # Example
65/// ```
66/// use ucm_core::content::{Content, TextFormat};
67/// use ucm_core::normalize::normalize_content;
68///
69/// let content = Content::Text(ucm_core::content::Text {
70///     text: "  Hello   World  ".to_string(),
71///     format: TextFormat::Plain,
72/// });
73///
74/// let normalized = normalize_content(&content);
75/// assert_eq!(normalized, "Hello World");
76/// ```
77pub fn normalize_content(content: &Content) -> String {
78    match content {
79        Content::Text(text) => normalize_text_content(text),
80        Content::Code(code) => normalize_code_content(code),
81        Content::Table(table) => normalize_table_content(table),
82        Content::Math(math) => normalize_math_content(math),
83        Content::Media(media) => normalize_media_content(media),
84        Content::Json { value, .. } => canonical_json(value),
85        Content::Binary {
86            data, mime_type, ..
87        } => {
88            format!("{}:{}", mime_type, hex::encode(sha256_hash(data)))
89        }
90        Content::Composite { layout, children } => {
91            let children_str: Vec<String> = children.iter().map(|id| id.to_string()).collect();
92            format!("{:?}:[{}]", layout, children_str.join(","))
93        }
94    }
95}
96
97fn normalize_text_content(text: &Text) -> String {
98    normalize_text(
99        &text.text,
100        NormalizationConfig {
101            whitespace: WhitespaceNorm::Collapse,
102            ..Default::default()
103        },
104    )
105}
106
107fn normalize_code_content(code: &Code) -> String {
108    // Code preserves whitespace but normalizes line endings
109    let config = NormalizationConfig {
110        whitespace: WhitespaceNorm::Preserve,
111        line_endings: LineEndingNorm::LF,
112        ..Default::default()
113    };
114    format!(
115        "{}:{}",
116        code.language.to_lowercase(),
117        normalize_text(&code.source, config)
118    )
119}
120
121fn normalize_table_content(table: &Table) -> String {
122    let columns: Vec<String> = table.columns.iter().map(normalize_column).collect();
123
124    let rows: Vec<String> = table.rows.iter().map(normalize_row).collect();
125
126    format!("columns:[{}],rows:[{}]", columns.join(","), rows.join(","))
127}
128
129fn normalize_column(column: &Column) -> String {
130    let name = normalize_text(&column.name, NormalizationConfig::default());
131    match &column.data_type {
132        Some(dt) => format!("{}:{:?}", name, dt),
133        None => name,
134    }
135}
136
137fn normalize_row(row: &Row) -> String {
138    let cells: Vec<String> = row.cells.iter().map(normalize_cell).collect();
139    format!("[{}]", cells.join(","))
140}
141
142fn normalize_cell(cell: &Cell) -> String {
143    match cell {
144        Cell::Null => "null".to_string(),
145        Cell::Text(s) => format!("\"{}\"", normalize_text(s, NormalizationConfig::default())),
146        Cell::Number(n) => {
147            // Normalize floating point representation
148            if n.fract() == 0.0 {
149                format!("{:.0}", n)
150            } else {
151                format!("{}", n)
152            }
153        }
154        Cell::Boolean(b) => b.to_string(),
155        Cell::Date(s) => format!("d:{}", s),
156        Cell::DateTime(s) => format!("dt:{}", s),
157        Cell::Json(v) => canonical_json(v),
158    }
159}
160
161fn normalize_math_content(math: &Math) -> String {
162    let normalized_expr = normalize_text(
163        &math.expression,
164        NormalizationConfig {
165            whitespace: WhitespaceNorm::Collapse,
166            ..Default::default()
167        },
168    );
169    format!("{:?}:{}", math.format, normalized_expr)
170}
171
172fn normalize_media_content(media: &Media) -> String {
173    let source = match &media.source {
174        MediaSource::Url(url) => format!("url:{}", url),
175        MediaSource::Base64(data) => format!("b64:{}", &data[..data.len().min(32)]),
176        MediaSource::Reference(id) => format!("ref:{}", id),
177        MediaSource::External(ext) => {
178            format!("ext:{}:{}:{}", ext.provider, ext.bucket, ext.key)
179        }
180    };
181
182    match &media.content_hash {
183        Some(hash) => format!(
184            "{:?}:{}:hash:{}",
185            media.media_type,
186            source,
187            hex::encode(hash)
188        ),
189        None => format!("{:?}:{}", media.media_type, source),
190    }
191}
192
193/// Normalize a text string according to configuration.
194pub fn normalize_text(text: &str, config: NormalizationConfig) -> String {
195    // Step 1: Unicode normalization
196    let unicode_normalized = match config.unicode_form {
197        UnicodeForm::NFC => text.nfc().collect::<String>(),
198        UnicodeForm::NFD => text.nfd().collect::<String>(),
199        UnicodeForm::NFKC => text.nfkc().collect::<String>(),
200        UnicodeForm::NFKD => text.nfkd().collect::<String>(),
201    };
202
203    // Step 2: Line ending normalization
204    let line_normalized = match config.line_endings {
205        LineEndingNorm::LF => unicode_normalized.replace("\r\n", "\n").replace('\r', "\n"),
206        LineEndingNorm::CRLF => unicode_normalized
207            .replace("\r\n", "\n")
208            .replace('\r', "\n")
209            .replace('\n', "\r\n"),
210        LineEndingNorm::Preserve => unicode_normalized,
211    };
212
213    // Step 3: Whitespace normalization
214    match config.whitespace {
215        WhitespaceNorm::Collapse => line_normalized
216            .split_whitespace()
217            .collect::<Vec<_>>()
218            .join(" "),
219        WhitespaceNorm::Trim => line_normalized.trim().to_string(),
220        WhitespaceNorm::Preserve => line_normalized,
221    }
222}
223
224/// Canonical JSON serialization (RFC 8785).
225///
226/// - Object keys sorted lexicographically
227/// - No whitespace
228/// - Numbers in canonical form
229pub fn canonical_json(value: &serde_json::Value) -> String {
230    match value {
231        serde_json::Value::Object(map) => {
232            let mut pairs: Vec<_> = map.iter().collect();
233            pairs.sort_by(|a, b| a.0.cmp(b.0));
234            let inner: Vec<String> = pairs
235                .iter()
236                .map(|(k, v)| format!("\"{}\":{}", escape_json_string(k), canonical_json(v)))
237                .collect();
238            format!("{{{}}}", inner.join(","))
239        }
240        serde_json::Value::Array(arr) => {
241            let inner: Vec<String> = arr.iter().map(canonical_json).collect();
242            format!("[{}]", inner.join(","))
243        }
244        serde_json::Value::String(s) => format!("\"{}\"", escape_json_string(s)),
245        serde_json::Value::Number(n) => {
246            // Canonical number representation
247            if let Some(i) = n.as_i64() {
248                i.to_string()
249            } else if let Some(f) = n.as_f64() {
250                if f.fract() == 0.0 && f.abs() < 1e15 {
251                    format!("{:.0}", f)
252                } else {
253                    format!("{}", f)
254                }
255            } else {
256                n.to_string()
257            }
258        }
259        serde_json::Value::Bool(b) => b.to_string(),
260        serde_json::Value::Null => "null".to_string(),
261    }
262}
263
264/// Escape a string for JSON output
265fn escape_json_string(s: &str) -> String {
266    let mut result = String::with_capacity(s.len());
267    for c in s.chars() {
268        match c {
269            '"' => result.push_str("\\\""),
270            '\\' => result.push_str("\\\\"),
271            '\n' => result.push_str("\\n"),
272            '\r' => result.push_str("\\r"),
273            '\t' => result.push_str("\\t"),
274            c if c.is_control() => {
275                result.push_str(&format!("\\u{:04x}", c as u32));
276            }
277            c => result.push(c),
278        }
279    }
280    result
281}
282
283/// Compute SHA256 hash of data
284fn sha256_hash(data: &[u8]) -> [u8; 32] {
285    use sha2::{Digest, Sha256};
286    let mut hasher = Sha256::new();
287    hasher.update(data);
288    let result = hasher.finalize();
289    let mut hash = [0u8; 32];
290    hash.copy_from_slice(&result);
291    hash
292}
293
294/// Check if a character is CJK (Chinese, Japanese, Korean)
295pub fn is_cjk_character(c: char) -> bool {
296    matches!(c,
297        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
298        '\u{3400}'..='\u{4DBF}' |   // CJK Extension A
299        '\u{F900}'..='\u{FAFF}' |   // CJK Compatibility Ideographs
300        '\u{3040}'..='\u{309F}' |   // Hiragana
301        '\u{30A0}'..='\u{30FF}' |   // Katakana
302        '\u{AC00}'..='\u{D7AF}'     // Hangul Syllables
303    )
304}
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309    use crate::content::TextFormat;
310
311    #[test]
312    fn test_normalize_text_whitespace() {
313        let result = normalize_text("  hello   world  ", NormalizationConfig::default());
314        assert_eq!(result, "hello world");
315    }
316
317    #[test]
318    fn test_normalize_text_preserve() {
319        let config = NormalizationConfig {
320            whitespace: WhitespaceNorm::Preserve,
321            ..Default::default()
322        };
323        let result = normalize_text("  hello   world  ", config);
324        assert_eq!(result, "  hello   world  ");
325    }
326
327    #[test]
328    fn test_normalize_line_endings() {
329        let config = NormalizationConfig {
330            line_endings: LineEndingNorm::LF,
331            whitespace: WhitespaceNorm::Preserve,
332            ..Default::default()
333        };
334        let result = normalize_text("line1\r\nline2\rline3", config);
335        assert_eq!(result, "line1\nline2\nline3");
336    }
337
338    #[test]
339    fn test_canonical_json_sorted_keys() {
340        let json = serde_json::json!({"b": 1, "a": 2});
341        let canonical = canonical_json(&json);
342        assert_eq!(canonical, "{\"a\":2,\"b\":1}");
343    }
344
345    #[test]
346    fn test_canonical_json_nested() {
347        let json = serde_json::json!({"outer": {"b": 1, "a": 2}});
348        let canonical = canonical_json(&json);
349        assert_eq!(canonical, "{\"outer\":{\"a\":2,\"b\":1}}");
350    }
351
352    #[test]
353    fn test_normalize_content_text() {
354        let content = Content::Text(Text {
355            text: "  Hello   World  ".to_string(),
356            format: TextFormat::Plain,
357        });
358        let normalized = normalize_content(&content);
359        assert_eq!(normalized, "Hello World");
360    }
361
362    #[test]
363    fn test_normalize_content_code() {
364        let content = Content::Code(Code {
365            language: "Rust".to_string(),
366            source: "fn main() {\n    println!(\"hello\");\n}".to_string(),
367            highlights: vec![],
368        });
369        let normalized = normalize_content(&content);
370        assert!(normalized.starts_with("rust:"));
371    }
372
373    #[test]
374    fn test_is_cjk() {
375        assert!(is_cjk_character('中'));
376        assert!(is_cjk_character('あ'));
377        assert!(is_cjk_character('한'));
378        assert!(!is_cjk_character('a'));
379        assert!(!is_cjk_character('1'));
380    }
381}