infiniloom_engine/embedding/
normalizer.rs

1//! Content normalization for deterministic, cross-platform hashing
2//!
3//! This module ensures that the same code produces the same hash regardless of:
4//! - Operating system (Windows CRLF, Unix LF, old Mac CR)
5//! - Unicode representation (macOS NFD vs Linux NFC)
6//! - Trailing whitespace differences
7//! - Leading/trailing blank lines
8//!
9//! # Normalization Steps
10//!
11//! 1. **Unicode NFC normalization**: Converts decomposed characters (NFD) to composed form (NFC)
12//!    - Example: "é" (e + combining accent) becomes "é" (single character)
13//!    - Critical for macOS which often produces NFD in file names and content
14//!
15//! 2. **Line ending normalization**: CRLF (Windows) and CR (old Mac) → LF (Unix)
16//!    - Ensures cross-platform consistency
17//!
18//! 3. **Trailing whitespace removal**: Strips spaces/tabs from line ends
19//!    - Editors often differ in trailing whitespace handling
20//!
21//! 4. **Blank line trimming**: Removes leading and trailing blank lines
22//!    - Keeps internal blank lines (they're semantically meaningful)
23//!
24//! 5. **Indentation preservation**: Internal indentation is kept intact
25//!    - Critical for Python and other indentation-sensitive languages
26
27use unicode_normalization::UnicodeNormalization;
28
29/// Normalize content for deterministic, cross-platform hashing
30///
31/// # Guarantees
32///
33/// - Identical output on Windows, Linux, macOS
34/// - Same code with different line endings produces same output
35/// - Unicode-safe: NFD and NFC representations produce same output
36/// - Preserves semantic structure (internal indentation, blank lines)
37///
38/// # Example
39///
40/// ```
41/// use infiniloom_engine::embedding::normalize_for_hash;
42///
43/// let unix = "fn foo() {\n    bar();\n}";
44/// let windows = "fn foo() {\r\n    bar();\r\n}";
45///
46/// assert_eq!(normalize_for_hash(unix), normalize_for_hash(windows));
47/// ```
48pub fn normalize_for_hash(content: &str) -> String {
49    // Step 1: Unicode NFC normalization
50    // This ensures "café" (NFD: e + combining accent) equals "café" (NFC: single char)
51    let unicode_normalized: String = content.nfc().collect();
52
53    // Step 2: Normalize line endings (optimize for common case - no \r)
54    let line_normalized = if unicode_normalized.contains('\r') {
55        unicode_normalized.replace("\r\n", "\n").replace('\r', "\n")
56    } else {
57        unicode_normalized
58    };
59
60    // Step 3: Process lines - trim trailing whitespace only
61    let lines: Vec<&str> = line_normalized
62        .lines()
63        .map(|line| line.trim_end()) // Remove trailing whitespace only
64        .collect();
65
66    // Step 4: Remove leading blank lines
67    let start = lines.iter().position(|l| !l.is_empty()).unwrap_or(0);
68
69    // Step 5: Remove trailing blank lines
70    let end = lines
71        .iter()
72        .rposition(|l| !l.is_empty())
73        .map(|i| i + 1)
74        .unwrap_or(0);
75
76    // Handle empty content
77    if start >= end {
78        return String::new();
79    }
80
81    // Join the trimmed lines with LF
82    lines[start..end].join("\n")
83}
84
85/// Fast check if content needs normalization
86///
87/// Returns `true` if the content might produce different hashes without normalization.
88/// This is a quick heuristic check - it may return `true` for some content that
89/// wouldn't actually change after normalization.
90///
91/// Use this for early-exit optimization when processing many files.
92#[inline]
93pub fn needs_normalization(content: &str) -> bool {
94    // Check for carriage returns (Windows line endings or old Mac)
95    if content.contains('\r') {
96        return true;
97    }
98
99    // Check for potential Unicode that needs normalization
100    // Any byte > 127 could be multi-byte UTF-8 that might need NFC
101    if content.bytes().any(|b| b > 127) {
102        return true;
103    }
104
105    // Check for trailing whitespace on any line
106    for line in content.lines() {
107        if line != line.trim_end() {
108            return true;
109        }
110    }
111
112    // Check for trailing newline (normalize_for_hash removes it)
113    // Note: .lines() doesn't give us trailing empty lines, so check directly
114    if content.ends_with('\n') {
115        return true;
116    }
117
118    // Check for leading blank lines
119    if content.starts_with('\n') {
120        return true;
121    }
122
123    // Check for leading/trailing blank lines via .lines()
124    let lines: Vec<&str> = content.lines().collect();
125    if !lines.is_empty() {
126        if lines.first().is_some_and(|l| l.is_empty()) {
127            return true;
128        }
129        if lines.last().is_some_and(|l| l.is_empty()) {
130            return true;
131        }
132    }
133
134    false
135}
136
137/// Normalize a single line (without line ending changes)
138///
139/// Useful for processing content line by line.
140#[inline]
141pub fn normalize_line(line: &str) -> String {
142    line.nfc().collect::<String>().trim_end().to_string()
143}
144
145/// Check if content is already normalized
146///
147/// Returns `true` if `normalize_for_hash(content) == content`.
148/// More expensive than `needs_normalization` but more accurate.
149pub fn is_normalized(content: &str) -> bool {
150    normalize_for_hash(content) == content
151}
152
153#[cfg(test)]
154mod tests {
155    use super::*;
156
157    #[test]
158    fn test_unicode_nfc() {
159        // NFD: e + combining acute accent
160        let nfd = "cafe\u{0301}";
161        // NFC: single character é
162        let nfc = "caf\u{00E9}";
163
164        assert_eq!(normalize_for_hash(nfd), normalize_for_hash(nfc));
165    }
166
167    #[test]
168    fn test_cross_platform_line_endings() {
169        let unix = "fn foo() {\n    bar();\n}";
170        let windows = "fn foo() {\r\n    bar();\r\n}";
171        let mac_classic = "fn foo() {\r    bar();\r}";
172        let trailing_ws = "fn foo() {   \n    bar();   \n}";
173
174        let normalized = normalize_for_hash(unix);
175        assert_eq!(normalize_for_hash(windows), normalized);
176        assert_eq!(normalize_for_hash(mac_classic), normalized);
177        assert_eq!(normalize_for_hash(trailing_ws), normalized);
178    }
179
180    #[test]
181    fn test_preserves_indentation() {
182        let python = "def foo():\n    if True:\n        return 1";
183        let normalized = normalize_for_hash(python);
184
185        assert!(normalized.contains("    if True:"));
186        assert!(normalized.contains("        return"));
187    }
188
189    #[test]
190    fn test_removes_leading_blank_lines() {
191        let with_leading = "\n\n\nfn foo() {}";
192        let without = "fn foo() {}";
193
194        assert_eq!(normalize_for_hash(with_leading), normalize_for_hash(without));
195    }
196
197    #[test]
198    fn test_removes_trailing_blank_lines() {
199        let with_trailing = "fn foo() {}\n\n\n";
200        let without = "fn foo() {}";
201
202        assert_eq!(normalize_for_hash(with_trailing), normalize_for_hash(without));
203    }
204
205    #[test]
206    fn test_preserves_internal_blank_lines() {
207        let code = "fn foo() {\n    let x = 1;\n\n    let y = 2;\n}";
208        let normalized = normalize_for_hash(code);
209
210        // Internal blank line should be preserved
211        assert!(normalized.contains("\n\n"));
212    }
213
214    #[test]
215    fn test_empty_content() {
216        assert_eq!(normalize_for_hash(""), "");
217        assert_eq!(normalize_for_hash("\n\n\n"), "");
218        assert_eq!(normalize_for_hash("   \n   \n   "), "");
219    }
220
221    #[test]
222    fn test_single_line() {
223        let line = "fn foo() {}";
224        assert_eq!(normalize_for_hash(line), line);
225    }
226
227    #[test]
228    fn test_single_line_with_trailing_whitespace() {
229        let with_ws = "fn foo() {}   ";
230        let without = "fn foo() {}";
231        assert_eq!(normalize_for_hash(with_ws), without);
232    }
233
234    #[test]
235    fn test_needs_normalization() {
236        // Needs normalization
237        assert!(needs_normalization("fn foo() {\r\n}"));
238        assert!(needs_normalization("café")); // Has non-ASCII
239        assert!(needs_normalization("foo   \nbar"));
240        assert!(needs_normalization("\nfoo"));
241        assert!(needs_normalization("foo\n"));
242
243        // Does NOT need normalization (already normalized)
244        assert!(!needs_normalization("fn foo() {\n    bar();\n}"));
245    }
246
247    #[test]
248    fn test_is_normalized() {
249        assert!(is_normalized("fn foo() {\n    bar();\n}"));
250        assert!(!is_normalized("fn foo() {\r\n    bar();\r\n}"));
251        assert!(!is_normalized("fn foo() {}   "));
252    }
253
254    #[test]
255    fn test_normalize_line() {
256        assert_eq!(normalize_line("foo   "), "foo");
257        assert_eq!(normalize_line("  foo  "), "  foo");
258        assert_eq!(normalize_line("cafe\u{0301}"), "caf\u{00E9}");
259    }
260
261    #[test]
262    fn test_mixed_line_endings() {
263        // Mix of CRLF, LF, and CR
264        let mixed = "line1\r\nline2\nline3\rline4";
265        let normalized = normalize_for_hash(mixed);
266
267        assert!(!normalized.contains('\r'));
268        assert!(normalized.contains("line1\nline2\nline3\nline4"));
269    }
270
271    #[test]
272    fn test_tabs_preserved() {
273        let with_tabs = "fn foo() {\n\tbar();\n}";
274        let normalized = normalize_for_hash(with_tabs);
275
276        // Tabs should be preserved (they're indentation)
277        assert!(normalized.contains('\t'));
278    }
279
280    #[test]
281    fn test_unicode_identifiers() {
282        // Some languages allow Unicode identifiers
283        let code = "let αβγ = 42;";
284        let normalized = normalize_for_hash(code);
285        assert!(normalized.contains("αβγ"));
286    }
287
288    #[test]
289    fn test_emoji_preserved() {
290        // Some code has emoji in comments/strings
291        let code = "// 🎉 Success!\nfn celebrate() {}";
292        let normalized = normalize_for_hash(code);
293        assert!(normalized.contains("🎉"));
294    }
295
296    #[test]
297    fn test_deterministic_multiple_calls() {
298        let content = "fn foo() {\r\n    bar();   \r\n}";
299
300        let result1 = normalize_for_hash(content);
301        let result2 = normalize_for_hash(content);
302        let result3 = normalize_for_hash(content);
303
304        assert_eq!(result1, result2);
305        assert_eq!(result2, result3);
306    }
307
308    #[test]
309    fn test_idempotent() {
310        let content = "fn foo() {\r\n    bar();   \r\n}";
311        let once = normalize_for_hash(content);
312        let twice = normalize_for_hash(&once);
313        let thrice = normalize_for_hash(&twice);
314
315        assert_eq!(once, twice);
316        assert_eq!(twice, thrice);
317    }
318}