infiniloom_engine/embedding/
normalizer.rs

1//! Content normalization for deterministic, cross-platform hashing
2//!
3//! This module ensures that the same code produces the same hash regardless of:
4//! - Operating system (Windows CRLF, Unix LF, old Mac CR)
5//! - Unicode representation (macOS NFD vs Linux NFC)
6//! - Trailing whitespace differences
7//! - Leading/trailing blank lines
8//!
9//! # Normalization Steps
10//!
11//! 1. **Unicode NFC normalization**: Converts decomposed characters (NFD) to composed form (NFC)
12//!    - Example: "é" (e + combining accent) becomes "é" (single character)
13//!    - Critical for macOS which often produces NFD in file names and content
14//!
15//! 2. **Line ending normalization**: CRLF (Windows) and CR (old Mac) → LF (Unix)
16//!    - Ensures cross-platform consistency
17//!
18//! 3. **Trailing whitespace removal**: Strips spaces/tabs from line ends
19//!    - Editors often differ in trailing whitespace handling
20//!
21//! 4. **Blank line trimming**: Removes leading and trailing blank lines
22//!    - Keeps internal blank lines (they're semantically meaningful)
23//!
24//! 5. **Indentation preservation**: Internal indentation is kept intact
25//!    - Critical for Python and other indentation-sensitive languages
26
27use unicode_normalization::UnicodeNormalization;
28
29/// Normalize content for deterministic, cross-platform hashing
30///
31/// # Guarantees
32///
33/// - Identical output on Windows, Linux, macOS
34/// - Same code with different line endings produces same output
35/// - Unicode-safe: NFD and NFC representations produce same output
36/// - Preserves semantic structure (internal indentation, blank lines)
37///
38/// # Example
39///
40/// ```
41/// use infiniloom_engine::embedding::normalize_for_hash;
42///
43/// let unix = "fn foo() {\n    bar();\n}";
44/// let windows = "fn foo() {\r\n    bar();\r\n}";
45///
46/// assert_eq!(normalize_for_hash(unix), normalize_for_hash(windows));
47/// ```
48pub fn normalize_for_hash(content: &str) -> String {
49    // Step 1: Unicode NFC normalization
50    // This ensures "café" (NFD: e + combining accent) equals "café" (NFC: single char)
51    let unicode_normalized: String = content.nfc().collect();
52
53    // Step 2: Normalize line endings (optimize for common case - no \r)
54    let line_normalized = if unicode_normalized.contains('\r') {
55        unicode_normalized
56            .replace("\r\n", "\n")
57            .replace('\r', "\n")
58    } else {
59        unicode_normalized
60    };
61
62    // Step 3: Process lines - trim trailing whitespace only
63    let lines: Vec<&str> = line_normalized
64        .lines()
65        .map(|line| line.trim_end()) // Remove trailing whitespace only
66        .collect();
67
68    // Step 4: Remove leading blank lines
69    let start = lines.iter().position(|l| !l.is_empty()).unwrap_or(0);
70
71    // Step 5: Remove trailing blank lines
72    let end = lines
73        .iter()
74        .rposition(|l| !l.is_empty())
75        .map(|i| i + 1)
76        .unwrap_or(0);
77
78    // Handle empty content
79    if start >= end {
80        return String::new();
81    }
82
83    // Join the trimmed lines with LF
84    lines[start..end].join("\n")
85}
86
87/// Fast check if content needs normalization
88///
89/// Returns `true` if the content might produce different hashes without normalization.
90/// This is a quick heuristic check - it may return `true` for some content that
91/// wouldn't actually change after normalization.
92///
93/// Use this for early-exit optimization when processing many files.
94#[inline]
95pub fn needs_normalization(content: &str) -> bool {
96    // Check for carriage returns (Windows line endings or old Mac)
97    if content.contains('\r') {
98        return true;
99    }
100
101    // Check for potential Unicode that needs normalization
102    // Any byte > 127 could be multi-byte UTF-8 that might need NFC
103    if content.bytes().any(|b| b > 127) {
104        return true;
105    }
106
107    // Check for trailing whitespace on any line
108    for line in content.lines() {
109        if line != line.trim_end() {
110            return true;
111        }
112    }
113
114    // Check for trailing newline (normalize_for_hash removes it)
115    // Note: .lines() doesn't give us trailing empty lines, so check directly
116    if content.ends_with('\n') {
117        return true;
118    }
119
120    // Check for leading blank lines
121    if content.starts_with('\n') {
122        return true;
123    }
124
125    // Check for leading/trailing blank lines via .lines()
126    let lines: Vec<&str> = content.lines().collect();
127    if !lines.is_empty() {
128        if lines.first().is_some_and(|l| l.is_empty()) {
129            return true;
130        }
131        if lines.last().is_some_and(|l| l.is_empty()) {
132            return true;
133        }
134    }
135
136    false
137}
138
139/// Normalize a single line (without line ending changes)
140///
141/// Useful for processing content line by line.
142#[inline]
143pub fn normalize_line(line: &str) -> String {
144    line.nfc().collect::<String>().trim_end().to_string()
145}
146
147/// Check if content is already normalized
148///
149/// Returns `true` if `normalize_for_hash(content) == content`.
150/// More expensive than `needs_normalization` but more accurate.
151pub fn is_normalized(content: &str) -> bool {
152    normalize_for_hash(content) == content
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158
159    #[test]
160    fn test_unicode_nfc() {
161        // NFD: e + combining acute accent
162        let nfd = "cafe\u{0301}";
163        // NFC: single character é
164        let nfc = "caf\u{00E9}";
165
166        assert_eq!(normalize_for_hash(nfd), normalize_for_hash(nfc));
167    }
168
169    #[test]
170    fn test_cross_platform_line_endings() {
171        let unix = "fn foo() {\n    bar();\n}";
172        let windows = "fn foo() {\r\n    bar();\r\n}";
173        let mac_classic = "fn foo() {\r    bar();\r}";
174        let trailing_ws = "fn foo() {   \n    bar();   \n}";
175
176        let normalized = normalize_for_hash(unix);
177        assert_eq!(normalize_for_hash(windows), normalized);
178        assert_eq!(normalize_for_hash(mac_classic), normalized);
179        assert_eq!(normalize_for_hash(trailing_ws), normalized);
180    }
181
182    #[test]
183    fn test_preserves_indentation() {
184        let python = "def foo():\n    if True:\n        return 1";
185        let normalized = normalize_for_hash(python);
186
187        assert!(normalized.contains("    if True:"));
188        assert!(normalized.contains("        return"));
189    }
190
191    #[test]
192    fn test_removes_leading_blank_lines() {
193        let with_leading = "\n\n\nfn foo() {}";
194        let without = "fn foo() {}";
195
196        assert_eq!(normalize_for_hash(with_leading), normalize_for_hash(without));
197    }
198
199    #[test]
200    fn test_removes_trailing_blank_lines() {
201        let with_trailing = "fn foo() {}\n\n\n";
202        let without = "fn foo() {}";
203
204        assert_eq!(
205            normalize_for_hash(with_trailing),
206            normalize_for_hash(without)
207        );
208    }
209
210    #[test]
211    fn test_preserves_internal_blank_lines() {
212        let code = "fn foo() {\n    let x = 1;\n\n    let y = 2;\n}";
213        let normalized = normalize_for_hash(code);
214
215        // Internal blank line should be preserved
216        assert!(normalized.contains("\n\n"));
217    }
218
219    #[test]
220    fn test_empty_content() {
221        assert_eq!(normalize_for_hash(""), "");
222        assert_eq!(normalize_for_hash("\n\n\n"), "");
223        assert_eq!(normalize_for_hash("   \n   \n   "), "");
224    }
225
226    #[test]
227    fn test_single_line() {
228        let line = "fn foo() {}";
229        assert_eq!(normalize_for_hash(line), line);
230    }
231
232    #[test]
233    fn test_single_line_with_trailing_whitespace() {
234        let with_ws = "fn foo() {}   ";
235        let without = "fn foo() {}";
236        assert_eq!(normalize_for_hash(with_ws), without);
237    }
238
239    #[test]
240    fn test_needs_normalization() {
241        // Needs normalization
242        assert!(needs_normalization("fn foo() {\r\n}"));
243        assert!(needs_normalization("café")); // Has non-ASCII
244        assert!(needs_normalization("foo   \nbar"));
245        assert!(needs_normalization("\nfoo"));
246        assert!(needs_normalization("foo\n"));
247
248        // Does NOT need normalization (already normalized)
249        assert!(!needs_normalization("fn foo() {\n    bar();\n}"));
250    }
251
252    #[test]
253    fn test_is_normalized() {
254        assert!(is_normalized("fn foo() {\n    bar();\n}"));
255        assert!(!is_normalized("fn foo() {\r\n    bar();\r\n}"));
256        assert!(!is_normalized("fn foo() {}   "));
257    }
258
259    #[test]
260    fn test_normalize_line() {
261        assert_eq!(normalize_line("foo   "), "foo");
262        assert_eq!(normalize_line("  foo  "), "  foo");
263        assert_eq!(normalize_line("cafe\u{0301}"), "caf\u{00E9}");
264    }
265
266    #[test]
267    fn test_mixed_line_endings() {
268        // Mix of CRLF, LF, and CR
269        let mixed = "line1\r\nline2\nline3\rline4";
270        let normalized = normalize_for_hash(mixed);
271
272        assert!(!normalized.contains('\r'));
273        assert!(normalized.contains("line1\nline2\nline3\nline4"));
274    }
275
276    #[test]
277    fn test_tabs_preserved() {
278        let with_tabs = "fn foo() {\n\tbar();\n}";
279        let normalized = normalize_for_hash(with_tabs);
280
281        // Tabs should be preserved (they're indentation)
282        assert!(normalized.contains('\t'));
283    }
284
285    #[test]
286    fn test_unicode_identifiers() {
287        // Some languages allow Unicode identifiers
288        let code = "let αβγ = 42;";
289        let normalized = normalize_for_hash(code);
290        assert!(normalized.contains("αβγ"));
291    }
292
293    #[test]
294    fn test_emoji_preserved() {
295        // Some code has emoji in comments/strings
296        let code = "// 🎉 Success!\nfn celebrate() {}";
297        let normalized = normalize_for_hash(code);
298        assert!(normalized.contains("🎉"));
299    }
300
301    #[test]
302    fn test_deterministic_multiple_calls() {
303        let content = "fn foo() {\r\n    bar();   \r\n}";
304
305        let result1 = normalize_for_hash(content);
306        let result2 = normalize_for_hash(content);
307        let result3 = normalize_for_hash(content);
308
309        assert_eq!(result1, result2);
310        assert_eq!(result2, result3);
311    }
312
313    #[test]
314    fn test_idempotent() {
315        let content = "fn foo() {\r\n    bar();   \r\n}";
316        let once = normalize_for_hash(content);
317        let twice = normalize_for_hash(&once);
318        let thrice = normalize_for_hash(&twice);
319
320        assert_eq!(once, twice);
321        assert_eq!(twice, thrice);
322    }
323}