infiniloom_engine/embedding/
normalizer.rs

1//! Content normalization for deterministic, cross-platform hashing
2//!
3//! This module ensures that the same code produces the same hash regardless of:
4//! - Operating system (Windows CRLF, Unix LF, old Mac CR)
5//! - Unicode representation (macOS NFD vs Linux NFC)
6//! - Trailing whitespace differences
7//! - Leading/trailing blank lines
8//!
9//! # Normalization Steps
10//!
11//! 1. **Unicode NFC normalization**: Converts decomposed characters (NFD) to composed form (NFC)
12//!    - Example: "é" (e + combining accent) becomes "é" (single character)
13//!    - Critical for macOS which often produces NFD in file names and content
14//!
15//! 2. **Line ending normalization**: CRLF (Windows) and CR (old Mac) → LF (Unix)
16//!    - Ensures cross-platform consistency
17//!
18//! 3. **Trailing whitespace removal**: Strips spaces/tabs from line ends
19//!    - Editors often differ in trailing whitespace handling
20//!
21//! 4. **Blank line trimming**: Removes leading and trailing blank lines
22//!    - Keeps internal blank lines (they're semantically meaningful)
23//!
24//! 5. **Indentation preservation**: Internal indentation is kept intact
25//!    - Critical for Python and other indentation-sensitive languages
26
27use unicode_normalization::UnicodeNormalization;
28
29/// Normalize content for deterministic, cross-platform hashing
30///
31/// # Guarantees
32///
33/// - Identical output on Windows, Linux, macOS
34/// - Same code with different line endings produces same output
35/// - Unicode-safe: NFD and NFC representations produce same output
36/// - Preserves semantic structure (internal indentation, blank lines)
37///
38/// # Example
39///
40/// ```
41/// use infiniloom_engine::embedding::normalize_for_hash;
42///
43/// let unix = "fn foo() {\n    bar();\n}";
44/// let windows = "fn foo() {\r\n    bar();\r\n}";
45///
46/// assert_eq!(normalize_for_hash(unix), normalize_for_hash(windows));
47/// ```
48pub fn normalize_for_hash(content: &str) -> String {
49    // Step 1: Unicode NFC normalization
50    // This ensures "café" (NFD: e + combining accent) equals "café" (NFC: single char)
51    let unicode_normalized: String = content.nfc().collect();
52
53    // Step 2: Normalize line endings (optimize for common case - no \r)
54    let line_normalized = if unicode_normalized.contains('\r') {
55        unicode_normalized.replace("\r\n", "\n").replace('\r', "\n")
56    } else {
57        unicode_normalized
58    };
59
60    // Step 3: Process lines - trim trailing whitespace only
61    let lines: Vec<&str> = line_normalized
62        .lines()
63        .map(|line| line.trim_end()) // Remove trailing whitespace only
64        .collect();
65
66    // Step 4: Remove leading blank lines
67    let start = lines.iter().position(|l| !l.is_empty()).unwrap_or(0);
68
69    // Step 5: Remove trailing blank lines
70    let end = lines
71        .iter()
72        .rposition(|l| !l.is_empty())
73        .map_or(0, |i| i + 1);
74
75    // Handle empty content
76    if start >= end {
77        return String::new();
78    }
79
80    // Join the trimmed lines with LF
81    lines[start..end].join("\n")
82}
83
84/// Fast check if content needs normalization
85///
86/// Returns `true` if the content might produce different hashes without normalization.
87/// This is a quick heuristic check - it may return `true` for some content that
88/// wouldn't actually change after normalization.
89///
90/// Use this for early-exit optimization when processing many files.
91#[inline]
92pub fn needs_normalization(content: &str) -> bool {
93    // Check for carriage returns (Windows line endings or old Mac)
94    if content.contains('\r') {
95        return true;
96    }
97
98    // Check for potential Unicode that needs normalization
99    // Any byte > 127 could be multi-byte UTF-8 that might need NFC
100    if content.bytes().any(|b| b > 127) {
101        return true;
102    }
103
104    // Check for trailing whitespace on any line
105    for line in content.lines() {
106        if line != line.trim_end() {
107            return true;
108        }
109    }
110
111    // Check for trailing newline (normalize_for_hash removes it)
112    // Note: .lines() doesn't give us trailing empty lines, so check directly
113    if content.ends_with('\n') {
114        return true;
115    }
116
117    // Check for leading blank lines
118    if content.starts_with('\n') {
119        return true;
120    }
121
122    // Check for leading/trailing blank lines via .lines()
123    let lines: Vec<&str> = content.lines().collect();
124    if !lines.is_empty() {
125        if lines.first().is_some_and(|l| l.is_empty()) {
126            return true;
127        }
128        if lines.last().is_some_and(|l| l.is_empty()) {
129            return true;
130        }
131    }
132
133    false
134}
135
136/// Normalize a single line (without line ending changes)
137///
138/// Useful for processing content line by line.
139#[inline]
140pub(super) fn normalize_line(line: &str) -> String {
141    line.nfc().collect::<String>().trim_end().to_owned()
142}
143
144/// Check if content is already normalized
145///
146/// Returns `true` if `normalize_for_hash(content) == content`.
147/// More expensive than `needs_normalization` but more accurate.
148pub(super) fn is_normalized(content: &str) -> bool {
149    normalize_for_hash(content) == content
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn test_unicode_nfc() {
158        // NFD: e + combining acute accent
159        let nfd = "cafe\u{0301}";
160        // NFC: single character é
161        let nfc = "caf\u{00E9}";
162
163        assert_eq!(normalize_for_hash(nfd), normalize_for_hash(nfc));
164    }
165
166    #[test]
167    fn test_cross_platform_line_endings() {
168        let unix = "fn foo() {\n    bar();\n}";
169        let windows = "fn foo() {\r\n    bar();\r\n}";
170        let mac_classic = "fn foo() {\r    bar();\r}";
171        let trailing_ws = "fn foo() {   \n    bar();   \n}";
172
173        let normalized = normalize_for_hash(unix);
174        assert_eq!(normalize_for_hash(windows), normalized);
175        assert_eq!(normalize_for_hash(mac_classic), normalized);
176        assert_eq!(normalize_for_hash(trailing_ws), normalized);
177    }
178
179    #[test]
180    fn test_preserves_indentation() {
181        let python = "def foo():\n    if True:\n        return 1";
182        let normalized = normalize_for_hash(python);
183
184        assert!(normalized.contains("    if True:"));
185        assert!(normalized.contains("        return"));
186    }
187
188    #[test]
189    fn test_removes_leading_blank_lines() {
190        let with_leading = "\n\n\nfn foo() {}";
191        let without = "fn foo() {}";
192
193        assert_eq!(normalize_for_hash(with_leading), normalize_for_hash(without));
194    }
195
196    #[test]
197    fn test_removes_trailing_blank_lines() {
198        let with_trailing = "fn foo() {}\n\n\n";
199        let without = "fn foo() {}";
200
201        assert_eq!(normalize_for_hash(with_trailing), normalize_for_hash(without));
202    }
203
204    #[test]
205    fn test_preserves_internal_blank_lines() {
206        let code = "fn foo() {\n    let x = 1;\n\n    let y = 2;\n}";
207        let normalized = normalize_for_hash(code);
208
209        // Internal blank line should be preserved
210        assert!(normalized.contains("\n\n"));
211    }
212
213    #[test]
214    fn test_empty_content() {
215        assert_eq!(normalize_for_hash(""), "");
216        assert_eq!(normalize_for_hash("\n\n\n"), "");
217        assert_eq!(normalize_for_hash("   \n   \n   "), "");
218    }
219
220    #[test]
221    fn test_single_line() {
222        let line = "fn foo() {}";
223        assert_eq!(normalize_for_hash(line), line);
224    }
225
226    #[test]
227    fn test_single_line_with_trailing_whitespace() {
228        let with_ws = "fn foo() {}   ";
229        let without = "fn foo() {}";
230        assert_eq!(normalize_for_hash(with_ws), without);
231    }
232
233    #[test]
234    fn test_needs_normalization() {
235        // Needs normalization
236        assert!(needs_normalization("fn foo() {\r\n}"));
237        assert!(needs_normalization("café")); // Has non-ASCII
238        assert!(needs_normalization("foo   \nbar"));
239        assert!(needs_normalization("\nfoo"));
240        assert!(needs_normalization("foo\n"));
241
242        // Does NOT need normalization (already normalized)
243        assert!(!needs_normalization("fn foo() {\n    bar();\n}"));
244    }
245
246    #[test]
247    fn test_is_normalized() {
248        assert!(is_normalized("fn foo() {\n    bar();\n}"));
249        assert!(!is_normalized("fn foo() {\r\n    bar();\r\n}"));
250        assert!(!is_normalized("fn foo() {}   "));
251    }
252
253    #[test]
254    fn test_normalize_line() {
255        assert_eq!(normalize_line("foo   "), "foo");
256        assert_eq!(normalize_line("  foo  "), "  foo");
257        assert_eq!(normalize_line("cafe\u{0301}"), "caf\u{00E9}");
258    }
259
260    #[test]
261    fn test_mixed_line_endings() {
262        // Mix of CRLF, LF, and CR
263        let mixed = "line1\r\nline2\nline3\rline4";
264        let normalized = normalize_for_hash(mixed);
265
266        assert!(!normalized.contains('\r'));
267        assert!(normalized.contains("line1\nline2\nline3\nline4"));
268    }
269
270    #[test]
271    fn test_tabs_preserved() {
272        let with_tabs = "fn foo() {\n\tbar();\n}";
273        let normalized = normalize_for_hash(with_tabs);
274
275        // Tabs should be preserved (they're indentation)
276        assert!(normalized.contains('\t'));
277    }
278
279    #[test]
280    fn test_unicode_identifiers() {
281        // Some languages allow Unicode identifiers
282        let code = "let αβγ = 42;";
283        let normalized = normalize_for_hash(code);
284        assert!(normalized.contains("αβγ"));
285    }
286
287    #[test]
288    fn test_emoji_preserved() {
289        // Some code has emoji in comments/strings
290        let code = "// 🎉 Success!\nfn celebrate() {}";
291        let normalized = normalize_for_hash(code);
292        assert!(normalized.contains("🎉"));
293    }
294
295    #[test]
296    fn test_deterministic_multiple_calls() {
297        let content = "fn foo() {\r\n    bar();   \r\n}";
298
299        let result1 = normalize_for_hash(content);
300        let result2 = normalize_for_hash(content);
301        let result3 = normalize_for_hash(content);
302
303        assert_eq!(result1, result2);
304        assert_eq!(result2, result3);
305    }
306
307    #[test]
308    fn test_idempotent() {
309        let content = "fn foo() {\r\n    bar();   \r\n}";
310        let once = normalize_for_hash(content);
311        let twice = normalize_for_hash(&once);
312        let thrice = normalize_for_hash(&twice);
313
314        assert_eq!(once, twice);
315        assert_eq!(twice, thrice);
316    }
317}