infiniloom_engine/embedding/
normalizer.rs1use unicode_normalization::UnicodeNormalization;
28
29pub fn normalize_for_hash(content: &str) -> String {
49 let unicode_normalized: String = content.nfc().collect();
52
53 let line_normalized = if unicode_normalized.contains('\r') {
55 unicode_normalized
56 .replace("\r\n", "\n")
57 .replace('\r', "\n")
58 } else {
59 unicode_normalized
60 };
61
62 let lines: Vec<&str> = line_normalized
64 .lines()
65 .map(|line| line.trim_end()) .collect();
67
68 let start = lines.iter().position(|l| !l.is_empty()).unwrap_or(0);
70
71 let end = lines
73 .iter()
74 .rposition(|l| !l.is_empty())
75 .map(|i| i + 1)
76 .unwrap_or(0);
77
78 if start >= end {
80 return String::new();
81 }
82
83 lines[start..end].join("\n")
85}
86
87#[inline]
95pub fn needs_normalization(content: &str) -> bool {
96 if content.contains('\r') {
98 return true;
99 }
100
101 if content.bytes().any(|b| b > 127) {
104 return true;
105 }
106
107 for line in content.lines() {
109 if line != line.trim_end() {
110 return true;
111 }
112 }
113
114 if content.ends_with('\n') {
117 return true;
118 }
119
120 if content.starts_with('\n') {
122 return true;
123 }
124
125 let lines: Vec<&str> = content.lines().collect();
127 if !lines.is_empty() {
128 if lines.first().is_some_and(|l| l.is_empty()) {
129 return true;
130 }
131 if lines.last().is_some_and(|l| l.is_empty()) {
132 return true;
133 }
134 }
135
136 false
137}
138
139#[inline]
143pub fn normalize_line(line: &str) -> String {
144 line.nfc().collect::<String>().trim_end().to_string()
145}
146
147pub fn is_normalized(content: &str) -> bool {
152 normalize_for_hash(content) == content
153}
154
155#[cfg(test)]
156mod tests {
157 use super::*;
158
159 #[test]
160 fn test_unicode_nfc() {
161 let nfd = "cafe\u{0301}";
163 let nfc = "caf\u{00E9}";
165
166 assert_eq!(normalize_for_hash(nfd), normalize_for_hash(nfc));
167 }
168
169 #[test]
170 fn test_cross_platform_line_endings() {
171 let unix = "fn foo() {\n bar();\n}";
172 let windows = "fn foo() {\r\n bar();\r\n}";
173 let mac_classic = "fn foo() {\r bar();\r}";
174 let trailing_ws = "fn foo() { \n bar(); \n}";
175
176 let normalized = normalize_for_hash(unix);
177 assert_eq!(normalize_for_hash(windows), normalized);
178 assert_eq!(normalize_for_hash(mac_classic), normalized);
179 assert_eq!(normalize_for_hash(trailing_ws), normalized);
180 }
181
182 #[test]
183 fn test_preserves_indentation() {
184 let python = "def foo():\n if True:\n return 1";
185 let normalized = normalize_for_hash(python);
186
187 assert!(normalized.contains(" if True:"));
188 assert!(normalized.contains(" return"));
189 }
190
191 #[test]
192 fn test_removes_leading_blank_lines() {
193 let with_leading = "\n\n\nfn foo() {}";
194 let without = "fn foo() {}";
195
196 assert_eq!(normalize_for_hash(with_leading), normalize_for_hash(without));
197 }
198
199 #[test]
200 fn test_removes_trailing_blank_lines() {
201 let with_trailing = "fn foo() {}\n\n\n";
202 let without = "fn foo() {}";
203
204 assert_eq!(
205 normalize_for_hash(with_trailing),
206 normalize_for_hash(without)
207 );
208 }
209
210 #[test]
211 fn test_preserves_internal_blank_lines() {
212 let code = "fn foo() {\n let x = 1;\n\n let y = 2;\n}";
213 let normalized = normalize_for_hash(code);
214
215 assert!(normalized.contains("\n\n"));
217 }
218
219 #[test]
220 fn test_empty_content() {
221 assert_eq!(normalize_for_hash(""), "");
222 assert_eq!(normalize_for_hash("\n\n\n"), "");
223 assert_eq!(normalize_for_hash(" \n \n "), "");
224 }
225
226 #[test]
227 fn test_single_line() {
228 let line = "fn foo() {}";
229 assert_eq!(normalize_for_hash(line), line);
230 }
231
232 #[test]
233 fn test_single_line_with_trailing_whitespace() {
234 let with_ws = "fn foo() {} ";
235 let without = "fn foo() {}";
236 assert_eq!(normalize_for_hash(with_ws), without);
237 }
238
239 #[test]
240 fn test_needs_normalization() {
241 assert!(needs_normalization("fn foo() {\r\n}"));
243 assert!(needs_normalization("café")); assert!(needs_normalization("foo \nbar"));
245 assert!(needs_normalization("\nfoo"));
246 assert!(needs_normalization("foo\n"));
247
248 assert!(!needs_normalization("fn foo() {\n bar();\n}"));
250 }
251
252 #[test]
253 fn test_is_normalized() {
254 assert!(is_normalized("fn foo() {\n bar();\n}"));
255 assert!(!is_normalized("fn foo() {\r\n bar();\r\n}"));
256 assert!(!is_normalized("fn foo() {} "));
257 }
258
259 #[test]
260 fn test_normalize_line() {
261 assert_eq!(normalize_line("foo "), "foo");
262 assert_eq!(normalize_line(" foo "), " foo");
263 assert_eq!(normalize_line("cafe\u{0301}"), "caf\u{00E9}");
264 }
265
266 #[test]
267 fn test_mixed_line_endings() {
268 let mixed = "line1\r\nline2\nline3\rline4";
270 let normalized = normalize_for_hash(mixed);
271
272 assert!(!normalized.contains('\r'));
273 assert!(normalized.contains("line1\nline2\nline3\nline4"));
274 }
275
276 #[test]
277 fn test_tabs_preserved() {
278 let with_tabs = "fn foo() {\n\tbar();\n}";
279 let normalized = normalize_for_hash(with_tabs);
280
281 assert!(normalized.contains('\t'));
283 }
284
285 #[test]
286 fn test_unicode_identifiers() {
287 let code = "let αβγ = 42;";
289 let normalized = normalize_for_hash(code);
290 assert!(normalized.contains("αβγ"));
291 }
292
293 #[test]
294 fn test_emoji_preserved() {
295 let code = "// 🎉 Success!\nfn celebrate() {}";
297 let normalized = normalize_for_hash(code);
298 assert!(normalized.contains("🎉"));
299 }
300
301 #[test]
302 fn test_deterministic_multiple_calls() {
303 let content = "fn foo() {\r\n bar(); \r\n}";
304
305 let result1 = normalize_for_hash(content);
306 let result2 = normalize_for_hash(content);
307 let result3 = normalize_for_hash(content);
308
309 assert_eq!(result1, result2);
310 assert_eq!(result2, result3);
311 }
312
313 #[test]
314 fn test_idempotent() {
315 let content = "fn foo() {\r\n bar(); \r\n}";
316 let once = normalize_for_hash(content);
317 let twice = normalize_for_hash(&once);
318 let thrice = normalize_for_hash(&twice);
319
320 assert_eq!(once, twice);
321 assert_eq!(twice, thrice);
322 }
323}