infiniloom_engine/embedding/
normalizer.rs1use unicode_normalization::UnicodeNormalization;
28
29pub fn normalize_for_hash(content: &str) -> String {
49 let unicode_normalized: String = content.nfc().collect();
52
53 let line_normalized = if unicode_normalized.contains('\r') {
55 unicode_normalized.replace("\r\n", "\n").replace('\r', "\n")
56 } else {
57 unicode_normalized
58 };
59
60 let lines: Vec<&str> = line_normalized
62 .lines()
63 .map(|line| line.trim_end()) .collect();
65
66 let start = lines.iter().position(|l| !l.is_empty()).unwrap_or(0);
68
69 let end = lines
71 .iter()
72 .rposition(|l| !l.is_empty())
73 .map(|i| i + 1)
74 .unwrap_or(0);
75
76 if start >= end {
78 return String::new();
79 }
80
81 lines[start..end].join("\n")
83}
84
85#[inline]
93pub fn needs_normalization(content: &str) -> bool {
94 if content.contains('\r') {
96 return true;
97 }
98
99 if content.bytes().any(|b| b > 127) {
102 return true;
103 }
104
105 for line in content.lines() {
107 if line != line.trim_end() {
108 return true;
109 }
110 }
111
112 if content.ends_with('\n') {
115 return true;
116 }
117
118 if content.starts_with('\n') {
120 return true;
121 }
122
123 let lines: Vec<&str> = content.lines().collect();
125 if !lines.is_empty() {
126 if lines.first().is_some_and(|l| l.is_empty()) {
127 return true;
128 }
129 if lines.last().is_some_and(|l| l.is_empty()) {
130 return true;
131 }
132 }
133
134 false
135}
136
137#[inline]
141pub fn normalize_line(line: &str) -> String {
142 line.nfc().collect::<String>().trim_end().to_string()
143}
144
145pub fn is_normalized(content: &str) -> bool {
150 normalize_for_hash(content) == content
151}
152
153#[cfg(test)]
154mod tests {
155 use super::*;
156
157 #[test]
158 fn test_unicode_nfc() {
159 let nfd = "cafe\u{0301}";
161 let nfc = "caf\u{00E9}";
163
164 assert_eq!(normalize_for_hash(nfd), normalize_for_hash(nfc));
165 }
166
167 #[test]
168 fn test_cross_platform_line_endings() {
169 let unix = "fn foo() {\n bar();\n}";
170 let windows = "fn foo() {\r\n bar();\r\n}";
171 let mac_classic = "fn foo() {\r bar();\r}";
172 let trailing_ws = "fn foo() { \n bar(); \n}";
173
174 let normalized = normalize_for_hash(unix);
175 assert_eq!(normalize_for_hash(windows), normalized);
176 assert_eq!(normalize_for_hash(mac_classic), normalized);
177 assert_eq!(normalize_for_hash(trailing_ws), normalized);
178 }
179
180 #[test]
181 fn test_preserves_indentation() {
182 let python = "def foo():\n if True:\n return 1";
183 let normalized = normalize_for_hash(python);
184
185 assert!(normalized.contains(" if True:"));
186 assert!(normalized.contains(" return"));
187 }
188
189 #[test]
190 fn test_removes_leading_blank_lines() {
191 let with_leading = "\n\n\nfn foo() {}";
192 let without = "fn foo() {}";
193
194 assert_eq!(normalize_for_hash(with_leading), normalize_for_hash(without));
195 }
196
197 #[test]
198 fn test_removes_trailing_blank_lines() {
199 let with_trailing = "fn foo() {}\n\n\n";
200 let without = "fn foo() {}";
201
202 assert_eq!(normalize_for_hash(with_trailing), normalize_for_hash(without));
203 }
204
205 #[test]
206 fn test_preserves_internal_blank_lines() {
207 let code = "fn foo() {\n let x = 1;\n\n let y = 2;\n}";
208 let normalized = normalize_for_hash(code);
209
210 assert!(normalized.contains("\n\n"));
212 }
213
214 #[test]
215 fn test_empty_content() {
216 assert_eq!(normalize_for_hash(""), "");
217 assert_eq!(normalize_for_hash("\n\n\n"), "");
218 assert_eq!(normalize_for_hash(" \n \n "), "");
219 }
220
221 #[test]
222 fn test_single_line() {
223 let line = "fn foo() {}";
224 assert_eq!(normalize_for_hash(line), line);
225 }
226
227 #[test]
228 fn test_single_line_with_trailing_whitespace() {
229 let with_ws = "fn foo() {} ";
230 let without = "fn foo() {}";
231 assert_eq!(normalize_for_hash(with_ws), without);
232 }
233
234 #[test]
235 fn test_needs_normalization() {
236 assert!(needs_normalization("fn foo() {\r\n}"));
238 assert!(needs_normalization("café")); assert!(needs_normalization("foo \nbar"));
240 assert!(needs_normalization("\nfoo"));
241 assert!(needs_normalization("foo\n"));
242
243 assert!(!needs_normalization("fn foo() {\n bar();\n}"));
245 }
246
247 #[test]
248 fn test_is_normalized() {
249 assert!(is_normalized("fn foo() {\n bar();\n}"));
250 assert!(!is_normalized("fn foo() {\r\n bar();\r\n}"));
251 assert!(!is_normalized("fn foo() {} "));
252 }
253
254 #[test]
255 fn test_normalize_line() {
256 assert_eq!(normalize_line("foo "), "foo");
257 assert_eq!(normalize_line(" foo "), " foo");
258 assert_eq!(normalize_line("cafe\u{0301}"), "caf\u{00E9}");
259 }
260
261 #[test]
262 fn test_mixed_line_endings() {
263 let mixed = "line1\r\nline2\nline3\rline4";
265 let normalized = normalize_for_hash(mixed);
266
267 assert!(!normalized.contains('\r'));
268 assert!(normalized.contains("line1\nline2\nline3\nline4"));
269 }
270
271 #[test]
272 fn test_tabs_preserved() {
273 let with_tabs = "fn foo() {\n\tbar();\n}";
274 let normalized = normalize_for_hash(with_tabs);
275
276 assert!(normalized.contains('\t'));
278 }
279
280 #[test]
281 fn test_unicode_identifiers() {
282 let code = "let αβγ = 42;";
284 let normalized = normalize_for_hash(code);
285 assert!(normalized.contains("αβγ"));
286 }
287
288 #[test]
289 fn test_emoji_preserved() {
290 let code = "// 🎉 Success!\nfn celebrate() {}";
292 let normalized = normalize_for_hash(code);
293 assert!(normalized.contains("🎉"));
294 }
295
296 #[test]
297 fn test_deterministic_multiple_calls() {
298 let content = "fn foo() {\r\n bar(); \r\n}";
299
300 let result1 = normalize_for_hash(content);
301 let result2 = normalize_for_hash(content);
302 let result3 = normalize_for_hash(content);
303
304 assert_eq!(result1, result2);
305 assert_eq!(result2, result3);
306 }
307
308 #[test]
309 fn test_idempotent() {
310 let content = "fn foo() {\r\n bar(); \r\n}";
311 let once = normalize_for_hash(content);
312 let twice = normalize_for_hash(&once);
313 let thrice = normalize_for_hash(&twice);
314
315 assert_eq!(once, twice);
316 assert_eq!(twice, thrice);
317 }
318}