infiniloom_engine/embedding/
normalizer.rs1use unicode_normalization::UnicodeNormalization;
28
29pub fn normalize_for_hash(content: &str) -> String {
49 let unicode_normalized: String = content.nfc().collect();
52
53 let line_normalized = if unicode_normalized.contains('\r') {
55 unicode_normalized.replace("\r\n", "\n").replace('\r', "\n")
56 } else {
57 unicode_normalized
58 };
59
60 let lines: Vec<&str> = line_normalized
62 .lines()
63 .map(|line| line.trim_end()) .collect();
65
66 let start = lines.iter().position(|l| !l.is_empty()).unwrap_or(0);
68
69 let end = lines
71 .iter()
72 .rposition(|l| !l.is_empty())
73 .map_or(0, |i| i + 1);
74
75 if start >= end {
77 return String::new();
78 }
79
80 lines[start..end].join("\n")
82}
83
84#[inline]
92pub fn needs_normalization(content: &str) -> bool {
93 if content.contains('\r') {
95 return true;
96 }
97
98 if content.bytes().any(|b| b > 127) {
101 return true;
102 }
103
104 for line in content.lines() {
106 if line != line.trim_end() {
107 return true;
108 }
109 }
110
111 if content.ends_with('\n') {
114 return true;
115 }
116
117 if content.starts_with('\n') {
119 return true;
120 }
121
122 let lines: Vec<&str> = content.lines().collect();
124 if !lines.is_empty() {
125 if lines.first().is_some_and(|l| l.is_empty()) {
126 return true;
127 }
128 if lines.last().is_some_and(|l| l.is_empty()) {
129 return true;
130 }
131 }
132
133 false
134}
135
136#[inline]
140pub(super) fn normalize_line(line: &str) -> String {
141 line.nfc().collect::<String>().trim_end().to_owned()
142}
143
144pub(super) fn is_normalized(content: &str) -> bool {
149 normalize_for_hash(content) == content
150}
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155
156 #[test]
157 fn test_unicode_nfc() {
158 let nfd = "cafe\u{0301}";
160 let nfc = "caf\u{00E9}";
162
163 assert_eq!(normalize_for_hash(nfd), normalize_for_hash(nfc));
164 }
165
166 #[test]
167 fn test_cross_platform_line_endings() {
168 let unix = "fn foo() {\n bar();\n}";
169 let windows = "fn foo() {\r\n bar();\r\n}";
170 let mac_classic = "fn foo() {\r bar();\r}";
171 let trailing_ws = "fn foo() { \n bar(); \n}";
172
173 let normalized = normalize_for_hash(unix);
174 assert_eq!(normalize_for_hash(windows), normalized);
175 assert_eq!(normalize_for_hash(mac_classic), normalized);
176 assert_eq!(normalize_for_hash(trailing_ws), normalized);
177 }
178
179 #[test]
180 fn test_preserves_indentation() {
181 let python = "def foo():\n if True:\n return 1";
182 let normalized = normalize_for_hash(python);
183
184 assert!(normalized.contains(" if True:"));
185 assert!(normalized.contains(" return"));
186 }
187
188 #[test]
189 fn test_removes_leading_blank_lines() {
190 let with_leading = "\n\n\nfn foo() {}";
191 let without = "fn foo() {}";
192
193 assert_eq!(normalize_for_hash(with_leading), normalize_for_hash(without));
194 }
195
196 #[test]
197 fn test_removes_trailing_blank_lines() {
198 let with_trailing = "fn foo() {}\n\n\n";
199 let without = "fn foo() {}";
200
201 assert_eq!(normalize_for_hash(with_trailing), normalize_for_hash(without));
202 }
203
204 #[test]
205 fn test_preserves_internal_blank_lines() {
206 let code = "fn foo() {\n let x = 1;\n\n let y = 2;\n}";
207 let normalized = normalize_for_hash(code);
208
209 assert!(normalized.contains("\n\n"));
211 }
212
213 #[test]
214 fn test_empty_content() {
215 assert_eq!(normalize_for_hash(""), "");
216 assert_eq!(normalize_for_hash("\n\n\n"), "");
217 assert_eq!(normalize_for_hash(" \n \n "), "");
218 }
219
220 #[test]
221 fn test_single_line() {
222 let line = "fn foo() {}";
223 assert_eq!(normalize_for_hash(line), line);
224 }
225
226 #[test]
227 fn test_single_line_with_trailing_whitespace() {
228 let with_ws = "fn foo() {} ";
229 let without = "fn foo() {}";
230 assert_eq!(normalize_for_hash(with_ws), without);
231 }
232
233 #[test]
234 fn test_needs_normalization() {
235 assert!(needs_normalization("fn foo() {\r\n}"));
237 assert!(needs_normalization("café")); assert!(needs_normalization("foo \nbar"));
239 assert!(needs_normalization("\nfoo"));
240 assert!(needs_normalization("foo\n"));
241
242 assert!(!needs_normalization("fn foo() {\n bar();\n}"));
244 }
245
246 #[test]
247 fn test_is_normalized() {
248 assert!(is_normalized("fn foo() {\n bar();\n}"));
249 assert!(!is_normalized("fn foo() {\r\n bar();\r\n}"));
250 assert!(!is_normalized("fn foo() {} "));
251 }
252
253 #[test]
254 fn test_normalize_line() {
255 assert_eq!(normalize_line("foo "), "foo");
256 assert_eq!(normalize_line(" foo "), " foo");
257 assert_eq!(normalize_line("cafe\u{0301}"), "caf\u{00E9}");
258 }
259
260 #[test]
261 fn test_mixed_line_endings() {
262 let mixed = "line1\r\nline2\nline3\rline4";
264 let normalized = normalize_for_hash(mixed);
265
266 assert!(!normalized.contains('\r'));
267 assert!(normalized.contains("line1\nline2\nline3\nline4"));
268 }
269
270 #[test]
271 fn test_tabs_preserved() {
272 let with_tabs = "fn foo() {\n\tbar();\n}";
273 let normalized = normalize_for_hash(with_tabs);
274
275 assert!(normalized.contains('\t'));
277 }
278
279 #[test]
280 fn test_unicode_identifiers() {
281 let code = "let αβγ = 42;";
283 let normalized = normalize_for_hash(code);
284 assert!(normalized.contains("αβγ"));
285 }
286
287 #[test]
288 fn test_emoji_preserved() {
289 let code = "// 🎉 Success!\nfn celebrate() {}";
291 let normalized = normalize_for_hash(code);
292 assert!(normalized.contains("🎉"));
293 }
294
295 #[test]
296 fn test_deterministic_multiple_calls() {
297 let content = "fn foo() {\r\n bar(); \r\n}";
298
299 let result1 = normalize_for_hash(content);
300 let result2 = normalize_for_hash(content);
301 let result3 = normalize_for_hash(content);
302
303 assert_eq!(result1, result2);
304 assert_eq!(result2, result3);
305 }
306
307 #[test]
308 fn test_idempotent() {
309 let content = "fn foo() {\r\n bar(); \r\n}";
310 let once = normalize_for_hash(content);
311 let twice = normalize_for_hash(&once);
312 let thrice = normalize_for_hash(&twice);
313
314 assert_eq!(once, twice);
315 assert_eq!(twice, thrice);
316 }
317}