1use std::hash::{Hash, Hasher};
2use std::path::PathBuf;
3
4use syn::{spanned::Spanned, visit::Visit, ItemFn};
5
6#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8pub enum NormalizedToken {
9 Keyword(String),
11 Variable(usize),
13 IntLiteral,
15 FloatLiteral,
16 StrLiteral,
17 CharLiteral,
18 BoolLiteral(bool),
19 TypePlaceholder,
21 Operator(String),
23 Delimiter(char),
25}
26
27#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
29pub struct FunctionFingerprint {
30 pub hash: u64,
31 pub normalized_tokens: Vec<NormalizedToken>,
32 pub locations: Vec<FileLocation>,
33 pub token_count: usize,
34 pub function_name: String,
35}
36
37impl FunctionFingerprint {
38 pub fn new(
40 normalized_tokens: Vec<NormalizedToken>,
41 location: FileLocation,
42 function_name: String,
43 ) -> Self {
44 let token_count = normalized_tokens.len();
45 let hash = compute_hash(&normalized_tokens);
46
47 Self {
48 hash,
49 normalized_tokens,
50 locations: vec![location],
51 token_count,
52 function_name,
53 }
54 }
55
56 pub fn add_location(&mut self, location: FileLocation) {
58 self.locations.push(location);
59 }
60}
61
62#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
64pub struct FileLocation {
65 pub file_path: PathBuf,
66 pub line_start: usize,
67 pub line_end: usize,
68 pub function_name: String,
69}
70
71impl FileLocation {
72 pub fn new(
73 file_path: PathBuf,
74 line_start: usize,
75 line_end: usize,
76 function_name: String,
77 ) -> Self {
78 Self {
79 file_path,
80 line_start,
81 line_end,
82 function_name,
83 }
84 }
85}
86
87fn compute_hash(tokens: &[NormalizedToken]) -> u64 {
89 use std::collections::hash_map::DefaultHasher;
90
91 let mut hasher = DefaultHasher::new();
92 tokens.hash(&mut hasher);
93 hasher.finish()
94}
95
96struct FunctionNormalizer {
98 tokens: Vec<NormalizedToken>,
99 var_counter: usize,
100}
101
102impl FunctionNormalizer {
103 fn new() -> Self {
104 Self {
105 tokens: Vec::new(),
106 var_counter: 0,
107 }
108 }
109
110 fn add_var_placeholder(&mut self) {
112 let id = self.var_counter;
113 self.var_counter += 1;
114 self.tokens.push(NormalizedToken::Variable(id));
115 }
116
117 fn next_var_placeholder(&mut self) -> NormalizedToken {
118 let id = self.var_counter;
119 self.var_counter += 1;
120 NormalizedToken::Variable(id)
121 }
122}
123
124impl<'ast> Visit<'ast> for FunctionNormalizer {
125 fn visit_item_fn(&mut self, func: &'ast ItemFn) {
126 self.tokens.push(NormalizedToken::Keyword("fn".to_string()));
128
129 let func_name_placeholder = self.next_var_placeholder();
131 self.tokens.push(func_name_placeholder);
132
133 syn::visit::visit_generics(self, &func.sig.generics);
135
136 self.tokens.push(NormalizedToken::Delimiter('('));
138
139 for input in func.sig.inputs.iter() {
141 match input {
142 syn::FnArg::Receiver(receiver) => {
143 if receiver.reference.is_some() {
144 self.tokens.push(NormalizedToken::Operator("&".to_string()));
145 if receiver.mutability.is_some() {
146 self.tokens
147 .push(NormalizedToken::Keyword("mut".to_string()));
148 }
149 }
150 self.add_var_placeholder();
151 }
152 syn::FnArg::Typed(pat_type) => {
153 self.visit_pat(&pat_type.pat);
155 self.tokens.push(NormalizedToken::Delimiter(':'));
157 self.visit_type(&pat_type.ty);
159 }
160 }
161
162 self.tokens.push(NormalizedToken::Delimiter(','));
164 }
165
166 if !func.sig.inputs.is_empty() {
168 self.tokens.pop();
169 }
170
171 self.tokens.push(NormalizedToken::Delimiter(')'));
173
174 match &func.sig.output {
176 syn::ReturnType::Default => {}
177 syn::ReturnType::Type(_, ty) => {
178 self.tokens
179 .push(NormalizedToken::Operator("->".to_string()));
180 self.visit_type(ty);
181 }
182 }
183
184 self.visit_block(&func.block);
186 }
187
188 fn visit_pat(&mut self, pat: &'ast syn::Pat) {
189 match pat {
190 syn::Pat::Ident(_) => {
191 self.add_var_placeholder();
192 }
193 syn::Pat::Wild(_) => {
194 self.tokens.push(NormalizedToken::Keyword("_".to_string()));
195 }
196 _ => {
197 syn::visit::visit_pat(self, pat);
198 }
199 }
200 }
201
202 fn visit_type(&mut self, ty: &'ast syn::Type) {
203 match ty {
204 syn::Type::Path(type_path) => {
205 if let Some(segment) = type_path.path.segments.last() {
206 let ident_str = segment.ident.to_string();
207
208 match ident_str.as_str() {
210 "i8" | "i16" | "i32" | "i64" | "i128" | "isize" | "u8" | "u16" | "u32"
211 | "u64" | "u128" | "usize" | "f32" | "f64" => {
212 self.tokens.push(NormalizedToken::TypePlaceholder);
213 }
214 "String" | "str" => {
215 self.tokens.push(NormalizedToken::StrLiteral);
216 }
217 "bool" => {
218 self.tokens.push(NormalizedToken::TypePlaceholder);
219 }
220 "Vec" | "Option" | "Result" | "Box" | "Rc" | "Arc" => {
221 self.tokens
222 .push(NormalizedToken::Keyword(ident_str.clone()));
223 match &segment.arguments {
225 syn::PathArguments::None => {}
226 syn::PathArguments::AngleBracketed(args) => {
227 for arg in args.args.iter() {
228 syn::visit::visit_generic_argument(self, arg);
229 }
230 }
231 syn::PathArguments::Parenthesized(_) => {}
232 }
233 }
234 _ => {
235 self.add_var_placeholder();
237 }
238 }
239 }
240 }
241 syn::Type::Reference(_) => {
242 self.tokens.push(NormalizedToken::Operator("&".to_string()));
243 if let syn::Type::Reference(ref_type) = ty {
244 self.visit_type(&ref_type.elem);
245 }
246 }
247 _ => {
248 self.tokens.push(NormalizedToken::TypePlaceholder);
249 }
250 }
251 }
252
253 fn visit_return_type(&mut self, return_type: &'ast syn::ReturnType) {
254 match return_type {
255 syn::ReturnType::Default => {}
256 syn::ReturnType::Type(_, ty) => {
257 self.visit_type(ty);
258 }
259 }
260 }
261
262 fn visit_block(&mut self, block: &'ast syn::Block) {
263 self.tokens.push(NormalizedToken::Delimiter('{'));
264
265 for stmt in block.stmts.iter() {
266 self.visit_stmt(stmt);
267 }
268
269 self.tokens.push(NormalizedToken::Delimiter('}'));
270 }
271
272 fn visit_stmt(&mut self, stmt: &'ast syn::Stmt) {
273 match stmt {
274 syn::Stmt::Local(local) => {
275 self.tokens
276 .push(NormalizedToken::Keyword("let".to_string()));
277
278 let is_mut = matches!(&local.pat, syn::Pat::Ident(pat_ident) if pat_ident.mutability.is_some());
280 if is_mut {
281 self.tokens
282 .push(NormalizedToken::Keyword("mut".to_string()));
283 }
284
285 self.visit_pat(&local.pat);
286
287 if let Some(init) = &local.init {
288 self.tokens.push(NormalizedToken::Operator("=".to_string()));
289 self.visit_expr(&init.expr);
290 }
291
292 self.tokens.push(NormalizedToken::Delimiter(';'));
293 }
294 syn::Stmt::Item(item) => {
295 syn::visit::visit_item(self, item);
296 }
297 syn::Stmt::Expr(expr, _) => {
298 self.visit_expr(expr);
299 }
300 syn::Stmt::Macro(_) => {
301 self.tokens
303 .push(NormalizedToken::Keyword("macro!".to_string()));
304 }
305 }
306 }
307
308 fn visit_expr(&mut self, expr: &'ast syn::Expr) {
309 match expr {
310 syn::Expr::Path(expr_path) => {
311 if let Some(ident) = expr_path.path.get_ident() {
312 let ident_str = ident.to_string();
313 match ident_str.as_str() {
314 "true" => self.tokens.push(NormalizedToken::BoolLiteral(true)),
315 "false" => self.tokens.push(NormalizedToken::BoolLiteral(false)),
316 _ => self.add_var_placeholder(),
317 }
318 } else {
319 self.add_var_placeholder();
320 }
321 }
322 syn::Expr::Lit(expr_lit) => match &expr_lit.lit {
323 syn::Lit::Int(_) => self.tokens.push(NormalizedToken::IntLiteral),
324 syn::Lit::Float(_) => self.tokens.push(NormalizedToken::FloatLiteral),
325 syn::Lit::Str(_) => self.tokens.push(NormalizedToken::StrLiteral),
326 syn::Lit::Char(_) => self.tokens.push(NormalizedToken::CharLiteral),
327 syn::Lit::Bool(lit) => {
328 self.tokens.push(NormalizedToken::BoolLiteral(lit.value));
329 }
330 _ => self.tokens.push(NormalizedToken::IntLiteral),
331 },
332 syn::Expr::Binary(bin_expr) => {
333 self.visit_expr(&bin_expr.left);
334 self.tokens.push(match bin_expr.op {
335 syn::BinOp::Add(_) => NormalizedToken::Operator("+".to_string()),
336 syn::BinOp::Sub(_) => NormalizedToken::Operator("-".to_string()),
337 syn::BinOp::Mul(_) => NormalizedToken::Operator("*".to_string()),
338 syn::BinOp::Div(_) => NormalizedToken::Operator("/".to_string()),
339 syn::BinOp::Rem(_) => NormalizedToken::Operator("%".to_string()),
340 syn::BinOp::And(_) => NormalizedToken::Operator("&&".to_string()),
341 syn::BinOp::Or(_) => NormalizedToken::Operator("||".to_string()),
342 syn::BinOp::BitAnd(_) => NormalizedToken::Operator("&".to_string()),
343 syn::BinOp::BitOr(_) => NormalizedToken::Operator("|".to_string()),
344 syn::BinOp::BitXor(_) => NormalizedToken::Operator("^".to_string()),
345 syn::BinOp::Shl(_) => NormalizedToken::Operator("<<".to_string()),
346 syn::BinOp::Shr(_) => NormalizedToken::Operator(">>".to_string()),
347 syn::BinOp::Eq(_) => NormalizedToken::Operator("==".to_string()),
348 syn::BinOp::Lt(_) => NormalizedToken::Operator("<".to_string()),
349 syn::BinOp::Le(_) => NormalizedToken::Operator("<=".to_string()),
350 syn::BinOp::Ne(_) => NormalizedToken::Operator("!=".to_string()),
351 syn::BinOp::Ge(_) => NormalizedToken::Operator(">=".to_string()),
352 syn::BinOp::Gt(_) => NormalizedToken::Operator(">".to_string()),
353 _ => {
354 let op_str = format!("{:?}", bin_expr.op);
356 NormalizedToken::Operator(op_str)
357 }
358 });
359 self.visit_expr(&bin_expr.right);
360 }
361 syn::Expr::Unary(unary_expr) => {
362 match unary_expr.op {
363 syn::UnOp::Deref(_) => {
364 self.tokens.push(NormalizedToken::Operator("*".to_string()))
365 }
366 syn::UnOp::Not(_) => {
367 self.tokens.push(NormalizedToken::Operator("!".to_string()))
368 }
369 syn::UnOp::Neg(_) => {
370 self.tokens.push(NormalizedToken::Operator("-".to_string()))
371 }
372 _ => {
373 let op_str = format!("{:?}", unary_expr.op);
375 self.tokens.push(NormalizedToken::Operator(op_str))
376 }
377 }
378 self.visit_expr(&unary_expr.expr);
379 }
380 syn::Expr::MethodCall(method_call) => {
381 self.visit_expr(&method_call.receiver);
382 self.tokens.push(NormalizedToken::Delimiter('.'));
383 self.add_var_placeholder();
384 self.tokens.push(NormalizedToken::Delimiter('('));
385 for arg in method_call.args.iter() {
386 self.visit_expr(arg);
387 self.tokens.push(NormalizedToken::Delimiter(','));
388 }
389 if !method_call.args.is_empty() {
390 self.tokens.pop();
391 }
392 self.tokens.push(NormalizedToken::Delimiter(')'));
393 }
394 syn::Expr::Call(call_expr) => {
395 self.visit_expr(&call_expr.func);
396 self.tokens.push(NormalizedToken::Delimiter('('));
397 for arg in call_expr.args.iter() {
398 self.visit_expr(arg);
399 self.tokens.push(NormalizedToken::Delimiter(','));
400 }
401 if !call_expr.args.is_empty() {
402 self.tokens.pop();
403 }
404 self.tokens.push(NormalizedToken::Delimiter(')'));
405 }
406 syn::Expr::If(if_expr) => {
407 self.tokens.push(NormalizedToken::Keyword("if".to_string()));
408 self.visit_expr(&if_expr.cond);
409 self.visit_block(&if_expr.then_branch);
410 if let Some((_, else_block)) = &if_expr.else_branch {
411 self.tokens
412 .push(NormalizedToken::Keyword("else".to_string()));
413 match else_block.as_ref() {
414 syn::Expr::If(inner_if) => {
415 self.visit_expr(&syn::Expr::If(inner_if.clone()));
416 }
417 syn::Expr::Block(expr_block) => {
418 self.visit_block(&expr_block.block);
419 }
420 _ => {}
421 }
422 }
423 }
424 syn::Expr::Match(match_expr) => {
425 self.tokens
426 .push(NormalizedToken::Keyword("match".to_string()));
427 self.visit_expr(&match_expr.expr);
428 self.tokens.push(NormalizedToken::Delimiter('{'));
429 for arm in match_expr.arms.iter() {
430 self.visit_pat(&arm.pat);
431 if let Some(guard) = &arm.guard {
432 self.tokens.push(NormalizedToken::Keyword("if".to_string()));
433 self.visit_expr(&guard.1); }
435 self.tokens
436 .push(NormalizedToken::Operator("=>".to_string()));
437 self.visit_expr(&arm.body);
438 self.tokens.push(NormalizedToken::Delimiter(','));
439 }
440 self.tokens.push(NormalizedToken::Delimiter('}'));
441 }
442 syn::Expr::ForLoop(for_loop) => {
443 self.tokens
444 .push(NormalizedToken::Keyword("for".to_string()));
445 self.visit_pat(&for_loop.pat);
446 self.tokens.push(NormalizedToken::Keyword("in".to_string()));
447 self.visit_expr(&for_loop.expr);
448 self.visit_block(&for_loop.body);
449 }
450 syn::Expr::While(while_loop) => {
451 self.tokens
452 .push(NormalizedToken::Keyword("while".to_string()));
453 self.visit_expr(&while_loop.cond);
454 self.visit_block(&while_loop.body);
455 }
456 syn::Expr::Loop(loop_expr) => {
457 self.tokens
458 .push(NormalizedToken::Keyword("loop".to_string()));
459 self.visit_block(&loop_expr.body);
460 }
461 syn::Expr::Return(ret_expr) => {
462 self.tokens
463 .push(NormalizedToken::Keyword("return".to_string()));
464 if let Some(expr) = &ret_expr.expr {
465 self.visit_expr(expr);
466 }
467 }
468 syn::Expr::Field(field_expr) => {
469 self.visit_expr(&field_expr.base);
470 self.tokens.push(NormalizedToken::Delimiter('.'));
471 self.add_var_placeholder();
472 }
473 syn::Expr::Index(index_expr) => {
474 self.visit_expr(&index_expr.expr);
475 self.tokens.push(NormalizedToken::Delimiter('['));
476 self.visit_expr(&index_expr.index);
477 self.tokens.push(NormalizedToken::Delimiter(']'));
478 }
479 syn::Expr::Block(block_expr) => {
480 self.visit_block(&block_expr.block);
481 }
482 syn::Expr::Struct(struct_expr) => {
483 self.add_var_placeholder();
484 self.tokens.push(NormalizedToken::Delimiter('{'));
485 for field in struct_expr.fields.iter() {
486 self.add_var_placeholder();
487 self.tokens.push(NormalizedToken::Operator(":".to_string()));
488 self.visit_expr(&field.expr);
489 self.tokens.push(NormalizedToken::Delimiter(','));
490 }
491 if !struct_expr.fields.is_empty() {
492 self.tokens.pop();
493 }
494 self.tokens.push(NormalizedToken::Delimiter('}'));
495 }
496 _ => {
497 syn::visit::visit_expr(self, expr);
498 }
499 }
500 }
501}
502
503pub fn extract_fingerprint(func: &ItemFn, file_path: PathBuf) -> Option<FunctionFingerprint> {
505 let mut normalizer = FunctionNormalizer::new();
506 normalizer.visit_item_fn(func);
507
508 if normalizer.tokens.is_empty() || normalizer.tokens.len() < 3 {
509 return None;
510 }
511
512 let line_start = func.sig.fn_token.span.start().line;
513 let line_end = func.block.span().end().line;
514
515 let location = FileLocation::new(file_path, line_start, line_end, func.sig.ident.to_string());
516
517 Some(FunctionFingerprint::new(
518 normalizer.tokens,
519 location,
520 func.sig.ident.to_string(),
521 ))
522}
523
524pub fn jaccard_similarity(a: &[NormalizedToken], b: &[NormalizedToken]) -> f64 {
526 use std::collections::HashSet;
527
528 if a.is_empty() && b.is_empty() {
529 return 1.0;
530 }
531
532 if a.is_empty() || b.is_empty() {
533 return 0.0;
534 }
535
536 let set_a: HashSet<&NormalizedToken> = a.iter().collect();
537 let set_b: HashSet<&NormalizedToken> = b.iter().collect();
538
539 let intersection = set_a.intersection(&set_b).count();
540 let union = set_a.union(&set_b).count();
541
542 if union == 0 {
543 0.0
544 } else {
545 intersection as f64 / union as f64
546 }
547}
548
549#[cfg(test)]
550mod tests {
551 use super::*;
552
553 #[test]
556 fn test_fingerprint_empty_function() {
557 let code = r#"
558 fn empty_func() {}
559 "#;
560
561 let func: ItemFn = syn::parse_str(code).expect("Failed to parse empty function");
562 let fp = extract_fingerprint(&func, PathBuf::from("test.rs"));
563
564 assert!(fp.is_some(), "Empty function should produce a fingerprint");
565
566 let fp = fp.unwrap();
567 assert!(
568 fp.token_count >= 5,
569 "Empty function should have at least signature tokens, got {}",
570 fp.token_count
571 );
572 assert_ne!(fp.hash, 0, "Hash should never be zero (collision risk)");
573 assert_eq!(fp.function_name, "empty_func");
574 assert_eq!(fp.locations.len(), 1);
575 }
576
577 #[test]
580 fn test_normalization_variable_renaming_produces_same_hash() {
581 let code_a = r#"
582 fn process_data(data: i32) -> i32 {
583 let result = data * 2;
584 result + 1
585 }
586 "#;
587
588 let code_b = r#"
589 fn calculate_value(input: i32) -> i32 {
590 let output = input * 2;
591 output + 1
592 }
593 "#;
594
595 let func_a: ItemFn = syn::parse_str(code_a).expect("Failed to parse code A");
596 let func_b: ItemFn = syn::parse_str(code_b).expect("Failed to parse code B");
597
598 let fp_a = extract_fingerprint(&func_a, PathBuf::from("a.rs")).unwrap();
599 let fp_b = extract_fingerprint(&func_b, PathBuf::from("b.rs")).unwrap();
600
601 assert_eq!(
602 fp_a.hash, fp_b.hash,
603 "Variable renaming should not affect fingerprint hash"
604 );
605 assert_eq!(
606 fp_a.normalized_tokens, fp_b.normalized_tokens,
607 "Normalized tokens should be identical after renaming"
608 );
609 }
610
611 #[test]
614 fn test_similarity_near_duplicate_detection() {
615 let base_code = r#"
616 fn validate(item: i32) -> bool {
617 if item > 0 { true } else { false }
618 }
619 "#;
620
621 let modified_code = r#"
622 fn check(value: i32) -> bool {
623 if value > 0 && value < 100 { true } else { false }
624 }
625 "#;
626
627 let base_func: ItemFn = syn::parse_str(base_code).expect("Failed to parse base");
628 let modified_func: ItemFn =
629 syn::parse_str(modified_code).expect("Failed to parse modified");
630
631 let base_fp = extract_fingerprint(&base_func, PathBuf::from("base.rs")).unwrap();
632 let modified_fp =
633 extract_fingerprint(&modified_func, PathBuf::from("modified.rs")).unwrap();
634
635 let similarity =
636 jaccard_similarity(&base_fp.normalized_tokens, &modified_fp.normalized_tokens);
637
638 assert!(
639 similarity > 0.7,
640 "Near-duplicate should have similarity > 0.7, got {:.3}",
641 similarity
642 );
643 assert!(
644 similarity < 1.0,
645 "Modified function should NOT be exact match (similarity < 1.0), got {:.3}",
646 similarity
647 );
648 }
649
650 #[test]
653 fn test_similarity_different_functions_low_score() {
654 let code_math = r#"
655 fn calculate(x: i32, y: i32) -> i32 { x + y * 2 }
656 "#;
657
658 let code_io = r#"
659 fn read_file(path: &str) -> Result<String, std::io::Error> {
660 std::fs::read_to_string(path)
661 }
662 "#;
663
664 let math_func: ItemFn = syn::parse_str(code_math).expect("Failed to parse math");
665 let io_func: ItemFn = syn::parse_str(code_io).expect("Failed to parse io");
666
667 let math_fp = extract_fingerprint(&math_func, PathBuf::from("math.rs")).unwrap();
668 let io_fp = extract_fingerprint(&io_func, PathBuf::from("io.rs")).unwrap();
669
670 let similarity = jaccard_similarity(&math_fp.normalized_tokens, &io_fp.normalized_tokens);
671
672 assert!(
673 similarity < 0.65,
674 "Different functions should have low similarity (< 0.65), got {:.3}",
675 similarity
676 );
677 }
678
679 #[test]
682 fn test_hash_determinism_and_path_independence() {
683 let code = r#"
684 fn example(x: i32) -> i32 { x * x }
685 "#;
686
687 let func: ItemFn = syn::parse_str(code).unwrap();
688
689 let fp1 = extract_fingerprint(&func, PathBuf::from("/path/a.rs")).unwrap();
690 let fp2 = extract_fingerprint(&func, PathBuf::from("/different/path/b.rs")).unwrap();
691
692 assert_eq!(
693 fp1.hash, fp2.hash,
694 "Hash must be identical regardless of file path"
695 );
696 assert_eq!(
697 fp1.token_count, fp2.token_count,
698 "Token count must be identical"
699 );
700 }
701
702 #[test]
705 fn test_complex_control_flow_normalization() {
706 let code = r#"
707 fn complex_logic(items: Vec<i32>) -> Vec<i32> {
708 let mut result = Vec::new();
709 for item in items {
710 match item {
711 x if x > 0 => result.push(x * 2),
712 x if x < 0 => result.push(x.abs()),
713 _ => continue,
714 }
715 }
716 result
717 }
718 "#;
719
720 let func: ItemFn = syn::parse_str(code).expect("Failed to parse complex function");
721 let fp = extract_fingerprint(&func, PathBuf::from("complex.rs")).unwrap();
722
723 assert!(
724 fp.token_count > 20,
725 "Complex function should produce many tokens, got {}",
726 fp.token_count
727 );
728
729 assert!(
730 fp.normalized_tokens
731 .contains(&NormalizedToken::Keyword("for".to_string()))
732 && fp
733 .normalized_tokens
734 .contains(&NormalizedToken::Keyword("match".to_string()))
735 && fp
736 .normalized_tokens
737 .contains(&NormalizedToken::Keyword("if".to_string())),
738 "Control flow keywords must be preserved in normalized tokens"
739 );
740 }
741}