Skip to main content

garbage_code_hunter/cross_file/
fingerprint.rs

1use std::hash::{Hash, Hasher};
2use std::path::PathBuf;
3
4use syn::{spanned::Spanned, visit::Visit, ItemFn};
5
6/// Normalized token representation for deduplication
7#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8pub enum NormalizedToken {
9    /// Rust keyword (fn, let, if, match, etc.)
10    Keyword(String),
11    /// Placeholder for user-defined variable (e.g., __var_1__)
12    Variable(usize),
13    /// Placeholder for literal values
14    IntLiteral,
15    FloatLiteral,
16    StrLiteral,
17    CharLiteral,
18    BoolLiteral(bool),
19    /// Type placeholder
20    TypePlaceholder,
21    /// Operator or punctuation
22    Operator(String),
23    /// Delimiter (parentheses, braces, brackets)
24    Delimiter(char),
25}
26
27/// Unique identifier for a normalized function body
28#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
29pub struct FunctionFingerprint {
30    pub hash: u64,
31    pub normalized_tokens: Vec<NormalizedToken>,
32    pub locations: Vec<FileLocation>,
33    pub token_count: usize,
34    pub function_name: String,
35}
36
37impl FunctionFingerprint {
38    /// Create a new fingerprint from normalized tokens and compute hash
39    pub fn new(
40        normalized_tokens: Vec<NormalizedToken>,
41        location: FileLocation,
42        function_name: String,
43    ) -> Self {
44        let token_count = normalized_tokens.len();
45        let hash = compute_hash(&normalized_tokens);
46
47        Self {
48            hash,
49            normalized_tokens,
50            locations: vec![location],
51            token_count,
52            function_name,
53        }
54    }
55
56    /// Add another location where this pattern appears
57    pub fn add_location(&mut self, location: FileLocation) {
58        self.locations.push(location);
59    }
60}
61
62/// Location of a function occurrence in source code
63#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
64pub struct FileLocation {
65    pub file_path: PathBuf,
66    pub line_start: usize,
67    pub line_end: usize,
68    pub function_name: String,
69}
70
71impl FileLocation {
72    pub fn new(
73        file_path: PathBuf,
74        line_start: usize,
75        line_end: usize,
76        function_name: String,
77    ) -> Self {
78        Self {
79            file_path,
80            line_start,
81            line_end,
82            function_name,
83        }
84    }
85}
86
87/// Compute a 64-bit hash from normalized tokens using DefaultHasher
88fn compute_hash(tokens: &[NormalizedToken]) -> u64 {
89    use std::collections::hash_map::DefaultHasher;
90
91    let mut hasher = DefaultHasher::new();
92    tokens.hash(&mut hasher);
93    hasher.finish()
94}
95
96/// Visitor to extract and normalize function body tokens
97struct FunctionNormalizer {
98    tokens: Vec<NormalizedToken>,
99    var_counter: usize,
100}
101
102impl FunctionNormalizer {
103    fn new() -> Self {
104        Self {
105            tokens: Vec::new(),
106            var_counter: 0,
107        }
108    }
109
110    /// Create and add a variable placeholder to tokens (avoids borrow conflict)
111    fn add_var_placeholder(&mut self) {
112        let id = self.var_counter;
113        self.var_counter += 1;
114        self.tokens.push(NormalizedToken::Variable(id));
115    }
116
117    fn next_var_placeholder(&mut self) -> NormalizedToken {
118        let id = self.var_counter;
119        self.var_counter += 1;
120        NormalizedToken::Variable(id)
121    }
122}
123
124impl<'ast> Visit<'ast> for FunctionNormalizer {
125    fn visit_item_fn(&mut self, func: &'ast ItemFn) {
126        // Add 'fn' keyword
127        self.tokens.push(NormalizedToken::Keyword("fn".to_string()));
128
129        // Normalize function name as variable
130        let func_name_placeholder = self.next_var_placeholder();
131        self.tokens.push(func_name_placeholder);
132
133        // Visit generics if present
134        syn::visit::visit_generics(self, &func.sig.generics);
135
136        // Add opening paren
137        self.tokens.push(NormalizedToken::Delimiter('('));
138
139        // Normalize inputs
140        for input in func.sig.inputs.iter() {
141            match input {
142                syn::FnArg::Receiver(receiver) => {
143                    if receiver.reference.is_some() {
144                        self.tokens.push(NormalizedToken::Operator("&".to_string()));
145                        if receiver.mutability.is_some() {
146                            self.tokens
147                                .push(NormalizedToken::Keyword("mut".to_string()));
148                        }
149                    }
150                    self.add_var_placeholder();
151                }
152                syn::FnArg::Typed(pat_type) => {
153                    // Pattern (variable name)
154                    self.visit_pat(&pat_type.pat);
155                    // Colon separator
156                    self.tokens.push(NormalizedToken::Delimiter(':'));
157                    // Type
158                    self.visit_type(&pat_type.ty);
159                }
160            }
161
162            // Comma separator between arguments
163            self.tokens.push(NormalizedToken::Delimiter(','));
164        }
165
166        // Remove trailing comma if present
167        if !func.sig.inputs.is_empty() {
168            self.tokens.pop();
169        }
170
171        // Add closing paren
172        self.tokens.push(NormalizedToken::Delimiter(')'));
173
174        // Return type
175        match &func.sig.output {
176            syn::ReturnType::Default => {}
177            syn::ReturnType::Type(_, ty) => {
178                self.tokens
179                    .push(NormalizedToken::Operator("->".to_string()));
180                self.visit_type(ty);
181            }
182        }
183
184        // Visit function body
185        self.visit_block(&func.block);
186    }
187
188    fn visit_pat(&mut self, pat: &'ast syn::Pat) {
189        match pat {
190            syn::Pat::Ident(_) => {
191                self.add_var_placeholder();
192            }
193            syn::Pat::Wild(_) => {
194                self.tokens.push(NormalizedToken::Keyword("_".to_string()));
195            }
196            _ => {
197                syn::visit::visit_pat(self, pat);
198            }
199        }
200    }
201
202    fn visit_type(&mut self, ty: &'ast syn::Type) {
203        match ty {
204            syn::Type::Path(type_path) => {
205                if let Some(segment) = type_path.path.segments.last() {
206                    let ident_str = segment.ident.to_string();
207
208                    // Map common types to placeholders
209                    match ident_str.as_str() {
210                        "i8" | "i16" | "i32" | "i64" | "i128" | "isize" | "u8" | "u16" | "u32"
211                        | "u64" | "u128" | "usize" | "f32" | "f64" => {
212                            self.tokens.push(NormalizedToken::TypePlaceholder);
213                        }
214                        "String" | "str" => {
215                            self.tokens.push(NormalizedToken::StrLiteral);
216                        }
217                        "bool" => {
218                            self.tokens.push(NormalizedToken::TypePlaceholder);
219                        }
220                        "Vec" | "Option" | "Result" | "Box" | "Rc" | "Arc" => {
221                            self.tokens
222                                .push(NormalizedToken::Keyword(ident_str.clone()));
223                            // Visit generic arguments
224                            match &segment.arguments {
225                                syn::PathArguments::None => {}
226                                syn::PathArguments::AngleBracketed(args) => {
227                                    for arg in args.args.iter() {
228                                        syn::visit::visit_generic_argument(self, arg);
229                                    }
230                                }
231                                syn::PathArguments::Parenthesized(_) => {}
232                            }
233                        }
234                        _ => {
235                            // User-defined type - treat as variable
236                            self.add_var_placeholder();
237                        }
238                    }
239                }
240            }
241            syn::Type::Reference(_) => {
242                self.tokens.push(NormalizedToken::Operator("&".to_string()));
243                if let syn::Type::Reference(ref_type) = ty {
244                    self.visit_type(&ref_type.elem);
245                }
246            }
247            _ => {
248                self.tokens.push(NormalizedToken::TypePlaceholder);
249            }
250        }
251    }
252
253    fn visit_return_type(&mut self, return_type: &'ast syn::ReturnType) {
254        match return_type {
255            syn::ReturnType::Default => {}
256            syn::ReturnType::Type(_, ty) => {
257                self.visit_type(ty);
258            }
259        }
260    }
261
262    fn visit_block(&mut self, block: &'ast syn::Block) {
263        self.tokens.push(NormalizedToken::Delimiter('{'));
264
265        for stmt in block.stmts.iter() {
266            self.visit_stmt(stmt);
267        }
268
269        self.tokens.push(NormalizedToken::Delimiter('}'));
270    }
271
272    fn visit_stmt(&mut self, stmt: &'ast syn::Stmt) {
273        match stmt {
274            syn::Stmt::Local(local) => {
275                self.tokens
276                    .push(NormalizedToken::Keyword("let".to_string()));
277
278                // Check if the pattern is a mutable binding
279                let is_mut = matches!(&local.pat, syn::Pat::Ident(pat_ident) if pat_ident.mutability.is_some());
280                if is_mut {
281                    self.tokens
282                        .push(NormalizedToken::Keyword("mut".to_string()));
283                }
284
285                self.visit_pat(&local.pat);
286
287                if let Some(init) = &local.init {
288                    self.tokens.push(NormalizedToken::Operator("=".to_string()));
289                    self.visit_expr(&init.expr);
290                }
291
292                self.tokens.push(NormalizedToken::Delimiter(';'));
293            }
294            syn::Stmt::Item(item) => {
295                syn::visit::visit_item(self, item);
296            }
297            syn::Stmt::Expr(expr, _) => {
298                self.visit_expr(expr);
299            }
300            syn::Stmt::Macro(_) => {
301                // Skip macro statements for now
302                self.tokens
303                    .push(NormalizedToken::Keyword("macro!".to_string()));
304            }
305        }
306    }
307
308    fn visit_expr(&mut self, expr: &'ast syn::Expr) {
309        match expr {
310            syn::Expr::Path(expr_path) => {
311                if let Some(ident) = expr_path.path.get_ident() {
312                    let ident_str = ident.to_string();
313                    match ident_str.as_str() {
314                        "true" => self.tokens.push(NormalizedToken::BoolLiteral(true)),
315                        "false" => self.tokens.push(NormalizedToken::BoolLiteral(false)),
316                        _ => self.add_var_placeholder(),
317                    }
318                } else {
319                    self.add_var_placeholder();
320                }
321            }
322            syn::Expr::Lit(expr_lit) => match &expr_lit.lit {
323                syn::Lit::Int(_) => self.tokens.push(NormalizedToken::IntLiteral),
324                syn::Lit::Float(_) => self.tokens.push(NormalizedToken::FloatLiteral),
325                syn::Lit::Str(_) => self.tokens.push(NormalizedToken::StrLiteral),
326                syn::Lit::Char(_) => self.tokens.push(NormalizedToken::CharLiteral),
327                syn::Lit::Bool(lit) => {
328                    self.tokens.push(NormalizedToken::BoolLiteral(lit.value));
329                }
330                _ => self.tokens.push(NormalizedToken::IntLiteral),
331            },
332            syn::Expr::Binary(bin_expr) => {
333                self.visit_expr(&bin_expr.left);
334                self.tokens.push(match bin_expr.op {
335                    syn::BinOp::Add(_) => NormalizedToken::Operator("+".to_string()),
336                    syn::BinOp::Sub(_) => NormalizedToken::Operator("-".to_string()),
337                    syn::BinOp::Mul(_) => NormalizedToken::Operator("*".to_string()),
338                    syn::BinOp::Div(_) => NormalizedToken::Operator("/".to_string()),
339                    syn::BinOp::Rem(_) => NormalizedToken::Operator("%".to_string()),
340                    syn::BinOp::And(_) => NormalizedToken::Operator("&&".to_string()),
341                    syn::BinOp::Or(_) => NormalizedToken::Operator("||".to_string()),
342                    syn::BinOp::BitAnd(_) => NormalizedToken::Operator("&".to_string()),
343                    syn::BinOp::BitOr(_) => NormalizedToken::Operator("|".to_string()),
344                    syn::BinOp::BitXor(_) => NormalizedToken::Operator("^".to_string()),
345                    syn::BinOp::Shl(_) => NormalizedToken::Operator("<<".to_string()),
346                    syn::BinOp::Shr(_) => NormalizedToken::Operator(">>".to_string()),
347                    syn::BinOp::Eq(_) => NormalizedToken::Operator("==".to_string()),
348                    syn::BinOp::Lt(_) => NormalizedToken::Operator("<".to_string()),
349                    syn::BinOp::Le(_) => NormalizedToken::Operator("<=".to_string()),
350                    syn::BinOp::Ne(_) => NormalizedToken::Operator("!=".to_string()),
351                    syn::BinOp::Ge(_) => NormalizedToken::Operator(">=".to_string()),
352                    syn::BinOp::Gt(_) => NormalizedToken::Operator(">".to_string()),
353                    _ => {
354                        // For assignment operators, use a generic representation
355                        let op_str = format!("{:?}", bin_expr.op);
356                        NormalizedToken::Operator(op_str)
357                    }
358                });
359                self.visit_expr(&bin_expr.right);
360            }
361            syn::Expr::Unary(unary_expr) => {
362                match unary_expr.op {
363                    syn::UnOp::Deref(_) => {
364                        self.tokens.push(NormalizedToken::Operator("*".to_string()))
365                    }
366                    syn::UnOp::Not(_) => {
367                        self.tokens.push(NormalizedToken::Operator("!".to_string()))
368                    }
369                    syn::UnOp::Neg(_) => {
370                        self.tokens.push(NormalizedToken::Operator("-".to_string()))
371                    }
372                    _ => {
373                        // Handle any future UnOp variants
374                        let op_str = format!("{:?}", unary_expr.op);
375                        self.tokens.push(NormalizedToken::Operator(op_str))
376                    }
377                }
378                self.visit_expr(&unary_expr.expr);
379            }
380            syn::Expr::MethodCall(method_call) => {
381                self.visit_expr(&method_call.receiver);
382                self.tokens.push(NormalizedToken::Delimiter('.'));
383                self.add_var_placeholder();
384                self.tokens.push(NormalizedToken::Delimiter('('));
385                for arg in method_call.args.iter() {
386                    self.visit_expr(arg);
387                    self.tokens.push(NormalizedToken::Delimiter(','));
388                }
389                if !method_call.args.is_empty() {
390                    self.tokens.pop();
391                }
392                self.tokens.push(NormalizedToken::Delimiter(')'));
393            }
394            syn::Expr::Call(call_expr) => {
395                self.visit_expr(&call_expr.func);
396                self.tokens.push(NormalizedToken::Delimiter('('));
397                for arg in call_expr.args.iter() {
398                    self.visit_expr(arg);
399                    self.tokens.push(NormalizedToken::Delimiter(','));
400                }
401                if !call_expr.args.is_empty() {
402                    self.tokens.pop();
403                }
404                self.tokens.push(NormalizedToken::Delimiter(')'));
405            }
406            syn::Expr::If(if_expr) => {
407                self.tokens.push(NormalizedToken::Keyword("if".to_string()));
408                self.visit_expr(&if_expr.cond);
409                self.visit_block(&if_expr.then_branch);
410                if let Some((_, else_block)) = &if_expr.else_branch {
411                    self.tokens
412                        .push(NormalizedToken::Keyword("else".to_string()));
413                    match else_block.as_ref() {
414                        syn::Expr::If(inner_if) => {
415                            self.visit_expr(&syn::Expr::If(inner_if.clone()));
416                        }
417                        syn::Expr::Block(expr_block) => {
418                            self.visit_block(&expr_block.block);
419                        }
420                        _ => {}
421                    }
422                }
423            }
424            syn::Expr::Match(match_expr) => {
425                self.tokens
426                    .push(NormalizedToken::Keyword("match".to_string()));
427                self.visit_expr(&match_expr.expr);
428                self.tokens.push(NormalizedToken::Delimiter('{'));
429                for arm in match_expr.arms.iter() {
430                    self.visit_pat(&arm.pat);
431                    if let Some(guard) = &arm.guard {
432                        self.tokens.push(NormalizedToken::Keyword("if".to_string()));
433                        self.visit_expr(&guard.1); // Extract the expression from (If, Expr) tuple
434                    }
435                    self.tokens
436                        .push(NormalizedToken::Operator("=>".to_string()));
437                    self.visit_expr(&arm.body);
438                    self.tokens.push(NormalizedToken::Delimiter(','));
439                }
440                self.tokens.push(NormalizedToken::Delimiter('}'));
441            }
442            syn::Expr::ForLoop(for_loop) => {
443                self.tokens
444                    .push(NormalizedToken::Keyword("for".to_string()));
445                self.visit_pat(&for_loop.pat);
446                self.tokens.push(NormalizedToken::Keyword("in".to_string()));
447                self.visit_expr(&for_loop.expr);
448                self.visit_block(&for_loop.body);
449            }
450            syn::Expr::While(while_loop) => {
451                self.tokens
452                    .push(NormalizedToken::Keyword("while".to_string()));
453                self.visit_expr(&while_loop.cond);
454                self.visit_block(&while_loop.body);
455            }
456            syn::Expr::Loop(loop_expr) => {
457                self.tokens
458                    .push(NormalizedToken::Keyword("loop".to_string()));
459                self.visit_block(&loop_expr.body);
460            }
461            syn::Expr::Return(ret_expr) => {
462                self.tokens
463                    .push(NormalizedToken::Keyword("return".to_string()));
464                if let Some(expr) = &ret_expr.expr {
465                    self.visit_expr(expr);
466                }
467            }
468            syn::Expr::Field(field_expr) => {
469                self.visit_expr(&field_expr.base);
470                self.tokens.push(NormalizedToken::Delimiter('.'));
471                self.add_var_placeholder();
472            }
473            syn::Expr::Index(index_expr) => {
474                self.visit_expr(&index_expr.expr);
475                self.tokens.push(NormalizedToken::Delimiter('['));
476                self.visit_expr(&index_expr.index);
477                self.tokens.push(NormalizedToken::Delimiter(']'));
478            }
479            syn::Expr::Block(block_expr) => {
480                self.visit_block(&block_expr.block);
481            }
482            syn::Expr::Struct(struct_expr) => {
483                self.add_var_placeholder();
484                self.tokens.push(NormalizedToken::Delimiter('{'));
485                for field in struct_expr.fields.iter() {
486                    self.add_var_placeholder();
487                    self.tokens.push(NormalizedToken::Operator(":".to_string()));
488                    self.visit_expr(&field.expr);
489                    self.tokens.push(NormalizedToken::Delimiter(','));
490                }
491                if !struct_expr.fields.is_empty() {
492                    self.tokens.pop();
493                }
494                self.tokens.push(NormalizedToken::Delimiter('}'));
495            }
496            _ => {
497                syn::visit::visit_expr(self, expr);
498            }
499        }
500    }
501}
502
503/// Extract normalized fingerprint from a parsed function
504pub fn extract_fingerprint(func: &ItemFn, file_path: PathBuf) -> Option<FunctionFingerprint> {
505    let mut normalizer = FunctionNormalizer::new();
506    normalizer.visit_item_fn(func);
507
508    if normalizer.tokens.is_empty() || normalizer.tokens.len() < 3 {
509        return None;
510    }
511
512    let line_start = func.sig.fn_token.span.start().line;
513    let line_end = func.block.span().end().line;
514
515    let location = FileLocation::new(file_path, line_start, line_end, func.sig.ident.to_string());
516
517    Some(FunctionFingerprint::new(
518        normalizer.tokens,
519        location,
520        func.sig.ident.to_string(),
521    ))
522}
523
524/// Calculate Jaccard similarity between two token sequences
525pub fn jaccard_similarity(a: &[NormalizedToken], b: &[NormalizedToken]) -> f64 {
526    use std::collections::HashSet;
527
528    if a.is_empty() && b.is_empty() {
529        return 1.0;
530    }
531
532    if a.is_empty() || b.is_empty() {
533        return 0.0;
534    }
535
536    let set_a: HashSet<&NormalizedToken> = a.iter().collect();
537    let set_b: HashSet<&NormalizedToken> = b.iter().collect();
538
539    let intersection = set_a.intersection(&set_b).count();
540    let union = set_a.union(&set_b).count();
541
542    if union == 0 {
543        0.0
544    } else {
545        intersection as f64 / union as f64
546    }
547}
548
549#[cfg(test)]
550mod tests {
551    use super::*;
552
553    /// Objective: Verify that empty functions produce valid but minimal fingerprints
554    /// Invariants: Hash should be non-zero, token count should include at least signature
555    #[test]
556    fn test_fingerprint_empty_function() {
557        let code = r#"
558        fn empty_func() {}
559        "#;
560
561        let func: ItemFn = syn::parse_str(code).expect("Failed to parse empty function");
562        let fp = extract_fingerprint(&func, PathBuf::from("test.rs"));
563
564        assert!(fp.is_some(), "Empty function should produce a fingerprint");
565
566        let fp = fp.unwrap();
567        assert!(
568            fp.token_count >= 5,
569            "Empty function should have at least signature tokens, got {}",
570            fp.token_count
571        );
572        assert_ne!(fp.hash, 0, "Hash should never be zero (collision risk)");
573        assert_eq!(fp.function_name, "empty_func");
574        assert_eq!(fp.locations.len(), 1);
575    }
576
577    /// Objective: Verify normalization produces identical output for semantically equivalent code with different variable names
578    /// Invariants: Different variable names must yield identical fingerprints
579    #[test]
580    fn test_normalization_variable_renaming_produces_same_hash() {
581        let code_a = r#"
582        fn process_data(data: i32) -> i32 {
583            let result = data * 2;
584            result + 1
585        }
586        "#;
587
588        let code_b = r#"
589        fn calculate_value(input: i32) -> i32 {
590            let output = input * 2;
591            output + 1
592        }
593        "#;
594
595        let func_a: ItemFn = syn::parse_str(code_a).expect("Failed to parse code A");
596        let func_b: ItemFn = syn::parse_str(code_b).expect("Failed to parse code B");
597
598        let fp_a = extract_fingerprint(&func_a, PathBuf::from("a.rs")).unwrap();
599        let fp_b = extract_fingerprint(&func_b, PathBuf::from("b.rs")).unwrap();
600
601        assert_eq!(
602            fp_a.hash, fp_b.hash,
603            "Variable renaming should not affect fingerprint hash"
604        );
605        assert_eq!(
606            fp_a.normalized_tokens, fp_b.normalized_tokens,
607            "Normalized tokens should be identical after renaming"
608        );
609    }
610
611    /// Objective: Verify Jaccard similarity correctly identifies near-duplicates
612    /// Invariants: Functions differing by one line should have high similarity (>0.85)
613    #[test]
614    fn test_similarity_near_duplicate_detection() {
615        let base_code = r#"
616        fn validate(item: i32) -> bool {
617            if item > 0 { true } else { false }
618        }
619        "#;
620
621        let modified_code = r#"
622        fn check(value: i32) -> bool {
623            if value > 0 && value < 100 { true } else { false }
624        }
625        "#;
626
627        let base_func: ItemFn = syn::parse_str(base_code).expect("Failed to parse base");
628        let modified_func: ItemFn =
629            syn::parse_str(modified_code).expect("Failed to parse modified");
630
631        let base_fp = extract_fingerprint(&base_func, PathBuf::from("base.rs")).unwrap();
632        let modified_fp =
633            extract_fingerprint(&modified_func, PathBuf::from("modified.rs")).unwrap();
634
635        let similarity =
636            jaccard_similarity(&base_fp.normalized_tokens, &modified_fp.normalized_tokens);
637
638        assert!(
639            similarity > 0.7,
640            "Near-duplicate should have similarity > 0.7, got {:.3}",
641            similarity
642        );
643        assert!(
644            similarity < 1.0,
645            "Modified function should NOT be exact match (similarity < 1.0), got {:.3}",
646            similarity
647        );
648    }
649
650    /// Objective: Verify completely different functions produce low similarity scores
651    /// Invariants: Unrelated functions should have similarity < 0.5
652    #[test]
653    fn test_similarity_different_functions_low_score() {
654        let code_math = r#"
655        fn calculate(x: i32, y: i32) -> i32 { x + y * 2 }
656        "#;
657
658        let code_io = r#"
659        fn read_file(path: &str) -> Result<String, std::io::Error> {
660            std::fs::read_to_string(path)
661        }
662        "#;
663
664        let math_func: ItemFn = syn::parse_str(code_math).expect("Failed to parse math");
665        let io_func: ItemFn = syn::parse_str(code_io).expect("Failed to parse io");
666
667        let math_fp = extract_fingerprint(&math_func, PathBuf::from("math.rs")).unwrap();
668        let io_fp = extract_fingerprint(&io_func, PathBuf::from("io.rs")).unwrap();
669
670        let similarity = jaccard_similarity(&math_fp.normalized_tokens, &io_fp.normalized_tokens);
671
672        assert!(
673            similarity < 0.65,
674            "Different functions should have low similarity (< 0.65), got {:.3}",
675            similarity
676        );
677    }
678
679    /// Objective: Verify identical functions always produce same hash regardless of file path
680    /// Invariants: Hash must be deterministic and path-independent
681    #[test]
682    fn test_hash_determinism_and_path_independence() {
683        let code = r#"
684        fn example(x: i32) -> i32 { x * x }
685        "#;
686
687        let func: ItemFn = syn::parse_str(code).unwrap();
688
689        let fp1 = extract_fingerprint(&func, PathBuf::from("/path/a.rs")).unwrap();
690        let fp2 = extract_fingerprint(&func, PathBuf::from("/different/path/b.rs")).unwrap();
691
692        assert_eq!(
693            fp1.hash, fp2.hash,
694            "Hash must be identical regardless of file path"
695        );
696        assert_eq!(
697            fp1.token_count, fp2.token_count,
698            "Token count must be identical"
699        );
700    }
701
702    /// Objective: Verify fingerprint extraction handles complex control flow structures
703    /// Invariants: Nested loops, matches, and ifs should all be captured in normalized form
704    #[test]
705    fn test_complex_control_flow_normalization() {
706        let code = r#"
707        fn complex_logic(items: Vec<i32>) -> Vec<i32> {
708            let mut result = Vec::new();
709            for item in items {
710                match item {
711                    x if x > 0 => result.push(x * 2),
712                    x if x < 0 => result.push(x.abs()),
713                    _ => continue,
714                }
715            }
716            result
717        }
718        "#;
719
720        let func: ItemFn = syn::parse_str(code).expect("Failed to parse complex function");
721        let fp = extract_fingerprint(&func, PathBuf::from("complex.rs")).unwrap();
722
723        assert!(
724            fp.token_count > 20,
725            "Complex function should produce many tokens, got {}",
726            fp.token_count
727        );
728
729        assert!(
730            fp.normalized_tokens
731                .contains(&NormalizedToken::Keyword("for".to_string()))
732                && fp
733                    .normalized_tokens
734                    .contains(&NormalizedToken::Keyword("match".to_string()))
735                && fp
736                    .normalized_tokens
737                    .contains(&NormalizedToken::Keyword("if".to_string())),
738            "Control flow keywords must be preserved in normalized tokens"
739        );
740    }
741}