garbage_code_hunter/cross_file/
analyzer.rs1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use syn::{spanned::Spanned, ItemFn};
5
6use crate::analyzer::Severity;
7use crate::context::FileContext;
8
9use super::fingerprint::{
10 extract_fingerprint, jaccard_similarity, FileLocation, FunctionFingerprint,
11};
12
13#[derive(Debug, Clone)]
15pub struct CrossFileConfig {
16 pub min_function_lines: usize,
17 pub max_function_lines: usize,
18 pub min_file_occurrences: usize,
19 pub similarity_threshold: f64,
20 pub max_memory_mb: usize,
21}
22
23impl Default for CrossFileConfig {
24 fn default() -> Self {
25 Self {
26 min_function_lines: 5,
27 max_function_lines: 150,
28 min_file_occurrences: 2,
29 similarity_threshold: 0.85,
30 max_memory_mb: 512,
31 }
32 }
33}
34
35#[derive(Debug, Clone)]
37pub struct CrossFileIssue {
38 pub fingerprint: FunctionFingerprint,
39 pub file_count: usize,
40 pub total_occurrences: usize,
41 pub similarity_score: f64,
42 pub severity: Severity,
43}
44
45impl CrossFileIssue {
46 fn compute_severity(&self) -> Severity {
48 if self.total_occurrences > 10 || self.file_count > 5 {
49 Severity::Nuclear
50 } else if self.total_occurrences > 5 || self.file_count > 3 {
51 Severity::Spicy
52 } else {
53 Severity::Mild
54 }
55 }
56}
57
58pub struct CrossFileAnalyzer {
60 index: HashMap<u64, FunctionFingerprint>,
62 config: CrossFileConfig,
63 total_functions_processed: usize,
64 total_files_processed: usize,
65}
66
67impl Default for CrossFileAnalyzer {
68 fn default() -> Self {
69 Self::new()
70 }
71}
72
73impl CrossFileAnalyzer {
74 pub fn new() -> Self {
76 Self::with_config(CrossFileConfig::default())
77 }
78
79 pub fn with_config(config: CrossFileConfig) -> Self {
81 Self {
82 index: HashMap::new(),
83 config,
84 total_functions_processed: 0,
85 total_files_processed: 0,
86 }
87 }
88
89 pub fn process_file(&mut self, file_path: &Path, content: &str) -> Result<(), String> {
91 let syntax: syn::File =
92 syn::parse_str(content).map_err(|e| format!("Parse error: {}", e))?;
93
94 self.total_files_processed += 1;
95
96 for item in syntax.items.iter() {
97 if let syn::Item::Fn(func) = item {
98 if let Some(fp) = self.process_function(func, file_path) {
99 self.add_fingerprint(fp);
100 }
101 }
102 }
103
104 Ok(())
105 }
106
107 fn process_function(&self, func: &ItemFn, file_path: &Path) -> Option<FunctionFingerprint> {
109 let line_start = func.sig.fn_token.span.start().line;
110 let line_end = func.block.span().end().line;
111 let line_count = line_end - line_start + 1;
112
113 if line_count < self.config.min_function_lines {
115 return None;
116 }
117
118 if line_count > self.config.max_function_lines {
120 return None;
121 }
122
123 extract_fingerprint(func, file_path.to_path_buf())
124 }
125
126 fn add_fingerprint(&mut self, mut fingerprint: FunctionFingerprint) {
128 self.total_functions_processed += 1;
129
130 match self.index.get_mut(&fingerprint.hash) {
131 Some(existing) => {
132 existing.locations.append(&mut fingerprint.locations);
133 }
134 None => {
135 self.index.insert(fingerprint.hash, fingerprint);
136 }
137 }
138 }
139
140 pub fn find_all_duplicates(&self) -> Vec<CrossFileIssue> {
142 let mut issues = Vec::new();
143
144 for (_hash, fingerprint) in self.index.iter() {
145 let unique_files: std::collections::HashSet<&PathBuf> = fingerprint
147 .locations
148 .iter()
149 .map(|loc| &loc.file_path)
150 .collect();
151
152 let file_count = unique_files.len();
153 let total_occurrences = fingerprint.locations.len();
154
155 if file_count < self.config.min_file_occurrences {
157 continue;
158 }
159
160 let similarity = 1.0;
162
163 let issue = CrossFileIssue {
164 fingerprint: fingerprint.clone(),
165 file_count,
166 total_occurrences,
167 similarity_score: similarity,
168 severity: Severity::Mild, };
170
171 let issue_with_severity = CrossFileIssue {
172 severity: issue.compute_severity(),
173 ..issue
174 };
175
176 issues.push(issue_with_severity);
177 }
178
179 issues.sort_by(|a, b| {
181 b.severity
182 .cmp(&a.severity)
183 .then(b.total_occurrences.cmp(&a.total_occurrences))
184 });
185
186 issues
187 }
188
189 pub fn find_near_duplicates(&self) -> Vec<CrossFileIssue> {
191 let fingerprints: Vec<&FunctionFingerprint> = self.index.values().collect();
192 let mut issues = Vec::new();
193 let mut compared_pairs: std::collections::HashSet<(u64, u64)> =
194 std::collections::HashSet::new();
195
196 for i in 0..fingerprints.len() {
197 for j in (i + 1)..fingerprints.len() {
198 let fp_a = fingerprints[i];
199 let fp_b = fingerprints[j];
200
201 let pair_key = if fp_a.hash < fp_b.hash {
203 (fp_a.hash, fp_b.hash)
204 } else {
205 (fp_b.hash, fp_a.hash)
206 };
207
208 if fp_a.hash == fp_b.hash || compared_pairs.contains(&pair_key) {
209 continue;
210 }
211
212 compared_pairs.insert(pair_key);
213
214 let similarity =
216 jaccard_similarity(&fp_a.normalized_tokens, &fp_b.normalized_tokens);
217
218 if similarity >= self.config.similarity_threshold {
219 let all_locations: Vec<&FileLocation> =
221 fp_a.locations.iter().chain(fp_b.locations.iter()).collect();
222 let unique_files: std::collections::HashSet<&PathBuf> =
223 all_locations.iter().map(|loc| &loc.file_path).collect();
224
225 let file_count = unique_files.len();
226 let total_occurrences = all_locations.len();
227
228 if file_count >= self.config.min_file_occurrences {
229 let issue = CrossFileIssue {
230 fingerprint: fp_a.clone(), file_count,
232 total_occurrences,
233 similarity_score: similarity,
234 severity: Severity::Mild,
235 };
236
237 let issue_with_severity = CrossFileIssue {
238 severity: issue.compute_severity(),
239 ..issue
240 };
241
242 issues.push(issue_with_severity);
243 }
244 }
245 }
246 }
247
248 issues.sort_by(|a, b| {
250 b.severity.cmp(&a.severity).then(
251 b.similarity_score
252 .partial_cmp(&a.similarity_score)
253 .unwrap_or(std::cmp::Ordering::Equal),
254 )
255 });
256
257 issues
258 }
259
260 pub fn estimated_memory_usage(&self) -> usize {
262 let base_size = std::mem::size_of::<Self>();
263 let index_size: usize = self
264 .index
265 .values()
266 .map(|fp| {
267 std::mem::size_of::<u64>()
268 + std::mem::size_of::<FunctionFingerprint>()
269 + fp.normalized_tokens.capacity()
270 * std::mem::size_of::<super::fingerprint::NormalizedToken>()
271 + fp.locations.capacity() * std::mem::size_of::<FileLocation>()
272 + fp.function_name.capacity()
273 })
274 .sum();
275
276 base_size + index_size
277 }
278
279 pub fn evict_old_entries(&mut self) {
281 let limit_bytes = self.config.max_memory_mb * 1024 * 1024;
282
283 while self.estimated_memory_usage() > limit_bytes && !self.index.is_empty() {
284 let keys_to_remove: Vec<u64> = self.index.keys().take(10).copied().collect();
287 for key in keys_to_remove {
288 self.index.remove(&key);
289 }
290 }
291 }
292
293 pub fn stats(&self) -> AnalysisStats {
295 AnalysisStats {
296 total_functions: self.total_functions_processed,
297 total_files: self.total_files_processed,
298 unique_fingerprints: self.index.len(),
299 memory_bytes: self.estimated_memory_usage(),
300 }
301 }
302}
303
304#[derive(Debug, Clone)]
306pub struct AnalysisStats {
307 pub total_functions: usize,
308 pub total_files: usize,
309 pub unique_fingerprints: usize,
310 pub memory_bytes: usize,
311}
312
313pub fn analyze_project<P: AsRef<Path>>(
326 root: P,
327 config: CrossFileConfig,
328) -> Result<CrossFileAnalyzer, String> {
329 let root = root.as_ref();
330 let mut analyzer = CrossFileAnalyzer::with_config(config);
331
332 walk_directory(root, |path, content| analyzer.process_file(path, content))?;
333
334 Ok(analyzer)
335}
336
337fn walk_directory<F>(root: &Path, mut processor: F) -> Result<(), String>
339where
340 F: FnMut(&Path, &str) -> Result<(), String>,
341{
342 use std::fs;
343
344 if !root.is_dir() {
345 return Err(format!("{} is not a directory", root.display()));
346 }
347
348 fn visit_dir<F>(dir: &Path, processor: &mut F) -> Result<(), String>
349 where
350 F: FnMut(&Path, &str) -> Result<(), String>,
351 {
352 let entries =
353 fs::read_dir(dir).map_err(|e| format!("Cannot read dir {}: {}", dir.display(), e))?;
354
355 for entry in entries.flatten() {
356 let path = entry.path();
357
358 if path.is_dir() {
359 let name = path
361 .file_name()
362 .unwrap_or_default()
363 .to_string_lossy()
364 .to_string();
365
366 if name == "target" || name == ".git" || name == "node_modules" {
367 continue;
368 }
369
370 visit_dir(&path, processor)?;
371 } else if path.extension().is_some_and(|ext| ext == "rs") {
372 let context = FileContext::from_path(&path);
373
374 if context.rule_weight_multiplier() < 0.5 {
376 continue;
377 }
378
379 let content = fs::read_to_string(&path)
380 .map_err(|e| format!("Cannot read {}: {}", path.display(), e))?;
381
382 processor(&path, &content)?;
383 }
384 }
385
386 Ok(())
387 }
388
389 visit_dir(root, &mut processor)
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395
396 #[test]
399 fn test_process_single_file_multiple_functions() {
400 let code = r#"
401 fn short_func() { 1 } // Too short, should be skipped
402
403 fn valid_function_one(x: i32) -> i32 {
404 let result = x * 2;
405 result + 1
406 }
407
408 fn valid_function_two(data: &Vec<i32>) -> i32 {
409 let mut sum = 0;
410 for item in data {
411 sum += item;
412 }
413 sum
414 }
415 "#;
416
417 let mut analyzer = CrossFileAnalyzer::with_config(CrossFileConfig {
418 min_function_lines: 4, ..Default::default()
420 });
421
422 let result = analyzer.process_file(Path::new("test.rs"), code);
423 assert!(result.is_ok(), "Processing should succeed");
424
425 assert_eq!(
426 analyzer.index.len(),
427 2,
428 "Should find exactly 2 valid functions (short one skipped)"
429 );
430 assert_eq!(
431 analyzer.total_functions_processed, 2,
432 "Should have processed 2 functions"
433 );
434 }
435
436 #[test]
439 fn test_detect_exact_duplicates_across_files() {
440 let shared_code = r#"
441 fn calculate_total(items: &Vec<i32>) -> i32 {
442 let mut total = 0;
443 for item in items {
444 total += item;
445 }
446 total
447 }
448 "#;
449
450 let mut analyzer = CrossFileAnalyzer::new();
451
452 analyzer
453 .process_file(Path::new("src/utils.rs"), shared_code)
454 .expect("Failed to process utils.rs");
455 analyzer
456 .process_file(Path::new("src/helpers.rs"), shared_code)
457 .expect("Failed to process helpers.rs");
458
459 let duplicates = analyzer.find_all_duplicates();
460
461 assert_eq!(
462 duplicates.len(),
463 1,
464 "Should detect exactly 1 duplicate pattern"
465 );
466
467 let issue = &duplicates[0];
468 assert_eq!(
469 issue.file_count, 2,
470 "Duplicate should appear in 2 different files"
471 );
472 assert_eq!(
473 issue.total_occurrences, 2,
474 "Total occurrences should be 2 (one per file)"
475 );
476 assert!(
477 (issue.similarity_score - 1.0).abs() < f64::EPSILON,
478 "Exact match should have similarity 1.0"
479 );
480 }
481
482 #[test]
485 fn test_min_file_occurrences_filtering() {
486 let code_unique = r#"
487 fn unique_function(x: i32) -> i32 { x + 42 }
488 "#;
489
490 let mut analyzer = CrossFileAnalyzer::with_config(CrossFileConfig {
491 min_function_lines: 3,
492 min_file_occurrences: 2, ..Default::default()
494 });
495
496 analyzer
497 .process_file(Path::new("only_file.rs"), code_unique)
498 .unwrap();
499
500 let duplicates = analyzer.find_all_duplicates();
501
502 assert!(
503 duplicates.is_empty(),
504 "Single-file pattern should not be reported when min_file_occurrences=2"
505 );
506 }
507
508 #[test]
511 fn test_severity_scaling_with_duplication_extent() {
512 let shared_code = r#"
513 fn duplicated(x: i32) -> i32 {
514 let y = x * 2;
515 y + 1
516 }
517 "#;
518
519 let mut analyzer = CrossFileAnalyzer::with_config(CrossFileConfig {
520 min_function_lines: 4,
521 ..Default::default()
522 });
523
524 for i in 0..6 {
526 analyzer
527 .process_file(Path::new(&format!("file_{}.rs", i)), shared_code)
528 .unwrap();
529 }
530
531 let duplicates = analyzer.find_all_duplicates();
532
533 assert_eq!(duplicates.len(), 1, "Should find 1 duplicate group");
534 assert_eq!(
535 duplicates[0].severity,
536 Severity::Nuclear,
537 "6 files with same function should be Nuclear severity"
538 );
539 }
540
541 #[test]
544 fn test_memory_limit_enforcement() {
545 let config = CrossFileConfig {
546 max_memory_mb: 1, min_function_lines: 3,
548 ..Default::default()
549 };
550
551 let mut analyzer = CrossFileAnalyzer::with_config(config.clone());
552
553 let simple_fn = r#"
554 fn sample_func(a: i32, b: i32) -> i32 { a + b }
555 "#;
556
557 for i in 0..100 {
559 let _ = analyzer.process_file(Path::new(&format!("test_{}.rs", i)), simple_fn);
560
561 if analyzer.estimated_memory_usage() > config.max_memory_mb * 1024 * 1024 {
563 analyzer.evict_old_entries();
564 }
565 }
566
567 let max_allowed = config.max_memory_mb * 1024 * 1024 * 2;
569 assert!(
570 analyzer.estimated_memory_usage() <= max_allowed,
571 "Memory usage ({}) should stay within 2x limit ({})",
572 analyzer.estimated_memory_usage(),
573 max_allowed
574 );
575 }
576
577 #[test]
580 fn test_statistics_accuracy() {
581 let code = r#"
582 fn first_func(x: i32) -> i32 { x + 42 }
583 fn second_func(data: &Vec<i32>) -> i32 {
584 let mut sum = 0;
585 for item in data { sum += item; }
586 sum
587 }
588 "#;
589
590 let mut analyzer = CrossFileAnalyzer::with_config(CrossFileConfig {
591 min_function_lines: 1, ..Default::default()
593 });
594
595 analyzer
596 .process_file(Path::new("stats_test.rs"), code)
597 .unwrap();
598
599 let stats = analyzer.stats();
600
601 assert_eq!(
602 stats.total_functions, 2,
603 "Should have processed 2 functions"
604 );
605 assert_eq!(stats.total_files, 1, "Should have processed 1 file");
606 assert_eq!(
607 stats.unique_fingerprints, 2,
608 "Should have 2 unique fingerprints (different structures)"
609 );
610 assert!(stats.memory_bytes > 0, "Memory usage should be positive");
611 }
612
613 #[test]
616 fn test_near_duplicate_detection_fuzzy_matching() {
617 let code_base = r#"
618 fn process_data(data: &Vec<i32>) -> i32 {
619 let mut sum = 0;
620 for item in data {
621 if *item > 0 {
622 sum += item;
623 }
624 }
625 sum
626 }
627 "#;
628
629 let code_modified = r#"
630 fn handle_items(items: &Vec<i32>) -> i32 {
631 let mut total = 0;
632 for value in items {
633 if *value >= 0 {
634 total += value;
635 }
636 }
637 total
638 }
639 "#;
640
641 let mut analyzer = CrossFileAnalyzer::with_config(CrossFileConfig {
642 min_function_lines: 8,
643 similarity_threshold: 0.8, ..Default::default()
645 });
646
647 analyzer
648 .process_file(Path::new("base.rs"), code_base)
649 .unwrap();
650 analyzer
651 .process_file(Path::new("modified.rs"), code_modified)
652 .unwrap();
653
654 let exact_dups = analyzer.find_all_duplicates();
656
657 let near_dups = analyzer.find_near_duplicates();
659
660 assert!(
663 !near_dups.is_empty() || exact_dups.is_empty(),
664 "Either exact or near-duplicates should be found (or neither if too different)"
665 );
666 }
667}