1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
use rayon::prelude::*;
use similarity_core::{
cli_parallel::{FileData, SimilarityResult},
language_parser::{GenericFunctionDef, LanguageParser},
tsed::TSEDOptions,
};
use std::fs;
use std::path::PathBuf;
/// Rust file with its content and extracted functions
#[allow(dead_code)]
pub type RustFileData = FileData<GenericFunctionDef>;
/// Load and parse Rust files in parallel
#[allow(dead_code)]
pub fn load_files_parallel(files: &[PathBuf]) -> Vec<RustFileData> {
files
.par_iter()
.filter_map(|file| {
match fs::read_to_string(file) {
Ok(content) => {
let filename = file.to_string_lossy();
// Create Rust parser
match similarity_rs::rust_parser::RustParser::new() {
Ok(mut parser) => {
// Extract functions
match parser.extract_functions(&content, &filename) {
Ok(functions) => {
Some(FileData { path: file.clone(), content, functions })
}
Err(e) => {
eprintln!("Error parsing {}: {}", file.display(), e);
None
}
}
}
Err(e) => {
eprintln!("Error creating parser for {}: {}", file.display(), e);
None
}
}
}
Err(e) => {
eprintln!("Error reading {}: {}", file.display(), e);
None
}
}
})
.collect()
}
/// Check for duplicates within Rust files in parallel
pub fn check_within_file_duplicates_parallel(
files: &[PathBuf],
threshold: f64,
options: &TSEDOptions,
) -> Vec<(PathBuf, Vec<SimilarityResult<GenericFunctionDef>>)> {
files
.par_iter()
.filter_map(|file| match fs::read_to_string(file) {
Ok(code) => {
let file_str = file.to_string_lossy();
// Create Rust parser
match similarity_rs::rust_parser::RustParser::new() {
Ok(mut parser) => {
// Extract functions
match parser.extract_functions_with_skip_test(&code, &file_str, options.skip_test) {
Ok(functions) => {
let mut similar_pairs = Vec::new();
// Compare all pairs within the file
for i in 0..functions.len() {
for j in (i + 1)..functions.len() {
let func1 = &functions[i];
let func2 = &functions[j];
// Skip if functions don't meet minimum requirements
if func1.end_line - func1.start_line + 1 < options.min_lines
|| func2.end_line - func2.start_line + 1
< options.min_lines
{
continue;
}
// Extract function bodies
let lines: Vec<&str> = code.lines().collect();
let body1 = extract_function_body(&lines, func1);
let body2 = extract_function_body(&lines, func2);
// Parse function bodies to trees
let (tree1_opt, tree2_opt) = match (
parser.parse(&body1, &format!("{}:func1", file_str)),
parser.parse(&body2, &format!("{}:func2", file_str)),
) {
(Ok(tree1), Ok(tree2)) => (Some(tree1), Some(tree2)),
_ => (None, None),
};
// Calculate similarity
let similarity = match (tree1_opt, tree2_opt) {
(Some(tree1), Some(tree2)) => {
// Check minimum tokens if specified
if let Some(min_tokens) = options.min_tokens {
let tokens1 = tree1.get_subtree_size() as u32;
let tokens2 = tree2.get_subtree_size() as u32;
if tokens1 < min_tokens || tokens2 < min_tokens {
continue;
}
}
// For Rust, use TSED instead of enhanced similarity
// to better handle short functions
similarity_core::tsed::calculate_tsed(
&tree1,
&tree2,
options,
)
}
_ => 0.0,
};
if similarity >= threshold {
similar_pairs.push(SimilarityResult::new(
func1.clone(),
func2.clone(),
similarity,
));
}
}
}
if similar_pairs.is_empty() {
None
} else {
Some((file.clone(), similar_pairs))
}
}
Err(_) => None,
}
}
Err(_) => None,
}
}
Err(_) => None,
})
.collect()
}
/// Extract complete function from lines (including signature)
fn extract_function_body(lines: &[&str], func: &GenericFunctionDef) -> String {
// Use the complete function, not just the body
let start_idx = (func.start_line.saturating_sub(1)) as usize;
let end_idx = std::cmp::min(func.end_line as usize, lines.len());
if start_idx >= lines.len() {
return String::new();
}
lines[start_idx..end_idx].join("\n")
}