1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
use crate::apted::{compute_edit_distance, compute_edit_distance_with_cutoff, APTEDOptions};
use crate::tree::TreeNode;
use std::rc::Rc;
#[derive(Debug, Clone)]
pub struct TSEDOptions {
pub apted_options: APTEDOptions,
pub min_lines: u32, // Minimum number of lines for a function to be considered
pub min_tokens: Option<u32>, // Minimum number of tokens (AST nodes) for a function to be considered
pub size_penalty: bool, // Apply penalty for short functions
pub skip_test: bool, // Skip test functions (language-specific)
}
impl Default for TSEDOptions {
fn default() -> Self {
TSEDOptions {
apted_options: APTEDOptions {
rename_cost: 0.3, // Default from the TypeScript implementation
delete_cost: 1.0,
insert_cost: 1.0,
compare_values: false, // TypeScript default: structural comparison only
},
min_lines: 5, // Increased default to better filter trivial matches
min_tokens: None, // No token limit by default
size_penalty: true, // Enable size penalty by default
skip_test: false, // Don't skip test functions by default
}
}
}
/// Calculate TSED (Tree Structure Edit Distance) similarity between two trees
/// Returns a value between 0.0 and 1.0, where 1.0 means identical
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn calculate_tsed(tree1: &Rc<TreeNode>, tree2: &Rc<TreeNode>, options: &TSEDOptions) -> f64 {
let distance = compute_edit_distance(tree1, tree2, &options.apted_options);
let size1 = tree1.get_subtree_size() as f64;
let size2 = tree2.get_subtree_size() as f64;
// TSED normalization: Use the larger tree size
// This ensures that when comparing trees of different sizes,
// the similarity reflects how much of the larger tree matches
let max_size = size1.max(size2);
// Calculate base TSED similarity
let tsed_similarity = if max_size > 0.0 { (1.0 - distance / max_size).max(0.0) } else { 1.0 };
// If distance is 0 but trees have different sizes, check more carefully
// This can happen when compare_values is false and structure is similar
let tsed_similarity = if distance == 0.0 && size1 != size2 {
let size_ratio = size1.min(size2) / size1.max(size2);
let size_diff = (size1 - size2).abs();
// Apply penalty based on both ratio and absolute difference
if size_diff > 10.0 {
// Strong penalty for large absolute differences
tsed_similarity * 0.5
} else if size_ratio < 0.95 || size_diff > 3.0 {
// Moderate penalty for noticeable differences
tsed_similarity * size_ratio.powf(0.5)
} else {
tsed_similarity // Very minor differences are OK
}
} else {
tsed_similarity
};
// For very small trees, even small differences should matter more
let tsed_similarity = if options.size_penalty {
if max_size < 10.0 && distance > 0.0 {
tsed_similarity * 0.8 // Reduce similarity for small trees with any differences
} else if max_size < 30.0 && distance > 0.0 {
// For moderately small trees, apply a smaller penalty
tsed_similarity * 0.9
} else {
tsed_similarity
}
} else {
tsed_similarity
};
// Apply additional penalties for structural differences
let mut similarity = tsed_similarity;
// Size ratio penalty: penalize when trees have very different sizes
let size_ratio = size1.min(size2) / size1.max(size2);
if options.size_penalty {
// For short functions, make differences more pronounced
let min_size = size1.min(size2);
if min_size < 30.0 {
// Short function penalty: the shorter, the more sensitive to differences
let short_function_factor = (min_size / 30.0).powf(0.5);
similarity *= short_function_factor;
// Additional penalty for very short functions
if min_size < 10.0 {
similarity *= 0.5; // Strong penalty for very short functions
} else if min_size < 20.0 {
similarity *= 0.7; // Moderate penalty for short functions
}
}
// Size difference penalty
if size_ratio < 0.5 {
// If one tree is less than half the size of the other,
// they're likely fundamentally different
similarity *= size_ratio.powf(0.5);
}
}
similarity
}
/// Calculate TSED with early termination when the result cannot reach the threshold.
/// Returns 0.0 immediately if the distance exceeds the budget, avoiding full computation.
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn calculate_tsed_with_threshold(
tree1: &Rc<TreeNode>,
tree2: &Rc<TreeNode>,
options: &TSEDOptions,
threshold: f64,
) -> f64 {
let size1 = tree1.get_subtree_size() as f64;
let size2 = tree2.get_subtree_size() as f64;
let max_size = size1.max(size2);
if max_size == 0.0 {
return 1.0;
}
// Maximum distance that could still yield similarity >= threshold
// similarity = 1.0 - distance / max_size >= threshold
// => distance <= max_size * (1.0 - threshold)
let max_distance = max_size * (1.0 - threshold);
let distance =
compute_edit_distance_with_cutoff(tree1, tree2, &options.apted_options, max_distance);
// If distance exceeded cutoff, similarity is definitely below threshold
if distance >= f64::MAX / 2.0 {
return 0.0;
}
// Apply the same normalization and penalties as calculate_tsed
let tsed_similarity = (1.0 - distance / max_size).max(0.0);
let tsed_similarity = if distance == 0.0 && size1 != size2 {
let size_ratio = size1.min(size2) / size1.max(size2);
let size_diff = (size1 - size2).abs();
if size_diff > 10.0 {
tsed_similarity * 0.5
} else if size_ratio < 0.95 || size_diff > 3.0 {
tsed_similarity * size_ratio.powf(0.5)
} else {
tsed_similarity
}
} else {
tsed_similarity
};
let tsed_similarity = if options.size_penalty {
if max_size < 10.0 && distance > 0.0 {
tsed_similarity * 0.8
} else if max_size < 30.0 && distance > 0.0 {
tsed_similarity * 0.9
} else {
tsed_similarity
}
} else {
tsed_similarity
};
let mut similarity = tsed_similarity;
let size_ratio = size1.min(size2) / size1.max(size2);
if options.size_penalty {
let min_size = size1.min(size2);
if min_size < 30.0 {
let short_function_factor = (min_size / 30.0).powf(0.5);
similarity *= short_function_factor;
if min_size < 10.0 {
similarity *= 0.5;
} else if min_size < 20.0 {
similarity *= 0.7;
}
}
if size_ratio < 0.5 {
similarity *= size_ratio.powf(0.5);
}
}
similarity
}
/// Calculate TSED from TypeScript code strings
///
/// # Errors
///
/// Returns an error if parsing fails for either code string
pub fn calculate_tsed_from_code(
code1: &str,
code2: &str,
filename1: &str,
filename2: &str,
options: &TSEDOptions,
) -> Result<f64, String> {
use crate::parser::parse_and_convert_to_tree;
let tree1 = parse_and_convert_to_tree(filename1, code1)?;
let tree2 = parse_and_convert_to_tree(filename2, code2)?;
Ok(calculate_tsed(&tree1, &tree2, options))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_identical_code() {
let code = "function add(a: number, b: number) { return a + b; }";
let options = TSEDOptions {
size_penalty: false, // Disable for small test functions
..Default::default()
};
let similarity =
calculate_tsed_from_code(code, code, "test1.ts", "test2.ts", &options).unwrap();
assert!((similarity - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_renamed_function() {
let code1 = "function add(a: number, b: number) { return a + b; }";
let code2 = "function sum(x: number, y: number) { return x + y; }";
let options = TSEDOptions {
size_penalty: false, // Disable for small test functions
..Default::default()
};
let similarity =
calculate_tsed_from_code(code1, code2, "test1.ts", "test2.ts", &options).unwrap();
// Should have high similarity due to low rename cost
assert!(similarity > 0.8);
}
#[test]
fn test_different_structure() {
let code1 = "function test() { return 1; }";
let code2 = "class Test { method() { return 1; } }";
let options = TSEDOptions::default();
let similarity =
calculate_tsed_from_code(code1, code2, "test1.ts", "test2.ts", &options).unwrap();
// Should have lower similarity due to structural differences
assert!(similarity < 0.7);
}
}