1use std::collections::HashMap;
13
14use crate::analysis::relation_inference::{InferenceRecord, Relation, RelationEdge};
15use crate::analysis::unsafe_inference::TypeKind;
16
17#[derive(Debug, Clone)]
19pub struct CloneConfig {
20 pub max_time_diff_ns: u64,
27 pub compare_bytes: usize,
29 pub min_similarity: f64,
32 pub min_similarity_no_stack_hash: f64,
36 pub max_clone_edges_per_node: usize,
39}
40
41impl Default for CloneConfig {
42 fn default() -> Self {
43 Self {
44 max_time_diff_ns: 1_000_000, compare_bytes: 64,
46 min_similarity: 0.8,
47 min_similarity_no_stack_hash: 0.95,
48 max_clone_edges_per_node: 10,
49 }
50 }
51}
52
53pub fn detect_clones(records: &[InferenceRecord], config: &CloneConfig) -> Vec<RelationEdge> {
69 let mut groups: HashMap<(TypeKind, usize, u64), Vec<usize>> = HashMap::new();
70 for (i, record) in records.iter().enumerate() {
71 let stack_hash = record.call_stack_hash.unwrap_or(0);
72 let key = (record.type_kind, record.size, stack_hash);
73 groups.entry(key).or_default().push(i);
74 }
75
76 let mut relations = Vec::new();
77 let mut edge_count_per_node: HashMap<usize, usize> = HashMap::new();
78
79 for group_indices in groups.values() {
80 if group_indices.len() < 2 {
81 continue;
82 }
83
84 let mut group: Vec<&InferenceRecord> = group_indices.iter().map(|&i| &records[i]).collect();
85 group.sort_by_key(|r| r.alloc_time);
86
87 let has_stack_hash = group.iter().any(|r| r.call_stack_hash.is_some());
88 let min_similarity = if has_stack_hash {
89 config.min_similarity
90 } else {
91 config.min_similarity_no_stack_hash
92 };
93
94 let mut left = 0;
95 for right in 1..group.len() {
96 while left < right
97 && group[right]
98 .alloc_time
99 .saturating_sub(group[left].alloc_time)
100 > config.max_time_diff_ns
101 {
102 left += 1;
103 }
104
105 for i in left..right {
106 let a = group[i];
107 let b = group[right];
108
109 let a_count = edge_count_per_node.get(&a.id).copied().unwrap_or(0);
110 let b_count = edge_count_per_node.get(&b.id).copied().unwrap_or(0);
111
112 if a_count >= config.max_clone_edges_per_node
113 || b_count >= config.max_clone_edges_per_node
114 {
115 continue;
116 }
117
118 if content_similarity(a, b, config.compare_bytes) >= min_similarity {
119 relations.push(RelationEdge {
120 from: a.id,
121 to: b.id,
122 relation: Relation::Clone,
123 });
124 *edge_count_per_node.entry(a.id).or_insert(0) += 1;
125 *edge_count_per_node.entry(b.id).or_insert(0) += 1;
126 }
127 }
128 }
129 }
130
131 relations
132}
133
134fn content_similarity(a: &InferenceRecord, b: &InferenceRecord, max_bytes: usize) -> f64 {
139 let memory_a = match &a.memory {
140 Some(m) => m,
141 None => return 0.0,
142 };
143 let memory_b = match &b.memory {
144 Some(m) => m,
145 None => return 0.0,
146 };
147
148 let len = max_bytes.min(memory_a.len()).min(memory_b.len());
149 if len == 0 {
150 return 0.0;
151 }
152
153 let mut matching = 0usize;
154 for i in 0..len {
155 let byte_a: u8 = memory_a.read_u8(i).unwrap_or(0);
156 let byte_b: u8 = memory_b.read_u8(i).unwrap_or(0);
157 if byte_a == byte_b {
158 matching += 1;
159 }
160 }
161
162 matching as f64 / len as f64
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168 use crate::analysis::unsafe_inference::{OwnedMemoryView, TypeKind};
169
170 fn make_record(
171 id: usize,
172 ptr: usize,
173 size: usize,
174 memory: Vec<u8>,
175 stack_hash: Option<u64>,
176 alloc_time: u64,
177 type_kind: TypeKind,
178 ) -> InferenceRecord {
179 InferenceRecord {
180 id,
181 ptr,
182 size,
183 memory: Some(OwnedMemoryView::new(memory)),
184 type_kind,
185 confidence: 80,
186 call_stack_hash: stack_hash,
187 alloc_time,
188 }
189 }
190
191 #[test]
192 fn test_clone_detection_identical_content() {
193 let content = vec![0xAAu8; 64];
194 let records = vec![
195 make_record(
196 0,
197 0x1000,
198 24,
199 content.clone(),
200 Some(123),
201 1000,
202 TypeKind::Vec,
203 ),
204 make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::Vec),
205 ];
206
207 let edges = detect_clones(&records, &CloneConfig::default());
208 assert_eq!(edges.len(), 1);
209 assert_eq!(edges[0].from, 0);
210 assert_eq!(edges[0].to, 1);
211 assert_eq!(edges[0].relation, Relation::Clone);
212 }
213
214 #[test]
215 fn test_clone_detection_different_content() {
216 let content_a = vec![0xAAu8; 64];
217 let content_b = vec![0xBBu8; 64];
218 let records = vec![
219 make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
220 make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
221 ];
222
223 let edges = detect_clones(&records, &CloneConfig::default());
224 assert!(edges.is_empty());
225 }
226
227 #[test]
228 fn test_clone_detection_time_window_exceeded() {
229 let content = vec![0xAAu8; 64];
230 let records = vec![
231 make_record(
232 0,
233 0x1000,
234 24,
235 content.clone(),
236 Some(123),
237 1000,
238 TypeKind::Vec,
239 ),
240 make_record(1, 0x2000, 24, content, Some(123), 20_000_000, TypeKind::Vec),
241 ];
242
243 let edges = detect_clones(&records, &CloneConfig::default());
244 assert!(edges.is_empty());
245 }
246
247 #[test]
248 fn test_clone_detection_different_types() {
249 let content = vec![0xAAu8; 64];
250 let records = vec![
251 make_record(
252 0,
253 0x1000,
254 24,
255 content.clone(),
256 Some(123),
257 1000,
258 TypeKind::Vec,
259 ),
260 make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::String),
261 ];
262
263 let edges = detect_clones(&records, &CloneConfig::default());
264 assert!(edges.is_empty());
265 }
266
267 #[test]
268 fn test_clone_detection_different_stack_hashes() {
269 let content = vec![0xAAu8; 64];
270 let records = vec![
271 make_record(
272 0,
273 0x1000,
274 24,
275 content.clone(),
276 Some(111),
277 1000,
278 TypeKind::Vec,
279 ),
280 make_record(1, 0x2000, 24, content, Some(222), 5000, TypeKind::Vec),
281 ];
282
283 let edges = detect_clones(&records, &CloneConfig::default());
284 assert!(edges.is_empty());
285 }
286
287 #[test]
288 fn test_clone_detection_no_memory() {
289 let records = vec![
290 InferenceRecord {
291 id: 0,
292 ptr: 0x1000,
293 size: 24,
294 memory: None,
295 type_kind: TypeKind::Vec,
296 confidence: 80,
297 call_stack_hash: Some(123),
298 alloc_time: 1000,
299 },
300 InferenceRecord {
301 id: 1,
302 ptr: 0x2000,
303 size: 24,
304 memory: None,
305 type_kind: TypeKind::Vec,
306 confidence: 80,
307 call_stack_hash: Some(123),
308 alloc_time: 5000,
309 },
310 ];
311
312 let edges = detect_clones(&records, &CloneConfig::default());
313 assert!(edges.is_empty());
314 }
315
316 #[test]
317 fn test_clone_detection_partial_similarity() {
318 let content_a = vec![0xAAu8; 64];
319 let mut content_b = vec![0xAAu8; 64];
320 for item in content_b.iter_mut().skip(50).take(14) {
321 *item = 0xBB;
322 }
323
324 let records = vec![
325 make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
326 make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
327 ];
328
329 let edges = detect_clones(&records, &CloneConfig::default());
330 assert!(edges.is_empty());
331 }
332
333 #[test]
334 fn test_clone_detection_sliding_window_efficiency() {
335 let content = vec![0xCCu8; 64];
339 let records: Vec<InferenceRecord> = (0..5)
340 .map(|i| {
341 make_record(
342 i,
343 0x1000 + i * 0x1000,
344 24,
345 content.clone(),
346 Some(999),
347 1000_u64 + (i as u64) * 100_000, TypeKind::Vec,
349 )
350 })
351 .collect();
352
353 let edges = detect_clones(&records, &CloneConfig::default());
354 assert_eq!(edges.len(), 10); }
356
357 #[test]
358 fn test_clone_config_defaults() {
359 let config = CloneConfig::default();
360 assert_eq!(config.max_time_diff_ns, 1_000_000); assert_eq!(config.compare_bytes, 64);
362 assert!((config.min_similarity - 0.8).abs() < f64::EPSILON);
363 assert!((config.min_similarity_no_stack_hash - 0.95).abs() < f64::EPSILON);
364 assert_eq!(config.max_clone_edges_per_node, 10);
365 }
366
367 #[test]
368 fn test_clone_detection_different_sizes() {
369 let content = vec![0xAAu8; 64];
370 let records = vec![
371 make_record(
372 0,
373 0x1000,
374 24,
375 content.clone(),
376 Some(123),
377 1000,
378 TypeKind::Vec,
379 ),
380 make_record(1, 0x2000, 48, content, Some(123), 5000, TypeKind::Vec),
381 ];
382
383 let edges = detect_clones(&records, &CloneConfig::default());
384 assert!(edges.is_empty());
385 }
386
387 #[test]
388 fn test_clone_detection_no_stack_hash_stricter_threshold() {
389 let content = vec![0xAAu8; 64];
390 let records = vec![
391 make_record(0, 0x1000, 24, content.clone(), None, 1000, TypeKind::Vec),
392 make_record(1, 0x2000, 24, content, None, 5000, TypeKind::Vec),
393 ];
394
395 let config = CloneConfig::default();
396 let edges = detect_clones(&records, &config);
397
398 assert_eq!(edges.len(), 1, "Identical content should still be detected");
399 }
400
401 #[test]
402 fn test_clone_detection_no_stack_hash_partial_similarity_rejected() {
403 let content_a = vec![0xAAu8; 64];
404 let mut content_b = vec![0xAAu8; 64];
405 for item in content_b.iter_mut().skip(60).take(4) {
406 *item = 0xBB;
407 }
408
409 let records = vec![
410 make_record(0, 0x1000, 24, content_a, None, 1000, TypeKind::Vec),
411 make_record(1, 0x2000, 24, content_b, None, 5000, TypeKind::Vec),
412 ];
413
414 let config = CloneConfig::default();
415 let edges = detect_clones(&records, &config);
416
417 assert!(
418 edges.is_empty(),
419 "Partial similarity should be rejected with no stack hash"
420 );
421 }
422
423 #[test]
424 fn test_clone_detection_max_edges_per_node() {
425 let content = vec![0xCCu8; 64];
426 let records: Vec<InferenceRecord> = (0..20)
427 .map(|i| {
428 make_record(
429 i,
430 0x1000 + i * 0x1000,
431 24,
432 content.clone(),
433 Some(999),
434 1000,
435 TypeKind::Vec,
436 )
437 })
438 .collect();
439
440 let config = CloneConfig {
441 max_clone_edges_per_node: 3,
442 ..Default::default()
443 };
444 let edges = detect_clones(&records, &config);
445
446 let mut edge_count: HashMap<usize, usize> = HashMap::new();
447 for edge in &edges {
448 *edge_count.entry(edge.from).or_insert(0) += 1;
449 *edge_count.entry(edge.to).or_insert(0) += 1;
450 }
451
452 for (&_node, &count) in &edge_count {
453 assert!(
454 count <= config.max_clone_edges_per_node,
455 "Node has {} edges, max is {}",
456 count,
457 config.max_clone_edges_per_node
458 );
459 }
460 }
461}