1use std::collections::HashMap;
13
14use crate::analysis::relation_inference::{InferenceRecord, Relation, RelationEdge};
15use crate::analysis::unsafe_inference::TypeKind;
16
17#[derive(Debug, Clone)]
19pub struct CloneConfig {
20 pub max_time_diff_ns: u64,
27 pub compare_bytes: usize,
29 pub min_similarity: f64,
32 pub min_similarity_no_stack_hash: f64,
36 pub max_clone_edges_per_node: usize,
39 pub detect_smart_pointers: bool,
42 pub arc_threshold: f64,
45 pub rc_threshold: f64,
48}
49
50impl Default for CloneConfig {
51 fn default() -> Self {
52 Self {
53 max_time_diff_ns: 1_000_000, compare_bytes: 64,
55 min_similarity: 0.8,
56 min_similarity_no_stack_hash: 0.95,
57 max_clone_edges_per_node: 10,
58 detect_smart_pointers: true,
59 arc_threshold: 0.7,
60 rc_threshold: 0.85,
61 }
62 }
63}
64
65pub fn detect_clones(records: &[InferenceRecord], config: &CloneConfig) -> Vec<RelationEdge> {
81 let mut groups: HashMap<(TypeKind, usize, u64), Vec<usize>> = HashMap::new();
82 for (i, record) in records.iter().enumerate() {
83 let stack_hash = record.call_stack_hash.unwrap_or(0);
84 let key = (record.type_kind, record.size, stack_hash);
85 groups.entry(key).or_default().push(i);
86 }
87
88 let mut relations = Vec::new();
89 let mut edge_count_per_node: HashMap<usize, usize> = HashMap::new();
90
91 for group_indices in groups.values() {
92 if group_indices.len() < 2 {
93 continue;
94 }
95
96 let mut group: Vec<&InferenceRecord> = group_indices.iter().map(|&i| &records[i]).collect();
97 group.sort_by_key(|r| r.alloc_time);
98
99 let has_stack_hash = group.iter().any(|r| r.call_stack_hash.is_some());
100 let min_similarity = if has_stack_hash {
101 config.min_similarity
102 } else {
103 config.min_similarity_no_stack_hash
104 };
105
106 let mut left = 0;
107 for right in 1..group.len() {
108 while left < right
109 && group[right]
110 .alloc_time
111 .saturating_sub(group[left].alloc_time)
112 > config.max_time_diff_ns
113 {
114 left += 1;
115 }
116
117 for i in left..right {
118 let a = group[i];
119 let b = group[right];
120
121 let a_count = edge_count_per_node.get(&a.id).copied().unwrap_or(0);
122 let b_count = edge_count_per_node.get(&b.id).copied().unwrap_or(0);
123
124 if a_count >= config.max_clone_edges_per_node
125 || b_count >= config.max_clone_edges_per_node
126 {
127 continue;
128 }
129
130 let threshold = if config.detect_smart_pointers {
132 if is_arc_like(a, b) {
133 config.arc_threshold
134 } else if is_rc_like(a, b) {
135 config.rc_threshold
136 } else {
137 min_similarity
138 }
139 } else {
140 min_similarity
141 };
142
143 if content_similarity(a, b, config.compare_bytes) >= threshold {
144 relations.push(RelationEdge {
145 from: a.id,
146 to: b.id,
147 relation: Relation::Clone,
148 });
149 *edge_count_per_node.entry(a.id).or_insert(0) += 1;
150 *edge_count_per_node.entry(b.id).or_insert(0) += 1;
151 }
152 }
153 }
154 }
155
156 relations
157}
158
159fn content_similarity(a: &InferenceRecord, b: &InferenceRecord, max_bytes: usize) -> f64 {
164 let memory_a = match &a.memory {
165 Some(m) => m,
166 None => return 0.0,
167 };
168 let memory_b = match &b.memory {
169 Some(m) => m,
170 None => return 0.0,
171 };
172
173 let len = max_bytes.min(memory_a.len()).min(memory_b.len());
174 if len == 0 {
175 return 0.0;
176 }
177
178 let mut matching = 0usize;
179 for i in 0..len {
180 let byte_a: u8 = memory_a.read_u8(i).unwrap_or(0);
181 let byte_b: u8 = memory_b.read_u8(i).unwrap_or(0);
182 if byte_a == byte_b {
183 matching += 1;
184 }
185 }
186
187 matching as f64 / len as f64
188}
189
190fn is_arc_like(a: &InferenceRecord, b: &InferenceRecord) -> bool {
192 let has_arc_pattern = |record: &InferenceRecord| -> bool {
195 let memory = match &record.memory {
196 Some(m) => m,
197 None => return false,
198 };
199 if memory.len() < 16 {
200 return false;
201 }
202 let strong = memory.read_usize(0).unwrap_or(usize::MAX);
203 let weak = memory.read_usize(8).unwrap_or(usize::MAX);
204 (1..=10000).contains(&strong) && weak <= 1000
206 };
207
208 has_arc_pattern(a) && has_arc_pattern(b)
209}
210
211fn is_rc_like(a: &InferenceRecord, b: &InferenceRecord) -> bool {
213 let has_rc_pattern = |record: &InferenceRecord| -> bool {
216 let memory = match &record.memory {
217 Some(m) => m,
218 None => return false,
219 };
220 if memory.len() < 16 {
221 return false;
222 }
223 let strong = memory.read_usize(0).unwrap_or(usize::MAX);
224 let weak = memory.read_usize(8).unwrap_or(usize::MAX);
225 (1..=10000).contains(&strong) && weak <= 1000
227 };
228
229 has_rc_pattern(a) && has_rc_pattern(b)
230}
231
232#[cfg(test)]
233mod tests {
234 use super::*;
235 use crate::analysis::unsafe_inference::{OwnedMemoryView, TypeKind};
236
237 fn make_record(
238 id: usize,
239 ptr: usize,
240 size: usize,
241 memory: Vec<u8>,
242 stack_hash: Option<u64>,
243 alloc_time: u64,
244 type_kind: TypeKind,
245 ) -> InferenceRecord {
246 InferenceRecord {
247 id,
248 ptr,
249 size,
250 memory: Some(OwnedMemoryView::new(memory)),
251 type_kind,
252 confidence: 100,
253 call_stack_hash: stack_hash,
254 alloc_time,
255 stack_ptr: None,
256 }
257 }
258
259 #[test]
260 fn test_clone_detection_identical_content() {
261 let content = vec![0xAAu8; 64];
262 let records = vec![
263 make_record(
264 0,
265 0x1000,
266 24,
267 content.clone(),
268 Some(123),
269 1000,
270 TypeKind::Vec,
271 ),
272 make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::Vec),
273 ];
274
275 let edges = detect_clones(&records, &CloneConfig::default());
276 assert_eq!(edges.len(), 1);
277 assert_eq!(edges[0].from, 0);
278 assert_eq!(edges[0].to, 1);
279 assert_eq!(edges[0].relation, Relation::Clone);
280 }
281
282 #[test]
283 fn test_clone_detection_different_content() {
284 let content_a = vec![0xAAu8; 64];
285 let content_b = vec![0xBBu8; 64];
286 let records = vec![
287 make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
288 make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
289 ];
290
291 let edges = detect_clones(&records, &CloneConfig::default());
292 assert!(edges.is_empty());
293 }
294
295 #[test]
296 fn test_clone_detection_time_window_exceeded() {
297 let content = vec![0xAAu8; 64];
298 let records = vec![
299 make_record(
300 0,
301 0x1000,
302 24,
303 content.clone(),
304 Some(123),
305 1000,
306 TypeKind::Vec,
307 ),
308 make_record(1, 0x2000, 24, content, Some(123), 20_000_000, TypeKind::Vec),
309 ];
310
311 let edges = detect_clones(&records, &CloneConfig::default());
312 assert!(edges.is_empty());
313 }
314
315 #[test]
316 fn test_clone_detection_different_types() {
317 let content = vec![0xAAu8; 64];
318 let records = vec![
319 make_record(
320 0,
321 0x1000,
322 24,
323 content.clone(),
324 Some(123),
325 1000,
326 TypeKind::Vec,
327 ),
328 make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::String),
329 ];
330
331 let edges = detect_clones(&records, &CloneConfig::default());
332 assert!(edges.is_empty());
333 }
334
335 #[test]
336 fn test_clone_detection_different_stack_hashes() {
337 let content = vec![0xAAu8; 64];
338 let records = vec![
339 make_record(
340 0,
341 0x1000,
342 24,
343 content.clone(),
344 Some(111),
345 1000,
346 TypeKind::Vec,
347 ),
348 make_record(1, 0x2000, 24, content, Some(222), 5000, TypeKind::Vec),
349 ];
350
351 let edges = detect_clones(&records, &CloneConfig::default());
352 assert!(edges.is_empty());
353 }
354
355 #[test]
356 fn test_clone_detection_no_memory() {
357 let records = vec![
358 InferenceRecord {
359 id: 0,
360 ptr: 0x1000,
361 size: 24,
362 memory: None,
363 type_kind: TypeKind::Vec,
364 confidence: 80,
365 call_stack_hash: Some(123),
366 alloc_time: 1000,
367 stack_ptr: None,
368 },
369 InferenceRecord {
370 id: 1,
371 ptr: 0x2000,
372 size: 24,
373 memory: None,
374 type_kind: TypeKind::Vec,
375 confidence: 80,
376 call_stack_hash: Some(123),
377 alloc_time: 5000,
378 stack_ptr: None,
379 },
380 ];
381
382 let edges = detect_clones(&records, &CloneConfig::default());
383 assert!(edges.is_empty());
384 }
385
386 #[test]
387 fn test_clone_detection_partial_similarity() {
388 let content_a = vec![0xAAu8; 64];
389 let mut content_b = vec![0xAAu8; 64];
390 for item in content_b.iter_mut().skip(50).take(14) {
391 *item = 0xBB;
392 }
393
394 let records = vec![
395 make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
396 make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
397 ];
398
399 let edges = detect_clones(&records, &CloneConfig::default());
400 assert!(edges.is_empty());
401 }
402
403 #[test]
404 fn test_clone_detection_sliding_window_efficiency() {
405 let content = vec![0xCCu8; 64];
409 let records: Vec<InferenceRecord> = (0..5)
410 .map(|i| {
411 make_record(
412 i,
413 0x1000 + i * 0x1000,
414 24,
415 content.clone(),
416 Some(999),
417 1000_u64 + (i as u64) * 100_000, TypeKind::Vec,
419 )
420 })
421 .collect();
422
423 let edges = detect_clones(&records, &CloneConfig::default());
424 assert_eq!(edges.len(), 10); }
426
427 #[test]
428 fn test_clone_config_defaults() {
429 let config = CloneConfig::default();
430 assert_eq!(config.max_time_diff_ns, 1_000_000); assert_eq!(config.compare_bytes, 64);
432 assert!((config.min_similarity - 0.8).abs() < f64::EPSILON);
433 assert!((config.min_similarity_no_stack_hash - 0.95).abs() < f64::EPSILON);
434 assert_eq!(config.max_clone_edges_per_node, 10);
435 }
436
437 #[test]
438 fn test_clone_detection_different_sizes() {
439 let content = vec![0xAAu8; 64];
440 let records = vec![
441 make_record(
442 0,
443 0x1000,
444 24,
445 content.clone(),
446 Some(123),
447 1000,
448 TypeKind::Vec,
449 ),
450 make_record(1, 0x2000, 48, content, Some(123), 5000, TypeKind::Vec),
451 ];
452
453 let edges = detect_clones(&records, &CloneConfig::default());
454 assert!(edges.is_empty());
455 }
456
457 #[test]
458 fn test_clone_detection_no_stack_hash_stricter_threshold() {
459 let content = vec![0xAAu8; 64];
460 let records = vec![
461 make_record(0, 0x1000, 24, content.clone(), None, 1000, TypeKind::Vec),
462 make_record(1, 0x2000, 24, content, None, 5000, TypeKind::Vec),
463 ];
464
465 let config = CloneConfig::default();
466 let edges = detect_clones(&records, &config);
467
468 assert_eq!(edges.len(), 1, "Identical content should still be detected");
469 }
470
471 #[test]
472 fn test_clone_detection_no_stack_hash_partial_similarity_rejected() {
473 let content_a = vec![0xAAu8; 64];
474 let mut content_b = vec![0xAAu8; 64];
475 for item in content_b.iter_mut().skip(60).take(4) {
476 *item = 0xBB;
477 }
478
479 let records = vec![
480 make_record(0, 0x1000, 24, content_a, None, 1000, TypeKind::Vec),
481 make_record(1, 0x2000, 24, content_b, None, 5000, TypeKind::Vec),
482 ];
483
484 let config = CloneConfig::default();
485 let edges = detect_clones(&records, &config);
486
487 assert!(
488 edges.is_empty(),
489 "Partial similarity should be rejected with no stack hash"
490 );
491 }
492
493 #[test]
494 fn test_clone_detection_max_edges_per_node() {
495 let content = vec![0xCCu8; 64];
496 let records: Vec<InferenceRecord> = (0..20)
497 .map(|i| {
498 make_record(
499 i,
500 0x1000 + i * 0x1000,
501 24,
502 content.clone(),
503 Some(999),
504 1000,
505 TypeKind::Vec,
506 )
507 })
508 .collect();
509
510 let config = CloneConfig {
511 max_clone_edges_per_node: 3,
512 ..Default::default()
513 };
514 let edges = detect_clones(&records, &config);
515
516 let mut edge_count: HashMap<usize, usize> = HashMap::new();
517 for edge in &edges {
518 *edge_count.entry(edge.from).or_insert(0) += 1;
519 *edge_count.entry(edge.to).or_insert(0) += 1;
520 }
521
522 for (&_node, &count) in &edge_count {
523 assert!(
524 count <= config.max_clone_edges_per_node,
525 "Node has {} edges, max is {}",
526 count,
527 config.max_clone_edges_per_node
528 );
529 }
530 }
531}