Skip to main content

memscope_rs/analysis/relation_inference/
clone_detector.rs

1//! Clone detection via grouped comparison with sliding time window.
2//!
3//! Detects when one allocation is a clone of another by grouping on
4//! (type, size, stack_hash) and comparing content similarity within
5//! a time-bounded sliding window.
6//!
7//! # False Positive Mitigation
8//!
9//! When `call_stack_hash` is None (common in synthetic tests), the system
10//! applies stricter similarity thresholds to reduce false positives.
11
12use std::collections::HashMap;
13
14use crate::analysis::relation_inference::{InferenceRecord, Relation, RelationEdge};
15use crate::analysis::unsafe_inference::TypeKind;
16
17/// Configuration for clone detection.
18#[derive(Debug, Clone)]
19pub struct CloneConfig {
20    /// Maximum time difference (nanoseconds) between two allocations
21    /// to be considered potential clones.
22    ///
23    /// Default: 1ms (1_000_000 ns) - reduced from 10ms to minimize
24    /// false positives in hot paths. Real clones typically happen
25    /// within microseconds, not milliseconds.
26    pub max_time_diff_ns: u64,
27    /// Number of leading bytes to compare for content similarity.
28    pub compare_bytes: usize,
29    /// Minimum similarity ratio (0.0 - 1.0) to consider a clone.
30    /// Default: 0.8 (80%).
31    pub min_similarity: f64,
32    /// Minimum similarity when call_stack_hash is None.
33    /// Higher threshold reduces false positives in synthetic data.
34    /// Default: 0.95 (95%).
35    pub min_similarity_no_stack_hash: f64,
36    /// Maximum clone edges per node to prevent explosion.
37    /// Default: 10.
38    pub max_clone_edges_per_node: usize,
39    /// Enable smart pointer type detection (Arc/Rc).
40    /// Default: true.
41    pub detect_smart_pointers: bool,
42    /// Similarity threshold for Arc clones (0.0 - 1.0).
43    /// Default: 0.7 (70%) - Arc clones may have slight differences.
44    pub arc_threshold: f64,
45    /// Similarity threshold for Rc clones (0.0 - 1.0).
46    /// Default: 0.85 (85%) - Rc clones are more consistent.
47    pub rc_threshold: f64,
48}
49
50impl Default for CloneConfig {
51    fn default() -> Self {
52        Self {
53            max_time_diff_ns: 1_000_000, // 1ms - reduced for hot path accuracy
54            compare_bytes: 64,
55            min_similarity: 0.8,
56            min_similarity_no_stack_hash: 0.95,
57            max_clone_edges_per_node: 10,
58            detect_smart_pointers: true,
59            arc_threshold: 0.7,
60            rc_threshold: 0.85,
61        }
62    }
63}
64
65/// Detect Clone relationships among all inference records.
66///
67/// Groups allocations by `(TypeKind, size, call_stack_hash)` and then
68/// uses a sliding time window to compare content similarity within each group.
69/// This avoids O(n^2) worst-case by only comparing temporally close allocations.
70///
71/// # False Positive Mitigation
72///
73/// When `call_stack_hash` is None, applies stricter similarity threshold
74/// (`min_similarity_no_stack_hash`) to reduce false positives.
75///
76/// # Arguments
77///
78/// * `records` - All inference records.
79/// * `config` - Detection configuration.
80pub fn detect_clones(records: &[InferenceRecord], config: &CloneConfig) -> Vec<RelationEdge> {
81    let mut groups: HashMap<(TypeKind, usize, u64), Vec<usize>> = HashMap::new();
82    for (i, record) in records.iter().enumerate() {
83        let stack_hash = record.call_stack_hash.unwrap_or(0);
84        let key = (record.type_kind, record.size, stack_hash);
85        groups.entry(key).or_default().push(i);
86    }
87
88    let mut relations = Vec::new();
89    let mut edge_count_per_node: HashMap<usize, usize> = HashMap::new();
90
91    for group_indices in groups.values() {
92        if group_indices.len() < 2 {
93            continue;
94        }
95
96        let mut group: Vec<&InferenceRecord> = group_indices.iter().map(|&i| &records[i]).collect();
97        group.sort_by_key(|r| r.alloc_time);
98
99        let has_stack_hash = group.iter().any(|r| r.call_stack_hash.is_some());
100        let min_similarity = if has_stack_hash {
101            config.min_similarity
102        } else {
103            config.min_similarity_no_stack_hash
104        };
105
106        let mut left = 0;
107        for right in 1..group.len() {
108            while left < right
109                && group[right]
110                    .alloc_time
111                    .saturating_sub(group[left].alloc_time)
112                    > config.max_time_diff_ns
113            {
114                left += 1;
115            }
116
117            for i in left..right {
118                let a = group[i];
119                let b = group[right];
120
121                let a_count = edge_count_per_node.get(&a.id).copied().unwrap_or(0);
122                let b_count = edge_count_per_node.get(&b.id).copied().unwrap_or(0);
123
124                if a_count >= config.max_clone_edges_per_node
125                    || b_count >= config.max_clone_edges_per_node
126                {
127                    continue;
128                }
129
130                // Use smart pointer type detection if enabled
131                let threshold = if config.detect_smart_pointers {
132                    if is_arc_like(a, b) {
133                        config.arc_threshold
134                    } else if is_rc_like(a, b) {
135                        config.rc_threshold
136                    } else {
137                        min_similarity
138                    }
139                } else {
140                    min_similarity
141                };
142
143                if content_similarity(a, b, config.compare_bytes) >= threshold {
144                    relations.push(RelationEdge {
145                        from: a.id,
146                        to: b.id,
147                        relation: Relation::Clone,
148                    });
149                    *edge_count_per_node.entry(a.id).or_insert(0) += 1;
150                    *edge_count_per_node.entry(b.id).or_insert(0) += 1;
151                }
152            }
153        }
154    }
155
156    relations
157}
158
159/// Compute content similarity between two records.
160///
161/// Compares the first `max_bytes` of each record's memory content
162/// and returns the ratio of matching bytes.
163fn content_similarity(a: &InferenceRecord, b: &InferenceRecord, max_bytes: usize) -> f64 {
164    let memory_a = match &a.memory {
165        Some(m) => m,
166        None => return 0.0,
167    };
168    let memory_b = match &b.memory {
169        Some(m) => m,
170        None => return 0.0,
171    };
172
173    let len = max_bytes.min(memory_a.len()).min(memory_b.len());
174    if len == 0 {
175        return 0.0;
176    }
177
178    let mut matching = 0usize;
179    for i in 0..len {
180        let byte_a: u8 = memory_a.read_u8(i).unwrap_or(0);
181        let byte_b: u8 = memory_b.read_u8(i).unwrap_or(0);
182        if byte_a == byte_b {
183            matching += 1;
184        }
185    }
186
187    matching as f64 / len as f64
188}
189
190/// Check if records look like Arc clones (thread-safe reference counting).
191fn is_arc_like(a: &InferenceRecord, b: &InferenceRecord) -> bool {
192    // Arc has a specific memory layout with strong/weak counts
193    // Check if both records have memory and look like Arc control blocks
194    let has_arc_pattern = |record: &InferenceRecord| -> bool {
195        let memory = match &record.memory {
196            Some(m) => m,
197            None => return false,
198        };
199        if memory.len() < 16 {
200            return false;
201        }
202        let strong = memory.read_usize(0).unwrap_or(usize::MAX);
203        let weak = memory.read_usize(8).unwrap_or(usize::MAX);
204        // Arc typically has strong >= 1 and weak >= 0
205        (1..=10000).contains(&strong) && weak <= 1000
206    };
207
208    has_arc_pattern(a) && has_arc_pattern(b)
209}
210
211/// Check if records look like Rc clones (single-threaded reference counting).
212fn is_rc_like(a: &InferenceRecord, b: &InferenceRecord) -> bool {
213    // Rc has the same memory layout as Arc but is single-threaded
214    // Check if both records have memory and look like Rc control blocks
215    let has_rc_pattern = |record: &InferenceRecord| -> bool {
216        let memory = match &record.memory {
217            Some(m) => m,
218            None => return false,
219        };
220        if memory.len() < 16 {
221            return false;
222        }
223        let strong = memory.read_usize(0).unwrap_or(usize::MAX);
224        let weak = memory.read_usize(8).unwrap_or(usize::MAX);
225        // Rc typically has strong >= 1 and weak >= 0
226        (1..=10000).contains(&strong) && weak <= 1000
227    };
228
229    has_rc_pattern(a) && has_rc_pattern(b)
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235    use crate::analysis::unsafe_inference::{OwnedMemoryView, TypeKind};
236
237    fn make_record(
238        id: usize,
239        ptr: usize,
240        size: usize,
241        memory: Vec<u8>,
242        stack_hash: Option<u64>,
243        alloc_time: u64,
244        type_kind: TypeKind,
245    ) -> InferenceRecord {
246        InferenceRecord {
247            id,
248            ptr,
249            size,
250            memory: Some(OwnedMemoryView::new(memory)),
251            type_kind,
252            confidence: 100,
253            call_stack_hash: stack_hash,
254            alloc_time,
255            stack_ptr: None,
256        }
257    }
258
259    #[test]
260    fn test_clone_detection_identical_content() {
261        let content = vec![0xAAu8; 64];
262        let records = vec![
263            make_record(
264                0,
265                0x1000,
266                24,
267                content.clone(),
268                Some(123),
269                1000,
270                TypeKind::Vec,
271            ),
272            make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::Vec),
273        ];
274
275        let edges = detect_clones(&records, &CloneConfig::default());
276        assert_eq!(edges.len(), 1);
277        assert_eq!(edges[0].from, 0);
278        assert_eq!(edges[0].to, 1);
279        assert_eq!(edges[0].relation, Relation::Clone);
280    }
281
282    #[test]
283    fn test_clone_detection_different_content() {
284        let content_a = vec![0xAAu8; 64];
285        let content_b = vec![0xBBu8; 64];
286        let records = vec![
287            make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
288            make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
289        ];
290
291        let edges = detect_clones(&records, &CloneConfig::default());
292        assert!(edges.is_empty());
293    }
294
295    #[test]
296    fn test_clone_detection_time_window_exceeded() {
297        let content = vec![0xAAu8; 64];
298        let records = vec![
299            make_record(
300                0,
301                0x1000,
302                24,
303                content.clone(),
304                Some(123),
305                1000,
306                TypeKind::Vec,
307            ),
308            make_record(1, 0x2000, 24, content, Some(123), 20_000_000, TypeKind::Vec),
309        ];
310
311        let edges = detect_clones(&records, &CloneConfig::default());
312        assert!(edges.is_empty());
313    }
314
315    #[test]
316    fn test_clone_detection_different_types() {
317        let content = vec![0xAAu8; 64];
318        let records = vec![
319            make_record(
320                0,
321                0x1000,
322                24,
323                content.clone(),
324                Some(123),
325                1000,
326                TypeKind::Vec,
327            ),
328            make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::String),
329        ];
330
331        let edges = detect_clones(&records, &CloneConfig::default());
332        assert!(edges.is_empty());
333    }
334
335    #[test]
336    fn test_clone_detection_different_stack_hashes() {
337        let content = vec![0xAAu8; 64];
338        let records = vec![
339            make_record(
340                0,
341                0x1000,
342                24,
343                content.clone(),
344                Some(111),
345                1000,
346                TypeKind::Vec,
347            ),
348            make_record(1, 0x2000, 24, content, Some(222), 5000, TypeKind::Vec),
349        ];
350
351        let edges = detect_clones(&records, &CloneConfig::default());
352        assert!(edges.is_empty());
353    }
354
355    #[test]
356    fn test_clone_detection_no_memory() {
357        let records = vec![
358            InferenceRecord {
359                id: 0,
360                ptr: 0x1000,
361                size: 24,
362                memory: None,
363                type_kind: TypeKind::Vec,
364                confidence: 80,
365                call_stack_hash: Some(123),
366                alloc_time: 1000,
367                stack_ptr: None,
368            },
369            InferenceRecord {
370                id: 1,
371                ptr: 0x2000,
372                size: 24,
373                memory: None,
374                type_kind: TypeKind::Vec,
375                confidence: 80,
376                call_stack_hash: Some(123),
377                alloc_time: 5000,
378                stack_ptr: None,
379            },
380        ];
381
382        let edges = detect_clones(&records, &CloneConfig::default());
383        assert!(edges.is_empty());
384    }
385
386    #[test]
387    fn test_clone_detection_partial_similarity() {
388        let content_a = vec![0xAAu8; 64];
389        let mut content_b = vec![0xAAu8; 64];
390        for item in content_b.iter_mut().skip(50).take(14) {
391            *item = 0xBB;
392        }
393
394        let records = vec![
395            make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
396            make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
397        ];
398
399        let edges = detect_clones(&records, &CloneConfig::default());
400        assert!(edges.is_empty());
401    }
402
403    #[test]
404    fn test_clone_detection_sliding_window_efficiency() {
405        // Create 5 records with identical content and timestamps within 1ms window.
406        // All records have time diff < 1ms from each other, so the sliding window
407        // should compare all pairs and detect C(5,2) = 10 clone edges.
408        let content = vec![0xCCu8; 64];
409        let records: Vec<InferenceRecord> = (0..5)
410            .map(|i| {
411                make_record(
412                    i,
413                    0x1000 + i * 0x1000,
414                    24,
415                    content.clone(),
416                    Some(999),
417                    1000_u64 + (i as u64) * 100_000, // 0.1ms apart, total span 0.4ms < 1ms
418                    TypeKind::Vec,
419                )
420            })
421            .collect();
422
423        let edges = detect_clones(&records, &CloneConfig::default());
424        assert_eq!(edges.len(), 10); // C(5,2) = 10
425    }
426
427    #[test]
428    fn test_clone_config_defaults() {
429        let config = CloneConfig::default();
430        assert_eq!(config.max_time_diff_ns, 1_000_000); // 1ms
431        assert_eq!(config.compare_bytes, 64);
432        assert!((config.min_similarity - 0.8).abs() < f64::EPSILON);
433        assert!((config.min_similarity_no_stack_hash - 0.95).abs() < f64::EPSILON);
434        assert_eq!(config.max_clone_edges_per_node, 10);
435    }
436
437    #[test]
438    fn test_clone_detection_different_sizes() {
439        let content = vec![0xAAu8; 64];
440        let records = vec![
441            make_record(
442                0,
443                0x1000,
444                24,
445                content.clone(),
446                Some(123),
447                1000,
448                TypeKind::Vec,
449            ),
450            make_record(1, 0x2000, 48, content, Some(123), 5000, TypeKind::Vec),
451        ];
452
453        let edges = detect_clones(&records, &CloneConfig::default());
454        assert!(edges.is_empty());
455    }
456
457    #[test]
458    fn test_clone_detection_no_stack_hash_stricter_threshold() {
459        let content = vec![0xAAu8; 64];
460        let records = vec![
461            make_record(0, 0x1000, 24, content.clone(), None, 1000, TypeKind::Vec),
462            make_record(1, 0x2000, 24, content, None, 5000, TypeKind::Vec),
463        ];
464
465        let config = CloneConfig::default();
466        let edges = detect_clones(&records, &config);
467
468        assert_eq!(edges.len(), 1, "Identical content should still be detected");
469    }
470
471    #[test]
472    fn test_clone_detection_no_stack_hash_partial_similarity_rejected() {
473        let content_a = vec![0xAAu8; 64];
474        let mut content_b = vec![0xAAu8; 64];
475        for item in content_b.iter_mut().skip(60).take(4) {
476            *item = 0xBB;
477        }
478
479        let records = vec![
480            make_record(0, 0x1000, 24, content_a, None, 1000, TypeKind::Vec),
481            make_record(1, 0x2000, 24, content_b, None, 5000, TypeKind::Vec),
482        ];
483
484        let config = CloneConfig::default();
485        let edges = detect_clones(&records, &config);
486
487        assert!(
488            edges.is_empty(),
489            "Partial similarity should be rejected with no stack hash"
490        );
491    }
492
493    #[test]
494    fn test_clone_detection_max_edges_per_node() {
495        let content = vec![0xCCu8; 64];
496        let records: Vec<InferenceRecord> = (0..20)
497            .map(|i| {
498                make_record(
499                    i,
500                    0x1000 + i * 0x1000,
501                    24,
502                    content.clone(),
503                    Some(999),
504                    1000,
505                    TypeKind::Vec,
506                )
507            })
508            .collect();
509
510        let config = CloneConfig {
511            max_clone_edges_per_node: 3,
512            ..Default::default()
513        };
514        let edges = detect_clones(&records, &config);
515
516        let mut edge_count: HashMap<usize, usize> = HashMap::new();
517        for edge in &edges {
518            *edge_count.entry(edge.from).or_insert(0) += 1;
519            *edge_count.entry(edge.to).or_insert(0) += 1;
520        }
521
522        for (&_node, &count) in &edge_count {
523            assert!(
524                count <= config.max_clone_edges_per_node,
525                "Node has {} edges, max is {}",
526                count,
527                config.max_clone_edges_per_node
528            );
529        }
530    }
531}