Skip to main content

memscope_rs/analysis/relation_inference/
clone_detector.rs

1//! Clone detection via grouped comparison with sliding time window.
2//!
3//! Detects when one allocation is a clone of another by grouping on
4//! (type, size, stack_hash) and comparing content similarity within
5//! a time-bounded sliding window.
6//!
7//! # False Positive Mitigation
8//!
9//! When `call_stack_hash` is None (common in synthetic tests), the system
10//! applies stricter similarity thresholds to reduce false positives.
11
12use std::collections::HashMap;
13
14use crate::analysis::relation_inference::{InferenceRecord, Relation, RelationEdge};
15use crate::analysis::unsafe_inference::TypeKind;
16
17/// Configuration for clone detection.
18#[derive(Debug, Clone)]
19pub struct CloneConfig {
20    /// Maximum time difference (nanoseconds) between two allocations
21    /// to be considered potential clones.
22    ///
23    /// Default: 1ms (1_000_000 ns) - reduced from 10ms to minimize
24    /// false positives in hot paths. Real clones typically happen
25    /// within microseconds, not milliseconds.
26    pub max_time_diff_ns: u64,
27    /// Number of leading bytes to compare for content similarity.
28    pub compare_bytes: usize,
29    /// Minimum similarity ratio (0.0 - 1.0) to consider a clone.
30    /// Default: 0.8 (80%).
31    pub min_similarity: f64,
32    /// Minimum similarity when call_stack_hash is None.
33    /// Higher threshold reduces false positives in synthetic data.
34    /// Default: 0.95 (95%).
35    pub min_similarity_no_stack_hash: f64,
36    /// Maximum clone edges per node to prevent explosion.
37    /// Default: 10.
38    pub max_clone_edges_per_node: usize,
39}
40
41impl Default for CloneConfig {
42    fn default() -> Self {
43        Self {
44            max_time_diff_ns: 1_000_000, // 1ms - reduced for hot path accuracy
45            compare_bytes: 64,
46            min_similarity: 0.8,
47            min_similarity_no_stack_hash: 0.95,
48            max_clone_edges_per_node: 10,
49        }
50    }
51}
52
53/// Detect Clone relationships among all inference records.
54///
55/// Groups allocations by `(TypeKind, size, call_stack_hash)` and then
56/// uses a sliding time window to compare content similarity within each group.
57/// This avoids O(n^2) worst-case by only comparing temporally close allocations.
58///
59/// # False Positive Mitigation
60///
61/// When `call_stack_hash` is None, applies stricter similarity threshold
62/// (`min_similarity_no_stack_hash`) to reduce false positives.
63///
64/// # Arguments
65///
66/// * `records` - All inference records.
67/// * `config` - Detection configuration.
68pub fn detect_clones(records: &[InferenceRecord], config: &CloneConfig) -> Vec<RelationEdge> {
69    let mut groups: HashMap<(TypeKind, usize, u64), Vec<usize>> = HashMap::new();
70    for (i, record) in records.iter().enumerate() {
71        let stack_hash = record.call_stack_hash.unwrap_or(0);
72        let key = (record.type_kind, record.size, stack_hash);
73        groups.entry(key).or_default().push(i);
74    }
75
76    let mut relations = Vec::new();
77    let mut edge_count_per_node: HashMap<usize, usize> = HashMap::new();
78
79    for group_indices in groups.values() {
80        if group_indices.len() < 2 {
81            continue;
82        }
83
84        let mut group: Vec<&InferenceRecord> = group_indices.iter().map(|&i| &records[i]).collect();
85        group.sort_by_key(|r| r.alloc_time);
86
87        let has_stack_hash = group.iter().any(|r| r.call_stack_hash.is_some());
88        let min_similarity = if has_stack_hash {
89            config.min_similarity
90        } else {
91            config.min_similarity_no_stack_hash
92        };
93
94        let mut left = 0;
95        for right in 1..group.len() {
96            while left < right
97                && group[right]
98                    .alloc_time
99                    .saturating_sub(group[left].alloc_time)
100                    > config.max_time_diff_ns
101            {
102                left += 1;
103            }
104
105            for i in left..right {
106                let a = group[i];
107                let b = group[right];
108
109                let a_count = edge_count_per_node.get(&a.id).copied().unwrap_or(0);
110                let b_count = edge_count_per_node.get(&b.id).copied().unwrap_or(0);
111
112                if a_count >= config.max_clone_edges_per_node
113                    || b_count >= config.max_clone_edges_per_node
114                {
115                    continue;
116                }
117
118                if content_similarity(a, b, config.compare_bytes) >= min_similarity {
119                    relations.push(RelationEdge {
120                        from: a.id,
121                        to: b.id,
122                        relation: Relation::Clone,
123                    });
124                    *edge_count_per_node.entry(a.id).or_insert(0) += 1;
125                    *edge_count_per_node.entry(b.id).or_insert(0) += 1;
126                }
127            }
128        }
129    }
130
131    relations
132}
133
134/// Compute content similarity between two records.
135///
136/// Compares the first `max_bytes` of each record's memory content
137/// and returns the ratio of matching bytes.
138fn content_similarity(a: &InferenceRecord, b: &InferenceRecord, max_bytes: usize) -> f64 {
139    let memory_a = match &a.memory {
140        Some(m) => m,
141        None => return 0.0,
142    };
143    let memory_b = match &b.memory {
144        Some(m) => m,
145        None => return 0.0,
146    };
147
148    let len = max_bytes.min(memory_a.len()).min(memory_b.len());
149    if len == 0 {
150        return 0.0;
151    }
152
153    let mut matching = 0usize;
154    for i in 0..len {
155        let byte_a: u8 = memory_a.read_u8(i).unwrap_or(0);
156        let byte_b: u8 = memory_b.read_u8(i).unwrap_or(0);
157        if byte_a == byte_b {
158            matching += 1;
159        }
160    }
161
162    matching as f64 / len as f64
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168    use crate::analysis::unsafe_inference::{OwnedMemoryView, TypeKind};
169
170    fn make_record(
171        id: usize,
172        ptr: usize,
173        size: usize,
174        memory: Vec<u8>,
175        stack_hash: Option<u64>,
176        alloc_time: u64,
177        type_kind: TypeKind,
178    ) -> InferenceRecord {
179        InferenceRecord {
180            id,
181            ptr,
182            size,
183            memory: Some(OwnedMemoryView::new(memory)),
184            type_kind,
185            confidence: 80,
186            call_stack_hash: stack_hash,
187            alloc_time,
188        }
189    }
190
191    #[test]
192    fn test_clone_detection_identical_content() {
193        let content = vec![0xAAu8; 64];
194        let records = vec![
195            make_record(
196                0,
197                0x1000,
198                24,
199                content.clone(),
200                Some(123),
201                1000,
202                TypeKind::Vec,
203            ),
204            make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::Vec),
205        ];
206
207        let edges = detect_clones(&records, &CloneConfig::default());
208        assert_eq!(edges.len(), 1);
209        assert_eq!(edges[0].from, 0);
210        assert_eq!(edges[0].to, 1);
211        assert_eq!(edges[0].relation, Relation::Clone);
212    }
213
214    #[test]
215    fn test_clone_detection_different_content() {
216        let content_a = vec![0xAAu8; 64];
217        let content_b = vec![0xBBu8; 64];
218        let records = vec![
219            make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
220            make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
221        ];
222
223        let edges = detect_clones(&records, &CloneConfig::default());
224        assert!(edges.is_empty());
225    }
226
227    #[test]
228    fn test_clone_detection_time_window_exceeded() {
229        let content = vec![0xAAu8; 64];
230        let records = vec![
231            make_record(
232                0,
233                0x1000,
234                24,
235                content.clone(),
236                Some(123),
237                1000,
238                TypeKind::Vec,
239            ),
240            make_record(1, 0x2000, 24, content, Some(123), 20_000_000, TypeKind::Vec),
241        ];
242
243        let edges = detect_clones(&records, &CloneConfig::default());
244        assert!(edges.is_empty());
245    }
246
247    #[test]
248    fn test_clone_detection_different_types() {
249        let content = vec![0xAAu8; 64];
250        let records = vec![
251            make_record(
252                0,
253                0x1000,
254                24,
255                content.clone(),
256                Some(123),
257                1000,
258                TypeKind::Vec,
259            ),
260            make_record(1, 0x2000, 24, content, Some(123), 5000, TypeKind::String),
261        ];
262
263        let edges = detect_clones(&records, &CloneConfig::default());
264        assert!(edges.is_empty());
265    }
266
267    #[test]
268    fn test_clone_detection_different_stack_hashes() {
269        let content = vec![0xAAu8; 64];
270        let records = vec![
271            make_record(
272                0,
273                0x1000,
274                24,
275                content.clone(),
276                Some(111),
277                1000,
278                TypeKind::Vec,
279            ),
280            make_record(1, 0x2000, 24, content, Some(222), 5000, TypeKind::Vec),
281        ];
282
283        let edges = detect_clones(&records, &CloneConfig::default());
284        assert!(edges.is_empty());
285    }
286
287    #[test]
288    fn test_clone_detection_no_memory() {
289        let records = vec![
290            InferenceRecord {
291                id: 0,
292                ptr: 0x1000,
293                size: 24,
294                memory: None,
295                type_kind: TypeKind::Vec,
296                confidence: 80,
297                call_stack_hash: Some(123),
298                alloc_time: 1000,
299            },
300            InferenceRecord {
301                id: 1,
302                ptr: 0x2000,
303                size: 24,
304                memory: None,
305                type_kind: TypeKind::Vec,
306                confidence: 80,
307                call_stack_hash: Some(123),
308                alloc_time: 5000,
309            },
310        ];
311
312        let edges = detect_clones(&records, &CloneConfig::default());
313        assert!(edges.is_empty());
314    }
315
316    #[test]
317    fn test_clone_detection_partial_similarity() {
318        let content_a = vec![0xAAu8; 64];
319        let mut content_b = vec![0xAAu8; 64];
320        for item in content_b.iter_mut().skip(50).take(14) {
321            *item = 0xBB;
322        }
323
324        let records = vec![
325            make_record(0, 0x1000, 24, content_a, Some(123), 1000, TypeKind::Vec),
326            make_record(1, 0x2000, 24, content_b, Some(123), 5000, TypeKind::Vec),
327        ];
328
329        let edges = detect_clones(&records, &CloneConfig::default());
330        assert!(edges.is_empty());
331    }
332
333    #[test]
334    fn test_clone_detection_sliding_window_efficiency() {
335        // Create 5 records with identical content and timestamps within 1ms window.
336        // All records have time diff < 1ms from each other, so the sliding window
337        // should compare all pairs and detect C(5,2) = 10 clone edges.
338        let content = vec![0xCCu8; 64];
339        let records: Vec<InferenceRecord> = (0..5)
340            .map(|i| {
341                make_record(
342                    i,
343                    0x1000 + i * 0x1000,
344                    24,
345                    content.clone(),
346                    Some(999),
347                    1000_u64 + (i as u64) * 100_000, // 0.1ms apart, total span 0.4ms < 1ms
348                    TypeKind::Vec,
349                )
350            })
351            .collect();
352
353        let edges = detect_clones(&records, &CloneConfig::default());
354        assert_eq!(edges.len(), 10); // C(5,2) = 10
355    }
356
357    #[test]
358    fn test_clone_config_defaults() {
359        let config = CloneConfig::default();
360        assert_eq!(config.max_time_diff_ns, 1_000_000); // 1ms
361        assert_eq!(config.compare_bytes, 64);
362        assert!((config.min_similarity - 0.8).abs() < f64::EPSILON);
363        assert!((config.min_similarity_no_stack_hash - 0.95).abs() < f64::EPSILON);
364        assert_eq!(config.max_clone_edges_per_node, 10);
365    }
366
367    #[test]
368    fn test_clone_detection_different_sizes() {
369        let content = vec![0xAAu8; 64];
370        let records = vec![
371            make_record(
372                0,
373                0x1000,
374                24,
375                content.clone(),
376                Some(123),
377                1000,
378                TypeKind::Vec,
379            ),
380            make_record(1, 0x2000, 48, content, Some(123), 5000, TypeKind::Vec),
381        ];
382
383        let edges = detect_clones(&records, &CloneConfig::default());
384        assert!(edges.is_empty());
385    }
386
387    #[test]
388    fn test_clone_detection_no_stack_hash_stricter_threshold() {
389        let content = vec![0xAAu8; 64];
390        let records = vec![
391            make_record(0, 0x1000, 24, content.clone(), None, 1000, TypeKind::Vec),
392            make_record(1, 0x2000, 24, content, None, 5000, TypeKind::Vec),
393        ];
394
395        let config = CloneConfig::default();
396        let edges = detect_clones(&records, &config);
397
398        assert_eq!(edges.len(), 1, "Identical content should still be detected");
399    }
400
401    #[test]
402    fn test_clone_detection_no_stack_hash_partial_similarity_rejected() {
403        let content_a = vec![0xAAu8; 64];
404        let mut content_b = vec![0xAAu8; 64];
405        for item in content_b.iter_mut().skip(60).take(4) {
406            *item = 0xBB;
407        }
408
409        let records = vec![
410            make_record(0, 0x1000, 24, content_a, None, 1000, TypeKind::Vec),
411            make_record(1, 0x2000, 24, content_b, None, 5000, TypeKind::Vec),
412        ];
413
414        let config = CloneConfig::default();
415        let edges = detect_clones(&records, &config);
416
417        assert!(
418            edges.is_empty(),
419            "Partial similarity should be rejected with no stack hash"
420        );
421    }
422
423    #[test]
424    fn test_clone_detection_max_edges_per_node() {
425        let content = vec![0xCCu8; 64];
426        let records: Vec<InferenceRecord> = (0..20)
427            .map(|i| {
428                make_record(
429                    i,
430                    0x1000 + i * 0x1000,
431                    24,
432                    content.clone(),
433                    Some(999),
434                    1000,
435                    TypeKind::Vec,
436                )
437            })
438            .collect();
439
440        let config = CloneConfig {
441            max_clone_edges_per_node: 3,
442            ..Default::default()
443        };
444        let edges = detect_clones(&records, &config);
445
446        let mut edge_count: HashMap<usize, usize> = HashMap::new();
447        for edge in &edges {
448            *edge_count.entry(edge.from).or_insert(0) += 1;
449            *edge_count.entry(edge.to).or_insert(0) += 1;
450        }
451
452        for (&_node, &count) in &edge_count {
453            assert!(
454                count <= config.max_clone_edges_per_node,
455                "Node has {} edges, max is {}",
456                count,
457                config.max_clone_edges_per_node
458            );
459        }
460    }
461}