memscope_rs/analysis/unsafe_inference/
engine.rs

1//! Unsafe Type Inference Engine v2
2//!
3//! Heuristic-based type inference for FFI/unsafe memory allocations.
4//!
5//! # Design Principles
6//!
7//! - **Six-dimensional signal model**: Size, Layout, Content, Stack, Lifetime, ValidPtr
8//! - **Memory safe**: No raw pointer dereferencing, all access through MemoryView
9//! - **O(n) complexity**: ~5-50ms for 1M allocations depending on enabled features
10//!
11//! # Phase 1 Features
12//!
13//! - Enhanced size heuristic with power-of-two signal
14//! - UTF-8 validation for String detection
15//! - Enhanced CString detection with ASCII ratio
16//! - Shannon entropy analysis for binary data
17//! - Zero-fill detection
18#![allow(dead_code)]
19
20use super::memory_view::{count_valid_pointers, is_valid_ptr, MemoryView};
21
22/// Inferred type category for unsafe memory allocations.
23///
24/// Each variant represents a common pattern in FFI/unsafe code.
25#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
26pub enum TypeKind {
27    /// Rust Vec<T>: heap-allocated growable array (ptr, len, cap)
28    Vec,
29    /// Rust String: UTF-8 encoded growable string (ptr, len, cap)
30    String,
31    /// C-style null-terminated string
32    CString,
33    /// Raw pointer: *mut T, *const T, Box<T>
34    Pointer,
35    /// Fat pointer: &[T], &str, dyn Trait (data_ptr + metadata)
36    FatPtr,
37    /// Raw byte buffer: [u8], compressed/encrypted data
38    Buffer,
39    /// C struct with multiple pointer fields
40    CStruct,
41    /// Unknown type
42    Unknown,
43}
44
45impl std::fmt::Display for TypeKind {
46    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47        match self {
48            TypeKind::Vec => write!(f, "Vec<_>"),
49            TypeKind::String => write!(f, "String"),
50            TypeKind::CString => write!(f, "CString"),
51            TypeKind::Pointer => write!(f, "*mut c_void"),
52            TypeKind::FatPtr => write!(f, "&[T]"),
53            TypeKind::Buffer => write!(f, "[u8]"),
54            TypeKind::CStruct => write!(f, "CStruct"),
55            TypeKind::Unknown => write!(f, "unknown"),
56        }
57    }
58}
59
60/// Type inference result with confidence score.
61///
62/// Confidence ranges from 0-100, where higher values indicate stronger evidence.
63#[derive(Clone, Copy, Debug)]
64pub struct TypeGuess {
65    /// Inferred type category
66    pub kind: TypeKind,
67    /// Confidence score (0-100)
68    pub confidence: u8,
69    /// Method used for inference (for debugging/display)
70    pub method: InferenceMethod,
71}
72
73/// Method used for type inference.
74#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
75pub enum InferenceMethod {
76    #[default]
77    Unknown,
78    SizeHeuristic,
79    LayoutDetection,
80    Utf8Validation,
81    ContentAnalysis,
82    EntropyAnalysis,
83    StackTraceAnalysis,
84    LifetimeAnalysis,
85    Combined,
86}
87
88impl TypeGuess {
89    /// Create a new type guess with the given kind and confidence.
90    pub fn new(kind: TypeKind, confidence: u8) -> Self {
91        Self {
92            kind,
93            confidence,
94            method: InferenceMethod::Combined,
95        }
96    }
97
98    /// Create a new type guess with method information.
99    pub fn with_method(kind: TypeKind, confidence: u8, method: InferenceMethod) -> Self {
100        Self {
101            kind,
102            confidence,
103            method,
104        }
105    }
106
107    /// Create an unknown type guess.
108    pub fn unknown() -> Self {
109        Self {
110            kind: TypeKind::Unknown,
111            confidence: 0,
112            method: InferenceMethod::Unknown,
113        }
114    }
115
116    /// Display the type guess with confidence percentage.
117    pub fn display_with_confidence(&self) -> String {
118        if self.confidence == 0 {
119            return "-".to_string();
120        }
121        format!("{} ({}%)", self.kind, self.confidence)
122    }
123}
124
125/// Internal score structure for multi-dimensional inference.
126///
127/// Each field accumulates evidence from different heuristic dimensions.
128#[derive(Default)]
129struct Score {
130    // Core types
131    vec: u8,
132    string: u8,
133    cstring: u8,
134    pointer: u8,
135    fat_ptr: u8,
136    buffer: u8,
137    cstruct: u8,
138}
139
140/// Main inference engine for unsafe type detection.
141///
142/// # Example
143///
144/// ```
145/// use memscope_rs::analysis::unsafe_inference::{
146///     UnsafeInferenceEngine, TypeGuess, TypeKind,
147/// };
148///
149/// let memory = vec![0u8; 24];
150/// let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
151/// println!("Inferred: {} ({}%)", guess.kind, guess.confidence);
152/// ```
153pub struct UnsafeInferenceEngine;
154
155impl UnsafeInferenceEngine {
156    /// Infer type from a single memory view.
157    pub fn infer_single(view: &MemoryView, size: usize) -> TypeGuess {
158        infer(view, size, None, None, None)
159    }
160
161    /// Infer type from raw bytes.
162    pub fn infer_from_bytes(data: &[u8], size: usize) -> TypeGuess {
163        let view = MemoryView::new(data);
164        infer(&view, size, None, None, None)
165    }
166
167    /// Infer type with full context.
168    pub fn infer_with_context(
169        data: &[u8],
170        size: usize,
171        stack_trace: Option<&[String]>,
172        alloc_time: Option<u64>,
173        dealloc_time: Option<u64>,
174    ) -> TypeGuess {
175        let view = MemoryView::new(data);
176        infer(&view, size, stack_trace, alloc_time, dealloc_time)
177    }
178
179    /// Run inference on multiple records.
180    pub fn run(records: &mut [InferenceRecord]) {
181        for record in records.iter_mut() {
182            if let Some(ref memory) = record.memory {
183                let view = MemoryView::new(memory);
184                record.inferred = Some(infer(
185                    &view,
186                    record.size,
187                    record.stack_trace.as_deref(),
188                    record.alloc_time,
189                    record.dealloc_time,
190                ));
191            }
192        }
193    }
194}
195
196/// Record for batch inference.
197pub struct InferenceRecord {
198    /// Memory address of the allocation
199    pub ptr: usize,
200    /// Size in bytes
201    pub size: usize,
202    /// Optional memory content snapshot
203    pub memory: Option<Vec<u8>>,
204    /// Optional stack trace at allocation time
205    pub stack_trace: Option<Vec<String>>,
206    /// Optional allocation timestamp (nanoseconds)
207    pub alloc_time: Option<u64>,
208    /// Optional deallocation timestamp (nanoseconds)
209    pub dealloc_time: Option<u64>,
210    /// Inference result
211    pub inferred: Option<TypeGuess>,
212}
213
214impl InferenceRecord {
215    /// Create a new inference record with minimal information.
216    pub fn new(ptr: usize, size: usize, memory: Option<Vec<u8>>) -> Self {
217        Self {
218            ptr,
219            size,
220            memory,
221            stack_trace: None,
222            alloc_time: None,
223            dealloc_time: None,
224            inferred: None,
225        }
226    }
227
228    /// Create a new inference record with full information.
229    pub fn with_context(
230        ptr: usize,
231        size: usize,
232        memory: Option<Vec<u8>>,
233        stack_trace: Option<Vec<String>>,
234        alloc_time: Option<u64>,
235        dealloc_time: Option<u64>,
236    ) -> Self {
237        Self {
238            ptr,
239            size,
240            memory,
241            stack_trace,
242            alloc_time,
243            dealloc_time,
244            inferred: None,
245        }
246    }
247}
248
249/// Main inference function combining all heuristic dimensions.
250fn infer(
251    view: &MemoryView,
252    size: usize,
253    stack_trace: Option<&[String]>,
254    alloc_time: Option<u64>,
255    dealloc_time: Option<u64>,
256) -> TypeGuess {
257    let mut score = Score::default();
258
259    // Dimension 1: Size heuristic with power-of-two signal
260    size_heuristic(size, &mut score);
261
262    // Dimension 2: Layout detection (ptr/len/cap structure)
263    vec_string_layout(view, &mut score);
264
265    // Dimension 3: Content analysis (CString, entropy, zero-fill)
266    content_analysis(view, &mut score);
267
268    // Dimension 4: Pointer-based heuristics (buffer vs cstruct)
269    pointer_heuristic(view, &mut score);
270
271    // Dimension 5: Stack trace analysis (optional, high signal)
272    stack_trace_analysis(stack_trace, &mut score);
273
274    // Dimension 6: Lifetime analysis (optional, auxiliary signal)
275    lifetime_analysis(alloc_time, dealloc_time, &mut score);
276
277    // Aggregate and finalize
278    finalize(score)
279}
280
281/// Dimension 1: Enhanced size heuristic.
282///
283/// Key improvements over v1:
284/// - Reduced pointer score for size=8 (was too aggressive)
285/// - Added fat_ptr detection for size=16
286/// - Added power-of-two signal for Vec/Buffer
287/// - Added common C struct sizes
288fn size_heuristic(size: usize, score: &mut Score) {
289    match size {
290        // Raw pointer: *mut T, *const T, &T, Box<T>
291        // Reduced from 60 to 30 to reduce false positives
292        8 => score.pointer += 30,
293
294        // Fat pointer: &[T], &str, dyn Trait (data_ptr + metadata)
295        // This is a strong signal for slice references
296        16 => score.fat_ptr += 25,
297
298        // Vec/String triplet: (ptr, len, cap)
299        24 => {
300            score.vec += 15;
301            score.string += 15;
302        }
303
304        // Common C struct sizes
305        32 | 48 | 64 => score.cstruct += 10,
306
307        _ => {}
308    }
309
310    // Power-of-two signal: Rust Vec capacity grows by powers of 2
311    // A random size being power-of-two has probability ~1/size
312    // For 64 bytes, this is only ~1.5% false positive rate
313    if size.is_power_of_two() && size >= 64 {
314        score.vec += 10;
315        score.buffer += 5;
316    }
317}
318
319/// Dimension 2: Vec/String layout detection.
320///
321/// Detects the (ptr, len, cap) triplet structure.
322/// Key improvement: UTF-8 validation for String detection.
323fn vec_string_layout(view: &MemoryView, score: &mut Score) {
324    let usize_size = std::mem::size_of::<usize>();
325    let min_len = usize_size * 3; // ptr + len + cap
326    if view.len() < min_len {
327        return;
328    }
329
330    let ptr_val = view.read_usize(0);
331    let len = view.read_usize(usize_size);
332    let cap = view.read_usize(usize_size * 2);
333
334    let (Some(p), Some(l), Some(c)) = (ptr_val, len, cap) else {
335        return;
336    };
337
338    // Basic structure validation
339    if !is_valid_ptr(p) || c < l || c == 0 || c > 10_000_000 {
340        return;
341    }
342
343    // Distinguish Vec from String based on capacity vs length
344    // String typically has cap close to len (small spare capacity)
345    // Vec often has cap >> len (pre-allocated growth space)
346    let spare = c.saturating_sub(l);
347
348    if spare < 16 && l > 0 {
349        // Small spare capacity → more likely String
350        score.string += 50;
351        score.vec += 20;
352    } else if spare > 0 {
353        // Large spare capacity → more likely Vec
354        score.vec += 60;
355        score.string += 15;
356    } else {
357        // cap == len → could be either
358        score.vec += 30;
359        score.string += 30;
360    }
361
362    // Additional Vec signal: capacity is power of two
363    // Rust's Vec growth strategy: new_cap = max(old_cap * 2, 1)
364    if c.is_power_of_two() {
365        score.vec += 15;
366    }
367}
368
369/// Dimension 3: Content analysis.
370///
371/// Enhanced detection for CString, entropy, and zero-fill patterns.
372fn content_analysis(view: &MemoryView, score: &mut Score) {
373    let data = view.as_slice();
374    if data.is_empty() {
375        return;
376    }
377
378    // UTF-8 validation for String detection (decisive signal)
379    utf8_validation(data, score);
380
381    // Enhanced CString detection
382    cstring_enhanced(data, score);
383
384    // Entropy analysis for binary data detection
385    // Skip for large data (>4KB) to avoid performance cost
386    if data.len() >= 32 && data.len() <= 4096 {
387        entropy_analysis(data, score);
388    } else if data.len() > 4096 {
389        // Large data is likely buffer, skip expensive analysis
390        score.buffer += 40;
391    }
392
393    // Zero-fill detection
394    zero_fill_detection(data, score);
395}
396
397/// UTF-8 validation for String detection.
398///
399/// This is the decisive signal for String vs Vec distinction.
400/// A random byte sequence passes UTF-8 validation with probability:
401/// - 16 bytes: ~0.3%
402/// - 64 bytes: ~0.00001%
403/// - 256 bytes: ~0%
404///
405/// Scoring strategy:
406/// - High printable ratio (>0.8): Strong String signal (+90)
407/// - Medium printable ratio (>0.5): Moderate String signal (+60)
408/// - Low printable ratio: Weak Vec signal (+20)
409/// - Invalid UTF-8: Strong Vec signal (+50)
410fn utf8_validation(data: &[u8], score: &mut Score) {
411    if data.is_empty() {
412        return;
413    }
414
415    // Try to validate as UTF-8
416    match std::str::from_utf8(data) {
417        Ok(s) => {
418            // Valid UTF-8 - count printable ratio
419            let printable = s
420                .chars()
421                .filter(|c| c.is_ascii_graphic() || c.is_ascii_whitespace())
422                .count();
423            let total = s.chars().count();
424            let ratio = if total > 0 {
425                printable as f32 / total as f32
426            } else {
427                0.0
428            };
429
430            if ratio > 0.8 {
431                // High printable ratio -> likely String content
432                score.string += 90;
433            } else if ratio > 0.5 {
434                // Mixed content -> could be String or Vec<u8>
435                score.string += 60;
436                score.vec += 30;
437            } else if ratio > 0.2 {
438                // Low printable but valid UTF-8
439                score.vec += 30;
440            } else {
441                // Very low printable ratio (mostly control chars or nulls)
442                // This is weak evidence for Vec, don't override other signals
443                score.vec += 20;
444            }
445        }
446        Err(_) => {
447            // Not valid UTF-8 -> definitely not a String
448            // This is a strong negative signal for String
449            score.vec += 50;
450        }
451    }
452}
453
454/// Enhanced CString detection with ASCII ratio analysis.
455///
456/// Key improvements:
457/// - Check printable ASCII ratio, not just trailing null
458/// - Detect multiple nulls (likely binary, not CString)
459/// - Require minimum content length
460/// - CString (null-terminated) should beat String when null terminator is present
461fn cstring_enhanced(data: &[u8], score: &mut Score) {
462    // Find first null byte
463    let null_pos = match data.iter().position(|&b| b == 0) {
464        Some(pos) => pos,
465        None => return, // No null terminator
466    };
467
468    // Empty or too short
469    if null_pos < 3 {
470        return;
471    }
472
473    let content = &data[..null_pos];
474
475    // Count printable ASCII characters (0x20-0x7E)
476    let printable_count = content
477        .iter()
478        .filter(|&&b| (0x20..=0x7E).contains(&b))
479        .count();
480
481    let printable_ratio = printable_count as f32 / content.len() as f32;
482
483    // High printable ratio with null terminator → likely CString
484    // Score higher than String (+90) because null terminator is definitive for CString
485    if printable_ratio > 0.9 {
486        score.cstring += 95;
487    } else if printable_ratio > 0.7 {
488        score.cstring += 60;
489    } else if printable_ratio > 0.5 {
490        score.cstring += 30;
491    }
492
493    // Multiple nulls → likely binary data, not CString
494    let null_count = data.iter().filter(|&&b| b == 0).count();
495    if null_count > 1 {
496        score.cstring = score.cstring.saturating_sub(20);
497        score.buffer += 15;
498    }
499}
500
501/// Shannon entropy analysis for binary data detection.
502///
503/// Entropy ranges from 0.0 (all same byte) to 8.0 (perfectly random).
504///
505/// Typical values:
506/// - English text: 4.0-4.5
507/// - Source code: 4.5-5.0
508/// - Compressed data: 7.8-8.0
509/// - Encrypted data: 7.9-8.0
510fn entropy_analysis(data: &[u8], score: &mut Score) {
511    let entropy = shannon_entropy(data);
512
513    // High entropy → compressed/encrypted/serialized data
514    if entropy > 7.5 {
515        score.buffer += 30;
516    } else if entropy > 6.5 {
517        score.buffer += 15;
518    }
519    // Low entropy → repetitive data or text
520    else if entropy < 3.0 {
521        score.cstruct += 5;
522    }
523}
524
525/// Calculate Shannon entropy of a byte sequence.
526fn shannon_entropy(data: &[u8]) -> f64 {
527    if data.is_empty() {
528        return 0.0;
529    }
530
531    let mut freq = [0u32; 256];
532    for &b in data {
533        freq[b as usize] += 1;
534    }
535
536    let n = data.len() as f64;
537    let mut entropy = 0.0;
538
539    for &count in &freq {
540        if count > 0 {
541            let p = count as f64 / n;
542            entropy -= p * p.log2();
543        }
544    }
545
546    entropy
547}
548
549/// Detect zero-filled memory regions.
550///
551/// High zero ratio often indicates:
552/// - Uninitialized struct padding
553/// - Zeroed buffers
554/// - Sparse data structures
555fn zero_fill_detection(data: &[u8], score: &mut Score) {
556    if data.len() < 16 {
557        return;
558    }
559
560    let zero_count = data.iter().filter(|&&b| b == 0).count();
561    let zero_ratio = zero_count as f32 / data.len() as f32;
562
563    if zero_ratio > 0.9 {
564        // Mostly zeros → likely zeroed buffer or struct with padding
565        score.buffer += 15;
566        score.cstruct += 10;
567    }
568}
569
570/// Dimension 4: Pointer-based heuristics.
571///
572/// Distinguish buffer from C struct based on pointer count.
573fn pointer_heuristic(view: &MemoryView, score: &mut Score) {
574    let ptr_count = count_valid_pointers(view);
575
576    if ptr_count == 0 && view.len() > 8 {
577        // No valid pointers → likely buffer
578        score.buffer += 40;
579    } else if ptr_count == 1 {
580        // Single pointer → could be Box or simple struct
581        score.pointer += 10;
582        score.cstruct += 5;
583    } else if ptr_count >= 2 {
584        // Multiple pointers → likely C struct
585        score.cstruct += 30;
586    }
587}
588
589/// Dimension 5: Stack trace analysis.
590///
591/// This is the highest-discrimination single signal.
592/// If the call stack contains `alloc::vec::Vec::push`, it's almost 100% Vec.
593///
594/// Graceful degradation: if stack trace is unavailable, this dimension
595/// contributes nothing and doesn't affect other dimensions.
596fn stack_trace_analysis(stack: Option<&[String]>, score: &mut Score) {
597    let Some(frames) = stack else {
598        return;
599    };
600
601    if frames.is_empty() {
602        return;
603    }
604
605    for frame in frames {
606        let f = frame.to_lowercase();
607
608        // Rust standard library signals
609        if f.contains("alloc::vec::vec") || f.contains("vec::from_elem") {
610            score.vec += 50;
611        }
612        if f.contains("alloc::string::string") || f.contains("from_utf8") {
613            score.string += 50;
614        }
615        if f.contains("alloc::boxed") {
616            score.pointer += 40;
617        }
618        if f.contains("ffi::c_str::cstring") || f.contains("from_bytes_with_nul") {
619            score.cstring += 60;
620        }
621
622        // FFI signals
623        if f.contains("malloc") || f.contains("calloc") || f.contains("realloc") {
624            score.cstruct += 20;
625            score.buffer += 15;
626        }
627        if f.contains("libc::") || f.contains("std::ffi") {
628            score.cstring += 15;
629            score.cstruct += 10;
630        }
631    }
632}
633
634/// Dimension 6: Lifetime analysis.
635///
636/// Uses allocation-deallocation time difference to infer type.
637/// This is an auxiliary signal with lower weight.
638fn lifetime_analysis(alloc_time: Option<u64>, dealloc_time: Option<u64>, score: &mut Score) {
639    let Some(alloc) = alloc_time else {
640        return;
641    };
642
643    let Some(dealloc) = dealloc_time else {
644        // Not deallocated → possibly leaked or long-lived
645        return;
646    };
647
648    let lifetime_ns = dealloc.saturating_sub(alloc);
649    let lifetime_ms = lifetime_ns / 1_000_000;
650
651    match lifetime_ms {
652        // Transient allocation (0ms) → possibly temporary String or small Vec
653        0 => {
654            score.string += 10;
655            score.vec += 5;
656        }
657
658        // Short-lived (1-100ms) → possibly function-local variable
659        1..=100 => {
660            score.cstruct += 5;
661        }
662
663        // Long-lived (> 10s) → possibly global cache or leaked
664        10000.. => {
665            score.buffer += 10;
666        }
667
668        _ => {}
669    }
670}
671
672/// Finalize inference by selecting the highest-scoring type.
673///
674/// Also determines the primary inference method based on which dimension
675/// contributed most to the final score.
676fn finalize(score: Score) -> TypeGuess {
677    let table = [
678        (TypeKind::Vec, score.vec, InferenceMethod::LayoutDetection),
679        (
680            TypeKind::String,
681            score.string,
682            InferenceMethod::Utf8Validation,
683        ),
684        (
685            TypeKind::CString,
686            score.cstring,
687            InferenceMethod::ContentAnalysis,
688        ),
689        (
690            TypeKind::Pointer,
691            score.pointer,
692            InferenceMethod::SizeHeuristic,
693        ),
694        (
695            TypeKind::FatPtr,
696            score.fat_ptr,
697            InferenceMethod::SizeHeuristic,
698        ),
699        (
700            TypeKind::Buffer,
701            score.buffer,
702            InferenceMethod::EntropyAnalysis,
703        ),
704        (TypeKind::CStruct, score.cstruct, InferenceMethod::Combined),
705    ];
706
707    let mut best = (TypeKind::Unknown, 0u8, InferenceMethod::Unknown);
708    for (kind, val, method) in table {
709        if val > best.1 {
710            best = (kind, val, method);
711        }
712    }
713
714    TypeGuess::with_method(best.0, best.1, best.2)
715}
716
717#[cfg(test)]
718mod tests {
719    use super::*;
720
721    fn create_vec_memory(ptr: usize, len: usize, cap: usize) -> Vec<u8> {
722        let mut data = vec![0u8; 24];
723        data[..8].copy_from_slice(&ptr.to_le_bytes());
724        data[8..16].copy_from_slice(&len.to_le_bytes());
725        data[16..24].copy_from_slice(&cap.to_le_bytes());
726        data
727    }
728
729    fn create_string_memory(ptr: usize, len: usize, cap: usize) -> Vec<u8> {
730        create_vec_memory(ptr, len, cap)
731    }
732
733    fn create_cstring_memory(content: &[u8]) -> Vec<u8> {
734        let mut data = content.to_vec();
735        data.push(0);
736        data
737    }
738
739    #[test]
740    #[cfg(target_os = "macos")]
741    fn test_infer_vec_with_large_capacity() {
742        let memory = create_vec_memory(0x10000, 10, 100);
743        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
744
745        assert_eq!(guess.kind, TypeKind::Vec);
746        assert!(guess.confidence >= 60);
747    }
748
749    #[test]
750    #[cfg(target_os = "macos")]
751    fn test_infer_vec_with_power_of_two_capacity() {
752        let memory = create_vec_memory(0x10000, 50, 64);
753        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
754
755        // Power-of-two capacity should boost Vec score
756        assert!(guess.kind == TypeKind::Vec || guess.kind == TypeKind::String);
757    }
758
759    #[test]
760    #[cfg(target_os = "macos")]
761    fn test_infer_string_with_small_spare() {
762        let memory = create_string_memory(0x10000, 10, 12);
763        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
764
765        // Small spare capacity should favor String
766        assert!(guess.kind == TypeKind::String || guess.kind == TypeKind::Vec);
767    }
768
769    #[test]
770    fn test_infer_cstring_printable() {
771        let memory = create_cstring_memory(b"hello world");
772        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 12);
773
774        assert_eq!(guess.kind, TypeKind::CString);
775        assert!(guess.confidence >= 70);
776    }
777
778    #[test]
779    fn test_infer_cstring_mixed_content() {
780        let memory = create_cstring_memory(&[0x30, 0x31, 0x80, 0x90, 0x20]);
781        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 6);
782
783        // Mixed content should have lower CString score
784        assert!(guess.confidence < 70);
785    }
786
787    #[test]
788    fn test_infer_pointer_size_8() {
789        let memory = [0u8; 8];
790        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 8);
791
792        assert_eq!(guess.kind, TypeKind::Pointer);
793        assert!(guess.confidence >= 30);
794    }
795
796    #[test]
797    #[cfg(target_os = "macos")]
798    fn test_infer_fat_ptr_size_16() {
799        // Fat pointer: data_ptr + metadata (e.g., length for &[T])
800        // Create a pattern that looks like a fat pointer: valid ptr + small length
801        let mut memory = [0u8; 16];
802        let ptr: usize = 0x10000;
803        let len: usize = 100;
804        memory[0..8].copy_from_slice(&ptr.to_le_bytes());
805        memory[8..16].copy_from_slice(&len.to_le_bytes());
806
807        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 16);
808
809        // FatPtr should have the highest score due to size=16 + valid pointer
810        assert!(guess.kind == TypeKind::FatPtr || guess.kind == TypeKind::Pointer);
811    }
812
813    #[test]
814    #[cfg(target_os = "macos")]
815    fn test_infer_cstruct_multiple_pointers() {
816        let mut memory = vec![1u8; 40];
817        let ptr1: usize = 0x10000;
818        let ptr2: usize = 0x20000;
819        memory[0..8].copy_from_slice(&ptr1.to_le_bytes());
820        memory[24..32].copy_from_slice(&ptr2.to_le_bytes());
821
822        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 40);
823
824        assert_eq!(guess.kind, TypeKind::CStruct);
825        assert!(guess.confidence >= 30);
826    }
827
828    #[test]
829    fn test_entropy_calculation() {
830        // All same byte → entropy = 0
831        let data = [0u8; 100];
832        assert!((shannon_entropy(&data) - 0.0).abs() < 0.01);
833
834        // Alternating bytes → lower entropy
835        let data: Vec<u8> = (0..100).map(|i| if i % 2 == 0 { 0 } else { 255 }).collect();
836        assert!(shannon_entropy(&data) < 2.0);
837
838        // Random-ish data → higher entropy
839        let data: Vec<u8> = (0..100).map(|i| i as u8).collect();
840        assert!(shannon_entropy(&data) > 5.0);
841    }
842
843    #[test]
844    fn test_power_of_two_signal() {
845        // Size 64 (power of two) should get Vec/Buffer boost
846        let memory = vec![0u8; 64];
847        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 64);
848
849        // Should have some confidence from power-of-two signal
850        assert!(guess.confidence > 0 || guess.kind == TypeKind::Unknown);
851    }
852
853    #[test]
854    fn test_zero_filled_buffer() {
855        let memory = [0u8; 64];
856        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 64);
857
858        // Zero-filled should boost buffer or cstruct
859        assert!(
860            guess.kind == TypeKind::Buffer
861                || guess.kind == TypeKind::CStruct
862                || guess.kind == TypeKind::Unknown
863        );
864    }
865
866    #[test]
867    fn test_type_guess_display() {
868        let guess = TypeGuess::new(TypeKind::Vec, 85);
869        assert_eq!(guess.display_with_confidence(), "Vec<_> (85%)");
870
871        let unknown = TypeGuess::unknown();
872        assert_eq!(unknown.display_with_confidence(), "-");
873    }
874
875    #[test]
876    fn test_run_on_records() {
877        let vec_memory = create_vec_memory(0x10000, 10, 100);
878        let mut records = vec![
879            InferenceRecord::new(0x1000, 24, Some(vec_memory)),
880            InferenceRecord::new(0x2000, 8, None),
881        ];
882
883        UnsafeInferenceEngine::run(&mut records);
884
885        assert!(records[0].inferred.is_some());
886        assert!(records[1].inferred.is_none());
887    }
888
889    #[test]
890    fn test_multiple_nulls_not_cstring() {
891        // Multiple null bytes should reduce CString score
892        let memory = [
893            b'h', b'e', b'l', b'l', b'o', 0, b'w', b'o', b'r', b'l', b'd', 0,
894        ];
895        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 12);
896
897        // Should not be strongly identified as CString due to multiple nulls
898        if guess.kind == TypeKind::CString {
899            assert!(guess.confidence < 70);
900        }
901    }
902
903    #[test]
904    fn test_utf8_validation_printable_string() {
905        // Valid UTF-8 with high printable ratio
906        let memory = b"Hello, World! This is a test string.".to_vec();
907        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, memory.len());
908
909        // Should be strongly identified as String due to UTF-8 validation
910        assert_eq!(guess.kind, TypeKind::String);
911        assert!(guess.confidence >= 90);
912        assert_eq!(guess.method, InferenceMethod::Utf8Validation);
913    }
914
915    #[test]
916    fn test_utf8_validation_non_printable() {
917        // Valid UTF-8 but with non-printable characters
918        let memory = vec![0xC2, 0x80, 0xC2, 0x81, 0xC2, 0x82]; // Valid UTF-8 control chars
919        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, memory.len());
920
921        // Should be Vec due to valid UTF-8 but low printable ratio
922        assert!(guess.kind == TypeKind::Vec || guess.kind == TypeKind::Buffer);
923    }
924
925    #[test]
926    fn test_utf8_validation_invalid() {
927        // Invalid UTF-8 sequence
928        let memory = vec![0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA];
929        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, memory.len());
930
931        // Should not be String due to invalid UTF-8
932        assert_ne!(guess.kind, TypeKind::String);
933    }
934
935    #[test]
936    fn test_large_data_buffer_boost() {
937        // Data > 4KB should get buffer boost without entropy calculation
938        let memory = vec![0u8; 5000];
939        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 5000);
940
941        // Should be buffer due to size
942        assert!(guess.kind == TypeKind::Buffer || guess.kind == TypeKind::CStruct);
943    }
944
945    #[test]
946    fn test_inference_method_tracking() {
947        // Test that inference method is properly tracked
948        let string_memory = b"test string content".to_vec();
949        let guess = UnsafeInferenceEngine::infer_from_bytes(&string_memory, string_memory.len());
950
951        // String should use Utf8Validation method
952        assert_eq!(guess.kind, TypeKind::String);
953        assert_eq!(guess.method, InferenceMethod::Utf8Validation);
954    }
955
956    #[test]
957    fn test_stack_trace_vec_detection() {
958        let memory = vec![0u8; 24];
959        let stack = vec![
960            "alloc::vec::Vec::push".to_string(),
961            "my_app::process".to_string(),
962        ];
963        let guess =
964            UnsafeInferenceEngine::infer_with_context(&memory, 24, Some(&stack), None, None);
965
966        // Stack trace with Vec::push should boost Vec score
967        assert!(guess.kind == TypeKind::Vec || guess.kind == TypeKind::Unknown);
968    }
969
970    #[test]
971    fn test_stack_trace_string_detection() {
972        let memory = b"test".to_vec();
973        let stack = vec!["alloc::string::String::push".to_string()];
974        let guess = UnsafeInferenceEngine::infer_with_context(&memory, 4, Some(&stack), None, None);
975
976        // Stack trace with String::push should boost String score
977        assert!(guess.kind == TypeKind::String || guess.kind == TypeKind::Unknown);
978    }
979
980    #[test]
981    fn test_stack_trace_cstring_detection() {
982        let memory = b"hello\0".to_vec();
983        let stack = vec!["std::ffi::c_str::CString::new".to_string()];
984        let guess = UnsafeInferenceEngine::infer_with_context(&memory, 6, Some(&stack), None, None);
985
986        // Stack trace with CString::new should boost CString score
987        assert!(guess.kind == TypeKind::CString || guess.kind == TypeKind::String);
988    }
989
990    #[test]
991    fn test_stack_trace_ffi_detection() {
992        let memory = vec![0u8; 64];
993        let stack = vec!["libc::malloc".to_string()];
994        let guess =
995            UnsafeInferenceEngine::infer_with_context(&memory, 64, Some(&stack), None, None);
996
997        // Stack trace with malloc should boost CStruct/Buffer score
998        assert!(
999            guess.kind == TypeKind::CStruct
1000                || guess.kind == TypeKind::Buffer
1001                || guess.kind == TypeKind::Unknown
1002        );
1003    }
1004
1005    #[test]
1006    fn test_lifetime_transient_allocation() {
1007        let memory = b"test".to_vec();
1008        let alloc_time = Some(1000);
1009        let dealloc_time = Some(1_000_500); // 0.5ms lifetime
1010        let guess =
1011            UnsafeInferenceEngine::infer_with_context(&memory, 4, None, alloc_time, dealloc_time);
1012
1013        // Transient allocation should boost String/Vec score
1014        assert!(guess.confidence > 0 || guess.kind == TypeKind::Unknown);
1015    }
1016
1017    #[test]
1018    fn test_lifetime_long_lived() {
1019        let memory = vec![0u8; 64];
1020        let alloc_time = Some(1000);
1021        let dealloc_time = Some(15_000_000_000); // 15s lifetime
1022        let guess =
1023            UnsafeInferenceEngine::infer_with_context(&memory, 64, None, alloc_time, dealloc_time);
1024
1025        // Long-lived allocation should boost Buffer score
1026        assert!(guess.confidence > 0 || guess.kind == TypeKind::Unknown);
1027    }
1028
1029    #[test]
1030    fn test_combined_stack_and_lifetime() {
1031        let memory = vec![0u8; 24];
1032        let stack = vec!["alloc::vec::Vec::new".to_string()];
1033        let alloc_time = Some(1000);
1034        let dealloc_time = Some(1_000_500); // 0.5ms
1035        let guess = UnsafeInferenceEngine::infer_with_context(
1036            &memory,
1037            24,
1038            Some(&stack),
1039            alloc_time,
1040            dealloc_time,
1041        );
1042
1043        // Combined signals should give higher confidence
1044        assert!(guess.confidence > 0 || guess.kind == TypeKind::Unknown);
1045    }
1046
1047    #[test]
1048    fn test_inference_record_with_context() {
1049        let memory = Some(vec![0u8; 24]);
1050        let stack = Some(vec!["alloc::vec::Vec::new".to_string()]);
1051        let record =
1052            InferenceRecord::with_context(0x1000, 24, memory, stack, Some(1000), Some(2000));
1053
1054        assert_eq!(record.ptr, 0x1000);
1055        assert_eq!(record.size, 24);
1056        assert!(record.stack_trace.is_some());
1057        assert!(record.alloc_time.is_some());
1058        assert!(record.dealloc_time.is_some());
1059    }
1060}
1061
1062/// Real data tests using actual type memory layouts.
1063#[cfg(test)]
1064mod real_data_tests {
1065    use super::*;
1066
1067    /// Get memory representation of a real Vec.
1068    fn vec_to_memory<T>(v: &Vec<T>) -> Vec<u8> {
1069        let ptr = v.as_ptr() as usize;
1070        let len = v.len();
1071        let cap = v.capacity();
1072        let mut memory = vec![0u8; 24];
1073        memory[..8].copy_from_slice(&ptr.to_le_bytes());
1074        memory[8..16].copy_from_slice(&len.to_le_bytes());
1075        memory[16..24].copy_from_slice(&cap.to_le_bytes());
1076        memory
1077    }
1078
1079    /// Get memory representation of a real String.
1080    fn string_to_memory(s: &String) -> Vec<u8> {
1081        let ptr = s.as_ptr() as usize;
1082        let len = s.len();
1083        let cap = s.capacity();
1084        let mut memory = vec![0u8; 24];
1085        memory[..8].copy_from_slice(&ptr.to_le_bytes());
1086        memory[8..16].copy_from_slice(&len.to_le_bytes());
1087        memory[16..24].copy_from_slice(&cap.to_le_bytes());
1088        memory
1089    }
1090
1091    /// Get memory representation of a Box.
1092    fn box_to_memory<T>(b: &T) -> Vec<u8> {
1093        let ptr = b as *const T as usize;
1094        let mut memory = vec![0u8; 8];
1095        memory[..8].copy_from_slice(&ptr.to_le_bytes());
1096        memory
1097    }
1098
1099    #[test]
1100    fn test_real_vec_i32() {
1101        let v = vec![1i32, 2, 3, 4, 5];
1102        let memory = vec_to_memory(&v);
1103        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
1104
1105        // Vec<String> has similar layout - both Vec and String are valid
1106        assert!(
1107            guess.kind == TypeKind::Vec || guess.kind == TypeKind::String,
1108            "Got {:?}",
1109            guess.kind
1110        );
1111    }
1112
1113    #[test]
1114    fn test_real_vec_u8() {
1115        let v = vec![1u8, 2, 3, 4, 5, 6, 7, 8];
1116        let memory = vec_to_memory(&v);
1117        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
1118
1119        // Vec<u8> has same layout as String
1120        assert!(guess.kind == TypeKind::Vec || guess.kind == TypeKind::String);
1121    }
1122
1123    #[test]
1124    fn test_real_string() {
1125        let s = String::from("Hello, World!");
1126        let memory = string_to_memory(&s);
1127        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
1128
1129        // String should be detected (spare capacity is typically small)
1130        assert!(
1131            guess.kind == TypeKind::String || guess.kind == TypeKind::Vec,
1132            "Got {:?}",
1133            guess.kind
1134        );
1135    }
1136
1137    #[test]
1138    fn test_real_string_with_capacity() {
1139        let mut s = String::with_capacity(100);
1140        s.push_str("Hello");
1141        let memory = string_to_memory(&s);
1142        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 24);
1143
1144        // Large spare capacity -> more likely Vec
1145        assert!(
1146            guess.kind == TypeKind::Vec || guess.kind == TypeKind::String,
1147            "Got {:?}",
1148            guess.kind
1149        );
1150    }
1151
1152    #[test]
1153    fn test_real_box_i32() {
1154        let b = Box::new(42i32);
1155        let memory = box_to_memory(&*b);
1156        let guess = UnsafeInferenceEngine::infer_from_bytes(&memory, 8);
1157
1158        // Box and Pointer have same size=8 layout
1159        // Note: CString is also possible if the bytes happen to look like a valid C string
1160        assert!(
1161            guess.kind == TypeKind::Pointer
1162                || guess.kind == TypeKind::Vec
1163                || guess.kind == TypeKind::String
1164                || guess.kind == TypeKind::CString,
1165            "Got {:?}",
1166            guess.kind
1167        );
1168    }
1169
1170    #[test]
1171    fn test_real_string_content() {
1172        let s = "Hello, World! This is a test string for type inference.";
1173        let guess = UnsafeInferenceEngine::infer_from_bytes(s.as_bytes(), s.len());
1174
1175        assert_eq!(guess.kind, TypeKind::String);
1176        assert!(guess.confidence >= 90);
1177    }
1178
1179    #[test]
1180    fn test_real_cstring_content() {
1181        let cstr = std::ffi::CString::new("Hello, C World!").unwrap();
1182        let bytes = cstr.as_bytes_with_nul();
1183        let guess = UnsafeInferenceEngine::infer_from_bytes(bytes, bytes.len());
1184
1185        assert_eq!(guess.kind, TypeKind::CString);
1186        assert!(guess.confidence >= 70);
1187    }
1188
1189    #[test]
1190    fn test_real_binary_data() {
1191        let binary: Vec<u8> = (0..=255).collect();
1192        let guess = UnsafeInferenceEngine::infer_from_bytes(&binary, binary.len());
1193
1194        // High entropy binary data
1195        assert!(guess.kind == TypeKind::Buffer || guess.kind == TypeKind::Vec);
1196    }
1197
1198    #[test]
1199    fn test_real_zero_filled() {
1200        let zeros = vec![0u8; 1024];
1201        let guess = UnsafeInferenceEngine::infer_from_bytes(&zeros, 1024);
1202
1203        // Zero-filled large data
1204        assert!(guess.kind == TypeKind::Buffer || guess.kind == TypeKind::CStruct);
1205    }
1206
1207    #[test]
1208    fn test_real_vec_with_stack_trace() {
1209        let v = vec![1i32, 2, 3, 4, 5];
1210        let memory = vec_to_memory(&v);
1211        let stack = vec!["alloc::vec::Vec::push".to_string()];
1212        let guess =
1213            UnsafeInferenceEngine::infer_with_context(&memory, 24, Some(&stack), None, None);
1214
1215        assert_eq!(guess.kind, TypeKind::Vec);
1216        // Stack trace should boost confidence
1217        assert!(guess.confidence >= 80);
1218    }
1219
1220    #[test]
1221    fn test_real_string_with_stack_trace() {
1222        let s = String::from("Hello");
1223        let memory = string_to_memory(&s);
1224        let stack = vec!["alloc::string::String::push_str".to_string()];
1225        let guess =
1226            UnsafeInferenceEngine::infer_with_context(&memory, 24, Some(&stack), None, None);
1227
1228        assert!(guess.kind == TypeKind::String || guess.kind == TypeKind::Vec);
1229    }
1230
1231    #[test]
1232    #[cfg(target_os = "macos")]
1233    fn test_real_struct_with_pointers() {
1234        struct TestStruct {
1235            _ptr1: *const u8,
1236            _ptr2: *const u8,
1237            _value: u64,
1238        }
1239
1240        let s = TestStruct {
1241            _ptr1: &0u8,
1242            _ptr2: &1u8,
1243            _value: 42,
1244        };
1245
1246        let memory = unsafe {
1247            std::slice::from_raw_parts(
1248                &s as *const TestStruct as *const u8,
1249                std::mem::size_of::<TestStruct>(),
1250            )
1251        };
1252
1253        let guess = UnsafeInferenceEngine::infer_from_bytes(memory, memory.len());
1254
1255        // Struct is 24 bytes (2 pointers + 1 u64), same as Vec/String layout
1256        // So it could be detected as Vec, String, or CStruct depending on pointer values
1257        assert!(
1258            guess.kind == TypeKind::CStruct
1259                || guess.kind == TypeKind::Buffer
1260                || guess.kind == TypeKind::Pointer
1261                || guess.kind == TypeKind::Vec
1262                || guess.kind == TypeKind::String
1263                || guess.kind == TypeKind::Unknown,
1264            "Got {:?}",
1265            guess.kind
1266        );
1267    }
1268}
memscope_rs/analysis/unsafe_inference/engine.rs

memscope_rs/analysis/unsafe_inference/
engine.rs