smmu 1.8.0

ARM SMMU v3 (System Memory Management Unit) implementation - Production-grade translation engine
Documentation
//! Validation tests for lock elimination and `PageEntry` packing optimizations
//!
//! This test suite verifies:
//! 1. `PagePermissions` is packed to 1 byte
//! 2. `PageEntry` maintains 16-byte size
//! 3. Lock elimination provides performance improvements
//! 4. Thread safety is maintained

#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_lossless)]
#![allow(clippy::uninlined_format_args)]

use smmu::prelude::*;
use std::mem::size_of;
use std::sync::Arc;
use std::thread;
use std::time::Instant;

#[test]
fn test_page_permissions_size() {
    // Verify PagePermissions is packed to 1 byte
    let size = size_of::<PagePermissions>();
    println!("PagePermissions size: {} bytes", size);
    assert_eq!(size, 1, "PagePermissions should be packed to 1 byte");
}

#[test]
fn test_page_entry_size() {
    // Verify PageEntry is 16 bytes (optimal cache line utilization)
    let size = size_of::<PageEntry>();
    println!("PageEntry size: {} bytes", size);
    assert!(size <= 16, "PageEntry should be 16 bytes or less, got {}", size);
}

#[test]
fn test_page_permissions_bitfield_functionality() {
    // Verify bitfield operations work correctly

    // Read-only
    let ro = PagePermissions::read_only();
    assert!(ro.read());
    assert!(!ro.write());
    assert!(!ro.execute());

    // Read-write
    let rw = PagePermissions::read_write();
    assert!(rw.read());
    assert!(rw.write());
    assert!(!rw.execute());

    // Execute-only
    let exec = PagePermissions::execute_only();
    assert!(!exec.read());
    assert!(!exec.write());
    assert!(exec.execute());

    // Full permissions
    let full = PagePermissions::new(true, true, true);
    assert!(full.read());
    assert!(full.write());
    assert!(full.execute());

    // Test allows() method with AccessType
    assert!(rw.allows(AccessType::Read));
    assert!(rw.allows(AccessType::Write));
    assert!(!rw.allows(AccessType::Execute));
}

#[test]
fn test_concurrent_translation_performance() {
    // Test that lock elimination improves concurrent performance
    let smmu = Arc::new(SMMU::new());

    // Setup: Create 10 streams with 10 PASIDs each
    for stream_num in 0..10 {
        let stream_id = StreamID::new(stream_num + 1).unwrap();
        smmu.configure_stream(stream_id, StreamConfig::stage1_only()).unwrap();

        for pasid_num in 0..10 {
            let pasid = PASID::new(pasid_num).unwrap();
            smmu.create_pasid(stream_id, pasid).unwrap();

            // Map 10 pages per PASID
            for page_num in 0..10 {
                let iova = IOVA::new((page_num as u64) * 0x1000).unwrap();
                let pa = PA::new(0x0010_0000 + (page_num as u64) * 0x1000).unwrap();
                smmu.map_page(
                    stream_id,
                    pasid,
                    iova,
                    pa,
                    PagePermissions::read_write(),
                    SecurityState::NonSecure,
                )
                .unwrap();
            }
        }
    }

    // Warmup: Populate TLB cache by translating each page once per stream/PASID
    // This ensures consistent results in both debug and release mode by eliminating
    // cache-miss variance from the benchmark timing.
    for stream_num in 0..10 {
        let stream_id = StreamID::new(stream_num + 1).unwrap();
        for pasid_num in 0..10 {
            let pasid = PASID::new(pasid_num).unwrap();
            for page_num in 0..10 {
                let iova = IOVA::new((page_num as u64) * 0x1000).unwrap();
                smmu.translate(stream_id, pasid, iova, AccessType::Read, SecurityState::NonSecure)
                    .expect("Warmup translation should succeed");
            }
        }
    }

    // Benchmark: Concurrent translations from 8 threads
    let num_threads = 8;
    let iterations_per_thread = 1000;

    let start = Instant::now();
    let mut handles = vec![];

    for thread_id in 0..num_threads {
        let smmu_clone = Arc::clone(&smmu);
        let handle = thread::spawn(move || {
            let stream_id = StreamID::new((thread_id % 10) + 1).unwrap();
            let pasid = PASID::new(thread_id % 10).unwrap();

            for i in 0..iterations_per_thread {
                let page_num = i % 10;
                let iova = IOVA::new((page_num as u64) * 0x1000).unwrap();

                smmu_clone
                    .translate(stream_id, pasid, iova, AccessType::Read, SecurityState::NonSecure)
                    .expect("Translation should succeed");
            }
        });
        handles.push(handle);
    }

    for handle in handles {
        handle.join().unwrap();
    }

    let duration = start.elapsed();
    let total_translations = num_threads * iterations_per_thread;
    let avg_latency_ns = duration.as_nanos() / total_translations as u128;

    println!("\n=== Concurrent Translation Performance ===");
    println!("Threads: {num_threads}");
    println!("Total translations: {total_translations}");
    println!("Total time: {duration:?}");
    println!("Average latency: {avg_latency_ns}ns");

    // With lock elimination + TLB cache, expect <200ns average
    assert!(
        avg_latency_ns < 500,
        "Average latency should be <500ns with optimizations, got {}ns",
        avg_latency_ns
    );
}

#[test]
fn test_single_thread_translation_latency() {
    // Measure single-threaded translation latency
    let smmu = SMMU::new();
    let stream_id = StreamID::new(1).unwrap();
    smmu.configure_stream(stream_id, StreamConfig::stage1_only())
        .unwrap();

    let pasid = PASID::new(0).unwrap();
    smmu.create_pasid(stream_id, pasid).unwrap();

    // Map 100 pages
    for page_num in 0..100 {
        let iova = IOVA::new((page_num * 0x1000) as u64).unwrap();
        let pa = PA::new(0x0020_0000 + (page_num * 0x1000) as u64).unwrap();
        smmu.map_page(
            stream_id,
            pasid,
            iova,
            pa,
            PagePermissions::read_write(),
            SecurityState::NonSecure,
        )
        .unwrap();
    }

    // Warmup: First pass to populate TLB cache
    for page_num in 0..100 {
        let iova = IOVA::new((page_num * 0x1000) as u64).unwrap();
        smmu.translate(stream_id, pasid, iova, AccessType::Read, SecurityState::NonSecure)
            .unwrap();
    }

    // Benchmark: Cached translations
    let iterations = 10000;
    let start = Instant::now();

    for i in 0..iterations {
        let page_num = i % 100;
        let iova = IOVA::new((page_num * 0x1000) as u64).unwrap();
        smmu.translate(stream_id, pasid, iova, AccessType::Read, SecurityState::NonSecure)
            .unwrap();
    }

    let duration = start.elapsed();
    let avg_latency_ns = duration.as_nanos() / iterations as u128;

    println!("\n=== Single-Threaded Cached Translation Performance ===");
    println!("Iterations: {iterations}");
    println!("Total time: {duration:?}");
    println!("Average latency: {avg_latency_ns}ns");

    let stats = smmu.get_cache_statistics();
    println!("TLB hit rate: {:.2}%", stats.tlb_hit_rate());

    // With TLB cache + lock elimination + packed PageEntry, expect <1µs (1000ns)
    // Note: 600-700ns is excellent for software SMMU (hardware SMMU: ~100-200ns)
    assert!(
        avg_latency_ns < 1000,
        "Average latency should be <1000ns with all optimizations, got {avg_latency_ns}ns"
    );
}

#[test]
fn test_memory_efficiency() {
    // Test that PageEntry packing reduces memory footprint
    let smmu = SMMU::new();
    let stream_id = StreamID::new(1).unwrap();
    smmu.configure_stream(stream_id, StreamConfig::stage1_only())
        .unwrap();

    let pasid = PASID::new(0).unwrap();
    smmu.create_pasid(stream_id, pasid).unwrap();

    // Map 10,000 pages
    let num_pages = 10000;
    for page_num in 0..num_pages {
        let iova = IOVA::new((page_num * 0x1000) as u64).unwrap();
        let pa = PA::new(0x0100_0000 + (page_num * 0x1000) as u64).unwrap();
        smmu.map_page(
            stream_id,
            pasid,
            iova,
            pa,
            PagePermissions::read_write(),
            SecurityState::NonSecure,
        )
        .unwrap();
    }

    // Calculate memory efficiency
    let page_entry_size = size_of::<PageEntry>();
    let estimated_memory = num_pages * page_entry_size;

    println!("\n=== Memory Efficiency ===");
    println!("Pages mapped: {num_pages}");
    println!("PageEntry size: {page_entry_size} bytes");
    println!("Estimated page table memory: {} bytes ({} KB)",
             estimated_memory, estimated_memory / 1024);
    println!("Entries per 64-byte cache line: {}", 64 / page_entry_size);

    // Verify we get 4 entries per cache line (16 bytes per entry)
    assert!(64 / page_entry_size >= 4,
            "Should fit at least 4 PageEntry per cache line");
}

#[test]
fn test_permissions_backward_compatibility() {

    let smmu = SMMU::new();
    smmu.enable().unwrap();
    let stream_id = StreamID::new(1).unwrap();
    smmu.configure_stream(stream_id, StreamConfig::stage1_only())
        .unwrap();

    let pasid = PASID::new(0).unwrap();
    smmu.create_pasid(stream_id, pasid).unwrap();

    let iova = IOVA::new(0x1000).unwrap();
    let pa = PA::new(0x2000).unwrap();

    // Test read-only permissions
    smmu.map_page(
        stream_id,
        pasid,
        iova,
        pa,
        PagePermissions::read_only(),
        SecurityState::NonSecure,
    )
    .unwrap();

    // Read should succeed
    assert!(smmu.translate(stream_id, pasid, iova, AccessType::Read, SecurityState::NonSecure).is_ok());

    // Write should fail
    assert!(smmu.translate(stream_id, pasid, iova, AccessType::Write, SecurityState::NonSecure).is_err());

    // Execute should fail
    assert!(smmu.translate(stream_id, pasid, iova, AccessType::Execute, SecurityState::NonSecure).is_err());
}

#[test]
fn test_cache_line_efficiency() {
    // Test that packed PageEntry improves cache efficiency
    // by fitting more entries per cache line

    let page_entry_size = size_of::<PageEntry>();
    let cache_line_size = 64; // Standard x86-64 cache line size
    let entries_per_line = cache_line_size / page_entry_size;

    println!("\n=== Cache Line Efficiency ===");
    println!("PageEntry size: {page_entry_size} bytes");
    println!("Cache line size: {cache_line_size} bytes");
    println!("Entries per cache line: {entries_per_line}");

    // With 16-byte PageEntry, we should get 4 entries per 64-byte cache line
    assert!(
        entries_per_line >= 4,
        "Expected at least 4 entries per cache line, got {entries_per_line}"
    );

    // This represents a 2x improvement over the original 32-byte PageEntry
    // which only fit 2 entries per cache line
}