Skip to main content

tibet_store_mmu/
lib.rs

1use libc::{mmap, munmap, MAP_ANONYMOUS, MAP_PRIVATE, PROT_READ, PROT_WRITE, sysconf, _SC_PAGESIZE};
2use std::ptr;
3use std::sync::atomic::{AtomicU64, AtomicBool, Ordering};
4use std::sync::Arc;
5use std::thread::{self, JoinHandle};
6use std::time::Instant;
7use userfaultfd::{Uffd, UffdBuilder, Event};
8
9// Bifurcation — encrypt-by-default
10use tibet_trust_kernel::bifurcation::{
11    AirlockBifurcation, BifurcationResult, ClearanceLevel, EncryptedBlock, JisClaim,
12};
13
14/// TIBET-Store MMU — Transparante Geheugen-Virtualisatie
15///
16/// Gemini's bewezen concept, verfijnd tot een meetbare library:
17///   - mmap: virtueel RAM zonder fysieke backing
18///   - userfaultfd: page fault trap zonder SIGSEGV
19///   - Archivaris thread: fault → fetch .tza → decompress → inject → resume
20///
21/// Drie modi:
22///   - ZeroFill: inject zero page (snelst, voor virgin memory)
23///   - StaticData: inject vaste payload (Redis-simulatie)
24///   - CompressedRestore: simuleer .tza decompress + inject (productie-pad)
25
26// ═══════════════════════════════════════════════════════════════
27// Public Types
28// ═══════════════════════════════════════════════════════════════
29
30/// Configuration for the MMU illusion.
31#[derive(Debug, Clone)]
32pub struct MmuConfig {
33    /// Total virtual arena size in bytes (must be page-aligned)
34    pub arena_size: usize,
35    /// What to inject on page fault
36    pub fill_mode: FillMode,
37    /// Use HugePages (2MB) instead of normal pages (4KB).
38    /// Reduces TLB pressure by ~512x. Requires:
39    ///   sudo sysctl vm.nr_hugepages=N
40    /// where N >= arena_size / 2MB.
41    pub use_hugepages: bool,
42}
43
44impl MmuConfig {
45    /// Create config with normal 4KB pages.
46    pub fn normal(arena_size: usize, fill_mode: FillMode) -> Self {
47        Self { arena_size, fill_mode, use_hugepages: false }
48    }
49
50    /// Create config with 2MB HugePages (requires kernel allocation).
51    pub fn hugepages(arena_size: usize, fill_mode: FillMode) -> Self {
52        Self { arena_size, fill_mode, use_hugepages: true }
53    }
54}
55
56impl Default for MmuConfig {
57    fn default() -> Self {
58        Self {
59            arena_size: 0,
60            fill_mode: FillMode::ZeroFill,
61            use_hugepages: false,
62        }
63    }
64}
65
66/// What to inject when a page fault occurs.
67#[derive(Debug, Clone)]
68pub enum FillMode {
69    /// Zero page (fastest — no data copy needed)
70    ZeroFill,
71    /// Static payload (copy same data into every page)
72    StaticData { payload: Vec<u8> },
73    /// Simulated .tza restore (zstd decompress simulation)
74    CompressedRestore,
75    /// Encrypted restore via Airlock Bifurcation
76    /// Page data is AES-256-GCM sealed. On fault: open(block, claim) → plaintext → inject.
77    /// Geen JIS claim = dood materiaal (zero page).
78    EncryptedRestore {
79        /// Pre-sealed blocks, indexed by page number
80        sealed_pages: Vec<EncryptedBlock>,
81        /// JIS claim for decryption — identity IS the key
82        claim: JisClaim,
83        /// Clearance level used for sealing
84        clearance: ClearanceLevel,
85    },
86    /// Compressed + Encrypted restore — de productie-modus.
87    ///
88    /// plaintext → zstd compress → AES-256-GCM seal → stored
89    /// On fault: open → zstd decompress → inject
90    ///
91    /// Kleiner EN veiliger. Compressie reduceert I/O, encryptie beschermt data.
92    /// Netto effect: sneller dan raw plaintext voor compressible data.
93    CompressedEncryptedRestore {
94        /// Pre-compressed + sealed blocks, indexed by page number
95        sealed_pages: Vec<EncryptedBlock>,
96        /// Original (uncompressed) page sizes for verification
97        original_sizes: Vec<usize>,
98        /// JIS claim for decryption
99        claim: JisClaim,
100        /// Clearance level
101        clearance: ClearanceLevel,
102    },
103}
104
105/// Stats from the MMU handler.
106#[derive(Debug, Clone)]
107pub struct MmuStats {
108    pub pages_faulted: u64,
109    pub pages_injected: u64,
110    pub inject_errors: u64,
111    pub total_bytes_injected: u64,
112    pub page_size: usize,
113    pub arena_size: usize,
114    pub arena_pages: usize,
115}
116
117/// Result of running the MMU illusion.
118#[derive(Debug)]
119pub struct MmuResult {
120    pub stats: MmuStats,
121    pub elapsed: std::time::Duration,
122    pub fault_latencies_ns: Vec<u64>,
123}
124
125// ═══════════════════════════════════════════════════════════════
126// MMU Arena — The core abstraction
127// ═══════════════════════════════════════════════════════════════
128
129/// An MMU-trapped virtual memory arena.
130///
131/// The arena is a region of virtual memory with no physical backing.
132/// When any thread touches an address in the arena, userfaultfd catches
133/// the page fault and the handler thread injects the requested data.
134pub struct MmuArena {
135    /// Base address of the mmap'd region
136    addr: *mut libc::c_void,
137    /// Arena size in bytes
138    size: usize,
139    /// System page size
140    page_size: usize,
141    /// Stats counters
142    pages_faulted: Arc<AtomicU64>,
143    pages_injected: Arc<AtomicU64>,
144    inject_errors: Arc<AtomicU64>,
145    bytes_injected: Arc<AtomicU64>,
146    /// Handler thread alive flag
147    handler_active: Arc<AtomicBool>,
148    /// Handler thread join handle
149    handler_thread: Option<JoinHandle<Vec<u64>>>,
150}
151
152impl MmuArena {
153    /// Create a new MMU arena.
154    ///
155    /// This:
156    ///   1. Gets the system page size
157    ///   2. mmap's a virtual region (MAP_ANONYMOUS, no physical backing)
158    ///   3. Creates a userfaultfd and registers the region
159    ///   4. Spawns a handler thread that listens for page faults
160    ///
161    /// Returns None if userfaultfd is not available (needs root or CAP_SYS_PTRACE).
162    pub fn new(config: MmuConfig) -> Option<Self> {
163        let use_hugepages = config.use_hugepages;
164        let base_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize };
165
166        // HugePages: 2MB alignment, normal: 4KB alignment
167        let page_size = if use_hugepages { 2 * 1024 * 1024 } else { base_page_size };
168
169        // Align arena size to page boundary
170        let size = (config.arena_size + page_size - 1) & !(page_size - 1);
171
172        // Step 1: Allocate virtual memory
173        // HugePages: MAP_HUGETLB eliminates TLB thrashing for large arenas
174        // 18.5GB GGUF: 4.8M normal pages vs 9375 huge pages (512x less TLB pressure)
175        let mmap_flags = if use_hugepages {
176            MAP_PRIVATE | MAP_ANONYMOUS | libc::MAP_HUGETLB
177        } else {
178            MAP_PRIVATE | MAP_ANONYMOUS
179        };
180
181        let addr = unsafe {
182            mmap(
183                ptr::null_mut(),
184                size,
185                PROT_READ | PROT_WRITE,
186                mmap_flags,
187                -1,
188                0,
189            )
190        };
191        if addr == libc::MAP_FAILED {
192            return None;
193        }
194
195        // Step 2: Create userfaultfd (non-blocking so handler can check active flag)
196        let uffd = match UffdBuilder::new()
197            .close_on_exec(true)
198            .non_blocking(true)
199            .user_mode_only(true)
200            .create()
201        {
202            Ok(u) => u,
203            Err(_) => {
204                // Clean up mmap
205                unsafe { munmap(addr, size); }
206                return None;
207            }
208        };
209
210        // Step 3: Register the arena
211        if uffd.register(addr, size).is_err() {
212            unsafe { munmap(addr, size); }
213            return None;
214        }
215
216        // Step 4: Spawn handler thread
217        let pages_faulted = Arc::new(AtomicU64::new(0));
218        let pages_injected = Arc::new(AtomicU64::new(0));
219        let inject_errors = Arc::new(AtomicU64::new(0));
220        let bytes_injected = Arc::new(AtomicU64::new(0));
221        let handler_active = Arc::new(AtomicBool::new(true));
222
223        let pf = pages_faulted.clone();
224        let pi = pages_injected.clone();
225        let ie = inject_errors.clone();
226        let bi = bytes_injected.clone();
227        let ha = handler_active.clone();
228        let handler_ready = Arc::new(AtomicBool::new(false));
229        let hr = handler_ready.clone();
230        let fill_mode = config.fill_mode;
231
232        // Clone arena base address for page index calculation in handler
233        let arena_base = addr as usize;
234
235        let handler_thread = thread::spawn(move || {
236            let mut latencies: Vec<u64> = Vec::new();
237
238            // Archivaris engine — lives in handler thread, owns the decryption keys
239            let mut engine = AirlockBifurcation::new();
240
241            // Signal that handler is ready to receive faults
242            hr.store(true, Ordering::Release);
243
244            loop {
245                if !ha.load(Ordering::Relaxed) {
246                    break;
247                }
248
249                match uffd.read_event() {
250                    Ok(None) => {
251                        // Non-blocking: no event yet, brief yield and retry
252                        thread::yield_now();
253                        continue;
254                    }
255                    Err(_) => {
256                        // Non-blocking: nothing pending or uffd error, brief yield
257                        thread::yield_now();
258                        continue;
259                    }
260                    Ok(Some(Event::Pagefault { addr: fault_addr, .. })) => {
261                        let t0 = Instant::now();
262                        let aligned = (fault_addr as usize / page_size) * page_size;
263                        let page_index = (aligned - arena_base) / page_size;
264
265                        pf.fetch_add(1, Ordering::Relaxed);
266
267                        // Build injection data based on fill mode
268                        let data = match &fill_mode {
269                            FillMode::ZeroFill => {
270                                vec![0u8; page_size]
271                            }
272                            FillMode::StaticData { payload } => {
273                                let mut page = vec![0u8; page_size];
274                                let copy_len = payload.len().min(page_size);
275                                page[..copy_len].copy_from_slice(&payload[..copy_len]);
276                                page
277                            }
278                            FillMode::CompressedRestore => {
279                                let mut page = vec![0u8; page_size];
280                                let marker = format!("TZA_RESTORED:page@{:#x}", aligned);
281                                let marker_bytes = marker.as_bytes();
282                                page[..marker_bytes.len()].copy_from_slice(marker_bytes);
283                                page
284                            }
285                            FillMode::EncryptedRestore { sealed_pages, claim, .. } => {
286                                // ═══════════════════════════════════════════
287                                // SPACESHUTTLE: Encrypted Page Fault Handler
288                                //
289                                // Page fault → lookup sealed block → bifurcation.open()
290                                //   → JIS clearance check → AES-256-GCM decrypt
291                                //   → plaintext → inject in page → app resumes
292                                //
293                                // Geen JIS claim = dood materiaal (zero page)
294                                // Identity IS the memory.
295                                // ═══════════════════════════════════════════
296                                if page_index < sealed_pages.len() {
297                                    match engine.open(&sealed_pages[page_index], claim) {
298                                        BifurcationResult::Opened { plaintext, .. } => {
299                                            let mut page = vec![0u8; page_size];
300                                            let copy_len = plaintext.len().min(page_size);
301                                            page[..copy_len].copy_from_slice(&plaintext[..copy_len]);
302                                            page
303                                        }
304                                        BifurcationResult::AccessDenied { .. } => {
305                                            vec![0u8; page_size]
306                                        }
307                                        _ => {
308                                            vec![0u8; page_size]
309                                        }
310                                    }
311                                } else {
312                                    vec![0u8; page_size]
313                                }
314                            }
315                            FillMode::CompressedEncryptedRestore { sealed_pages, claim, .. } => {
316                                // ═══════════════════════════════════════════
317                                // SPACESHUTTLE v2: Compressed + Encrypted
318                                //
319                                // Page fault → open(block) → AES-256-GCM decrypt
320                                //   → zstd decompress → full page → inject
321                                //
322                                // Stored: ~1-2KB per 4KB page (compressible data)
323                                // Decrypted: on-demand, per page fault
324                                // Net effect: less I/O, less bandwidth, same security
325                                // ═══════════════════════════════════════════
326                                if page_index < sealed_pages.len() {
327                                    match engine.open(&sealed_pages[page_index], claim) {
328                                        BifurcationResult::Opened { plaintext, .. } => {
329                                            // plaintext = zstd compressed data → decompress
330                                            match zstd::decode_all(plaintext.as_slice()) {
331                                                Ok(decompressed) => {
332                                                    let mut page = vec![0u8; page_size];
333                                                    let copy_len = decompressed.len().min(page_size);
334                                                    page[..copy_len].copy_from_slice(&decompressed[..copy_len]);
335                                                    page
336                                                }
337                                                Err(_) => {
338                                                    // Decompressie mislukt — dood materiaal
339                                                    vec![0u8; page_size]
340                                                }
341                                            }
342                                        }
343                                        BifurcationResult::AccessDenied { .. } => {
344                                            vec![0u8; page_size]
345                                        }
346                                        _ => {
347                                            vec![0u8; page_size]
348                                        }
349                                    }
350                                } else {
351                                    vec![0u8; page_size]
352                                }
353                            }
354                        };
355
356                        // THE MAGIC: inject data into the faulting page
357                        let result = unsafe {
358                            uffd.copy(
359                                data.as_ptr() as *const _,
360                                aligned as *mut _,
361                                page_size,
362                                true, // wake the blocked thread
363                            )
364                        };
365
366                        match result {
367                            Ok(_) => {
368                                pi.fetch_add(1, Ordering::Relaxed);
369                                bi.fetch_add(page_size as u64, Ordering::Relaxed);
370                            }
371                            Err(_) => {
372                                ie.fetch_add(1, Ordering::Relaxed);
373                            }
374                        }
375
376                        let latency_ns = t0.elapsed().as_nanos() as u64;
377                        latencies.push(latency_ns);
378                    }
379                    Ok(None) => {
380                        // No event (shouldn't happen in blocking mode)
381                        break;
382                    }
383                    Ok(Some(_)) => {
384                        // Other event (fork, remap, etc.) — ignore
385                    }
386                    Err(_) => {
387                        // UFFD closed or error
388                        break;
389                    }
390                }
391            }
392
393            latencies
394        });
395
396        // Wait for handler thread to be ready before returning
397        while !handler_ready.load(Ordering::Acquire) {
398            thread::yield_now();
399        }
400        // Give handler time to enter read_event() blocking call
401        // Without this, the first page fault can arrive before the handler is listening
402        thread::sleep(std::time::Duration::from_millis(5));
403
404        Some(Self {
405            addr,
406            size,
407            page_size,
408            pages_faulted,
409            pages_injected,
410            inject_errors,
411            bytes_injected,
412            handler_active,
413            handler_thread: Some(handler_thread),
414        })
415    }
416
417    /// Get the base address of the arena (for reading/writing).
418    pub fn addr(&self) -> *mut libc::c_void {
419        self.addr
420    }
421
422    /// Get the arena size.
423    pub fn size(&self) -> usize {
424        self.size
425    }
426
427    /// Get the system page size.
428    pub fn page_size(&self) -> usize {
429        self.page_size
430    }
431
432    /// Number of pages in the arena.
433    pub fn page_count(&self) -> usize {
434        self.size / self.page_size
435    }
436
437    /// Read a byte from offset (will trigger page fault if page not yet loaded).
438    ///
439    /// # Safety
440    /// Offset must be within arena bounds.
441    pub unsafe fn read_byte(&self, offset: usize) -> u8 {
442        let ptr = (self.addr as *const u8).add(offset);
443        ptr::read_volatile(ptr)
444    }
445
446    /// Read a slice from offset (may trigger multiple page faults).
447    ///
448    /// # Safety
449    /// Range must be within arena bounds.
450    pub unsafe fn read_slice(&self, offset: usize, len: usize) -> Vec<u8> {
451        let ptr = (self.addr as *const u8).add(offset);
452        let slice = std::slice::from_raw_parts(ptr, len);
453        slice.to_vec()
454    }
455
456    /// Get current stats (non-blocking).
457    pub fn stats(&self) -> MmuStats {
458        MmuStats {
459            pages_faulted: self.pages_faulted.load(Ordering::Relaxed),
460            pages_injected: self.pages_injected.load(Ordering::Relaxed),
461            inject_errors: self.inject_errors.load(Ordering::Relaxed),
462            total_bytes_injected: self.bytes_injected.load(Ordering::Relaxed),
463            page_size: self.page_size,
464            arena_size: self.size,
465            arena_pages: self.size / self.page_size,
466        }
467    }
468
469    /// Shut down the handler and collect latency data.
470    pub fn shutdown(mut self) -> MmuResult {
471        let t0 = Instant::now();
472        // Signal handler to stop (non-blocking handler checks this flag)
473        self.handler_active.store(false, Ordering::Release);
474
475        let latencies = if let Some(handle) = self.handler_thread.take() {
476            handle.join().unwrap_or_default()
477        } else {
478            Vec::new()
479        };
480
481        // Clean up mmap
482        if !self.addr.is_null() {
483            unsafe { munmap(self.addr, self.size); }
484            self.addr = ptr::null_mut();
485        }
486
487        let stats = MmuStats {
488            pages_faulted: self.pages_faulted.load(Ordering::Relaxed),
489            pages_injected: self.pages_injected.load(Ordering::Relaxed),
490            inject_errors: self.inject_errors.load(Ordering::Relaxed),
491            total_bytes_injected: self.bytes_injected.load(Ordering::Relaxed),
492            page_size: self.page_size,
493            arena_size: self.size,
494            arena_pages: self.size / self.page_size,
495        };
496
497        MmuResult {
498            stats,
499            elapsed: t0.elapsed(),
500            fault_latencies_ns: latencies,
501        }
502    }
503}
504
505impl Drop for MmuArena {
506    fn drop(&mut self) {
507        self.handler_active.store(false, Ordering::Release);
508        if !self.addr.is_null() {
509            unsafe { munmap(self.addr, self.size); }
510        }
511        // Note: handler thread will exit when uffd read fails after munmap
512    }
513}
514
515// ═══════════════════════════════════════════════════════════════
516// Helper: compute percentiles from sorted latency data
517// ═══════════════════════════════════════════════════════════════
518
519pub fn percentile(sorted: &[u64], pct: f64) -> u64 {
520    if sorted.is_empty() { return 0; }
521    let idx = ((sorted.len() as f64 * pct / 100.0) as usize).min(sorted.len() - 1);
522    sorted[idx]
523}
524
525/// Quick check: is userfaultfd available on this system?
526pub fn userfaultfd_available() -> bool {
527    match UffdBuilder::new()
528        .close_on_exec(true)
529        .non_blocking(true)
530        .user_mode_only(true)
531        .create()
532    {
533        Ok(_) => true,
534        Err(_) => false,
535    }
536}
537
538/// Pre-seal page data into encrypted blocks for EncryptedRestore mode.
539///
540/// Takes a slice of page-sized plaintext buffers and seals each one
541/// using session keys (fast path: HKDF+AES only after first DH).
542///
543/// Returns the sealed blocks ready for the page fault handler.
544pub fn seal_pages(
545    pages: &[Vec<u8>],
546    clearance: ClearanceLevel,
547    source: &str,
548) -> Vec<EncryptedBlock> {
549    let mut engine = AirlockBifurcation::new();
550    let mut blocks = Vec::with_capacity(pages.len());
551    for (i, page_data) in pages.iter().enumerate() {
552        if let BifurcationResult::Sealed { block, .. } =
553            engine.seal_session(page_data, i, clearance.clone(), source)
554        {
555            blocks.push(block);
556        }
557    }
558    blocks
559}
560
561/// Compressed seal result with storage statistics.
562pub struct CompressedSealResult {
563    pub blocks: Vec<EncryptedBlock>,
564    pub original_sizes: Vec<usize>,
565    pub total_original: usize,
566    pub total_compressed: usize,
567    pub total_encrypted: usize,
568    pub compression_ratio: f64,
569}
570
571/// Pre-compress + seal page data for CompressedEncryptedRestore mode.
572///
573/// Pipeline per page: plaintext → zstd (level 3) → AES-256-GCM seal
574///
575/// Returns sealed blocks + compression statistics.
576pub fn seal_pages_compressed(
577    pages: &[Vec<u8>],
578    clearance: ClearanceLevel,
579    source: &str,
580    zstd_level: i32,
581) -> CompressedSealResult {
582    let mut engine = AirlockBifurcation::new();
583    let mut blocks = Vec::with_capacity(pages.len());
584    let mut original_sizes = Vec::with_capacity(pages.len());
585    let mut total_original = 0usize;
586    let mut total_compressed = 0usize;
587    let mut total_encrypted = 0usize;
588
589    for (i, page_data) in pages.iter().enumerate() {
590        let original_size = page_data.len();
591        total_original += original_size;
592
593        // Step 1: zstd compress
594        let compressed = zstd::encode_all(page_data.as_slice(), zstd_level)
595            .unwrap_or_else(|_| page_data.clone()); // fallback to raw on compress failure
596        total_compressed += compressed.len();
597
598        // Step 2: AES-256-GCM seal the compressed data
599        if let BifurcationResult::Sealed { block, .. } =
600            engine.seal_session(&compressed, i, clearance.clone(), source)
601        {
602            total_encrypted += block.ciphertext.len();
603            blocks.push(block);
604        }
605
606        original_sizes.push(original_size);
607    }
608
609    let compression_ratio = if total_compressed > 0 {
610        total_original as f64 / total_compressed as f64
611    } else {
612        1.0
613    };
614
615    CompressedSealResult {
616        blocks,
617        original_sizes,
618        total_original,
619        total_compressed,
620        total_encrypted,
621        compression_ratio,
622    }
623}
624
625/// Create a JIS claim for MMU access.
626pub fn mmu_claim(identity: &str, clearance: ClearanceLevel) -> JisClaim {
627    JisClaim {
628        identity: identity.to_string(),
629        ed25519_pub: "a".repeat(64), // Placeholder — real impl uses actual key
630        clearance,
631        role: "operator".to_string(),
632        dept: "kernel".to_string(),
633        claimed_at: "2026-04-15T00:00:00Z".to_string(),
634        signature: "mmu_sig".to_string(),
635    }
636}
637
638pub fn format_ns(ns: u64) -> String {
639    if ns < 1_000 {
640        format!("{}ns", ns)
641    } else if ns < 1_000_000 {
642        format!("{:.1}µs", ns as f64 / 1_000.0)
643    } else {
644        format!("{:.2}ms", ns as f64 / 1_000_000.0)
645    }
646}