tibet_store_mmu/lib.rs
1use libc::{mmap, munmap, MAP_ANONYMOUS, MAP_PRIVATE, PROT_READ, PROT_WRITE, sysconf, _SC_PAGESIZE};
2use std::ptr;
3use std::sync::atomic::{AtomicU64, AtomicBool, Ordering};
4use std::sync::Arc;
5use std::thread::{self, JoinHandle};
6use std::time::Instant;
7use userfaultfd::{Uffd, UffdBuilder, Event};
8
9// Bifurcation — encrypt-by-default
10use tibet_trust_kernel::bifurcation::{
11 AirlockBifurcation, BifurcationResult, ClearanceLevel, EncryptedBlock, JisClaim,
12};
13
14/// TIBET-Store MMU — Transparante Geheugen-Virtualisatie
15///
16/// Gemini's bewezen concept, verfijnd tot een meetbare library:
17/// - mmap: virtueel RAM zonder fysieke backing
18/// - userfaultfd: page fault trap zonder SIGSEGV
19/// - Archivaris thread: fault → fetch .tza → decompress → inject → resume
20///
21/// Drie modi:
22/// - ZeroFill: inject zero page (snelst, voor virgin memory)
23/// - StaticData: inject vaste payload (Redis-simulatie)
24/// - CompressedRestore: simuleer .tza decompress + inject (productie-pad)
25
26// ═══════════════════════════════════════════════════════════════
27// Public Types
28// ═══════════════════════════════════════════════════════════════
29
30/// Configuration for the MMU illusion.
31#[derive(Debug, Clone)]
32pub struct MmuConfig {
33 /// Total virtual arena size in bytes (must be page-aligned)
34 pub arena_size: usize,
35 /// What to inject on page fault
36 pub fill_mode: FillMode,
37 /// Use HugePages (2MB) instead of normal pages (4KB).
38 /// Reduces TLB pressure by ~512x. Requires:
39 /// sudo sysctl vm.nr_hugepages=N
40 /// where N >= arena_size / 2MB.
41 pub use_hugepages: bool,
42}
43
44impl MmuConfig {
45 /// Create config with normal 4KB pages.
46 pub fn normal(arena_size: usize, fill_mode: FillMode) -> Self {
47 Self { arena_size, fill_mode, use_hugepages: false }
48 }
49
50 /// Create config with 2MB HugePages (requires kernel allocation).
51 pub fn hugepages(arena_size: usize, fill_mode: FillMode) -> Self {
52 Self { arena_size, fill_mode, use_hugepages: true }
53 }
54}
55
56impl Default for MmuConfig {
57 fn default() -> Self {
58 Self {
59 arena_size: 0,
60 fill_mode: FillMode::ZeroFill,
61 use_hugepages: false,
62 }
63 }
64}
65
66/// What to inject when a page fault occurs.
67#[derive(Debug, Clone)]
68pub enum FillMode {
69 /// Zero page (fastest — no data copy needed)
70 ZeroFill,
71 /// Static payload (copy same data into every page)
72 StaticData { payload: Vec<u8> },
73 /// Simulated .tza restore (zstd decompress simulation)
74 CompressedRestore,
75 /// Encrypted restore via Airlock Bifurcation
76 /// Page data is AES-256-GCM sealed. On fault: open(block, claim) → plaintext → inject.
77 /// Geen JIS claim = dood materiaal (zero page).
78 EncryptedRestore {
79 /// Pre-sealed blocks, indexed by page number
80 sealed_pages: Vec<EncryptedBlock>,
81 /// JIS claim for decryption — identity IS the key
82 claim: JisClaim,
83 /// Clearance level used for sealing
84 clearance: ClearanceLevel,
85 },
86 /// Compressed + Encrypted restore — de productie-modus.
87 ///
88 /// plaintext → zstd compress → AES-256-GCM seal → stored
89 /// On fault: open → zstd decompress → inject
90 ///
91 /// Kleiner EN veiliger. Compressie reduceert I/O, encryptie beschermt data.
92 /// Netto effect: sneller dan raw plaintext voor compressible data.
93 CompressedEncryptedRestore {
94 /// Pre-compressed + sealed blocks, indexed by page number
95 sealed_pages: Vec<EncryptedBlock>,
96 /// Original (uncompressed) page sizes for verification
97 original_sizes: Vec<usize>,
98 /// JIS claim for decryption
99 claim: JisClaim,
100 /// Clearance level
101 clearance: ClearanceLevel,
102 },
103}
104
105/// Stats from the MMU handler.
106#[derive(Debug, Clone)]
107pub struct MmuStats {
108 pub pages_faulted: u64,
109 pub pages_injected: u64,
110 pub inject_errors: u64,
111 pub total_bytes_injected: u64,
112 pub page_size: usize,
113 pub arena_size: usize,
114 pub arena_pages: usize,
115}
116
117/// Result of running the MMU illusion.
118#[derive(Debug)]
119pub struct MmuResult {
120 pub stats: MmuStats,
121 pub elapsed: std::time::Duration,
122 pub fault_latencies_ns: Vec<u64>,
123}
124
125// ═══════════════════════════════════════════════════════════════
126// MMU Arena — The core abstraction
127// ═══════════════════════════════════════════════════════════════
128
129/// An MMU-trapped virtual memory arena.
130///
131/// The arena is a region of virtual memory with no physical backing.
132/// When any thread touches an address in the arena, userfaultfd catches
133/// the page fault and the handler thread injects the requested data.
134pub struct MmuArena {
135 /// Base address of the mmap'd region
136 addr: *mut libc::c_void,
137 /// Arena size in bytes
138 size: usize,
139 /// System page size
140 page_size: usize,
141 /// Stats counters
142 pages_faulted: Arc<AtomicU64>,
143 pages_injected: Arc<AtomicU64>,
144 inject_errors: Arc<AtomicU64>,
145 bytes_injected: Arc<AtomicU64>,
146 /// Handler thread alive flag
147 handler_active: Arc<AtomicBool>,
148 /// Handler thread join handle
149 handler_thread: Option<JoinHandle<Vec<u64>>>,
150}
151
152impl MmuArena {
153 /// Create a new MMU arena.
154 ///
155 /// This:
156 /// 1. Gets the system page size
157 /// 2. mmap's a virtual region (MAP_ANONYMOUS, no physical backing)
158 /// 3. Creates a userfaultfd and registers the region
159 /// 4. Spawns a handler thread that listens for page faults
160 ///
161 /// Returns None if userfaultfd is not available (needs root or CAP_SYS_PTRACE).
162 pub fn new(config: MmuConfig) -> Option<Self> {
163 let use_hugepages = config.use_hugepages;
164 let base_page_size = unsafe { sysconf(_SC_PAGESIZE) as usize };
165
166 // HugePages: 2MB alignment, normal: 4KB alignment
167 let page_size = if use_hugepages { 2 * 1024 * 1024 } else { base_page_size };
168
169 // Align arena size to page boundary
170 let size = (config.arena_size + page_size - 1) & !(page_size - 1);
171
172 // Step 1: Allocate virtual memory
173 // HugePages: MAP_HUGETLB eliminates TLB thrashing for large arenas
174 // 18.5GB GGUF: 4.8M normal pages vs 9375 huge pages (512x less TLB pressure)
175 let mmap_flags = if use_hugepages {
176 MAP_PRIVATE | MAP_ANONYMOUS | libc::MAP_HUGETLB
177 } else {
178 MAP_PRIVATE | MAP_ANONYMOUS
179 };
180
181 let addr = unsafe {
182 mmap(
183 ptr::null_mut(),
184 size,
185 PROT_READ | PROT_WRITE,
186 mmap_flags,
187 -1,
188 0,
189 )
190 };
191 if addr == libc::MAP_FAILED {
192 return None;
193 }
194
195 // Step 2: Create userfaultfd (non-blocking so handler can check active flag)
196 let uffd = match UffdBuilder::new()
197 .close_on_exec(true)
198 .non_blocking(true)
199 .user_mode_only(true)
200 .create()
201 {
202 Ok(u) => u,
203 Err(_) => {
204 // Clean up mmap
205 unsafe { munmap(addr, size); }
206 return None;
207 }
208 };
209
210 // Step 3: Register the arena
211 if uffd.register(addr, size).is_err() {
212 unsafe { munmap(addr, size); }
213 return None;
214 }
215
216 // Step 4: Spawn handler thread
217 let pages_faulted = Arc::new(AtomicU64::new(0));
218 let pages_injected = Arc::new(AtomicU64::new(0));
219 let inject_errors = Arc::new(AtomicU64::new(0));
220 let bytes_injected = Arc::new(AtomicU64::new(0));
221 let handler_active = Arc::new(AtomicBool::new(true));
222
223 let pf = pages_faulted.clone();
224 let pi = pages_injected.clone();
225 let ie = inject_errors.clone();
226 let bi = bytes_injected.clone();
227 let ha = handler_active.clone();
228 let handler_ready = Arc::new(AtomicBool::new(false));
229 let hr = handler_ready.clone();
230 let fill_mode = config.fill_mode;
231
232 // Clone arena base address for page index calculation in handler
233 let arena_base = addr as usize;
234
235 let handler_thread = thread::spawn(move || {
236 let mut latencies: Vec<u64> = Vec::new();
237
238 // Archivaris engine — lives in handler thread, owns the decryption keys
239 let mut engine = AirlockBifurcation::new();
240
241 // Signal that handler is ready to receive faults
242 hr.store(true, Ordering::Release);
243
244 loop {
245 if !ha.load(Ordering::Relaxed) {
246 break;
247 }
248
249 match uffd.read_event() {
250 Ok(None) => {
251 // Non-blocking: no event yet, brief yield and retry
252 thread::yield_now();
253 continue;
254 }
255 Err(_) => {
256 // Non-blocking: nothing pending or uffd error, brief yield
257 thread::yield_now();
258 continue;
259 }
260 Ok(Some(Event::Pagefault { addr: fault_addr, .. })) => {
261 let t0 = Instant::now();
262 let aligned = (fault_addr as usize / page_size) * page_size;
263 let page_index = (aligned - arena_base) / page_size;
264
265 pf.fetch_add(1, Ordering::Relaxed);
266
267 // Build injection data based on fill mode
268 let data = match &fill_mode {
269 FillMode::ZeroFill => {
270 vec![0u8; page_size]
271 }
272 FillMode::StaticData { payload } => {
273 let mut page = vec![0u8; page_size];
274 let copy_len = payload.len().min(page_size);
275 page[..copy_len].copy_from_slice(&payload[..copy_len]);
276 page
277 }
278 FillMode::CompressedRestore => {
279 let mut page = vec![0u8; page_size];
280 let marker = format!("TZA_RESTORED:page@{:#x}", aligned);
281 let marker_bytes = marker.as_bytes();
282 page[..marker_bytes.len()].copy_from_slice(marker_bytes);
283 page
284 }
285 FillMode::EncryptedRestore { sealed_pages, claim, .. } => {
286 // ═══════════════════════════════════════════
287 // SPACESHUTTLE: Encrypted Page Fault Handler
288 //
289 // Page fault → lookup sealed block → bifurcation.open()
290 // → JIS clearance check → AES-256-GCM decrypt
291 // → plaintext → inject in page → app resumes
292 //
293 // Geen JIS claim = dood materiaal (zero page)
294 // Identity IS the memory.
295 // ═══════════════════════════════════════════
296 if page_index < sealed_pages.len() {
297 match engine.open(&sealed_pages[page_index], claim) {
298 BifurcationResult::Opened { plaintext, .. } => {
299 let mut page = vec![0u8; page_size];
300 let copy_len = plaintext.len().min(page_size);
301 page[..copy_len].copy_from_slice(&plaintext[..copy_len]);
302 page
303 }
304 BifurcationResult::AccessDenied { .. } => {
305 vec![0u8; page_size]
306 }
307 _ => {
308 vec![0u8; page_size]
309 }
310 }
311 } else {
312 vec![0u8; page_size]
313 }
314 }
315 FillMode::CompressedEncryptedRestore { sealed_pages, claim, .. } => {
316 // ═══════════════════════════════════════════
317 // SPACESHUTTLE v2: Compressed + Encrypted
318 //
319 // Page fault → open(block) → AES-256-GCM decrypt
320 // → zstd decompress → full page → inject
321 //
322 // Stored: ~1-2KB per 4KB page (compressible data)
323 // Decrypted: on-demand, per page fault
324 // Net effect: less I/O, less bandwidth, same security
325 // ═══════════════════════════════════════════
326 if page_index < sealed_pages.len() {
327 match engine.open(&sealed_pages[page_index], claim) {
328 BifurcationResult::Opened { plaintext, .. } => {
329 // plaintext = zstd compressed data → decompress
330 match zstd::decode_all(plaintext.as_slice()) {
331 Ok(decompressed) => {
332 let mut page = vec![0u8; page_size];
333 let copy_len = decompressed.len().min(page_size);
334 page[..copy_len].copy_from_slice(&decompressed[..copy_len]);
335 page
336 }
337 Err(_) => {
338 // Decompressie mislukt — dood materiaal
339 vec![0u8; page_size]
340 }
341 }
342 }
343 BifurcationResult::AccessDenied { .. } => {
344 vec![0u8; page_size]
345 }
346 _ => {
347 vec![0u8; page_size]
348 }
349 }
350 } else {
351 vec![0u8; page_size]
352 }
353 }
354 };
355
356 // THE MAGIC: inject data into the faulting page
357 let result = unsafe {
358 uffd.copy(
359 data.as_ptr() as *const _,
360 aligned as *mut _,
361 page_size,
362 true, // wake the blocked thread
363 )
364 };
365
366 match result {
367 Ok(_) => {
368 pi.fetch_add(1, Ordering::Relaxed);
369 bi.fetch_add(page_size as u64, Ordering::Relaxed);
370 }
371 Err(_) => {
372 ie.fetch_add(1, Ordering::Relaxed);
373 }
374 }
375
376 let latency_ns = t0.elapsed().as_nanos() as u64;
377 latencies.push(latency_ns);
378 }
379 Ok(None) => {
380 // No event (shouldn't happen in blocking mode)
381 break;
382 }
383 Ok(Some(_)) => {
384 // Other event (fork, remap, etc.) — ignore
385 }
386 Err(_) => {
387 // UFFD closed or error
388 break;
389 }
390 }
391 }
392
393 latencies
394 });
395
396 // Wait for handler thread to be ready before returning
397 while !handler_ready.load(Ordering::Acquire) {
398 thread::yield_now();
399 }
400 // Give handler time to enter read_event() blocking call
401 // Without this, the first page fault can arrive before the handler is listening
402 thread::sleep(std::time::Duration::from_millis(5));
403
404 Some(Self {
405 addr,
406 size,
407 page_size,
408 pages_faulted,
409 pages_injected,
410 inject_errors,
411 bytes_injected,
412 handler_active,
413 handler_thread: Some(handler_thread),
414 })
415 }
416
417 /// Get the base address of the arena (for reading/writing).
418 pub fn addr(&self) -> *mut libc::c_void {
419 self.addr
420 }
421
422 /// Get the arena size.
423 pub fn size(&self) -> usize {
424 self.size
425 }
426
427 /// Get the system page size.
428 pub fn page_size(&self) -> usize {
429 self.page_size
430 }
431
432 /// Number of pages in the arena.
433 pub fn page_count(&self) -> usize {
434 self.size / self.page_size
435 }
436
437 /// Read a byte from offset (will trigger page fault if page not yet loaded).
438 ///
439 /// # Safety
440 /// Offset must be within arena bounds.
441 pub unsafe fn read_byte(&self, offset: usize) -> u8 {
442 let ptr = (self.addr as *const u8).add(offset);
443 ptr::read_volatile(ptr)
444 }
445
446 /// Read a slice from offset (may trigger multiple page faults).
447 ///
448 /// # Safety
449 /// Range must be within arena bounds.
450 pub unsafe fn read_slice(&self, offset: usize, len: usize) -> Vec<u8> {
451 let ptr = (self.addr as *const u8).add(offset);
452 let slice = std::slice::from_raw_parts(ptr, len);
453 slice.to_vec()
454 }
455
456 /// Get current stats (non-blocking).
457 pub fn stats(&self) -> MmuStats {
458 MmuStats {
459 pages_faulted: self.pages_faulted.load(Ordering::Relaxed),
460 pages_injected: self.pages_injected.load(Ordering::Relaxed),
461 inject_errors: self.inject_errors.load(Ordering::Relaxed),
462 total_bytes_injected: self.bytes_injected.load(Ordering::Relaxed),
463 page_size: self.page_size,
464 arena_size: self.size,
465 arena_pages: self.size / self.page_size,
466 }
467 }
468
469 /// Shut down the handler and collect latency data.
470 pub fn shutdown(mut self) -> MmuResult {
471 let t0 = Instant::now();
472 // Signal handler to stop (non-blocking handler checks this flag)
473 self.handler_active.store(false, Ordering::Release);
474
475 let latencies = if let Some(handle) = self.handler_thread.take() {
476 handle.join().unwrap_or_default()
477 } else {
478 Vec::new()
479 };
480
481 // Clean up mmap
482 if !self.addr.is_null() {
483 unsafe { munmap(self.addr, self.size); }
484 self.addr = ptr::null_mut();
485 }
486
487 let stats = MmuStats {
488 pages_faulted: self.pages_faulted.load(Ordering::Relaxed),
489 pages_injected: self.pages_injected.load(Ordering::Relaxed),
490 inject_errors: self.inject_errors.load(Ordering::Relaxed),
491 total_bytes_injected: self.bytes_injected.load(Ordering::Relaxed),
492 page_size: self.page_size,
493 arena_size: self.size,
494 arena_pages: self.size / self.page_size,
495 };
496
497 MmuResult {
498 stats,
499 elapsed: t0.elapsed(),
500 fault_latencies_ns: latencies,
501 }
502 }
503}
504
505impl Drop for MmuArena {
506 fn drop(&mut self) {
507 self.handler_active.store(false, Ordering::Release);
508 if !self.addr.is_null() {
509 unsafe { munmap(self.addr, self.size); }
510 }
511 // Note: handler thread will exit when uffd read fails after munmap
512 }
513}
514
515// ═══════════════════════════════════════════════════════════════
516// Helper: compute percentiles from sorted latency data
517// ═══════════════════════════════════════════════════════════════
518
519pub fn percentile(sorted: &[u64], pct: f64) -> u64 {
520 if sorted.is_empty() { return 0; }
521 let idx = ((sorted.len() as f64 * pct / 100.0) as usize).min(sorted.len() - 1);
522 sorted[idx]
523}
524
525/// Quick check: is userfaultfd available on this system?
526pub fn userfaultfd_available() -> bool {
527 match UffdBuilder::new()
528 .close_on_exec(true)
529 .non_blocking(true)
530 .user_mode_only(true)
531 .create()
532 {
533 Ok(_) => true,
534 Err(_) => false,
535 }
536}
537
538/// Pre-seal page data into encrypted blocks for EncryptedRestore mode.
539///
540/// Takes a slice of page-sized plaintext buffers and seals each one
541/// using session keys (fast path: HKDF+AES only after first DH).
542///
543/// Returns the sealed blocks ready for the page fault handler.
544pub fn seal_pages(
545 pages: &[Vec<u8>],
546 clearance: ClearanceLevel,
547 source: &str,
548) -> Vec<EncryptedBlock> {
549 let mut engine = AirlockBifurcation::new();
550 let mut blocks = Vec::with_capacity(pages.len());
551 for (i, page_data) in pages.iter().enumerate() {
552 if let BifurcationResult::Sealed { block, .. } =
553 engine.seal_session(page_data, i, clearance.clone(), source)
554 {
555 blocks.push(block);
556 }
557 }
558 blocks
559}
560
561/// Compressed seal result with storage statistics.
562pub struct CompressedSealResult {
563 pub blocks: Vec<EncryptedBlock>,
564 pub original_sizes: Vec<usize>,
565 pub total_original: usize,
566 pub total_compressed: usize,
567 pub total_encrypted: usize,
568 pub compression_ratio: f64,
569}
570
571/// Pre-compress + seal page data for CompressedEncryptedRestore mode.
572///
573/// Pipeline per page: plaintext → zstd (level 3) → AES-256-GCM seal
574///
575/// Returns sealed blocks + compression statistics.
576pub fn seal_pages_compressed(
577 pages: &[Vec<u8>],
578 clearance: ClearanceLevel,
579 source: &str,
580 zstd_level: i32,
581) -> CompressedSealResult {
582 let mut engine = AirlockBifurcation::new();
583 let mut blocks = Vec::with_capacity(pages.len());
584 let mut original_sizes = Vec::with_capacity(pages.len());
585 let mut total_original = 0usize;
586 let mut total_compressed = 0usize;
587 let mut total_encrypted = 0usize;
588
589 for (i, page_data) in pages.iter().enumerate() {
590 let original_size = page_data.len();
591 total_original += original_size;
592
593 // Step 1: zstd compress
594 let compressed = zstd::encode_all(page_data.as_slice(), zstd_level)
595 .unwrap_or_else(|_| page_data.clone()); // fallback to raw on compress failure
596 total_compressed += compressed.len();
597
598 // Step 2: AES-256-GCM seal the compressed data
599 if let BifurcationResult::Sealed { block, .. } =
600 engine.seal_session(&compressed, i, clearance.clone(), source)
601 {
602 total_encrypted += block.ciphertext.len();
603 blocks.push(block);
604 }
605
606 original_sizes.push(original_size);
607 }
608
609 let compression_ratio = if total_compressed > 0 {
610 total_original as f64 / total_compressed as f64
611 } else {
612 1.0
613 };
614
615 CompressedSealResult {
616 blocks,
617 original_sizes,
618 total_original,
619 total_compressed,
620 total_encrypted,
621 compression_ratio,
622 }
623}
624
625/// Create a JIS claim for MMU access.
626pub fn mmu_claim(identity: &str, clearance: ClearanceLevel) -> JisClaim {
627 JisClaim {
628 identity: identity.to_string(),
629 ed25519_pub: "a".repeat(64), // Placeholder — real impl uses actual key
630 clearance,
631 role: "operator".to_string(),
632 dept: "kernel".to_string(),
633 claimed_at: "2026-04-15T00:00:00Z".to_string(),
634 signature: "mmu_sig".to_string(),
635 }
636}
637
638pub fn format_ns(ns: u64) -> String {
639 if ns < 1_000 {
640 format!("{}ns", ns)
641 } else if ns < 1_000_000 {
642 format!("{:.1}µs", ns as f64 / 1_000.0)
643 } else {
644 format!("{:.2}ms", ns as f64 / 1_000_000.0)
645 }
646}