sandlock-core 0.8.3

Lightweight process sandbox using Landlock, seccomp-bpf, and seccomp user notification
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
use std::collections::HashMap;
use std::fs::File;
use std::fs::OpenOptions;
use std::io::{self, BufRead, BufReader, Read, Seek, SeekFrom, Write};

use crate::error::SandlockError;

/// Find the base address and size of the vDSO mapping for a given process.
pub(crate) fn find_vdso_range(pid: i32) -> io::Result<(u64, u64)> {
    let path = format!("/proc/{}/maps", pid);
    let file = File::open(&path)?;
    let reader = BufReader::new(file);

    for line in reader.lines() {
        let line = line?;
        if line.ends_with("[vdso]") {
            // Line format: "7ffd1234000-7ffd1235000 r-xp ... [vdso]"
            let space = line.find(' ').unwrap_or(line.len());
            let range = &line[..space];
            if let Some(dash_pos) = range.find('-') {
                let start = u64::from_str_radix(&range[..dash_pos], 16).map_err(|e| {
                    io::Error::new(io::ErrorKind::InvalidData, format!("bad vDSO start: {}", e))
                })?;
                let end = u64::from_str_radix(&range[dash_pos + 1..], 16).map_err(|e| {
                    io::Error::new(io::ErrorKind::InvalidData, format!("bad vDSO end: {}", e))
                })?;
                return Ok((start, end - start));
            }
        }
    }

    Err(io::Error::new(
        io::ErrorKind::NotFound,
        "vDSO mapping not found",
    ))
}

/// Find the base address of the vDSO mapping for a given process.
pub(crate) fn find_vdso_base(pid: i32) -> io::Result<u64> {
    find_vdso_range(pid).map(|(base, _)| base)
}

/// Read `len` bytes from `/proc/{pid}/mem` at the given address.
fn read_proc_mem(pid: i32, addr: u64, len: usize) -> io::Result<Vec<u8>> {
    let mut file = File::open(format!("/proc/{}/mem", pid))?;
    file.seek(SeekFrom::Start(addr))?;
    let mut buf = vec![0u8; len];
    file.read_exact(&mut buf)?;
    Ok(buf)
}

/// Parse vDSO ELF bytes and return a map of symbol name -> offset from ELF base.
fn parse_vdso_symbols(vdso_bytes: &[u8]) -> HashMap<String, u64> {
    let mut symbols = HashMap::new();

    if let Ok(elf) = goblin::elf::Elf::parse(vdso_bytes) {
        for sym in elf.dynsyms.iter() {
            if sym.st_value != 0 {
                if let Some(name) = elf.dynstrtab.get_at(sym.st_name) {
                    if !name.is_empty() {
                        symbols.insert(name.to_string(), sym.st_value);
                    }
                }
            }
        }
    }

    symbols
}

#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
fn push_insn(stub: &mut Vec<u8>, insn: u32) {
    stub.extend_from_slice(&insn.to_le_bytes());
}

/// Encode an arm64 unconditional `B target` instruction located at `from`.
/// `imm26` is signed and scaled by 4, so the reachable range is ±128 MiB.
#[cfg(target_arch = "aarch64")]
fn arm64_b_insn(from: u64, to: u64) -> Result<u32, SandlockError> {
    let delta = to as i64 - from as i64;
    if delta % 4 != 0 {
        return Err(SandlockError::MemoryProtect(format!(
            "arm64 B target {:#x} not 4-byte aligned from {:#x}",
            to, from
        )));
    }
    let offset = delta / 4;
    if !(-(1i64 << 25)..(1i64 << 25)).contains(&offset) {
        return Err(SandlockError::MemoryProtect(format!(
            "arm64 B {:#x}->{:#x} out of ±128 MiB range",
            from, to
        )));
    }
    Ok(0x14000000u32 | ((offset as u32) & 0x03FF_FFFF))
}

/// Compute the offset within the vDSO mapping where the trampoline area starts —
/// just past the last symbol, rounded up to a 16-byte boundary.
#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
fn vdso_tramp_start(vdso_bytes: &[u8]) -> Option<u64> {
    let elf = goblin::elf::Elf::parse(vdso_bytes).ok()?;
    let highest_end = elf
        .dynsyms
        .iter()
        .filter(|s| s.st_value != 0)
        .map(|s| s.st_value + s.st_size)
        .max()?;
    Some((highest_end + 15) & !15)
}

#[cfg(target_arch = "aarch64")]
fn movz_x(reg: u32, imm16: u16, shift: u32) -> u32 {
    0xD280_0000 | (((shift / 16) & 0x3) << 21) | ((imm16 as u32) << 5) | reg
}

#[cfg(target_arch = "aarch64")]
fn movk_x(reg: u32, imm16: u16, shift: u32) -> u32 {
    0xF280_0000 | (((shift / 16) & 0x3) << 21) | ((imm16 as u32) << 5) | reg
}

#[cfg(target_arch = "aarch64")]
fn load_imm64(stub: &mut Vec<u8>, reg: u32, value: u64) {
    push_insn(stub, movz_x(reg, (value & 0xffff) as u16, 0));
    push_insn(stub, movk_x(reg, ((value >> 16) & 0xffff) as u16, 16));
    push_insn(stub, movk_x(reg, ((value >> 32) & 0xffff) as u16, 32));
    push_insn(stub, movk_x(reg, ((value >> 48) & 0xffff) as u16, 48));
}

/// Generate a simple stub that forces a real syscall (replacing the vDSO fast path).
#[cfg(target_arch = "x86_64")]
/// Layout: mov eax, imm32 / syscall / ret — 8 bytes total.
fn simple_stub(syscall_nr: u32) -> Vec<u8> {
    let mut stub = Vec::new();
    stub.push(0xB8); // mov eax, imm32
    stub.extend_from_slice(&syscall_nr.to_le_bytes()); // syscall number
    stub.extend_from_slice(&[0x0F, 0x05]); // syscall
    stub.push(0xC3); // ret
    stub // 8 bytes total
}

#[cfg(target_arch = "aarch64")]
fn simple_stub(syscall_nr: u32) -> Vec<u8> {
    let mut stub = Vec::new();
    push_insn(&mut stub, movz_x(8, syscall_nr as u16, 0)); // mov x8, syscall_nr
    push_insn(&mut stub, 0xD400_0001); // svc #0
    push_insn(&mut stub, 0xD65F_03C0); // ret
    stub
}

/// Generate an offset stub for clock_gettime that forces a real syscall,
/// then adds a time offset to the result for CLOCK_REALTIME and CLOCK_REALTIME_COARSE.
///
#[cfg(target_arch = "x86_64")]
/// Layout (x86-64):
///   push rdi / push rsi
///   mov eax, 228 / syscall          ; do the real syscall
///   pop rsi / pop rdi               ; restore args (rsi = timespec*)
///   cmp edi, 0                      ; CLOCK_REALTIME?
///   je  +5                          ; yes → skip second check, apply offset
///   cmp edi, 5                      ; CLOCK_REALTIME_COARSE?
///   jne +13                         ; neither → skip to ret
///   movabs rcx, offset_secs         ; load 8-byte offset
///   add  [rsi], rcx                 ; adjust tv_sec
///   ret
fn offset_stub_clock_gettime(offset_secs: i64) -> Vec<u8> {
    let mut stub = Vec::new();
    stub.push(0x57); // push rdi
    stub.push(0x56); // push rsi
    stub.extend_from_slice(&[0xB8, 0xE4, 0x00, 0x00, 0x00]); // mov eax, 228
    stub.extend_from_slice(&[0x0F, 0x05]); // syscall
    stub.push(0x5E); // pop rsi
    stub.push(0x5F); // pop rdi
    stub.extend_from_slice(&[0x83, 0xFF, 0x00]); // cmp edi, 0 (CLOCK_REALTIME)
    stub.push(0x74); // je (short jump) — if CLOCK_REALTIME, jump to movabs
    // Skip second check: cmp edi,5 (3 bytes) + jne (2 bytes) = 5
    let jump_to_movabs: u8 = 3 + 2;
    stub.push(jump_to_movabs);
    stub.extend_from_slice(&[0x83, 0xFF, 0x05]); // cmp edi, 5 (CLOCK_REALTIME_COARSE)
    stub.push(0x75); // jne (short jump) — if NOT CLOCK_REALTIME_COARSE, skip to ret
    // Skip: movabs rcx (10) + add [rsi],rcx (3) = 13
    let jump_to_ret: u8 = 10 + 3;
    stub.push(jump_to_ret);
    stub.extend_from_slice(&[0x48, 0xB9]); // movabs rcx, imm64
    stub.extend_from_slice(&offset_secs.to_le_bytes()); // 8-byte offset
    stub.extend_from_slice(&[0x48, 0x01, 0x0E]); // add [rsi], rcx
    stub.push(0xC3); // ret
    stub
}

#[cfg(target_arch = "aarch64")]
fn offset_stub_clock_gettime(offset_secs: i64) -> Vec<u8> {
    let mut stub = Vec::new();
    push_insn(&mut stub, 0xAA00_03E9); // mov x9, x0 (clock id)
    push_insn(&mut stub, 0xAA01_03EA); // mov x10, x1 (timespec*)
    push_insn(&mut stub, movz_x(8, libc::SYS_clock_gettime as u16, 0));
    push_insn(&mut stub, 0xD400_0001); // svc #0
    push_insn(&mut stub, 0x7100_013F); // cmp w9, #0 (CLOCK_REALTIME)
    push_insn(&mut stub, 0x5400_0060); // b.eq +3 instructions
    push_insn(&mut stub, 0x7100_153F); // cmp w9, #5 (CLOCK_REALTIME_COARSE)
    push_insn(&mut stub, 0x5400_0101); // b.ne +8 instructions, to ret
    load_imm64(&mut stub, 11, offset_secs as u64); // x11 = offset
    push_insn(&mut stub, 0xF940_014C); // ldr x12, [x10]
    push_insn(&mut stub, 0x8B0B_018C); // add x12, x12, x11
    push_insn(&mut stub, 0xF900_014C); // str x12, [x10]
    push_insn(&mut stub, 0xD65F_03C0); // ret
    stub
}

/// Generate an offset stub for gettimeofday that forces a real syscall,
/// then adds a time offset to tv_sec. For `gettimeofday(tv, tz)` the output
/// `timeval*` is the FIRST arg (rdi); rsi is `timezone*`. (Contrast with
/// clock_gettime, whose output `timespec*` is the second arg, rsi.)
#[cfg(target_arch = "x86_64")]
fn offset_stub_gettimeofday(offset_secs: i64) -> Vec<u8> {
    let mut stub = Vec::new();
    stub.extend_from_slice(&[0x57, 0x56]); // push rdi, push rsi
    stub.extend_from_slice(&[0xB8, 0x60, 0x00, 0x00, 0x00]); // mov eax, 96
    stub.extend_from_slice(&[0x0F, 0x05]); // syscall
    stub.extend_from_slice(&[0x5E, 0x5F]); // pop rsi, pop rdi
    stub.extend_from_slice(&[0x48, 0x85, 0xFF]); // test rdi, rdi (timeval* == NULL?)
    stub.extend_from_slice(&[0x74, 0x0D]); // je +13 -> ret (skip movabs(10)+add(3))
    stub.extend_from_slice(&[0x48, 0xB9]); // movabs rcx, imm64
    stub.extend_from_slice(&offset_secs.to_le_bytes());
    stub.extend_from_slice(&[0x48, 0x01, 0x0F]); // add [rdi], rcx (tv_sec)
    stub.push(0xC3); // ret
    stub
}

#[cfg(target_arch = "aarch64")]
fn offset_stub_gettimeofday(offset_secs: i64) -> Vec<u8> {
    let mut stub = Vec::new();
    push_insn(&mut stub, 0xAA00_03EA); // mov x10, x0 (timeval*)
    push_insn(&mut stub, movz_x(8, libc::SYS_gettimeofday as u16, 0));
    push_insn(&mut stub, 0xD400_0001); // svc #0
    push_insn(&mut stub, 0xB400_010A); // cbz x10, +8 instructions, to ret
    load_imm64(&mut stub, 11, offset_secs as u64); // x11 = offset
    push_insn(&mut stub, 0xF940_014C); // ldr x12, [x10]
    push_insn(&mut stub, 0x8B0B_018C); // add x12, x12, x11
    push_insn(&mut stub, 0xF900_014C); // str x12, [x10]
    push_insn(&mut stub, 0xD65F_03C0); // ret
    stub
}

#[cfg(target_arch = "x86_64")]
fn vdso_targets() -> Vec<(&'static str, &'static str, u32)> {
    vec![
        ("clock_gettime", "__vdso_clock_gettime", libc::SYS_clock_gettime as u32),
        ("gettimeofday", "__vdso_gettimeofday", libc::SYS_gettimeofday as u32),
        ("time", "__vdso_time", libc::SYS_time as u32),
    ]
}

#[cfg(target_arch = "aarch64")]
fn vdso_targets() -> Vec<(&'static str, &'static str, u32)> {
    vec![
        ("clock_gettime", "__kernel_clock_gettime", libc::SYS_clock_gettime as u32),
        ("gettimeofday", "__kernel_gettimeofday", libc::SYS_gettimeofday as u32),
    ]
}

// ============================================================
// riscv64 vDSO codegen
//
// Like aarch64, riscv64 places a full stub in the slack space at the tail of
// the vDSO mapping and patches each function entry with a single 4-byte `j`
// (jal x0) that jumps to its stub. The offset stubs run the real syscall, then
// add the time offset to the returned tv_sec (mirroring x86_64/aarch64). The
// 64-bit offset is stored as data at the tail of each stub and loaded
// PC-relative via `auipc`, so the stub is position-independent. It is read with
// two naturally-aligned 4-byte loads (lwu/lw) to avoid a misaligned 8-byte load.
// ============================================================

/// Emit `li a7, value` (load syscall number into a7/x17). Uses a single `addi`
/// for the 12-bit case (all syscall numbers sandlock patches fit), falling back
/// to `lui`+`addiw` for larger 32-bit values.
#[cfg(target_arch = "riscv64")]
fn riscv_li_a7(stub: &mut Vec<u8>, value: u32) {
    const A7: u32 = 17;
    if value < 2048 {
        // addi a7, x0, value
        push_insn(stub, (value << 20) | (A7 << 7) | 0x13);
    } else {
        let lo12 = value & 0xfff;
        // sign-extend lo12: if bit 11 is set, addiw subtracts, so bump hi20.
        let hi20 = if lo12 & 0x800 != 0 {
            (value >> 12).wrapping_add(1) & 0xf_ffff
        } else {
            (value >> 12) & 0xf_ffff
        };
        push_insn(stub, (hi20 << 12) | (A7 << 7) | 0x37); // lui a7, hi20
        push_insn(stub, ((lo12 & 0xfff) << 20) | (A7 << 15) | (A7 << 7) | 0x1b); // addiw a7, a7, lo12
    }
}

/// Encode a riscv64 unconditional `j target` (jal x0, offset) located at `from`.
/// The JAL immediate is signed and scaled by 2, so the reachable range is ±1 MiB.
#[cfg(target_arch = "riscv64")]
fn riscv_j_insn(from: u64, to: u64) -> Result<u32, SandlockError> {
    let delta = to as i64 - from as i64;
    if delta % 2 != 0 {
        return Err(SandlockError::MemoryProtect(format!(
            "riscv64 J target {:#x} not 2-byte aligned from {:#x}",
            to, from
        )));
    }
    if !(-(1i64 << 20)..(1i64 << 20)).contains(&delta) {
        return Err(SandlockError::MemoryProtect(format!(
            "riscv64 J {:#x}->{:#x} out of ±1 MiB range",
            from, to
        )));
    }
    let imm = delta as u32;
    let b20 = (imm >> 20) & 0x1;
    let b10_1 = (imm >> 1) & 0x3ff;
    let b11 = (imm >> 11) & 0x1;
    let b19_12 = (imm >> 12) & 0xff;
    // jal x0, offset (rd = x0)
    Ok((b20 << 31) | (b10_1 << 21) | (b11 << 20) | (b19_12 << 12) | 0x6f)
}

/// Minimal RV64I instruction encoders for the time-offset stubs. Registers are
/// ABI numbers; the stubs touch only caller-saved temporaries (t0-t6) and the
/// syscall registers (a0/a1/a7), so they need no prologue/epilogue.
#[cfg(target_arch = "riscv64")]
mod rv {
    pub const X0: u32 = 0;
    pub const T0: u32 = 5;
    pub const T1: u32 = 6;
    pub const T2: u32 = 7;
    pub const A0: u32 = 10;
    pub const A1: u32 = 11;
    pub const A7: u32 = 17;
    pub const T3: u32 = 28;
    pub const T4: u32 = 29;
    pub const T5: u32 = 30;
    pub const T6: u32 = 31;
    pub const ECALL: u32 = 0x0000_0073;
    pub const RET: u32 = 0x0000_8067; // jalr x0, 0(ra)

    pub fn addi(rd: u32, rs1: u32, imm: i32) -> u32 {
        ((imm as u32 & 0xfff) << 20) | (rs1 << 15) | (rd << 7) | 0x13
    }
    /// `mv rd, rs` == `addi rd, rs, 0`
    pub fn mv(rd: u32, rs: u32) -> u32 {
        addi(rd, rs, 0)
    }
    pub fn auipc(rd: u32, imm20: u32) -> u32 {
        (imm20 << 12) | (rd << 7) | 0x17
    }
    pub fn lwu(rd: u32, rs1: u32, imm: i32) -> u32 {
        ((imm as u32 & 0xfff) << 20) | (rs1 << 15) | (6 << 12) | (rd << 7) | 0x03
    }
    pub fn lw(rd: u32, rs1: u32, imm: i32) -> u32 {
        ((imm as u32 & 0xfff) << 20) | (rs1 << 15) | (2 << 12) | (rd << 7) | 0x03
    }
    pub fn ld(rd: u32, rs1: u32, imm: i32) -> u32 {
        ((imm as u32 & 0xfff) << 20) | (rs1 << 15) | (3 << 12) | (rd << 7) | 0x03
    }
    pub fn sd(rs2: u32, rs1: u32, imm: i32) -> u32 {
        let i = imm as u32;
        ((i >> 5 & 0x7f) << 25) | (rs2 << 20) | (rs1 << 15) | (3 << 12) | ((i & 0x1f) << 7) | 0x23
    }
    pub fn slli(rd: u32, rs1: u32, shamt: u32) -> u32 {
        ((shamt & 0x3f) << 20) | (rs1 << 15) | (1 << 12) | (rd << 7) | 0x13
    }
    pub fn or(rd: u32, rs1: u32, rs2: u32) -> u32 {
        (rs2 << 20) | (rs1 << 15) | (6 << 12) | (rd << 7) | 0x33
    }
    pub fn add(rd: u32, rs1: u32, rs2: u32) -> u32 {
        (rs2 << 20) | (rs1 << 15) | (rd << 7) | 0x33
    }
    pub fn beq(rs1: u32, rs2: u32, imm: i32) -> u32 {
        branch(0, rs1, rs2, imm)
    }
    pub fn bne(rs1: u32, rs2: u32, imm: i32) -> u32 {
        branch(1, rs1, rs2, imm)
    }
    fn branch(funct3: u32, rs1: u32, rs2: u32, imm: i32) -> u32 {
        let i = imm as u32;
        ((i >> 12 & 1) << 31)
            | ((i >> 5 & 0x3f) << 25)
            | (rs2 << 20)
            | (rs1 << 15)
            | (funct3 << 12)
            | ((i >> 1 & 0xf) << 8)
            | ((i >> 11 & 1) << 7)
            | 0x63
    }
}

#[cfg(target_arch = "riscv64")]
fn simple_stub(syscall_nr: u32) -> Vec<u8> {
    let mut stub = Vec::new();
    riscv_li_a7(&mut stub, syscall_nr);
    push_insn(&mut stub, 0x0000_0073); // ecall
    push_insn(&mut stub, 0x0000_8067); // ret (jalr x0, 0(ra))
    stub
}

/// clock_gettime(clockid=a0, timespec*=a1): run the real syscall, then add the
/// time offset to tv_sec for CLOCK_REALTIME (0) and CLOCK_REALTIME_COARSE (5).
#[cfg(target_arch = "riscv64")]
fn offset_stub_clock_gettime(offset_secs: i64) -> Vec<u8> {
    use rv::*;
    const DOFF: i32 = 36; // bytes from the `auipc` to the embedded offset data
    let nr = libc::SYS_clock_gettime as i32;
    let insns: [u32; 16] = [
        mv(T0, A0),           // save clockid (a0 is overwritten by the return)
        mv(T4, A1),           // save timespec*
        addi(A7, X0, nr),     // li a7, SYS_clock_gettime
        ECALL,                // a0 = kernel return value (preserved to the caller)
        beq(T0, X0, 12),      // clockid == CLOCK_REALTIME -> apply
        addi(T1, X0, 5),      // li t1, CLOCK_REALTIME_COARSE
        bne(T0, T1, 36),      // clockid != COARSE -> end (skip offset)
        auipc(T2, 0),         // apply: t2 = &this instruction
        lwu(T5, T2, DOFF),    // t5 = low 32 bits of offset (zero-extended)
        lw(T6, T2, DOFF + 4), // t6 = high 32 bits (sign-extended)
        slli(T6, T6, 32),
        or(T2, T5, T6),       // t2 = full 64-bit offset
        ld(T3, T4, 0),        // t3 = tv_sec
        add(T3, T3, T2),      // t3 += offset
        sd(T3, T4, 0),        // tv_sec = t3
        RET,                  // end
    ];
    let mut stub = Vec::with_capacity(insns.len() * 4 + 8);
    for insn in insns {
        push_insn(&mut stub, insn);
    }
    stub.extend_from_slice(&offset_secs.to_le_bytes());
    stub
}

/// gettimeofday(timeval*=a0): run the real syscall, then add the time offset to
/// tv_sec, unless the timeval pointer is NULL.
#[cfg(target_arch = "riscv64")]
fn offset_stub_gettimeofday(offset_secs: i64) -> Vec<u8> {
    use rv::*;
    const DOFF: i32 = 36;
    let nr = libc::SYS_gettimeofday as i32;
    let insns: [u32; 13] = [
        mv(T4, A0),           // save timeval*
        addi(A7, X0, nr),     // li a7, SYS_gettimeofday
        ECALL,
        beq(T4, X0, 36),      // timeval == NULL -> end
        auipc(T2, 0),
        lwu(T5, T2, DOFF),
        lw(T6, T2, DOFF + 4),
        slli(T6, T6, 32),
        or(T2, T5, T6),
        ld(T3, T4, 0),        // t3 = tv_sec
        add(T3, T3, T2),
        sd(T3, T4, 0),
        RET,                  // end
    ];
    let mut stub = Vec::with_capacity(insns.len() * 4 + 8);
    for insn in insns {
        push_insn(&mut stub, insn);
    }
    stub.extend_from_slice(&offset_secs.to_le_bytes());
    stub
}

#[cfg(target_arch = "riscv64")]
fn vdso_targets() -> Vec<(&'static str, &'static str, u32)> {
    vec![
        ("clock_gettime", "__vdso_clock_gettime", libc::SYS_clock_gettime as u32),
        ("gettimeofday", "__vdso_gettimeofday", libc::SYS_gettimeofday as u32),
    ]
}

/// Patch the vDSO of a target process to force real syscalls (interceptable by seccomp).
/// If `time_offset_secs` is provided, clock_gettime and gettimeofday stubs will add
/// the offset to the returned time.
pub(crate) fn patch(
    pid: i32,
    time_offset_secs: Option<i64>,
    _patch_for_random: bool,
) -> Result<(), SandlockError> {
    let (base, mapping_size) = find_vdso_range(pid).map_err(|e| {
        SandlockError::MemoryProtect(format!("failed to find vDSO range: {}", e))
    })?;

    let read_size = std::cmp::min(mapping_size as usize, 0x4000);
    let vdso_bytes = read_proc_mem(pid, base, read_size).map_err(|e| {
        SandlockError::MemoryProtect(format!("failed to read vDSO memory: {}", e))
    })?;

    let symbols = parse_vdso_symbols(&vdso_bytes);

    let mut mem = OpenOptions::new()
        .write(true)
        .open(format!("/proc/{}/mem", pid))
        .map_err(|e| {
            SandlockError::MemoryProtect(format!("failed to open /proc/{}/mem: {}", pid, e))
        })?;

    // arm64/riscv64: place full stubs in slack space at the tail of the vDSO
    // mapping and patch each function entry with a single 4-byte jump to its stub.
    // x86_64: stubs are short and inter-symbol gaps are wide; patch inline.
    #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
    let mut tramp_offset = vdso_tramp_start(&vdso_bytes).unwrap_or(0);

    for (name, alt_name, syscall_nr) in vdso_targets() {
        if let Some(&offset) = symbols.get(name).or_else(|| symbols.get(alt_name)) {
            let entry_addr = base + offset;
            let stub = match (time_offset_secs, name) {
                (Some(off), "clock_gettime") => offset_stub_clock_gettime(off),
                (Some(off), "gettimeofday") => offset_stub_gettimeofday(off),
                _ => simple_stub(syscall_nr),
            };

            #[cfg(target_arch = "x86_64")]
            {
                mem.seek(SeekFrom::Start(entry_addr)).map_err(|e| {
                    SandlockError::MemoryProtect(format!(
                        "failed to seek to {} at {:#x}: {}",
                        name, entry_addr, e
                    ))
                })?;
                mem.write_all(&stub).map_err(|e| {
                    SandlockError::MemoryProtect(format!(
                        "failed to write {} stub at {:#x}: {}",
                        name, entry_addr, e
                    ))
                })?;
            }

            #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
            {
                if tramp_offset + stub.len() as u64 > mapping_size {
                    return Err(SandlockError::MemoryProtect(format!(
                        "vDSO trampoline area exhausted: need {} bytes at offset {:#x}, mapping ends at {:#x}",
                        stub.len(), tramp_offset, mapping_size
                    )));
                }
                let tramp_addr = base + tramp_offset;

                mem.seek(SeekFrom::Start(tramp_addr)).map_err(|e| {
                    SandlockError::MemoryProtect(format!(
                        "failed to seek to {} trampoline at {:#x}: {}",
                        name, tramp_addr, e
                    ))
                })?;
                mem.write_all(&stub).map_err(|e| {
                    SandlockError::MemoryProtect(format!(
                        "failed to write {} trampoline at {:#x}: {}",
                        name, tramp_addr, e
                    ))
                })?;

                #[cfg(target_arch = "aarch64")]
                let b_insn = arm64_b_insn(entry_addr, tramp_addr)?;
                #[cfg(target_arch = "riscv64")]
                let b_insn = riscv_j_insn(entry_addr, tramp_addr)?;
                mem.seek(SeekFrom::Start(entry_addr)).map_err(|e| {
                    SandlockError::MemoryProtect(format!(
                        "failed to seek to {} entry at {:#x}: {}",
                        name, entry_addr, e
                    ))
                })?;
                mem.write_all(&b_insn.to_le_bytes()).map_err(|e| {
                    SandlockError::MemoryProtect(format!(
                        "failed to write {} branch at {:#x}: {}",
                        name, entry_addr, e
                    ))
                })?;

                tramp_offset = (tramp_offset + stub.len() as u64 + 3) & !3;
            }
        }
    }

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_find_vdso_self() {
        let base = find_vdso_base(std::process::id() as i32).unwrap();
        assert!(base > 0);
    }

    #[test]
    fn test_parse_vdso_symbols_self() {
        let pid = std::process::id() as i32;
        let base = find_vdso_base(pid).unwrap();
        let bytes = read_proc_mem(pid, base, 0x2000).unwrap();
        let symbols = parse_vdso_symbols(&bytes);
        // Should find at least clock_gettime
        assert!(
            symbols.contains_key("clock_gettime")
                || symbols.contains_key("__vdso_clock_gettime")
                || symbols.contains_key("__kernel_clock_gettime"),
            "Expected clock_gettime in vDSO symbols, found: {:?}",
            symbols.keys().collect::<Vec<_>>()
        );
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn test_simple_stub_size() {
        let stub = simple_stub(228);
        assert_eq!(stub.len(), 8);
        assert_eq!(stub[0], 0xB8); // mov eax
    }

    #[test]
    #[cfg(target_arch = "aarch64")]
    fn test_simple_stub_size() {
        let stub = simple_stub(228);
        // movz x8, #228 / svc #0 / ret — three 4-byte instructions.
        assert_eq!(stub.len(), 12);
    }

    #[test]
    #[cfg(target_arch = "riscv64")]
    fn test_simple_stub_size() {
        let stub = simple_stub(228);
        // addi a7, x0, 228 / ecall / ret — three 4-byte instructions.
        assert_eq!(stub.len(), 12);
        // ecall and ret are fixed encodings.
        assert_eq!(&stub[4..8], &0x0000_0073u32.to_le_bytes());
        assert_eq!(&stub[8..12], &0x0000_8067u32.to_le_bytes());
    }

    #[test]
    #[cfg(target_arch = "riscv64")]
    fn test_offset_stub_riscv_layout() {
        let off: i64 = -86400; // one day back
        let cg = offset_stub_clock_gettime(off);
        // 16 instructions (64 bytes) + 8-byte embedded offset.
        assert_eq!(cg.len(), 72);
        assert_eq!(&cg[64..72], &off.to_le_bytes(), "offset stored at tail");
        assert_eq!(&cg[12..16], &0x0000_0073u32.to_le_bytes(), "ecall");
        assert_eq!(&cg[60..64], &0x0000_8067u32.to_le_bytes(), "ret");

        let gtod = offset_stub_gettimeofday(off);
        // 13 instructions (52 bytes) + 8-byte embedded offset.
        assert_eq!(gtod.len(), 60);
        assert_eq!(&gtod[52..60], &off.to_le_bytes(), "offset stored at tail");
        assert_eq!(&gtod[48..52], &0x0000_8067u32.to_le_bytes(), "ret");
    }

    #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))]
    const EXEC_PAGE: usize = 4096;

    /// Map `code` into a fresh page, written then flipped to `PROT_READ|PROT_EXEC`
    /// (W^X-friendly), and return the page pointer. On riscv64 a `fence.i` is issued
    /// so the just-written instructions are fetchable on this hart (x86_64 caches
    /// are coherent). Caller transmutes the pointer to the right fn type and frees
    /// it with `libc::munmap(page, EXEC_PAGE)`.
    #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))]
    fn map_executable(code: &[u8]) -> *mut libc::c_void {
        use std::ptr;
        assert!(code.len() <= EXEC_PAGE);
        let page = unsafe {
            libc::mmap(
                ptr::null_mut(),
                EXEC_PAGE,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
                -1,
                0,
            )
        };
        assert_ne!(page, libc::MAP_FAILED, "mmap exec page");
        unsafe {
            ptr::copy_nonoverlapping(code.as_ptr(), page as *mut u8, code.len());
            assert_eq!(
                libc::mprotect(page, EXEC_PAGE, libc::PROT_READ | libc::PROT_EXEC),
                0,
                "mprotect r-x"
            );
        }
        // riscv64 instruction fetch is not coherent with the stores above until a
        // FENCE.I retires on this hart.
        #[cfg(target_arch = "riscv64")]
        unsafe {
            std::arch::asm!("fence.i");
        }
        page
    }

    /// Execute the generated `clock_gettime` offset stub as real machine code and
    /// confirm CLOCK_REALTIME comes back shifted by exactly the embedded offset
    /// while CLOCK_MONOTONIC is left untouched. Unlike the layout tests above, this
    /// proves the hand-assembled encoding (syscall, clockid branches, PC-relative
    /// offset load, tv_sec add) actually runs correctly on hardware. Needs no
    /// sandbox/Landlock, so it runs on any kernel.
    #[test]
    #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))]
    fn offset_stub_clock_gettime_executes_and_shifts_realtime() {
        const OFFSET: i64 = -86_400; // one day back
        let page = map_executable(&offset_stub_clock_gettime(OFFSET));
        let stub_fn: extern "C" fn(libc::clockid_t, *mut libc::timespec) -> libc::c_int =
            unsafe { std::mem::transmute(page) };

        // CLOCK_REALTIME (0): stub time must equal real time + OFFSET.
        let mut real = libc::timespec { tv_sec: 0, tv_nsec: 0 };
        let mut stubbed = libc::timespec { tv_sec: 0, tv_nsec: 0 };
        assert_eq!(unsafe { libc::clock_gettime(libc::CLOCK_REALTIME, &mut real) }, 0);
        assert_eq!(stub_fn(libc::CLOCK_REALTIME, &mut stubbed), 0, "stub returns 0");
        let shift = real.tv_sec - stubbed.tv_sec; // real - (real + OFFSET) = -OFFSET
        assert!(
            (shift - (-OFFSET)).abs() <= 2,
            "CLOCK_REALTIME should be shifted by {OFFSET}s, observed real-stub={shift}s"
        );

        // CLOCK_MONOTONIC (1): not in {0,5}, so the stub must leave it unshifted.
        let mut mono_stub = libc::timespec { tv_sec: 0, tv_nsec: 0 };
        let mut mono_real = libc::timespec { tv_sec: 0, tv_nsec: 0 };
        assert_eq!(stub_fn(libc::CLOCK_MONOTONIC, &mut mono_stub), 0);
        assert_eq!(unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut mono_real) }, 0);
        assert!(
            (mono_real.tv_sec - mono_stub.tv_sec).abs() <= 2,
            "CLOCK_MONOTONIC must be unshifted, stub={} real={}",
            mono_stub.tv_sec,
            mono_real.tv_sec
        );

        unsafe {
            libc::munmap(page, EXEC_PAGE);
        }
    }

    /// Execute the generated `gettimeofday` offset stub as real machine code:
    /// confirm a non-NULL `timeval` comes back shifted by the embedded offset, and
    /// that a NULL `timeval` takes the stub's NULL branch (returns 0, no store, no
    /// fault). Runs on any kernel (no sandbox/Landlock).
    #[test]
    #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))]
    fn offset_stub_gettimeofday_executes_and_shifts_tv_sec() {
        const OFFSET: i64 = -86_400; // one day back
        let page = map_executable(&offset_stub_gettimeofday(OFFSET));
        let stub_fn: extern "C" fn(*mut libc::timeval, *mut libc::c_void) -> libc::c_int =
            unsafe { std::mem::transmute(page) };

        // Non-NULL timeval: tv_sec must be shifted by OFFSET.
        let mut real = libc::timeval { tv_sec: 0, tv_usec: 0 };
        let mut stubbed = libc::timeval { tv_sec: 0, tv_usec: 0 };
        assert_eq!(unsafe { libc::gettimeofday(&mut real, std::ptr::null_mut()) }, 0);
        assert_eq!(stub_fn(&mut stubbed, std::ptr::null_mut()), 0, "stub returns 0");
        let shift = real.tv_sec - stubbed.tv_sec; // real - (real + OFFSET) = -OFFSET
        assert!(
            (shift - (-OFFSET)).abs() <= 2,
            "gettimeofday tv_sec should be shifted by {OFFSET}s, observed real-stub={shift}s"
        );

        // NULL timeval: the stub must take its NULL branch — return 0 without
        // dereferencing the pointer. A mis-encoded branch would SIGSEGV here.
        assert_eq!(
            stub_fn(std::ptr::null_mut(), std::ptr::null_mut()),
            0,
            "NULL timeval handled without fault"
        );

        unsafe {
            libc::munmap(page, EXEC_PAGE);
        }
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    fn test_offset_stub_contains_offset() {
        let offset: i64 = -86400; // one day back
        let stub = offset_stub_clock_gettime(offset);
        // x86_64 encodes the offset as a single movabs imm64, so the 8 bytes
        // appear contiguously in the stub.
        let offset_bytes = offset.to_le_bytes();
        assert!(stub.windows(8).any(|w| w == offset_bytes));
    }

    #[test]
    #[cfg(target_arch = "aarch64")]
    fn test_offset_stub_contains_offset() {
        let offset: i64 = -86400;
        let stub = offset_stub_clock_gettime(offset);
        // arm64 splits a 64-bit immediate across movz/movk instructions, so the
        // bytes are not contiguous. Verify each 16-bit chunk is encoded as a
        // movz/movk imm16 field (bits 5..21 of the 32-bit instruction).
        let raw = offset as u64;
        for shift in 0..4 {
            let chunk = ((raw >> (shift * 16)) & 0xFFFF) as u32;
            if chunk == 0 {
                continue; // a zero imm16 collides with too many other instructions to assert on
            }
            let found = stub.chunks_exact(4).any(|insn| {
                let word = u32::from_le_bytes(insn.try_into().unwrap());
                ((word >> 5) & 0xFFFF) == chunk
            });
            assert!(found, "chunk {:#06x} for shift {} not encoded in stub", chunk, shift);
        }
    }
}