go-lib 0.6.1

rust native goroutines
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
// SPDX-License-Identifier: Apache-2.0
//! AArch64 context switch primitives — ported from `runtime/asm_arm64.s` and
//! `runtime/preempt_arm64.s`.
//!
//! Public entry points:
//! - [`gogo`]                     — restore a saved `Gobuf` and resume a goroutine.
//! - [`mcall`]                    — save current G, switch to g0's stack, call a fn.
//! - [`async_preempt_trampoline`] — save all GPRs + d0–d31, call `async_preempt2`,
//!   restore, ret to interrupted PC.  *(v0.2.0 — Step 4)*
//! - [`systemstack`]              — run a closure on g0's stack.
//!
//! ## Design vs Go's approach
//!
//! Go stores the current G in a dedicated hardware register (`R28` / `x28` on
//! AArch64) via the OS's TLS mechanism and accesses it from assembly without
//! a function call.  We use a Rust `thread_local!` (`CURRENT_G` in `g.rs`)
//! instead, updating it in the Rust wrapper before entering the naked asm.
//! This avoids replicating Go's platform-specific TLS segment tricks while
//! keeping the hot-path assembly minimal.
//!
//! ## Calling convention (AArch64 AAPCS64)
//! Arguments: `x0`–`x7`.  Caller-saved: `x0`–`x18`.  Callee-saved: `x19`–`x28`,
//! `x29` (frame pointer), `x30` (link register / return address).
//!
//! ## Gobuf field offsets (verified by compile-time assertions in `g.rs`)
//! ```text
//!  0  sp
//!  8  pc
//! 16  g
//! 24  ctxt
//! 32  ret
//! 40  lr
//! 48  bp
//! ```

use std::ptr::addr_of_mut;

use super::g::{
    set_current_g, Gobuf, G,
    G0_SCHED,
    GOBUF_BP_OFFSET, GOBUF_G_OFFSET, GOBUF_LR_OFFSET,
    GOBUF_PC_OFFSET, GOBUF_REGS_OFFSET, GOBUF_SP_OFFSET,
};

// ---------------------------------------------------------------------------
// gogo — restore saved state and jump
// ---------------------------------------------------------------------------

/// Restore register state from `buf` and resume execution at `buf.pc`.
///
/// Ported from `runtime·gogo` in `runtime/asm_arm64.s`.
///
/// Register usage:
/// - `x0`  = buf (*mut Gobuf, argument)
/// - `x9`  = scratch (target pc)
/// - `x10` = scratch (sp value, cannot load sp from memory directly)
/// - `x29` = frame pointer (bp), `x30` = link register (lr) — restored
///
/// ## Callee-saved register restoration
///
/// `gogo_asm` resumes execution at `buf.pc`, which (for an `mcall`-yielded
/// goroutine) is the instruction immediately after `blr mcall_asm`.  The
/// Rust caller obeys AAPCS64 and may hold live values in callee-saved
/// registers (x19–x28, d8–d15) across that call; the scheduler clobbered
/// them while the G was parked.  Restore the slots saved by `mcall_asm` so
/// the resumed frame sees the exact register state it left behind.  For a
/// fresh G (never yielded) the slots are zero-initialised, which is harmless
/// — `goroutine_entry` writes its callee-saves before reading them.
#[unsafe(naked)]
pub(crate) unsafe extern "C" fn gogo_asm(buf: *mut Gobuf) -> ! {
    core::arch::naked_asm!(
        // ── restore callee-saved registers: x19–x28, d8–d15 ──────────────
        "ldp  x19, x20, [x0, #{regs} + 0]",
        "ldp  x21, x22, [x0, #{regs} + 16]",
        "ldp  x23, x24, [x0, #{regs} + 32]",
        "ldp  x25, x26, [x0, #{regs} + 48]",
        "ldp  x27, x28, [x0, #{regs} + 64]",
        "ldp  d8,  d9,  [x0, #{regs} + 80]",
        "ldp  d10, d11, [x0, #{regs} + 96]",
        "ldp  d12, d13, [x0, #{regs} + 112]",
        "ldp  d14, d15, [x0, #{regs} + 128]",

        "ldr  x9,  [x0, #{pc}]",   // x9  = gobuf.pc  (target instruction)
        "ldr  x29, [x0, #{bp}]",   // x29 = gobuf.bp  (frame pointer)
        "ldr  x30, [x0, #{lr}]",   // x30 = gobuf.lr  (link register)
        "ldr  x10, [x0, #{sp}]",   // x10 = gobuf.sp
        "mov  sp,  x10",            // SP  = gobuf.sp  (cannot be a load target)
        "br   x9",                  // jump to pc — never returns
        pc   = const GOBUF_PC_OFFSET,
        bp   = const GOBUF_BP_OFFSET,
        lr   = const GOBUF_LR_OFFSET,
        sp   = const GOBUF_SP_OFFSET,
        regs = const GOBUF_REGS_OFFSET,
    )
}

// ---------------------------------------------------------------------------
// mcall — save current G's state and switch to g0
// ---------------------------------------------------------------------------

/// Save the current goroutine's registers into `g_sched`, switch to g0's
/// stack, and call `fn_ptr(g)`.  Never returns via the normal path.
///
/// The return type is `()` (not `!`) deliberately: the Rust compiler must
/// generate a proper epilogue (`ldp x29,x30,[sp],#16; ret`) for `mcall()`
/// after the `blr mcall_asm` instruction.  `gogo_asm` resumes a goroutine
/// by jumping to `g_sched.pc`, which is the LR value saved here — i.e., the
/// address of that epilogue.  Executing it unwinds the `mcall` and caller
/// (`gosched`/`gopark`) frames normally, returning control to the goroutine's
/// user code — exactly the same sequence Go uses.
///
/// Ported from `runtime·mcall` in `runtime/asm_arm64.s`.
///
/// AArch64 argument registers on entry:
/// - `x0` = g          (*mut G  — current goroutine)
/// - `x1` = g_sched    (*mut Gobuf — &(*g).sched, pre-computed by wrapper)
/// - `x2` = g0_gobuf   (*mut Gobuf — &(*g0).sched, from G0_SCHED TLS)
/// - `x3` = fn_ptr     (unsafe extern "C" fn(*mut G))
/// - `x30`= LR / return address — the return address of `blr mcall_asm`
///          inside `mcall()`, i.e., the address of `mcall`'s epilogue.
///          Saved as `g_sched.pc` so `gogo` can resume there.
/// - `sp` = caller's stack pointer (saved as g_sched.sp)
///
/// After the stack switch the call `blr x3` runs on g0's stack.  `fn_ptr`
/// must not return — it must tail into `gogo` or loop in `schedule()`.
/// The `brk #1` that follows is a hard trap for debug builds only.
#[unsafe(naked)]
pub(crate) unsafe extern "C" fn mcall_asm(
    _g:        *mut G,
    _g_sched:  *mut Gobuf,
    _g0_gobuf: *mut Gobuf,
    _fn_ptr:   unsafe extern "C" fn(*mut G),
) {
    core::arch::naked_asm!(
        // ── save current goroutine's context into g_sched (x1) ───────────
        // On AArch64 the return address is always in x30 (LR) on function
        // entry before any prologue — naked fns have no prologue, so x30
        // holds the true return address here.
        "str  x30, [x1, #{pc}]",   // g_sched.pc = return address
        "mov  x9,  sp",
        "str  x9,  [x1, #{sp}]",   // g_sched.sp = caller SP
        "str  x29, [x1, #{bp}]",   // g_sched.bp = frame pointer (x29)
        "str  x0,  [x1, #{g}]",    // g_sched.g  = g (keep field in sync)

        // ── save callee-saved registers (AAPCS64): x19–x28, d8–d15 ───────
        // The scheduler clobbers every register while the G is parked;
        // gogo_asm restores these slots on resume.  Mirrors the x86-64
        // rbx/r12–r15 save in asm_amd64.rs.
        "stp  x19, x20, [x1, #{regs} + 0]",
        "stp  x21, x22, [x1, #{regs} + 16]",
        "stp  x23, x24, [x1, #{regs} + 32]",
        "stp  x25, x26, [x1, #{regs} + 48]",
        "stp  x27, x28, [x1, #{regs} + 64]",
        "stp  d8,  d9,  [x1, #{regs} + 80]",
        "stp  d10, d11, [x1, #{regs} + 96]",
        "stp  d12, d13, [x1, #{regs} + 112]",
        "stp  d14, d15, [x1, #{regs} + 128]",

        // ── switch to g0's stack (x2 = g0_gobuf) ────────────────────────
        // g0's stack must be 16-byte aligned at this point (ABI requirement
        // for bl/blr).  The invariant is maintained by M::new (step 6).
        "ldr  x9,  [x2, #{sp}]",   // x9 = g0.sp
        "mov  sp,  x9",
        "ldr  x29, [x2, #{bp}]",   // x29 = g0.bp

        // ── call fn_ptr(g) on g0's stack ─────────────────────────────────
        // x0 = g (first argument, untouched since function entry)
        // x3 = fn_ptr
        "blr  x3",

        // If fn_ptr returns (shutdown path: schedule() returned), exit this
        // OS thread cleanly rather than hitting a breakpoint trap.
        "bl   {m_exit}",

        pc     = const GOBUF_PC_OFFSET,
        sp     = const GOBUF_SP_OFFSET,
        bp     = const GOBUF_BP_OFFSET,
        g      = const GOBUF_G_OFFSET,
        regs   = const GOBUF_REGS_OFFSET,
        m_exit = sym crate::runtime::sched::m_thread_exit,
    )
}

// ---------------------------------------------------------------------------
// Public wrappers
// ---------------------------------------------------------------------------

/// Resume goroutine `g` by restoring its saved register state and jumping.
///
/// Updates `CURRENT_G` before the context switch so any code that runs after
/// the switch sees the correct current goroutine.  The caller must have
/// initialised `g.sched.sp` and `g.sched.pc` before calling.
///
/// Ported from the `execute` → `gogo` path in `runtime/proc.go` +
/// `runtime/asm_arm64.s`.
pub(crate) unsafe fn gogo(g: *mut G) -> ! {
    unsafe {
        set_current_g(g);
        gogo_asm(addr_of_mut!((*g).sched))
    }
}

/// Save the current goroutine's state into `g.sched` and switch to g0's
/// stack, calling `fn_ptr(g)` there.
///
/// `fn_ptr` must eventually call `schedule()` or hand off via `gogo()` and
/// must not return to its caller.
///
/// The return type is `()` (not `!`) for the same reason as `mcall_asm`: the
/// compiler must emit an epilogue after `blr mcall_asm` so that `gogo` can
/// resume the goroutine by jumping to that epilogue and returning through the
/// call stack normally.
///
/// Requires `G0_SCHED` to be initialised by `M::new` (step 6); panics in
/// debug builds if it has not been set yet.
///
/// Ported from `runtime·mcall` in `runtime/proc.go` + `runtime/asm_arm64.s`.
///
/// ## Why `#[inline(never)]` is load-bearing
///
/// Same reason as the x86-64 `mcall` (see `asm_amd64.rs`): if this function
/// is inlined into a caller that yields more than once, LLVM CSEs the
/// `G0_SCHED` TLS accessor and carries the slot address across the
/// suspension in a callee-saved register.  After a cross-thread resume the
/// next yield then switches onto the *old* thread's g0 stack, corrupting
/// that M's live scheduler frames.  Out-of-lining forces the TLS read to be
/// re-derived on the current thread at every suspension point.
#[inline(never)]
pub(crate) unsafe fn mcall(g: *mut G, fn_ptr: unsafe extern "C" fn(*mut G)) {
    unsafe {
        let g_sched  = addr_of_mut!((*g).sched);
        let g0_gobuf = G0_SCHED.with(|c| c.get());
        debug_assert!(
            !g0_gobuf.is_null(),
            "mcall: G0_SCHED is null — M::new must be called before spawning goroutines (step 6)",
        );
        mcall_asm(g, g_sched, g0_gobuf, fn_ptr);
        // mcall_asm switches to g0 and calls fn_ptr (which calls schedule,
        // an infinite loop).  Execution never reaches here during normal
        // forward flow.  When gogo() later resumes this goroutine it jumps
        // directly to the epilogue of this function (the instruction after
        // `blr mcall_asm`), unwinding the frame chain back to the user code.
    }
}

// ---------------------------------------------------------------------------
// systemstack — run a closure on g0's stack
// ---------------------------------------------------------------------------

/// Low-level stack switch for AArch64 (AAPCS64).
///
/// ## Register layout on entry
/// - `x0` = g0_sp   (target stack pointer)
/// - `x1` = arg     (opaque closure pointer)
/// - `x2` = thunk   (function to call: `fn(*mut ())`)
///
/// Saves `x29` (frame pointer) and `x30` (link register / return address) on
/// the goroutine's stack, switches `SP` to `g0_sp` (16-byte aligned), calls
/// `thunk(arg)`, then restores the goroutine's SP and returns.
///
/// Ported from `runtime·systemstack` in `runtime/asm_arm64.s`.
#[allow(dead_code)] // called by systemstack; no callers until systemstack is used
#[unsafe(naked)]
unsafe extern "C" fn systemstack_call(
    _g0_sp: usize,
    _arg:   *mut (),
    _thunk: unsafe extern "C" fn(*mut ()),
) {
    core::arch::naked_asm!(
        // Save FP (x29) and LR (x30) on the goroutine's stack.
        "stp x29, x30, [sp, #-16]!",
        // Record current SP (= goroutine SP after STP) in x29.
        "mov x29, sp",
        // Align g0_sp (x0) down to 16 bytes and switch SP.
        "bic x3,  x0, #15",
        "mov sp,  x3",
        // Call thunk(arg): x0 = arg (x1), thunk = x2.
        "mov x0,  x1",
        "blr x2",
        // Restore goroutine SP from FP (x29).
        "mov sp,  x29",
        "ldp x29, x30, [sp], #16",
        "ret",
    )
}

/// Run `f` on the M's g0 (system) stack, then return to the current goroutine.
///
/// If already on g0 (scheduler context — `CURRENT_G` is null), `f` is called
/// directly without any stack switch.
///
/// See the x86-64 version in `asm_amd64.rs` for full documentation.
///
/// Ported from `systemstack` in `runtime/asm_arm64.s`.
#[allow(dead_code)] // future callers: stack growth, signal handlers, GC hooks
pub(crate) unsafe fn systemstack<F: FnOnce()>(f: F) {
    if super::g::current_g().is_null() {
        f();
        return;
    }

    let g0_sp = unsafe { (*super::g::g0_sched()).sp };
    debug_assert!(g0_sp != 0, "systemstack: g0_sched.sp is 0 — M not initialised");

    let mut slot = std::mem::ManuallyDrop::new(f);
    let arg = std::ptr::addr_of_mut!(slot) as *mut ();

    unsafe extern "C" fn thunk<F: FnOnce()>(arg: *mut ()) {
        let f = unsafe { std::ptr::read(arg as *mut F) };
        f();
    }

    unsafe { systemstack_call(g0_sp, arg, thunk::<F>) };
}

// ---------------------------------------------------------------------------
// async_preempt_trampoline — Step 4: async signal-based preemption
// ---------------------------------------------------------------------------

/// AArch64 async-preemption trampoline.
///
/// The SIGURG handler (`redirect_to_async_preempt`) pushes the goroutine's
/// ORIGINAL `x30` onto its stack (16-byte slot), sets `x30` = original `PC`
/// (the resume target), and redirects `PC` here.  Execution resumes with all
/// registers intact except x30, and `[sp]` = the original x30.
///
/// ## Frame layout (512 B, 16-byte aligned; redirect slot above it)
/// ```text
/// sp+512 .. sp+527  : redirect slot — [sp+512] = ORIGINAL x30 (pushed by handler)
/// sp+0   .. sp+231  : x0–x28 (29 GPRs × 8 B)
/// sp+232 .. sp+239  : x29 (frame pointer)
/// sp+240 .. sp+247  : x30 (= resume PC, set by the SIGURG handler)
/// sp+248 .. sp+375  : d0–d15  (16 × 8 B double FP regs, caller-saved)
/// sp+376 .. sp+503  : d16–d31 (16 × 8 B double FP regs, callee-saved in AAPCS64)
///   ↑ 504 B used, padded to 512 B for 16-byte alignment
/// ```
///
/// After `bl async_preempt2` (which calls `mcall → schedule` and returns when
/// the goroutine is rescheduled), all registers are restored, the ORIGINAL
/// x30 is reloaded from the redirect slot, and execution branches to the
/// resume PC via `x18`.
///
/// ## Why `x18` for the final branch
///
/// The resume PC must be in a register to branch (AArch64 has no
/// branch-to-memory), but every general register may hold live interrupted
/// state.  `x18` is the AAPCS64 platform register: reserved (never allocated
/// by LLVM) on Darwin, Windows, Android, and Fuchsia, so clobbering it on
/// macOS is guaranteed safe.  Go avoids this with its dedicated REGTMP (x27),
/// which its own compiler never keeps live at preemption points — a luxury
/// Rust code does not have.  On `aarch64-unknown-linux-gnu` x18 IS
/// allocatable, so this trampoline would corrupt a live x18 there; for that
/// reason `preemptone` (sysmon.rs) never sends SIGURG on Linux/aarch64 —
/// preemption is cooperative-only on that target, and this trampoline is
/// unreachable.  Re-enable once builds can require
/// `-Ctarget-feature=+reserve-x18`.
///
/// Ported from the auto-generated `asyncPreempt` in `runtime/preempt_arm64.s`
/// plus `sigctxt.pushCall` in `runtime/signal_arm64.go`.
#[unsafe(naked)]
pub(crate) unsafe extern "C" fn async_preempt_trampoline() {
    core::arch::naked_asm!(
        // ── allocate frame (512 B) ────────────────────────────────────────────
        "sub sp, sp, #512",

        // ── save GPRs x0–x28, x29, x30 ───────────────────────────────────────
        "stp x0,  x1,  [sp, #0]",
        "stp x2,  x3,  [sp, #16]",
        "stp x4,  x5,  [sp, #32]",
        "stp x6,  x7,  [sp, #48]",
        "stp x8,  x9,  [sp, #64]",
        "stp x10, x11, [sp, #80]",
        "stp x12, x13, [sp, #96]",
        "stp x14, x15, [sp, #112]",
        "stp x16, x17, [sp, #128]",
        "stp x18, x19, [sp, #144]",
        "stp x20, x21, [sp, #160]",
        "stp x22, x23, [sp, #176]",
        "stp x24, x25, [sp, #192]",
        "stp x26, x27, [sp, #208]",
        "stp x28, x29, [sp, #224]",
        "str x30,      [sp, #240]",   // x30 = resume PC (set by SIGURG handler)

        // ── save FP regs d0–d31 ───────────────────────────────────────────────
        "stp d0,  d1,  [sp, #248]",
        "stp d2,  d3,  [sp, #264]",
        "stp d4,  d5,  [sp, #280]",
        "stp d6,  d7,  [sp, #296]",
        "stp d8,  d9,  [sp, #312]",
        "stp d10, d11, [sp, #328]",
        "stp d12, d13, [sp, #344]",
        "stp d14, d15, [sp, #360]",
        "stp d16, d17, [sp, #376]",
        "stp d18, d19, [sp, #392]",
        "stp d20, d21, [sp, #408]",
        "stp d22, d23, [sp, #424]",
        "stp d24, d25, [sp, #440]",
        "stp d26, d27, [sp, #456]",
        "stp d28, d29, [sp, #472]",
        "stp d30, d31, [sp, #488]",

        // ── call async_preempt2 ────────────────────────────────────────────────
        // bl sets x30 = return address (inside this trampoline).  On return from
        // async_preempt2 (after the goroutine is rescheduled), sp is restored to
        // this frame by mcall/gogo.
        "bl {ap2}",

        // ── restore FP regs ────────────────────────────────────────────────────
        "ldp d0,  d1,  [sp, #248]",
        "ldp d2,  d3,  [sp, #264]",
        "ldp d4,  d5,  [sp, #280]",
        "ldp d6,  d7,  [sp, #296]",
        "ldp d8,  d9,  [sp, #312]",
        "ldp d10, d11, [sp, #328]",
        "ldp d12, d13, [sp, #344]",
        "ldp d14, d15, [sp, #360]",
        "ldp d16, d17, [sp, #376]",
        "ldp d18, d19, [sp, #392]",
        "ldp d20, d21, [sp, #408]",
        "ldp d22, d23, [sp, #424]",
        "ldp d24, d25, [sp, #440]",
        "ldp d26, d27, [sp, #456]",
        "ldp d28, d29, [sp, #472]",
        "ldp d30, d31, [sp, #488]",

        // ── restore GPRs ──────────────────────────────────────────────────────
        "ldp x28, x29, [sp, #224]",
        "ldp x26, x27, [sp, #208]",
        "ldp x24, x25, [sp, #192]",
        "ldp x22, x23, [sp, #176]",
        "ldp x20, x21, [sp, #160]",
        "ldp x18, x19, [sp, #144]",
        "ldp x16, x17, [sp, #128]",
        "ldp x14, x15, [sp, #112]",
        "ldp x12, x13, [sp, #96]",
        "ldp x10, x11, [sp, #80]",
        "ldp x8,  x9,  [sp, #64]",
        "ldp x6,  x7,  [sp, #48]",
        "ldp x4,  x5,  [sp, #32]",
        "ldp x2,  x3,  [sp, #16]",
        "ldp x0,  x1,  [sp, #0]",

        // ── return to the resume PC, restoring the ORIGINAL x30 ───────────────
        // x18 = resume PC (the platform register — reserved on Darwin, see
        // the doc-comment).  x30 is reloaded from the redirect slot the
        // SIGURG handler pushed above this frame, restoring the interrupted
        // code's true return address (critical for leaf functions).
        "ldr x18, [sp, #240]",          // x18 = resume PC
        "add sp, sp, #512",             // release trampoline frame
        "ldr x30, [sp]",                // x30 = ORIGINAL x30 (redirect slot)
        "add sp, sp, #16",              // pop redirect slot → original SP
        "br  x18",                      // resume interrupted code

        ap2 = sym crate::runtime::sched::async_preempt2,
    )
}