1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
//! Three-port virtio-console with inline MMIO transport.
//!
//! Eight virtqueues per virtio-v1.2 §5.3.5 with `VIRTIO_CONSOLE_F_MULTIPORT`:
//! q0 in0 — host→guest, port 0 (console / hvc0 stdin)
//! q1 out0 — guest→host, port 0 (console / hvc0 stdout)
//! q2 c_ivq — host→guest control (PORT_ADD, PORT_OPEN, etc.)
//! q3 c_ovq — guest→host control (DEVICE_READY, PORT_READY, PORT_OPEN ack)
//! q4 in1 — host→guest, port 1 (snapshot reply payloads from the
//! freeze coordinator; see `queue_input_port1`)
//! q5 out1 — guest→host, port 1 (bulk TLV stream)
//! q6 in2 — host→guest, port 2 (scheduler-stats requests from the
//! host's [`super::sched_stats::SchedStatsClient`]; see
//! `queue_input_port2`)
//! q7 out2 — guest→host, port 2 (scheduler-stats responses; raw
//! byte passthrough, no TLV)
//!
//! Port 0 carries the interactive console (stdout/stdin via `/dev/hvc0`).
//! Port 1 carries the TLV stream written by
//! `guest_comms::send_*` — exit code, test result, per-payload
//! metrics, raw payload outputs, profraw, scheduler exit
//! notifications, stimulus events, scenario start/end markers.
//! Port 2 is a transparent byte pipe: the host pushes scx_stats
//! request bytes; the guest's relay thread forwards them to
//! `/var/run/scx/root/stats` and pumps the socket response back via
//! port 2 TX. scx_stats is already newline-delimited JSON so no
//! framing layer is added.
//! Crash payloads travel over COM2. Backpressure is asymmetric:
//! * Guest→host TX (port 1, port 2): the host's `add_used` rate
//! gates the guest's writes; when the host lags, the guest
//! blocks in `wait_port_writable` instead of dropping. Per-call
//! drain is also capped (`TX_PER_CALL_MAX`) so a hostile guest
//! cannot grow the host accumulator without bound on a single
//! notify.
//! * Host→guest RX (port 0 + port 1 + port 2): the per-port
//! `pending_rx` accumulators are unbounded by design — the host
//! alone produces these bytes (kernel scheduler signals, terminal
//! paste, snapshot replies, stats requests), so a hostile guest
//! cannot grow them; losing a host→guest byte would silently
//! strand a wake signal or truncate a reply, which is worse than
//! a host-side OOM. The per-call CHAIN drain is capped
//! (`RX_CHAINS_PER_CALL_MAX`) so a hostile guest publishing many
//! zero-progress descriptor chains cannot hold the vCPU MMIO
//! handler in `drain_pending_rx` for an unbounded number of
//! iterations on a single notify.
//!
//! Features: `VIRTIO_F_VERSION_1 | VIRTIO_CONSOLE_F_MULTIPORT`.
//! Config space: `cols=0, rows=0, max_nr_ports=3, emerg_wr=0` (cols/rows
//! valid only with F_SIZE which we do not advertise; the kernel reads
//! `max_nr_ports` via `virtio_cread_feature(F_MULTIPORT, max_nr_ports)`,
//! offset 4 in `struct virtio_console_config`).
//!
//! MMIO register layout per virtio-v1.2 §4.2.2. Interrupt delivery via
//! irqfd (eventfd → KVM GSI). TX data on port 0 or port 1 signals
//! `tx_evt`; TX data on port 2 signals a separate `stats_tx_evt` so
//! the host's [`super::sched_stats::SchedStatsClient`] wakes only on
//! its own port without contending with the freeze coordinator's
//! port-1 drain path.
use VecDeque;
use ;
use VIRTIO_ID_CONSOLE;
/// Multiport feature bit per `include/uapi/linux/virtio_console.h`.
/// virtio-bindings 0.2.7 does not expose virtio_console.h constants
/// (the crate's per-arch `bindings/` tree only carries blk/config/
/// gpu/ids/input/mmio/net/ring/scsi), so the spec-defined value lives
/// here as a single source of truth.
const VIRTIO_CONSOLE_F_MULTIPORT: u32 = 1;
use ;
use ;
use ;
use EventFd;
use ;
const MMIO_MAGIC: u32 = 0x7472_6976; // "virt" in LE
const MMIO_VERSION: u32 = 2; // virtio 1.x MMIO
const VENDOR_ID: u32 = 0;
/// MMIO region size: 4 KB (one page).
pub const VIRTIO_MMIO_SIZE: u64 = 0x1000;
/// RX wake byte: host requested a SysRq-D dump. The guest's
/// `hvc0_poll_loop` blocks on `/dev/hvc0`, scans every drained byte
/// for this value, and triggers SysRq-D directly via
/// `/proc/sysrq-trigger` when it is observed. Distinct from
/// `SIGNAL_VC_SHUTDOWN` and `SIGNAL_BPF_WRITE_DONE` so stack traces
/// and tcpdump-style captures can distinguish the trigger source.
pub const SIGNAL_VC_DUMP: u8 = 0xD1;
/// RX wake byte: host pushed a graceful-shutdown request through
/// the virtio-console RX queue.
pub const SIGNAL_VC_SHUTDOWN: u8 = 0xD3;
/// RX wake byte: host's `bpf-map-write` thread finished applying
/// every queued `bpf_map_write` to the BPF maps inside the guest's
/// kernel. The guest's `hvc0_poll_loop` recognises the byte and
/// sets the `bpf_map_write_done` latch so a scenario blocked on
/// [`crate::scenario::Ctx::wait_for_map_write`] resumes. Replaces
/// the legacy SHM signal-slot rendezvous (host writes slot 0, guest
/// blocks on slot 0) with a virtio-console wake byte. Host side:
/// `host_comms::request_bpf_map_write_done`.
pub const SIGNAL_BPF_WRITE_DONE: u8 = 0xBF;
/// RX wake byte: the host freeze coordinator has ADOPTED its
/// kernel-symbol accessor (`owned_accessor` is now `Some`), so a
/// failure dump captured from this point renders real BPF map values
/// instead of placeholders. The guest's `hvc0_poll_loop` recognises
/// the byte and sets the `accessor_ready` latch so a scenario blocked
/// in [`crate::scenario::ops::await_accessor_ready`] resumes and
/// triggers its stall only once the dump path is fully armed. Host side:
/// `host_comms::request_accessor_ready`, pushed at the coordinator's
/// accessor-adoption point.
pub const SIGNAL_ACCESSOR_READY: u8 = 0xAC;
// `NUM_PORTS` lives in [`super::wire`]; re-exported here so existing
// call sites keep working. Port 0 = console (hvc0); port 1 = bulk
// TLV stream (`/dev/vport0p1`); port 2 = scheduler-stats relay
// (`/dev/vport0p2`). Three ports → eight queues per virtio-v1.2 §5.3.5
// (`2 + 2 * num_ports`).
pub use NUM_PORTS;
const NUM_QUEUES: usize = 2 + 2 * NUM_PORTS as usize;
const QUEUE_MAX_SIZE: u16 = 256;
// Per port_id_to_queue_idx in libkrun (mirrored here):
// port 0: rx=0, tx=1
// control: c_ivq=2 (host→guest), c_ovq=3 (guest→host)
// port N>=1: rx = 2+2N, tx = 2+2N+1
// So port 1: rx=4, tx=5; port 2: rx=6, tx=7.
const PORT0_RXQ: usize = 0;
const PORT0_TXQ: usize = 1;
const C_IVQ: usize = 2; // host pushes control msgs to guest
const C_OVQ: usize = 3; // guest sends control msgs to host
const PORT1_RXQ: usize = 4;
const PORT1_TXQ: usize = 5;
const PORT2_RXQ: usize = 6;
const PORT2_TXQ: usize = 7;
/// Maximum bytes accepted from a single TX descriptor. The kernel's
/// virtio-console driver sends PAGE_SIZE chunks; this cap prevents a
/// malformed descriptor (len=0xFFFFFFFF) from triggering a ~4GB alloc.
const TX_DESC_MAX: usize = 32 * 1024;
/// Maximum cumulative bytes accepted by a single `process_tx`
/// call. The per-descriptor `TX_DESC_MAX` cap bounds individual
/// descriptors, but a hostile guest can publish thousands of valid
/// descriptors back-to-back and grow the per-port `tx_buf` without
/// bound. Capping the per-call drain at 256 KiB keeps the per-vCPU
/// MMIO-handler latency budget bounded — once the cap is hit we
/// stop popping chains and let the next QUEUE_NOTIFY drain the
/// rest. Backpressure on the guest's TX queue is the natural
/// consequence: a chain that has not been add_used yet stays in
/// the avail ring for the next call.
const TX_PER_CALL_MAX: usize = 256 * 1024;
/// Maximum control-queue chains drained per `process_control_tx`
/// call. The c_ovq's payload is a fixed 8-byte
/// `VirtioConsoleControl` frame — a hostile guest publishing
/// thousands of small chains would otherwise let one notify hold the
/// vCPU thread in `process_control_tx` for an unbounded duration and
/// grow the `events` Vec without bound. Mirrors the TX byte-cap
/// pattern: chains beyond the cap stay in the avail ring for the
/// next QUEUE_NOTIFY. 32 is enough headroom for the legitimate
/// handshake (DEVICE_READY + per-port PORT_READY + per-port
/// PORT_OPEN = ~5 events) with margin while still bounding the
/// adversarial case.
const CONTROL_CHAINS_PER_CALL_MAX: usize = 32;
/// Maximum host→guest RX chains drained per `drain_pending_rx`
/// call. Unlike TX (byte-driven via `TX_PER_CALL_MAX`), RX progress
/// is chain-shaped: each chain absorbs
/// `min(pending_rx_len, sum_of_write_only_desc_lens)` bytes. A
/// hostile guest publishing many zero-length write-only descriptors
/// (or chains lacking any write-only desc — unusual but legal)
/// makes `consumed_offset` stay 0; the `drain(..0)` at the bottom
/// of the loop is then a no-op and the outer
/// `while !pending_rx.is_empty()` reissues `pop_descriptor_chain`
/// without progress until the avail ring is exhausted. With
/// `QUEUE_MAX_SIZE = 256` chains × 256 descriptors per chain that's
/// ~65k iterations per notify, parked on a vCPU thread that is
/// expected to bound MMIO-handler latency. Cap drains at 64 chains
/// per call: legitimate traffic posts a small number of multi-KB
/// chains (kernel virtio-console driver allocates PAGE_SIZE buffers
/// per chain for hvc0, larger for `/dev/vport0p1`); 64 is well
/// above any single-notify legitimate fan-out while still bounding
/// the adversarial latency. Remaining chains stay in the avail
/// ring for the next QUEUE_NOTIFY (or the next host-side push).
const RX_CHAINS_PER_CALL_MAX: usize = 64;
/// Status bits required before each phase.
const S_ACK: u32 = VIRTIO_CONFIG_S_ACKNOWLEDGE;
const S_DRV: u32 = S_ACK | VIRTIO_CONFIG_S_DRIVER;
const S_FEAT: u32 = S_DRV | VIRTIO_CONFIG_S_FEATURES_OK;
/// Test helper — terminal state bits with DRIVER_OK set.
const S_OK: u32 = S_FEAT | VIRTIO_CONFIG_S_DRIVER_OK;
// ----- virtio-console control protocol -----------------------------
//
// `VirtioConsoleControl` and its u16 event discriminants live in
// [`super::wire`]. The constants here are convenience aliases for
// `ControlEvent::*.wire_value()` so the existing call sites read
// the same as the kernel uapi names; new code should prefer the
// typed `ControlEvent` enum directly. The wire format is 8 bytes
// little-endian: id (u32), event (u16), value (u16). LE on the
// wire is x86_64 / aarch64 native.
pub use VirtioConsoleControl;
pub const VIRTIO_CONSOLE_DEVICE_READY: u16 = DeviceReady.wire_value;
pub const VIRTIO_CONSOLE_PORT_ADD: u16 = PortAdd.wire_value;
// PORT_REMOVE and RESIZE are kernel uapi event ids the lib does not
// yet generate or consume — kept as named constants so the public
// surface mirrors `enum virtio_console_event` 1:1. `#[allow(dead_code)]`
// matches the `KVM_INTERESTING_STATS` pattern in `result.rs`.
pub const VIRTIO_CONSOLE_PORT_REMOVE: u16 = PortRemove.wire_value;
pub const VIRTIO_CONSOLE_PORT_READY: u16 = PortReady.wire_value;
pub const VIRTIO_CONSOLE_CONSOLE_PORT: u16 = ConsolePort.wire_value;
pub const VIRTIO_CONSOLE_RESIZE: u16 = Resize.wire_value;
pub const VIRTIO_CONSOLE_PORT_OPEN: u16 = PortOpen.wire_value;
pub const VIRTIO_CONSOLE_PORT_NAME: u16 = PortName.wire_value;
const VC_CONTROL_SIZE: usize = ;
const _: = assert!;
// `PORT1_NAME` and `PORT2_NAME` live in [`super::wire`]; re-exported
// here for the existing call sites in this module.
pub use PORT1_NAME;
pub use PORT2_NAME;
/// Port-0 device-name advertised to the guest. The kernel's
/// `handle_control_message` PORT_NAME case
/// (drivers/char/virtio_console.c) creates the sysfs
/// `/sys/class/virtio-ports/vport0p0/name` attribute when the host
/// sends PORT_NAME; without that emission the attribute does not
/// exist and tooling that scans `/sys/class/virtio-ports/*/name` to
/// disambiguate port 0 (console) from port 1 (bulk) cannot
/// distinguish them. QEMU's `add_port` (hw/char/virtio-console.c)
/// sets a name on the chardev (`chardev-id` derived) and the
/// virtio-serial PORT_NAME emission in
/// `virtio_serial_post_load_timer_cb` / `send_control_event`
/// emits it for every port that has one — including the console
/// port. Mirror that here.
pub const PORT0_NAME: &str = "ktstr-console";
/// Outbound (host→guest) control payload kinds. The host serialises
/// these into 8-byte wire frames (plus optional name bytes) for the
/// c_ivq.
/// Per-port state for the three virtio-console ports. Indexed by
/// `port_id` (0 = console / hvc0, 1 = bulk TLV stream / vport0p1,
/// 2 = scheduler-stats relay / vport0p2).
///
/// `tx_buf` accumulates guest→host TX bytes pending host drain;
/// `pending_rx` accumulates host→guest RX bytes pending delivery
/// into the guest's RX ring. Both are unbounded by design — a host
/// alone produces RX bytes (so a hostile guest cannot grow
/// `pending_rx`), and TX bytes are bounded per-call by
/// `TX_PER_CALL_MAX`.
/// Map a queue index to (port_id, is_tx). Returns None for the
/// control queues (C_IVQ / C_OVQ) and any out-of-range index.
const
/// Map a port id to its (rxq, txq) queue indices. Inverse of
/// `queue_to_port` for the data direction.
const
/// Static log-friendly label for a port id. Used in tracing fields
/// across the TX / RX / reset-drain paths so structured logs carry a
/// stable port identifier without per-call allocation.
const
/// Three-port virtio-console MMIO device.
///
/// Single-struct state — no separate transport layer. Caller holds
/// this in a `PiMutex` and dispatches MMIO reads/writes from the vCPU
/// run loop.