ktstr 0.14.0

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/// MP table setup for boot (single-CPU and SMP topologies).
/// The kernel reads this to discover CPUs and their APIC IDs.
/// Uses topology-aware APIC IDs for multi-LLC support.
use anyhow::{Context, Result};
use vm_memory::{Address, Bytes, GuestAddress, GuestMemoryMmap};

use super::kvm::MAX_XAPIC_ID;
use super::topology::{apic_id, max_apic_id};
use crate::vmm::topology::Topology;

const MPTABLE_START: u64 = 0x9fc00;

// MP table signatures
const SMP_MAGIC: [u8; 4] = *b"_MP_";
const MPC_MAGIC: [u8; 4] = *b"PCMP";

// MP table entry types
const MP_PROCESSOR: u8 = 0;
const MP_BUS: u8 = 1;
const MP_IOAPIC: u8 = 2;
const MP_INTSRC: u8 = 3;
const MP_LINTSRC: u8 = 4;

// CPU flags
const CPU_ENABLED: u8 = 0x01;
const CPU_BSP: u8 = 0x02;

// Versions/constants
const APIC_VERSION: u8 = 0x14;
const IO_APIC_ID: u8 = 0xfe;
const IO_APIC_ADDR: u32 = 0xfec0_0000;

/// Whether the legacy MP table applies to this topology.
///
/// The MP table stores each processor's APIC ID in a single byte, so it
/// can only describe APIC IDs up to 254 (`MAX_XAPIC_ID`). Above that the
/// IDs truncate and collide (256 -> 0 aliases the BSP), making the table a
/// spec-lie. MADT is the SMP authority on an ACPI guest regardless, so for
/// wide topologies the MP table is suppressed and the guest boots SMP from
/// MADT alone; below the limit it stays a valid `acpi=off` SMP fallback.
pub fn mptable_applies(topo: &Topology) -> bool {
    max_apic_id(topo) <= MAX_XAPIC_ID
}

/// Write an MP table describing the given topology into guest memory.
/// Each CPU entry uses the topology-computed APIC ID so the kernel
/// sees the correct LLC/core/thread structure.
///
/// No-op when `mptable_applies(topo)` is false (max APIC ID > 254): the
/// MP table's u8 APIC ID can't address those CPUs, and MADT is the SMP
/// authority on an ACPI guest, so the table is suppressed there.
pub fn setup_mptable(mem: &GuestMemoryMmap, topo: &Topology) -> Result<()> {
    if !mptable_applies(topo) {
        return Ok(());
    }
    let num_cpus = topo.total_cpus();
    let mut addr = GuestAddress(MPTABLE_START);

    // MP Floating Pointer Structure (16 bytes)
    let mpf_size = 16u64;
    let mpc_start = addr.raw_value() + mpf_size;

    let mut mpf = [0u8; 16];
    mpf[0..4].copy_from_slice(&SMP_MAGIC);
    // Physical address of MPC table
    mpf[4..8].copy_from_slice(&(mpc_start as u32).to_le_bytes());
    mpf[8] = 1; // length (in 16-byte units)
    mpf[9] = 4; // spec revision
    // feature1 = 0: custom MP configuration table present
    // feature2 bit 7: IMCR present → use APIC mode (required for SMP)
    mpf[12] = 0x80;
    // Checksum computed after all fields set
    let cksum = mpf.iter().fold(0u8, |acc, &b| acc.wrapping_add(b));
    mpf[10] = (!cksum).wrapping_add(1);
    mem.write_slice(&mpf, addr).context("write mpf")?;
    addr = addr.unchecked_add(mpf_size);

    // MPC Table Header (44 bytes)
    // We'll write the header last (need the total length)
    let header_addr = addr;
    let header_size = 44u64;
    addr = addr.unchecked_add(header_size);

    // CPU entries (20 bytes each)
    let cpu_entry_size = 20u64;
    for cpu_id in 0..num_cpus {
        // MP table spec uses 8-bit APIC IDs. The early return above
        // guarantees max APIC ID <= 254 here, so `as u8` is exact for
        // every CPU — no truncation.
        let apic_id = apic_id(topo, cpu_id) as u8;
        let mut entry = [0u8; 20];
        entry[0] = MP_PROCESSOR;
        entry[1] = apic_id;
        entry[2] = APIC_VERSION;
        entry[3] = CPU_ENABLED | if cpu_id == 0 { CPU_BSP } else { 0 };
        // CPU signature (stepping)
        entry[4..8].copy_from_slice(&0x0600u32.to_le_bytes());
        // Feature flags (FPU + APIC)
        entry[8..12].copy_from_slice(&0x0201u32.to_le_bytes());
        mem.write_slice(&entry, addr).context("write mpc_cpu")?;
        addr = addr.unchecked_add(cpu_entry_size);
    }

    // Bus entry (8 bytes)
    let bus_entry_size = 8u64;
    let mut bus = [0u8; 8];
    bus[0] = MP_BUS;
    bus[1] = 0; // bus ID
    bus[2..8].copy_from_slice(b"ISA   ");
    mem.write_slice(&bus, addr).context("write mpc_bus")?;
    addr = addr.unchecked_add(bus_entry_size);

    // IOAPIC entry (8 bytes)
    let ioapic_entry_size = 8u64;
    let mut ioapic = [0u8; 8];
    ioapic[0] = MP_IOAPIC;
    ioapic[1] = IO_APIC_ID;
    ioapic[2] = APIC_VERSION;
    ioapic[3] = 0x01; // enabled
    ioapic[4..8].copy_from_slice(&IO_APIC_ADDR.to_le_bytes());
    mem.write_slice(&ioapic, addr).context("write mpc_ioapic")?;
    addr = addr.unchecked_add(ioapic_entry_size);

    // Interrupt source entries (8 bytes each) — 24 legacy GSI IRQs (0..23)
    const NUM_IRQS: u32 = 24;
    let intsrc_entry_size = 8u64;
    for irq in 0u8..NUM_IRQS as u8 {
        let mut intsrc = [0u8; 8];
        intsrc[0] = MP_INTSRC;
        intsrc[1] = 0; // INT type
        intsrc[2] = 0; // flags
        intsrc[3] = 0;
        intsrc[4] = 0; // bus ID
        intsrc[5] = irq; // bus IRQ
        intsrc[6] = IO_APIC_ID; // dest APIC
        intsrc[7] = irq; // dest APIC INTIN
        mem.write_slice(&intsrc, addr).context("write mpc_intsrc")?;
        addr = addr.unchecked_add(intsrc_entry_size);
    }

    // Local interrupt source entries (8 bytes each) — LINT0 + LINT1
    let lintsrc_entry_size = 8u64;
    for lint in 0u8..2 {
        let mut lintsrc = [0u8; 8];
        lintsrc[0] = MP_LINTSRC;
        lintsrc[1] = if lint == 0 { 3 } else { 1 }; // ExtINT or NMI
        lintsrc[6] = 0xff; // dest APIC (all)
        lintsrc[7] = lint; // dest APIC LINTIN
        mem.write_slice(&lintsrc, addr)
            .context("write mpc_lintsrc")?;
        addr = addr.unchecked_add(lintsrc_entry_size);
    }

    // Now write the MPC table header
    let table_end = addr;
    let table_len = (table_end.raw_value() - header_addr.raw_value() - header_size) as u16;

    let mut header = [0u8; 44];
    header[0..4].copy_from_slice(&MPC_MAGIC);
    // Base table length (includes header + entries)
    let total_len = (header_size as u16) + table_len;
    header[4..6].copy_from_slice(&total_len.to_le_bytes());
    header[6] = 4; // spec revision
    // checksum at [7] — computed last
    // MP spec OEM ID: 8 bytes, space-padded (vs ACPI's 6-byte null-terminated).
    header[8..16].copy_from_slice(b"KTSTR   "); // OEM ID
    // Product ID: 12-byte field; zeroed (no product identifier assigned).
    header[16..28].copy_from_slice(b"000000000000"); // product ID
    // OEM table pointer [28..32] = 0
    // OEM table size [32..34] = 0
    let entry_count = num_cpus + 1 + 1 + NUM_IRQS + 2; // cpus + bus + ioapic + intsrcs + lintsrcs
    header[34..36].copy_from_slice(&(entry_count as u16).to_le_bytes());
    // Local APIC address
    header[36..40].copy_from_slice(&0xfee0_0000u32.to_le_bytes());
    // Extended table length [40..42] = 0
    // Extended table checksum [42] = 0

    // Compute header checksum
    // Need to include all entries in the checksum
    let entries_start = header_addr.unchecked_add(header_size);
    let entries_len = (table_end.raw_value() - entries_start.raw_value()) as usize;
    let mut entry_bytes = vec![0u8; entries_len];
    mem.read_slice(&mut entry_bytes, entries_start)
        .context("read entries for checksum")?;

    let mut cksum: u8 = 0;
    for &b in &header {
        cksum = cksum.wrapping_add(b);
    }
    for &b in &entry_bytes {
        cksum = cksum.wrapping_add(b);
    }
    header[7] = (!cksum).wrapping_add(1);

    mem.write_slice(&header, header_addr)
        .context("write mpc_table header")?;

    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    fn test_mem(mib: u32) -> GuestMemoryMmap {
        GuestMemoryMmap::<()>::from_ranges(&[(GuestAddress(0), (mib as usize) << 20)]).unwrap()
    }

    #[test]
    fn mptable_single_cpu() {
        let mem = test_mem(16);
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 1,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        setup_mptable(&mem, &topo).unwrap();
        // Verify MP floating pointer magic
        let mut magic = [0u8; 4];
        mem.read_slice(&mut magic, GuestAddress(MPTABLE_START))
            .unwrap();
        assert_eq!(&magic, b"_MP_");
    }

    #[test]
    fn mptable_multi_llc() {
        let mem = test_mem(16);
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        setup_mptable(&mem, &topo).unwrap();
        let mut magic = [0u8; 4];
        mem.read_slice(&mut magic, GuestAddress(MPTABLE_START))
            .unwrap();
        assert_eq!(&magic, b"_MP_");

        // Verify MPC table magic
        let mut mpc_magic = [0u8; 4];
        mem.read_slice(&mut mpc_magic, GuestAddress(MPTABLE_START + 16))
            .unwrap();
        assert_eq!(&mpc_magic, b"PCMP");
    }

    #[test]
    fn mptable_mpf_checksum() {
        let mem = test_mem(16);
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        setup_mptable(&mem, &topo).unwrap();
        let mut mpf = [0u8; 16];
        mem.read_slice(&mut mpf, GuestAddress(MPTABLE_START))
            .unwrap();
        let sum: u8 = mpf.iter().fold(0u8, |acc, &b| acc.wrapping_add(b));
        assert_eq!(sum, 0, "MPF checksum must be zero");
    }

    #[test]
    fn mptable_header_checksum() {
        let mem = test_mem(16);
        let topo = Topology {
            llcs: 1,
            cores_per_llc: 4,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        setup_mptable(&mem, &topo).unwrap();

        // Read header to get table length
        let header_addr = GuestAddress(MPTABLE_START + 16);
        let mut len_bytes = [0u8; 2];
        mem.read_slice(&mut len_bytes, header_addr.unchecked_add(4))
            .unwrap();
        let table_len = u16::from_le_bytes(len_bytes) as usize;

        // Read entire table and verify checksum
        let mut table = vec![0u8; table_len];
        mem.read_slice(&mut table, header_addr).unwrap();
        let sum: u8 = table.iter().fold(0u8, |acc, &b| acc.wrapping_add(b));
        assert_eq!(sum, 0, "MPC table checksum must be zero");
    }

    #[test]
    fn mptable_cpu_apic_ids_match_topology() {
        let mem = test_mem(16);
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        setup_mptable(&mem, &topo).unwrap();

        // CPU entries start at MPTABLE_START + 16 (mpf) + 44 (header)
        let cpu_start = GuestAddress(MPTABLE_START + 16 + 44);
        for i in 0..topo.total_cpus() {
            let entry_addr = cpu_start.unchecked_add(i as u64 * 20);
            let mut entry = [0u8; 20];
            mem.read_slice(&mut entry, entry_addr).unwrap();
            assert_eq!(entry[0], MP_PROCESSOR, "entry type should be processor");
            let id = entry[1];
            let expected = apic_id(&topo, i) as u8;
            assert_eq!(id, expected, "CPU {i}: APIC ID {id} != expected {expected}");
        }
    }

    #[test]
    fn mptable_bsp_flagged() {
        let mem = test_mem(16);
        let topo = Topology {
            llcs: 2,
            cores_per_llc: 2,
            threads_per_core: 1,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        setup_mptable(&mem, &topo).unwrap();

        let cpu_start = GuestAddress(MPTABLE_START + 16 + 44);
        // CPU 0 should be BSP
        let mut entry0 = [0u8; 20];
        mem.read_slice(&mut entry0, cpu_start).unwrap();
        assert_ne!(entry0[3] & CPU_BSP, 0, "CPU 0 should be BSP");
        assert_ne!(entry0[3] & CPU_ENABLED, 0, "CPU 0 should be enabled");

        // CPU 1 should not be BSP
        let mut entry1 = [0u8; 20];
        mem.read_slice(&mut entry1, cpu_start.unchecked_add(20))
            .unwrap();
        assert_eq!(entry1[3] & CPU_BSP, 0, "CPU 1 should not be BSP");
        assert_ne!(entry1[3] & CPU_ENABLED, 0, "CPU 1 should be enabled");
    }

    #[test]
    fn mptable_large_topology_240_cpus() {
        let mem = test_mem(2048);
        let topo = Topology {
            llcs: 15,
            cores_per_llc: 8,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert_eq!(topo.total_cpus(), 240);
        // max APIC ID = apic_id(239) = 14<<4 | 7<<1 | 1 = 239 <= 254, so the
        // MP table applies and is emitted with a valid signature.
        assert!(max_apic_id(&topo) <= MAX_XAPIC_ID);
        assert!(mptable_applies(&topo));
        setup_mptable(&mem, &topo).unwrap();
        let mut magic = [0u8; 4];
        mem.read_slice(&mut magic, GuestAddress(MPTABLE_START))
            .unwrap();
        assert_eq!(&magic, b"_MP_");
    }

    /// `mptable_applies` is exactly the negation of the split-irqchip
    /// condition (`max_apic_id > MAX_XAPIC_ID`), so it tracks `setup_mptable`
    /// suppression. Verified at the boundary and above it.
    #[test]
    fn mptable_applies_matches_xapic_limit() {
        // 15 LLCs x 8 cores x 2 threads: max APIC ID = 239 <= 254 -> applies.
        let at_limit = Topology {
            llcs: 15,
            cores_per_llc: 8,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(max_apic_id(&at_limit) <= MAX_XAPIC_ID);
        assert!(mptable_applies(&at_limit));

        // 14 LLCs x 9 cores x 2 threads: max APIC ID = 433 > 254 -> suppressed.
        let wide = Topology {
            llcs: 14,
            cores_per_llc: 9,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert!(max_apic_id(&wide) > MAX_XAPIC_ID);
        assert!(!mptable_applies(&wide));
    }

    #[test]
    fn mptable_suppressed_above_xapic_limit() {
        let mem = test_mem(4096);
        // 14 LLCs x 9 cores x 2 threads = 252 vCPUs, max APIC ID = 433 > 254.
        // The MP table's u8 APIC ID can't address those CPUs; setup_mptable
        // must be a no-op so no spec-lie table is left in guest memory (MADT
        // is the SMP authority on this path).
        let topo = Topology {
            llcs: 14,
            cores_per_llc: 9,
            threads_per_core: 2,
            numa_nodes: 1,
            nodes: None,
            distances: None,
        };
        assert_eq!(topo.total_cpus(), 252);
        assert!(max_apic_id(&topo) > MAX_XAPIC_ID);
        assert!(!mptable_applies(&topo));

        // Pre-fill the MP-table region with a sentinel so a no-op is provable
        // independent of the backing's initial contents: if setup_mptable
        // wrote anything (e.g. the _MP_ signature) the sentinel would change.
        let sentinel = [0xABu8; 16];
        mem.write_slice(&sentinel, GuestAddress(MPTABLE_START))
            .unwrap();

        setup_mptable(&mem, &topo).unwrap();

        let mut after = [0u8; 16];
        mem.read_slice(&mut after, GuestAddress(MPTABLE_START))
            .unwrap();
        assert_eq!(&after, &sentinel, "no MP table should be written");
    }
}