ktstr 0.5.2

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
//! Shared fixtures for per-module `#[cfg(test)]` blocks.
//!
//! This file is compiled only under `#[cfg(test)]`. Each fixture
//! lives here because it is used by two or more submodule test
//! blocks; single-use helpers stay co-located with their sole
//! consumer.
//!
//! Nothing in this module is exposed outside the crate — all items
//! are `pub(crate)` and the module itself is `#[cfg(test)]` so they
//! disappear from non-test builds entirely.

#![cfg(test)]

use std::ffi::{OsStr, OsString};
use std::sync::{Mutex, MutexGuard};
use std::time::Duration;

use anyhow::Result;
use tempfile::TempDir;

use crate::assert::{AssertDetail, AssertResult, ScenarioStats};
use crate::scenario::Ctx;

use super::entry::{KtstrTestEntry, Scheduler, SchedulerSpec, TopologyConstraints};
use crate::vmm::topology::Topology;

/// Serializes tests that mutate env vars. Shared across every
/// `#[cfg(test)]` module in the crate: nextest runs tests in
/// parallel within a binary, and `std::env::set_var` is process-wide,
/// so any test that mutates an env var must hold this mutex for its
/// full save/mutate/restore window.
pub(crate) static ENV_LOCK: Mutex<()> = Mutex::new(());

/// Lock [`ENV_LOCK`] for the lifetime of a test, recovering from
/// poisoning. If a previous test panicked while holding the lock we
/// still want the current test to run: env-touching tests establish
/// no shared invariant beyond their own save/restore pair, so the
/// poisoned inner guard is safe to take.
pub(crate) fn lock_env() -> MutexGuard<'static, ()> {
    ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner())
}

/// Tempdir bound as the process-wide `KTSTR_CACHE_DIR`. While the
/// returned value is live:
///   * the temp directory exists on disk and is pointed at by
///     `KTSTR_CACHE_DIR`;
///   * on drop, the env var's previous value is restored and the
///     directory is removed.
///
/// Field order is load-bearing: Rust drops fields in declaration
/// order, so `_guard` drops first (restoring `KTSTR_CACHE_DIR` to
/// whatever the test process saw before construction) and `tmp`
/// drops second (removing the directory). Reversing the order
/// would leave `KTSTR_CACHE_DIR` pointing at a deleted tempdir
/// during the window between `tmp`'s and `_guard`'s drop calls —
/// a dangling-env-ref hazard in any code that races with the
/// tail of the drop sequence.
///
/// Callers that also mutate other env vars must hold [`lock_env`]
/// for the guard's full lifetime so the save/restore pair does not
/// race with another test in the same binary.
pub(crate) struct IsolatedCacheDir {
    _guard: EnvVarGuard,
    tmp: TempDir,
}

impl IsolatedCacheDir {
    /// The temp directory's root path.
    pub(crate) fn path(&self) -> &std::path::Path {
        self.tmp.path()
    }
}

/// Create a fresh tempdir and point `KTSTR_CACHE_DIR` at it. See
/// [`IsolatedCacheDir`] for drop semantics.
pub(crate) fn isolated_cache_dir() -> IsolatedCacheDir {
    let tmp = TempDir::new().expect("tempdir for isolated cache root");
    let guard = EnvVarGuard::set("KTSTR_CACHE_DIR", tmp.path());
    IsolatedCacheDir { _guard: guard, tmp }
}

/// True when `dir` resolves to a discoverable git repository
/// via `gix::discover`'s upward walk. Useful for tests that
/// need to distinguish "the operator pointed at a non-git
/// tree" (where `gix::discover` returns Err) from "the
/// operator's tempdir happens to be inside a host git checkout
/// (e.g. `/tmp` under a developer's `~/work`)" — `gix::discover`
/// walks parents unconditionally and finds the ancestor `.git`
/// in the second case.
///
/// Production code is fine with the upward-walk semantics —
/// an operator who points `--source` at a tempdir genuinely
/// inside their checkout DOES want the checkout's HEAD as the
/// source identity. The semantic only breaks for tests that
/// MUST exercise the no-repo-found branch; those tests call
/// this helper and skip when the host environment cannot
/// provide isolation. The non-fatal skip is the right call:
/// the production behavior is correct in both cases, the test
/// is the artifact that cares about the ancestor-`.git`
/// outcome.
pub(crate) fn tempdir_resolves_to_ancestor_git(dir: &std::path::Path) -> bool {
    gix::discover(dir).is_ok()
}

/// Shared `Topology` used by `evaluate_vm_result` tests: the
/// 1-numa-1-llc-2-core-1-thread topology is unremarkable on every
/// dimension monitor checks touch, so tests that exercise other
/// fields don't also have to reason about topology effects.
pub(crate) const EVAL_TOPO: Topology = Topology::new(1, 1, 2, 1);

/// Placeholder test function used by `eevdf_entry` / `sched_entry`
/// fixtures. Always returns pass — tests that care about the
/// function's behaviour construct their own `KtstrTestEntry.func`.
pub(crate) fn dummy_test_fn(_ctx: &Ctx) -> Result<AssertResult> {
    Ok(AssertResult::pass())
}

/// RAII guard that mutates an environment variable and restores the
/// original value on drop. Not thread-safe: hold [`ENV_LOCK`] for the
/// guard's full lifetime when another test in the same binary might
/// mutate the same key.
///
/// nextest runs each test in its own process, so simple single-variable
/// tests don't need the lock — the lock is only needed when multiple
/// tests in a single process mutate overlapping keys.
///
/// Keys and saved originals are stored as [`OsString`] so callers
/// can pass `Path`, `PathBuf`, `&OsStr`, or `&str` without a UTF-8
/// bridge — `&str: AsRef<OsStr>` keeps existing string-literal call
/// sites working unchanged. Storage is `OsString` because non-UTF-8
/// env values are valid on Unix and the guard must restore them
/// exactly.
///
/// Shared across cache / remote_cache / anywhere else that needs to
/// rewrite HOME / XDG_CACHE_HOME / KTSTR_CACHE_DIR / ACTIONS_CACHE_URL
/// and friends during a test run. Previously duplicated in
/// `cache::tests` and `remote_cache::tests`.
pub(crate) struct EnvVarGuard {
    key: OsString,
    original: Option<OsString>,
}

impl EnvVarGuard {
    pub(crate) fn set(key: impl AsRef<OsStr>, value: impl AsRef<OsStr>) -> Self {
        let key = key.as_ref().to_owned();
        let original = std::env::var_os(&key);
        // SAFETY: nextest runs each test in its own process; callers
        // that share a key across concurrent tests in the same process
        // must call `lock_env()` before constructing the guard.
        unsafe { std::env::set_var(&key, value) };
        EnvVarGuard { key, original }
    }

    pub(crate) fn remove(key: impl AsRef<OsStr>) -> Self {
        let key = key.as_ref().to_owned();
        let original = std::env::var_os(&key);
        // SAFETY: same rationale as `set`.
        unsafe { std::env::remove_var(&key) };
        EnvVarGuard { key, original }
    }
}

impl Drop for EnvVarGuard {
    fn drop(&mut self) {
        match &self.original {
            // SAFETY: nextest runs each test in its own process.
            Some(val) => unsafe { std::env::set_var(&self.key, val) },
            None => unsafe { std::env::remove_var(&self.key) },
        }
    }
}

/// Build a minimal `KtstrTestEntry` bound to the EEVDF scheduler.
/// Intended for `evaluate_vm_result` tests that exercise the no-scx
/// code path — see the `sched_entry` sibling for the scx variant.
pub(crate) fn eevdf_entry(name: &'static str) -> KtstrTestEntry {
    KtstrTestEntry {
        name,
        func: dummy_test_fn,
        auto_repro: false,
        ..KtstrTestEntry::DEFAULT
    }
}

/// Scheduler used by `sched_entry` to represent a generic scx-style
/// scheduler for evaluate_vm_result tests that need the
/// `has_active_scheduling == true` path.
pub(crate) static SCHED_TEST: Scheduler = Scheduler {
    name: "test_sched",
    binary: SchedulerSpec::Discover("test_sched_bin"),
    sysctls: &[],
    kargs: &[],
    assert: crate::assert::Assert::NO_OVERRIDES,
    cgroup_parent: None,
    sched_args: &[],
    topology: Topology {
        llcs: 1,
        cores_per_llc: 2,
        threads_per_core: 1,
        numa_nodes: 1,
        nodes: None,
        distances: None,
    },
    constraints: TopologyConstraints::DEFAULT,
    config_file: None,
    config_file_def: None,
    kernels: &[],
};

/// Build a minimal `KtstrTestEntry` bound to the scx-style
/// `SCHED_TEST` fixture. Pair to `eevdf_entry` for the scheduler
/// path in evaluate_vm_result tests.
pub(crate) fn sched_entry(name: &'static str) -> KtstrTestEntry {
    KtstrTestEntry {
        name,
        func: dummy_test_fn,
        scheduler: &SCHED_TEST,
        auto_repro: false,
        ..KtstrTestEntry::DEFAULT
    }
}

/// No-op repro probe: returns `None` so evaluate_vm_result tests
/// don't attempt to spawn a second VM while running outside a kernel
/// context.
pub(crate) fn no_repro(_output: &str) -> Option<String> {
    None
}

/// Construct a `VmResult` with explicit output/stderr/exit and
/// default monitor/stimulus/verifier/kvm/crash fields. Used by
/// evaluate_vm_result tests to drive specific error paths without
/// touching an actual VM.
pub(crate) fn make_vm_result(
    output: &str,
    stderr: &str,
    exit_code: i32,
    timed_out: bool,
) -> crate::vmm::VmResult {
    crate::vmm::VmResult {
        success: !timed_out && exit_code == 0,
        exit_code,
        duration: std::time::Duration::from_secs(1),
        timed_out,
        output: output.to_string(),
        stderr: stderr.to_string(),
        monitor: None,
        guest_messages: None,
        stimulus_events: Vec::new(),
        verifier_stats: Vec::new(),
        kvm_stats: None,
        crash_message: None,
        cleanup_duration: None,
        virtio_blk_counters: None,
        virtio_net_counters: None,
        snapshot_bridge: {
            let cb: crate::scenario::snapshot::CaptureCallback = std::sync::Arc::new(|_| None);
            crate::scenario::snapshot::SnapshotBridge::new(cb)
        },
        stats_client: None,
        periodic_fired: 0,
        periodic_target: 0,
    }
}

/// Build an [`AssertResult`] with a default (all-zero)
/// [`ScenarioStats`] payload. Centralizes the 16-field zero-stats
/// skeleton that previously had to be hand-typed at every
/// `evaluate_vm_result` call site.
///
/// Pre-bincode migration: every call site used to wrap this
/// in `serde_json::to_string` and embed the JSON between
/// `RESULT_START` / `RESULT_END` delimiters in `output`. The
/// fallback decoder is gone — call sites now build the full
/// [`crate::vmm::host_comms::BulkDrainResult`] via
/// [`make_vm_result_with_assert`] (or hand-roll a `BulkDrainResult`
/// when they need extra entries beyond the test result).
pub(crate) fn build_assert_result(passed: bool, details: Vec<AssertDetail>) -> AssertResult {
    AssertResult {
        passed,
        skipped: false,
        details,
        stats: ScenarioStats::default(),
        measurements: std::collections::BTreeMap::new(),
    }
}

/// Frame an [`AssertResult`] as a bincode-encoded
/// `MSG_TYPE_TEST_RESULT` TLV entry, ready to drop into
/// [`crate::vmm::host_comms::BulkDrainResult::entries`].
///
/// Mirrors what the guest's
/// [`crate::vmm::guest_comms::send_test_result`] writes onto the
/// wire (modulo the framing header — `BulkDrainResult` carries
/// already-parsed entries, not raw bytes), so a test fixture's
/// entry round-trips through `parse_assert_result_from_drain`'s
/// `bincode::serde::decode_from_slice` exactly as a real guest
/// emission would.
pub(crate) fn assert_result_tlv_entry(result: &AssertResult) -> crate::vmm::wire::ShmEntry {
    let payload = bincode::serde::encode_to_vec(result, bincode::config::standard())
        .expect("AssertResult bincode encode must not fail");
    crate::vmm::wire::ShmEntry {
        msg_type: crate::vmm::wire::MSG_TYPE_TEST_RESULT,
        payload,
        crc_ok: true,
    }
}

/// Construct a [`crate::vmm::VmResult`] whose `guest_messages`
/// carries a single bincode-encoded `MSG_TYPE_TEST_RESULT` TLV
/// entry produced from `assert`. Otherwise identical to
/// [`make_vm_result`]; the explicit `output` / `stderr` / `exit_code`
/// / `timed_out` arguments still drive the COM2-style fields the
/// host-side panic / sched-log scrapers consume.
pub(crate) fn make_vm_result_with_assert(
    output: &str,
    stderr: &str,
    exit_code: i32,
    timed_out: bool,
    assert: &AssertResult,
) -> crate::vmm::VmResult {
    let mut r = make_vm_result(output, stderr, exit_code, timed_out);
    r.guest_messages = Some(crate::vmm::host_comms::BulkDrainResult {
        entries: vec![assert_result_tlv_entry(assert)],
    });
    r
}

/// Build a [`crate::vmm::host_comms::BulkDrainResult`] that carries
/// only `MSG_TYPE_LIFECYCLE` entries — one per supplied
/// [`crate::vmm::wire::LifecyclePhase`], in arrival order. Each entry
/// has a single-byte payload (the phase wire value) and `crc_ok: true`.
///
/// Mirrors the guest-side
/// [`crate::vmm::guest_comms::send_lifecycle`] frame layout for
/// every phase whose payload is the bare 1-byte discriminant
/// (`InitStarted`, `PayloadStarting`, `SchedulerDied`); the
/// `SchedulerNotAttached` reason suffix is not modelled here because
/// none of the current `evaluate_vm_result` / `classify_init_stage`
/// call sites need it.
pub(crate) fn lifecycle_drain(
    phases: &[crate::vmm::wire::LifecyclePhase],
) -> crate::vmm::host_comms::BulkDrainResult {
    let entries = phases
        .iter()
        .map(|p| crate::vmm::wire::ShmEntry {
            msg_type: crate::vmm::wire::MSG_TYPE_LIFECYCLE,
            payload: vec![p.wire_value()],
            crc_ok: true,
        })
        .collect();
    crate::vmm::host_comms::BulkDrainResult { entries }
}

/// Build a `KtstrTestEntry` with overridden memory/duration for
/// `KtstrTestEntry::validate` tests. Everything else inherits from
/// `DEFAULT`.
pub(crate) fn validate_entry(
    name: &'static str,
    memory_mb: u32,
    duration: Duration,
) -> KtstrTestEntry {
    KtstrTestEntry {
        name,
        memory_mb,
        duration,
        ..KtstrTestEntry::DEFAULT
    }
}

// ---------------------------------------------------------------------------
// Panic payload extraction
// ---------------------------------------------------------------------------

/// Convert a `catch_unwind` / `JoinHandle::join` panic payload into a
/// human-readable string, preserving whatever message the panic
/// actually carried.
///
/// The panic payload from `std::panic::catch_unwind` is
/// `Box<dyn Any + Send>`. The two common cases — `&'static str`
/// (from `panic!("literal")`) and `String` (from `panic!("{x}")` /
/// `panic!("{}", msg)`) — require explicit downcast; without them
/// the test-side diagnostic collapses to the useless "Box<Any>"
/// rendering that `panic_any` fall-through produces. A payload
/// carrying some other type drops to a `{:?}` rendering via the
/// `Box<dyn Any>` itself, which at least lets a reader see the
/// payload's type name.
///
/// Ladder (matches the de facto std-library convention used by
/// `Mutex::unwrap_or_else`, `rayon::scope`'s panic diagnostics,
/// etc.):
///
/// 1. `payload.downcast_ref::<&'static str>()` — covers
///    `panic!("literal")`.
/// 2. `payload.downcast_ref::<String>()` — covers
///    `panic!("{x}")` / `panic!("formatted: {}", y)`.
/// 3. Fallback: `format!("{payload:?}")` — any other payload type
///    renders via its `Debug` impl (or the `Box<dyn Any>` Debug
///    impl, which prints `Any { .. }`).
pub(crate) fn panic_payload_to_string(payload: Box<dyn std::any::Any + Send>) -> String {
    if let Some(s) = payload.downcast_ref::<&'static str>() {
        (*s).to_string()
    } else if let Some(s) = payload.downcast_ref::<String>() {
        s.clone()
    } else {
        format!("{payload:?}")
    }
}

#[cfg(test)]
mod panic_payload_tests {
    use super::panic_payload_to_string;

    /// `panic!("str-literal")` panics with a `&'static str` payload.
    /// The helper must recover the literal exactly.
    ///
    /// `#[cfg(panic = "unwind")]`: the test deliberately panics
    /// inside `std::panic::catch_unwind` to exercise the payload
    /// recovery helper. Under `panic = "abort"` (release profile —
    /// see `Cargo.toml [profile.release]`) panics cannot be caught;
    /// the panic aborts the whole test binary instead of returning
    /// an `Err` from `catch_unwind`. Gating the test on the panic
    /// strategy lets `cargo ktstr test --release` skip it cleanly.
    #[test]
    #[cfg(panic = "unwind")]
    fn panic_payload_str_literal_recovered() {
        let result = std::panic::catch_unwind(|| {
            panic!("a-string-literal");
        });
        let payload = result.expect_err("closure must panic");
        assert_eq!(panic_payload_to_string(payload), "a-string-literal");
    }

    /// `panic!("{x}")` with a formatted arg panics with a `String`
    /// payload. The helper must preserve the FORMATTED text, not
    /// drop back to a `Box<dyn Any>` stringification.
    ///
    /// `#[cfg(panic = "unwind")]`: same rationale as the sibling
    /// `panic_payload_str_literal_recovered` test — `catch_unwind`
    /// is unusable under `panic = "abort"`.
    #[test]
    #[cfg(panic = "unwind")]
    fn panic_payload_formatted_string_recovered() {
        let n: u32 = 42;
        let result = std::panic::catch_unwind(|| {
            panic!("formatted-{n}-panic");
        });
        let payload = result.expect_err("closure must panic");
        assert_eq!(panic_payload_to_string(payload), "formatted-42-panic");
    }

    /// A non-string payload (e.g. a numeric value via `panic_any`)
    /// drops to the `Debug` fallback rather than panicking the
    /// helper itself. The exact rendering is not pinned beyond a
    /// non-empty string — the contract is "never panic, always
    /// return SOMETHING useful".
    ///
    /// `#[cfg(panic = "unwind")]`: same rationale as the sibling
    /// `panic_payload_str_literal_recovered` test — `catch_unwind`
    /// is unusable under `panic = "abort"`.
    #[test]
    #[cfg(panic = "unwind")]
    fn panic_payload_non_string_falls_back_to_debug() {
        let result = std::panic::catch_unwind(|| {
            std::panic::panic_any(127i32);
        });
        let payload = result.expect_err("closure must panic");
        let rendered = panic_payload_to_string(payload);
        assert!(
            !rendered.is_empty(),
            "non-string payload must render to SOMETHING (non-empty)",
        );
    }
}

// ---------------------------------------------------------------------------
// EnvVarGuard drop-time restoration tests
// ---------------------------------------------------------------------------
//
// Each test uses a distinct env var name so even when `lock_env()` is
// not held across the full set, the tests don't clobber each other's
// pre-existing values. `lock_env()` still serializes against any other
// env-touching test in the crate so the save/mutate/restore window
// does not race with `HOME` / `XDG_CACHE_HOME` / `KTSTR_CACHE_DIR`
// rewrites in cache / model / sidecar tests.

/// `EnvVarGuard::set` restores the PRE-EXISTING value of the key
/// when the guard drops. Pre-seed the key to a known sentinel,
/// construct a guard that overwrites it, confirm the overwrite is
/// visible, drop the guard, and re-read: the sentinel must be back.
#[test]
fn env_var_guard_set_restores_original_value_on_drop() {
    const KEY: &str = "KTSTR_TEST_ENV_VAR_GUARD_SET_RESTORES_ORIGINAL";
    let _lock = lock_env();
    // SAFETY: process-wide env mutation; we hold `lock_env()` so no
    // other test in this binary is racing a key mutation.
    unsafe { std::env::set_var(KEY, "pre-existing") };
    {
        let _g = EnvVarGuard::set(KEY, "overwritten");
        assert_eq!(std::env::var(KEY).ok().as_deref(), Some("overwritten"));
    }
    assert_eq!(
        std::env::var(KEY).ok().as_deref(),
        Some("pre-existing"),
        "EnvVarGuard::set must restore the pre-existing value on drop"
    );
    // Cleanup so a subsequent test run doesn't inherit our sentinel.
    unsafe { std::env::remove_var(KEY) };
}

/// `EnvVarGuard::set` on a key that is currently absent restores it
/// to absent on drop — the guard remembered `None`, so dropping must
/// `remove_var` rather than re-`set_var` the string `"None"` or a
/// stale previous value.
#[test]
fn env_var_guard_set_restores_absent_key_on_drop() {
    const KEY: &str = "KTSTR_TEST_ENV_VAR_GUARD_SET_RESTORES_ABSENT";
    let _lock = lock_env();
    // Guarantee the key is absent before the guard fires.
    unsafe { std::env::remove_var(KEY) };
    assert!(std::env::var(KEY).is_err(), "setup: key must start absent");
    {
        let _g = EnvVarGuard::set(KEY, "transient");
        assert_eq!(std::env::var(KEY).ok().as_deref(), Some("transient"));
    }
    assert!(
        std::env::var(KEY).is_err(),
        "EnvVarGuard::set on an absent key must restore absence, not leave the value behind"
    );
}

/// `EnvVarGuard::remove` preserves the PRE-EXISTING value across its
/// lifetime: the guard takes the key out of the environment while
/// alive, then puts the original back on drop.
#[test]
fn env_var_guard_remove_restores_original_value_on_drop() {
    const KEY: &str = "KTSTR_TEST_ENV_VAR_GUARD_REMOVE_RESTORES_ORIGINAL";
    let _lock = lock_env();
    unsafe { std::env::set_var(KEY, "pre-existing") };
    {
        let _g = EnvVarGuard::remove(KEY);
        assert!(
            std::env::var(KEY).is_err(),
            "EnvVarGuard::remove must remove the key for the guard's lifetime"
        );
    }
    assert_eq!(
        std::env::var(KEY).ok().as_deref(),
        Some("pre-existing"),
        "EnvVarGuard::remove must restore the pre-existing value on drop"
    );
    unsafe { std::env::remove_var(KEY) };
}

/// Non-UTF-8 env values are valid on Unix, and the guard's
/// [`OsString`] storage must round-trip them exactly. If storage
/// had been kept as `String` or the read had used `std::env::var`
/// (which rejects non-UTF-8), a pre-existing non-UTF-8 original
/// would be silently converted into absence or lossily decoded —
/// and the drop-time restore would put back a *different* byte
/// sequence than what the test environment originally held.
#[cfg(unix)]
#[test]
fn env_var_guard_restores_non_utf8_original_exactly() {
    use std::os::unix::ffi::OsStrExt;

    const KEY: &str = "KTSTR_TEST_ENV_VAR_GUARD_RESTORES_NON_UTF8";
    // A byte sequence with a lone 0xFF — not valid UTF-8.
    let original_bytes: &[u8] = b"foo\xFFbar";
    let original = OsStr::from_bytes(original_bytes);

    let _lock = lock_env();
    // SAFETY: process-wide env mutation; we hold `lock_env()`.
    unsafe { std::env::set_var(KEY, original) };
    {
        let _g = EnvVarGuard::set(KEY, "replacement");
        assert_eq!(
            std::env::var_os(KEY).as_deref(),
            Some(OsStr::new("replacement")),
            "EnvVarGuard::set must overwrite the key while the guard is live",
        );
    }
    let restored = std::env::var_os(KEY).expect("restored value must be present after drop");
    assert_eq!(
        restored.as_bytes(),
        original_bytes,
        "EnvVarGuard must restore non-UTF-8 originals byte-for-byte",
    );
    unsafe { std::env::remove_var(KEY) };
}

/// Process-wide mutex serializing every stderr-capture call across
/// the crate. `fd 2 → tempfile` redirection is a non-reentrant
/// process-global mutation — two concurrent callers in DIFFERENT
/// modules would see each other's output land in their sink (or
/// worse, the save/restore pair could interleave and permanently
/// break fd 2). Before this lock existed, `src/cli.rs` and
/// `src/report.rs` each had their own module-local mutex, which
/// serialized within-module calls but did NOT coordinate across
/// modules. The ONE mutex here guards every capture site in the
/// crate.
pub(crate) static STDERR_CAPTURE_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());

/// RAII guard that restores the saved stderr fd on Drop, even if
/// the captured closure panics under the `panic = "unwind"` test
/// profile. Without this guard a panicking closure would leak the
/// fd-2 swap and every subsequent stderr write in the test process
/// would land in the orphaned tempfile. `saved` is `Option` so Drop
/// can `take()` and consume it without an `&mut` borrow fight.
pub(crate) struct StderrRestoreGuard {
    saved: Option<std::os::fd::OwnedFd>,
}
impl Drop for StderrRestoreGuard {
    fn drop(&mut self) {
        if let Some(saved) = self.saved.take() {
            let _ = nix::unistd::dup2_stderr(&saved);
        }
    }
}

/// Run `f` with stderr redirected to an in-memory tempfile; return
/// both `f`'s value and the captured bytes. Uses the process-wide
/// [`STDERR_CAPTURE_LOCK`] to serialize against every other capture
/// call in the crate. The RAII [`StderrRestoreGuard`] restores fd 2
/// even if `f` panics under `panic = "unwind"`.
pub(crate) fn capture_stderr<R>(f: impl FnOnce() -> R) -> (R, Vec<u8>) {
    use std::io::{Read, Seek, SeekFrom, Write};
    let _lock = STDERR_CAPTURE_LOCK
        .lock()
        .unwrap_or_else(|e| e.into_inner());
    let mut sink = tempfile::tempfile().expect("create stderr-capture tempfile");
    // Flush before redirect: eprintln! is line-buffered behind the
    // Stderr lock; pre-call bytes need to reach the ORIGINAL fd 2
    // or they leak into the captured tempfile.
    std::io::stderr().flush().ok();
    let saved = nix::unistd::dup(std::io::stderr()).expect("dup(stderr)");
    nix::unistd::dup2_stderr(&sink).expect("dup2_stderr(sink)");
    let guard = StderrRestoreGuard { saved: Some(saved) };
    let result = f();
    std::io::stderr().flush().ok();
    drop(guard);
    sink.seek(SeekFrom::Start(0)).expect("rewind sink");
    let mut bytes = Vec::new();
    sink.read_to_end(&mut bytes).expect("read sink");
    (result, bytes)
}

/// Concurrent capture-stderr calls MUST observe only their own
/// emitted bytes. Without [`STDERR_CAPTURE_LOCK`] the two threads'
/// `fd 2 → tempfile` swaps would race: thread A's
/// `dup2_stderr(sink_a)` followed by thread B's `dup2_stderr(sink_b)`
/// would leave A's `eprintln!` output in B's sink and vice versa,
/// and the subsequent `dup2_stderr(saved)` restores could leave fd 2
/// permanently pointing at a stale tempfile. Either outcome would
/// permanently break every subsequent stderr write in the test
/// process.
///
/// This test spawns N threads that each emit a unique marker,
/// capture, and assert their marker is present AND no OTHER
/// thread's marker leaked in. Under a broken or absent lock this
/// fails almost immediately on a multi-core host; under the
/// present lock the N captures serialize and every thread sees
/// only its own bytes.
#[test]
fn capture_stderr_serializes_concurrent_callers() {
    const N: usize = 8;
    let markers: Vec<String> = (0..N)
        .map(|i| format!("KTSTR_CAPTURE_LOCK_MARKER_{i:03}"))
        .collect();
    let handles: Vec<std::thread::JoinHandle<(usize, Vec<u8>)>> = (0..N)
        .map(|i| {
            let mine = markers[i].clone();
            let others: Vec<String> = markers
                .iter()
                .enumerate()
                .filter(|&(j, _m)| j != i)
                .map(|(_j, m)| m.clone())
                .collect();
            std::thread::spawn(move || {
                let (_, bytes) = capture_stderr(|| {
                    eprintln!("{mine}");
                });
                let captured = String::from_utf8_lossy(&bytes);
                assert!(
                    captured.contains(&mine),
                    "thread {i}: own marker missing from captured output: {captured:?}",
                );
                for other in &others {
                    assert!(
                        !captured.contains(other.as_str()),
                        "thread {i}: foreign marker '{other}' leaked into captured \
                         output — STDERR_CAPTURE_LOCK failed to serialize \
                         concurrent callers: {captured:?}",
                    );
                }
                (i, bytes)
            })
        })
        .collect();
    for h in handles {
        h.join().expect("capture thread panicked");
    }
}