ktstr 0.4.21

Test harness for Linux process schedulers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
//! Host-side end-to-end test for the jemalloc-counter wiring in
//! [`ktstr::ctprof::capture_pid`].
//!
//! Spawns `ktstr-jemalloc-alloc-worker` as a child process on the
//! host with a known allocation size, waits for the worker's
//! pid-scoped ready marker, then runs `capture_pid` against the
//! host's real `/proc` and confirms the worker's tid carries a
//! populated `allocated_bytes` field. Distinct from the VM-backed
//! `tests/ctprof_capture.rs`, which only proves the procfs
//! walk reaches non-jemalloc counters: this test specifically
//! exercises the ELF/DWARF + ptrace + process_vm_readv path that
//! the wiring lifted out of the standalone probe binary into the
//! capture pipeline.
//!
//! # Privilege
//!
//! `ptrace(PTRACE_SEIZE)` must succeed against the worker. Under
//! `kernel.yama.ptrace_scope=0` (typical CI runner default; also
//! the layout the ktstr VMM-backed integration tests use) any
//! same-uid process attaches and the test runs unconditionally.
//! Under `=1` (Debian/Ubuntu host default outside CI) the test
//! parent is the worker's parent PID, which YAMA's `is_my_thread`
//! branch already permits — same-uid + same-process-tree, no
//! capability needed.
//!
//! Under `=2` or `=3`, or when running as a non-uid-matching user,
//! the `attach_jemalloc` step inside `capture_pid` returns an
//! `AttachError` that the capture pipeline absorbs into the
//! "absent counter = 0" contract, so the worker's tid would land
//! with `allocated_bytes=0`. To distinguish "wiring failed" from
//! "test ran on a host where ptrace is locked down", the test
//! short-circuits with an informative skip message when it cannot
//! self-attach (a one-shot probe against the test process's own
//! parent) — operators see why the test passed without exercising
//! the path.

use std::io::Read;
use std::path::PathBuf;
use std::process::{Child, Command, Stdio};
use std::time::{Duration, Instant};

use ktstr::metric_types::Bytes;

/// Compile-time path to the alloc-worker binary; cargo populates
/// `CARGO_BIN_EXE_<name>` for every `[[bin]]` declared in the
/// workspace, so the test does not need to spell the build
/// directory layout manually.
const ALLOC_WORKER_BINARY: &str = env!("CARGO_BIN_EXE_ktstr-jemalloc-alloc-worker");

/// Allocation size the worker should hold after it writes the
/// ready marker. Picked well above jemalloc's tcache threshold
/// (16 KiB) so the allocation lands on the slow / huge path and
/// `tsd_s.thread_allocated` is updated synchronously rather than
/// deferred through a per-thread cache.
const KNOWN_BYTES: u64 = 16 * 1024 * 1024;

/// Upper bound on jemalloc/runtime overhead added on top of
/// [`KNOWN_BYTES`]. Mirrors the slop the in-VM probe tests use
/// (see `tests/jemalloc_probe_tests.rs`).
const MAX_SLOP: u64 = 4 * 1024 * 1024;

/// Wait deadline for the worker's ready-marker file.
const READY_TIMEOUT: Duration = Duration::from_secs(10);

/// Override env var the worker honours to redirect the
/// ready-marker path — pulled from
/// [`ktstr::worker_ready::WORKER_READY_MARKER_OVERRIDE_ENV`] so a
/// rename of the const propagates without manual sync.
const READY_MARKER_OVERRIDE_ENV: &str = ktstr::worker_ready::WORKER_READY_MARKER_OVERRIDE_ENV;

/// RAII guard that kills + reaps the worker on scope exit so a
/// test failure does not orphan the process.
struct WorkerGuard {
    child: Option<Child>,
}

impl Drop for WorkerGuard {
    fn drop(&mut self) {
        if let Some(mut c) = self.child.take() {
            let _ = c.kill();
            let _ = c.wait();
        }
    }
}

/// Probe whether the current process can `ptrace(PTRACE_SEIZE)` a
/// child it just spawned. Returns `true` iff the kernel allows
/// the attach — under `ptrace_scope=2` / `=3` or when the test
/// runs as a uid that cannot trace, the attach surface returns
/// EPERM and we should skip the wiring assertion.
///
/// Implementation: spawn a `sleep` child and try the SEIZE+DETACH
/// dance directly. Cleaner than reading the sysctl and inferring,
/// because the kernel's effective policy depends on uid + parent
/// relationship, not just the scope value.
fn ptrace_attach_allowed() -> bool {
    let child = match Command::new("sleep")
        .arg("1")
        .stdin(Stdio::null())
        .stdout(Stdio::null())
        .stderr(Stdio::null())
        .spawn()
    {
        Ok(c) => c,
        Err(_) => return false,
    };
    let pid = child.id() as i32;
    let nix_pid = nix::unistd::Pid::from_raw(pid);
    let res = nix::sys::ptrace::seize(nix_pid, nix::sys::ptrace::Options::empty());
    if res.is_ok() {
        let _ = nix::sys::ptrace::detach(nix_pid, None);
    }
    // Reap the sleep child regardless of outcome.
    let _ = nix::sys::wait::waitpid(nix_pid, Some(nix::sys::wait::WaitPidFlag::WNOHANG));
    let mut child = child;
    let _ = child.kill();
    let _ = child.wait();
    res.is_ok()
}

/// Spawn the alloc-worker with a known size and a redirected
/// ready-marker path inside `tempdir`. The redirected path keeps
/// each test invocation isolated from any concurrent tests using
/// the worker's default pid-scoped /tmp marker.
fn spawn_worker(bytes: u64, marker: &PathBuf) -> std::io::Result<Child> {
    Command::new(ALLOC_WORKER_BINARY)
        .arg(bytes.to_string())
        .env(READY_MARKER_OVERRIDE_ENV, marker)
        // Pin jemalloc's background thread to OFF so the
        // single-thread self-check inside the worker passes
        // (extra threads from a leaky shell env var trip exit 3).
        .env("MALLOC_CONF", "background_thread:false")
        .env("_RJEM_MALLOC_CONF", "background_thread:false")
        .stdin(Stdio::null())
        .stdout(Stdio::null())
        .stderr(Stdio::piped())
        .spawn()
}

/// Poll for `marker` to appear; surface the worker's stderr if
/// the deadline expires so a setup failure (exit 2/3/4/5/6 in the
/// worker) is visible at test level instead of a bare timeout.
fn wait_for_ready(child: &mut Child, marker: &PathBuf) -> Result<(), String> {
    let deadline = Instant::now() + READY_TIMEOUT;
    while Instant::now() < deadline {
        if marker.exists() {
            return Ok(());
        }
        if let Ok(Some(status)) = child.try_wait() {
            // Worker exited before the marker arrived — drain stderr
            // for the error message it printed.
            let mut stderr = String::new();
            if let Some(mut s) = child.stderr.take() {
                let _ = s.read_to_string(&mut stderr);
            }
            return Err(format!(
                "worker exited early with status {:?}; stderr: {}",
                status, stderr
            ));
        }
        std::thread::sleep(Duration::from_millis(20));
    }
    Err(format!(
        "ready marker {:?} did not appear within {:?}",
        marker, READY_TIMEOUT,
    ))
}

#[test]
fn capture_populates_jemalloc_counters_for_alloc_worker() {
    // Skip the wiring assertion when ptrace is locked down. A
    // skipped test still passes the suite — the goal is to avoid
    // a false-negative on hosts where `ptrace_scope` blocks
    // attach. The test's primary value is the positive
    // assertion path; running it on a locked-down host would
    // produce zero observable signal anyway because the probe
    // would silently absorb every `AttachError::PtraceSeize`
    // through capture's "absent = 0" contract.
    if !ptrace_attach_allowed() {
        eprintln!(
            "ctprof_capture_jemalloc_wiring: skipping — \
             ptrace attach is denied by the kernel policy (likely \
             yama.ptrace_scope >= 1 with no parent relationship). \
             Set kernel.yama.ptrace_scope=0 or run the test from \
             the worker's parent process tree."
        );
        return;
    }

    let tmp = tempfile::TempDir::new().expect("tempdir for ready marker");
    let marker = tmp.path().join("ready");

    let child = spawn_worker(KNOWN_BYTES, &marker)
        .expect("alloc-worker should spawn (CARGO_BIN_EXE_ resolved at compile time)");
    let mut guard = WorkerGuard { child: Some(child) };
    let child_ref = guard.child.as_mut().expect("child handle present");
    let worker_pid = child_ref.id() as i32;

    if let Err(msg) = wait_for_ready(child_ref, &marker) {
        panic!("worker did not signal ready: {}", msg);
    }

    // Capture the ctprof snapshot scoped to the worker's
    // tgid. `capture_pid` walks `/proc/<pid>/task` only and runs
    // the jemalloc probe attach against the single tgid — much
    // tighter than the global `capture()` walk, which would also
    // try to ptrace every other jemalloc-linked process on the
    // host (potentially hundreds of unrelated tids; slow + risky
    // on a busy dev box). The wiring being exercised is the SAME
    // probe-attach + per-tid probe_thread call that `capture` runs
    // for every jemalloc tgid, so the scoped capture proves
    // exactly the path under test.
    let snap = ktstr::ctprof::capture_pid(worker_pid);

    // Find the worker's main thread in the snapshot. The worker
    // is single-threaded (its own self-check enforces it via
    // exit code 3), so `tid == pid` for the only entry — we look
    // up by tgid instead of tid to keep this robust against any
    // future jemalloc helper thread (they would carry different
    // tids but share the tgid).
    let worker_threads: Vec<_> = snap
        .threads
        .iter()
        .filter(|t| t.tgid == worker_pid as u32)
        .collect();
    assert!(
        !worker_threads.is_empty(),
        "capture_pid() did not see worker tgid={worker_pid} in \
         its /proc walk; total threads in snapshot: {}",
        snap.threads.len(),
    );
    // The main thread carries the allocation. Its tid equals the
    // pid for the single-threaded worker; if jemalloc spawned a
    // helper thread despite background_thread:false in the env,
    // we still scan every worker thread for a non-zero counter
    // (only the main thread allocated, helpers stay near zero).
    let allocated: u64 = worker_threads
        .iter()
        .map(|t| t.allocated_bytes.0)
        .max()
        .expect("worker_threads non-empty per assert above");
    let deallocated: u64 = worker_threads
        .iter()
        .map(|t| t.deallocated_bytes.0)
        .max()
        .expect("worker_threads non-empty per assert above");

    assert!(
        allocated >= KNOWN_BYTES,
        "expected worker allocated_bytes >= {KNOWN_BYTES}, \
         got {allocated}; worker_pid={worker_pid}, threads in \
         worker tgid: {}. The capture pipeline's attach_jemalloc \
         either failed against the worker's ELF (DWARF missing, \
         arch mismatch, jemalloc-not-found) or the per-thread \
         ptrace step failed (check ptrace_scope / EPERM).",
        worker_threads.len(),
    );
    assert!(
        allocated <= KNOWN_BYTES + MAX_SLOP,
        "worker allocated_bytes={allocated} exceeds known + slop \
         ({}); probe may be reading the wrong address or the \
         worker leaked extra allocations beyond the planted Vec",
        KNOWN_BYTES + MAX_SLOP,
    );
    // The worker holds its Vec until kill, so deallocations are
    // bounded to jemalloc startup churn — well below the planted
    // size.
    assert!(
        deallocated < KNOWN_BYTES,
        "worker deallocated_bytes={deallocated} >= KNOWN_BYTES \
         ({KNOWN_BYTES}); worker should not free its planted Vec \
         before kill",
    );
}

#[test]
fn capture_pid_skips_self_attach_and_keeps_counters_zero() {
    // The capture process's own tgid must not be probed —
    // ptrace(PTRACE_SEIZE) rejects self-attach with EPERM. The
    // wiring inside capture_pid_with skips self via a `pid !=
    // self_pid` gate before calling attach_jemalloc; this test
    // proves the gate engages by running capture_pid against the
    // test binary itself and confirming the resulting self-tgid
    // ThreadState entries land with allocated_bytes==0 AND
    // deallocated_bytes==0.
    //
    // The pid != self_pid gate is the load-bearing skip; without
    // it, attach_jemalloc would be called against self_pid and
    // PTRACE_SEIZE would EPERM. (The ktstr library does not
    // declare a `#[global_allocator]`, so the test binary is not
    // automatically jemalloc-linked through `cargo test`; whether
    // attach_jemalloc would have detected jemalloc inside the
    // current binary depends on the binary's static link graph.
    // The self-skip gate makes that detail moot — the probe is
    // never even attempted against self.)
    let self_pid = std::process::id() as i32;
    let snap = ktstr::ctprof::capture_pid(self_pid);
    let self_threads: Vec<_> = snap
        .threads
        .iter()
        .filter(|t| t.tgid == self_pid as u32)
        .collect();
    assert!(
        !self_threads.is_empty(),
        "capture_pid() did not see self tgid={self_pid}; \
         expected the test process's own tids in the /proc walk",
    );
    for t in &self_threads {
        assert_eq!(
            t.allocated_bytes,
            Bytes(0),
            "self-pid threads must carry allocated_bytes=0 — the \
             pid==self_pid gate must keep attach_jemalloc from \
             running against the calling process; got {} on tid {}",
            t.allocated_bytes,
            t.tid,
        );
        assert_eq!(
            t.deallocated_bytes,
            Bytes(0),
            "self-pid threads must carry deallocated_bytes=0; \
             got {} on tid {}",
            t.deallocated_bytes,
            t.tid,
        );
    }
}

#[test]
fn capture_pid_against_non_jemalloc_target_keeps_counters_zero_but_populates_procfs() {
    // Spawn /bin/sleep — coreutils, not jemalloc-linked.
    // capture_pid against it should:
    //   - run attach_jemalloc, which returns JemallocNotFound
    //   - leave allocated_bytes / deallocated_bytes at 0 per the
    //     absent-counter contract
    //   - still populate identity (tid/tgid/pcomm/comm) and the
    //     procfs-derived counters so the snapshot is otherwise
    //     complete.
    //
    // This is the negative wiring path complement to
    // `capture_populates_jemalloc_counters_for_alloc_worker` — that
    // proves attach success populates counters, this proves attach
    // failure leaves them at zero without dropping the rest.
    let mut child = match Command::new("sleep")
        .arg("3")
        .stdin(Stdio::null())
        .stdout(Stdio::null())
        .stderr(Stdio::null())
        .spawn()
    {
        Ok(c) => c,
        Err(_) => {
            eprintln!("skipping — /bin/sleep unavailable");
            return;
        }
    };
    // Brief settle so /proc/<pid>/{maps,exe,task} populate.
    std::thread::sleep(Duration::from_millis(50));
    let pid = child.id() as i32;

    let snap = ktstr::ctprof::capture_pid(pid);

    let _ = child.kill();
    let _ = child.wait();

    let target_threads: Vec<_> = snap
        .threads
        .iter()
        .filter(|t| t.tgid == pid as u32)
        .collect();
    assert!(
        !target_threads.is_empty(),
        "capture_pid did not see /bin/sleep tgid={pid} in its /proc walk; \
         total threads in snapshot: {}",
        snap.threads.len(),
    );
    for t in &target_threads {
        assert_eq!(
            t.allocated_bytes,
            Bytes(0),
            "non-jemalloc target must carry allocated_bytes=0 (attach \
             returned JemallocNotFound, capture absorbed into absent-\
             counter contract); got {} on tid {}",
            t.allocated_bytes,
            t.tid,
        );
        assert_eq!(
            t.deallocated_bytes,
            Bytes(0),
            "non-jemalloc target must carry deallocated_bytes=0; \
             got {} on tid {}",
            t.deallocated_bytes,
            t.tid,
        );
        // Procfs identity + counters populate normally — the attach
        // failure absorbs into the jemalloc fields only, not the
        // whole ThreadState.
        assert_eq!(t.tgid, pid as u32);
        assert!(
            t.start_time_clock_ticks > 0,
            "/proc/<pid>/stat field 22 must populate for a live target",
        );
        assert!(
            !t.policy.0.is_empty(),
            "scheduling policy must populate from procfs even when the \
             jemalloc probe fails — proves the per-thread procfs path \
             does not depend on probe success",
        );
    }
}