mod support;
use support::paths::detector_dir;
use keyhog_scanner::CompiledScanner;
use std::time::{Duration, Instant};
const MAX_FULL_OVER_HALF_RATIO: f64 = 1.4;
const MIN_CORES_FOR_RATIO: usize = 4;
fn isolated_cache_dir(tag: &str) -> std::path::PathBuf {
let home = std::env::var_os("HOME").expect("HOME must be set to run this perf test");
let base = std::path::PathBuf::from(home)
.join(".cache")
.join("keyhog-perf");
std::fs::create_dir_all(&base).expect("create perf cache base");
let dir = base.join(format!("perf-compile-cache-{tag}"));
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).expect("create isolated cache dir");
dir
}
fn best_of<F: FnMut() -> Duration>(k: usize, mut f: F) -> Duration {
(0..k).map(|_| f()).min().expect("k >= 1")
}
#[test]
fn cold_compile_must_parallelize_across_pattern_shards() {
std::env::set_var("KEYHOG_NO_GPU", "1");
let dir = isolated_cache_dir("shard-ratio");
std::env::set_var("KEYHOG_CACHE_DIR", &dir);
let detectors = keyhog_core::load_detectors(&detector_dir()).expect("load detectors");
let n = detectors.len();
assert!(
n >= 600,
"expected the full ~899-detector corpus, got {n}; this tripwire \
measures the serial Hyperscan compile of the real corpus"
);
let half: Vec<_> = detectors.iter().take(n / 2).cloned().collect();
let clear = |d: &std::path::Path| {
let _ = std::fs::remove_dir_all(d);
std::fs::create_dir_all(d).expect("recreate cache dir");
};
let cold_full = best_of(3, || {
clear(&dir);
let t = Instant::now();
let s = CompiledScanner::compile(detectors.clone()).expect("cold full compile");
let e = t.elapsed();
std::hint::black_box(&s);
e
});
let cold_half = best_of(3, || {
clear(&dir);
let t = Instant::now();
let s = CompiledScanner::compile(half.clone()).expect("cold half compile");
let e = t.elapsed();
std::hint::black_box(&s);
e
});
let ratio = cold_full.as_secs_f64() / cold_half.as_secs_f64();
let cores = std::thread::available_parallelism()
.map(|c| c.get())
.unwrap_or(1);
eprintln!(
"perf_compile_cache: cores={cores} cold_full({n})={:.1}ms cold_half({})={:.1}ms \
full/half={ratio:.2}x (target <= {MAX_FULL_OVER_HALF_RATIO:.2}x)",
cold_full.as_secs_f64() * 1000.0,
half.len(),
cold_half.as_secs_f64() * 1000.0,
);
if cores < MIN_CORES_FOR_RATIO {
eprintln!(
"perf_compile_cache: SKIP ratio assertion - {cores} cores (< {MIN_CORES_FOR_RATIO}); \
parallel shard compile cannot show a speedup on this machine."
);
return;
}
assert!(
ratio <= MAX_FULL_OVER_HALF_RATIO,
"SERIAL Hyperscan compile: doubling the pattern set ({} -> {} detectors) \
multiplied cold-compile wall-clock by {ratio:.2}x (target <= {MAX_FULL_OVER_HALF_RATIO:.2}x) \
on a {cores}-core machine. ~99.7% of the cold compile is a single serial \
`Builder::build::<BlockMode>` call at crates/scanner/src/simd.rs:283 \
(driven from engine/backend_prepared.rs:81); the rayon-parallel \
build_compile_state phase is already ~5ms. The full corpus compiled in \
{:.0}ms while the half compiled in {:.0}ms - linear scaling means every \
core but one sat idle during the build. \
FIX: split the pattern set into K shards, compile K BlockDatabases on a \
rayon pool, scan all K and union the matches; doubling patterns is then \
absorbed by parallelism (ratio -> ~1.0-1.2x). MUST keep \
all_detectors_self_validate green so no shard is dropped.",
half.len(),
n,
cold_full.as_secs_f64() * 1000.0,
cold_half.as_secs_f64() * 1000.0,
);
}