keyhog_scanner/gpu_env.rs
1//! GPU environment detection + require-GPU preflight policy.
2//!
3//! Split out of `gpu.rs` (Law 5 / 500-LOC modularity cap): these are the
4//! environment-variable readers (`KEYHOG_NO_GPU`, `KEYHOG_REQUIRE_GPU`, CI
5//! auto-detect) plus the `require-GPU` preflight that fails closed when a
6//! GPU is demanded but absent. Re-exported from `gpu` via `pub use env::*`
7//! so the public surface (`crate::gpu::env_no_gpu`, `::gpu_probe`,
8//! `::require_gpu_preflight`, …) is unchanged.
9
10#[cfg(feature = "gpu")]
11use super::backend;
12
13/// Probe GPU availability and adapter metadata without panicking.
14///
15/// Honours `KEYHOG_NO_GPU=1` (and the usual on/off/true/false/0
16/// negatives) by reporting "no GPU available" without ever calling
17/// `backend::get_gpu()`. The MoE compute-shader init happens lazily
18/// inside `get_gpu()`, so this short-circuit is the difference
19/// between "Metal adapter request blocks for minutes on certain Mac
20/// configurations" (the v0.5.27 reproduction on Apple M4 Pro that
21/// the env var was added to escape) and "scanner starts in ~10ms
22/// like every other CPU-only tool".
23#[must_use]
24pub fn gpu_probe() -> (bool, Option<String>, Option<u64>) {
25 if env_no_gpu() {
26 return (false, None, None);
27 }
28 #[cfg(feature = "gpu")]
29 if let Some(gpu) = backend::get_gpu() {
30 return (true, Some(gpu.gpu_name().to_string()), gpu.vram_mb());
31 }
32 (false, None, None)
33}
34
35/// True when `KEYHOG_REQUIRE_GPU=1` is set: the operator demands a usable
36/// GPU and a silent CPU fallback is forbidden. Read uncached so embedders /
37/// tests that toggle the var between scans see the change (it is process-
38/// global at runtime, so this is only a few extra syscalls on the cold path).
39#[must_use]
40pub fn env_require_gpu() -> bool {
41 std::env::var("KEYHOG_REQUIRE_GPU").as_deref() == Ok("1")
42}
43
44/// Require-GPU preflight, independent of backend routing.
45///
46/// When `KEYHOG_REQUIRE_GPU=1` is NOT set this is a no-op and returns
47/// `Ok(())`. When it IS set, the contract (docs/src/reference/env.md,
48/// install.md, the `require-gpu-fails-closed` docker scenario) is to
49/// "refuse to run when no usable GPU adapter is detected". This check
50/// fires on the *no-GPU* path the flag exists for - it does not depend on
51/// `select_backend` having chosen GPU first (finding C0): the hard-fail
52/// that used to live only inside the GPU-selected dispatch paths was
53/// unreachable when there was no GPU, so a CPU scan completed and exited 0.
54///
55/// Returns `Err(diagnostic)` when a GPU is required but the host has no
56/// non-software adapter, or the GPU self-test (adapter init + one real MoE
57/// compute dispatch) fails. The caller (CLI run loop) maps that to the
58/// documented exit code 2. Returning an `Err` here - rather than calling
59/// `std::process::exit` from the library - keeps embedders alive (finding
60/// M12).
61pub fn require_gpu_preflight() -> Result<(), String> {
62 if !env_require_gpu() {
63 return Ok(());
64 }
65
66 let caps = crate::hw_probe::probe_hardware();
67 if !caps.gpu_available || caps.gpu_is_software {
68 let detail = match (&caps.gpu_name, caps.gpu_is_software) {
69 (Some(name), true) => {
70 format!("only a software GPU adapter is present ({name})")
71 }
72 (Some(name), false) => format!("adapter present but unusable ({name})"),
73 (None, _) => "no GPU adapter detected".to_string(),
74 };
75 return Err(format!(
76 "KEYHOG_REQUIRE_GPU=1 but {detail}; refusing to run on CPU. \
77 Install or enable a non-software GPU adapter + driver, or unset \
78 KEYHOG_REQUIRE_GPU to allow the CPU/SIMD path."
79 ));
80 }
81
82 // A non-software adapter is reported. Prove it can actually run a
83 // production-sized MoE dispatch before declaring the requirement met -
84 // a present-but-broken GPU (driver mismatch, dispatch reject) is exactly
85 // the regression the flag is meant to catch on self-hosted runners.
86 if let Err(reason) = super::gpu_self_test() {
87 return Err(format!(
88 "KEYHOG_REQUIRE_GPU=1 but the GPU self-test failed ({reason}); \
89 refusing to run on CPU. Fix the GPU stack or unset \
90 KEYHOG_REQUIRE_GPU."
91 ));
92 }
93
94 Ok(())
95}
96
97pub fn env_no_gpu() -> bool {
98 if let Ok(v) = std::env::var("KEYHOG_NO_GPU") {
99 // Explicit user choice wins both directions. "0"/"false"/"off"
100 // is the override that says "yes I want the GPU even though
101 // CI is detected" (self-hosted GPU runners exist).
102 return !matches!(v.as_str(), "" | "0" | "false" | "FALSE" | "off" | "OFF");
103 }
104 // `KEYHOG_REQUIRE_GPU=1` implies "do not skip the GPU": the operator
105 // wants a regression on a self-hosted GPU runner to fail loudly, not be
106 // masked by the CI auto-skip below. GitHub Actions always sets
107 // CI=true/GITHUB_ACTIONS=true even on self-hosted runners that have real
108 // GPUs, so without this override the auto-skip would route to SimdCpu
109 // before any GPU probe and the require gate would never fire (finding
110 // C1). This mirrors the explicit `KEYHOG_NO_GPU=0` override above; an
111 // explicit `KEYHOG_NO_GPU=1` still wins as the more specific signal (and
112 // the require-GPU preflight then hard-fails because the GPU is absent).
113 if env_require_gpu() {
114 return false;
115 }
116 // No explicit setting. Auto-skip GPU init on CI runners: they
117 // have no discrete GPU, the wgpu adapter probe enumerates the
118 // llvmpipe/swiftshader software fallback, gpu.rs:83 rightly
119 // rejects it as a software adapter, and the operator gets a
120 // confusing "GPU MoE init failed" warning that costs ~250ms of
121 // cold-start time for nothing. Detecting CI here turns that
122 // failure into a silent no-op (the user is on CPU + SIMD which
123 // is the right path on a CI runner anyway). Set
124 // KEYHOG_NO_GPU=0 to opt back in on self-hosted GPU runners.
125 is_ci_environment()
126}
127
128/// True when we are running inside a CI system. Used by the GPU
129/// init paths to auto-skip the wgpu adapter probe (which always
130/// fails on hosted CI runners and costs ~250ms of pointless cold-
131/// start time + emits a confusing warning).
132///
133/// Checks `CI=true` (the de-facto standard, set by GitHub Actions,
134/// GitLab CI, CircleCI, Travis, Buildkite, Drone, AppVeyor,
135/// Codeship, Wercker, and most others) plus a handful of platform-
136/// specific markers that some runners set without also setting the
137/// generic `CI` (Jenkins, TeamCity, Azure Pipelines, Bitbucket
138/// Pipelines).
139pub fn is_ci_environment() -> bool {
140 // The generic CI marker. Some runners set CI=true, some set
141 // CI=1, GitHub Actions sets both. Treat any non-empty non-false
142 // value as truthy.
143 if let Ok(v) = std::env::var("CI") {
144 if !matches!(v.as_str(), "" | "0" | "false" | "FALSE" | "off" | "OFF") {
145 return true;
146 }
147 }
148 // Platform-specific markers. Some legacy CI systems set their
149 // own variable but not the generic CI=. Hit the common ones.
150 const CI_MARKERS: &[&str] = &[
151 "GITHUB_ACTIONS", // GitHub Actions
152 "GITLAB_CI", // GitLab CI
153 "JENKINS_URL", // Jenkins
154 "TF_BUILD", // Azure Pipelines
155 "TEAMCITY_VERSION", // TeamCity
156 "BITBUCKET_BUILD_NUMBER", // Bitbucket Pipelines
157 "BUILDKITE", // Buildkite
158 "CIRCLECI", // CircleCI
159 "DRONE", // Drone CI
160 "TRAVIS", // Travis CI
161 "APPVEYOR", // AppVeyor
162 "CODEBUILD_BUILD_ID", // AWS CodeBuild
163 "WERCKER", // Wercker
164 "SEMAPHORE", // Semaphore CI
165 ];
166 CI_MARKERS.iter().any(|k| std::env::var(k).is_ok())
167}