Skip to main content

keyhog_scanner/
gpu_env.rs

1//! GPU environment detection + require-GPU preflight policy.
2//!
3//! Split out of `gpu.rs` (Law 5 / 500-LOC modularity cap): these are the
4//! environment-variable readers (`KEYHOG_NO_GPU`, `KEYHOG_REQUIRE_GPU`, CI
5//! auto-detect) plus the `require-GPU` preflight that fails closed when a
6//! GPU is demanded but absent. Re-exported from `gpu` via `pub use env::*`
7//! so the public surface (`crate::gpu::env_no_gpu`, `::gpu_probe`,
8//! `::require_gpu_preflight`, …) is unchanged.
9
10#[cfg(feature = "gpu")]
11use super::backend;
12
13/// Probe GPU availability and adapter metadata without panicking.
14///
15/// Honours `KEYHOG_NO_GPU=1` (and the usual on/off/true/false/0
16/// negatives) by reporting "no GPU available" without ever calling
17/// `backend::get_gpu()`. The MoE compute-shader init happens lazily
18/// inside `get_gpu()`, so this short-circuit is the difference
19/// between "Metal adapter request blocks for minutes on certain Mac
20/// configurations" (the v0.5.27 reproduction on Apple M4 Pro that
21/// the env var was added to escape) and "scanner starts in ~10ms
22/// like every other CPU-only tool".
23#[must_use]
24pub fn gpu_probe() -> (bool, Option<String>, Option<u64>) {
25    if env_no_gpu() {
26        return (false, None, None);
27    }
28    #[cfg(feature = "gpu")]
29    if let Some(gpu) = backend::get_gpu() {
30        return (true, Some(gpu.gpu_name().to_string()), gpu.vram_mb());
31    }
32    (false, None, None)
33}
34
35/// True when `KEYHOG_REQUIRE_GPU=1` is set: the operator demands a usable
36/// GPU and a silent CPU fallback is forbidden. Read uncached so embedders /
37/// tests that toggle the var between scans see the change (it is process-
38/// global at runtime, so this is only a few extra syscalls on the cold path).
39#[must_use]
40pub fn env_require_gpu() -> bool {
41    std::env::var("KEYHOG_REQUIRE_GPU").as_deref() == Ok("1")
42}
43
44/// Require-GPU preflight, independent of backend routing.
45///
46/// When `KEYHOG_REQUIRE_GPU=1` is NOT set this is a no-op and returns
47/// `Ok(())`. When it IS set, the contract (docs/src/reference/env.md,
48/// install.md, the `require-gpu-fails-closed` docker scenario) is to
49/// "refuse to run when no usable GPU adapter is detected". This check
50/// fires on the *no-GPU* path the flag exists for - it does not depend on
51/// `select_backend` having chosen GPU first (finding C0): the hard-fail
52/// that used to live only inside the GPU-selected dispatch paths was
53/// unreachable when there was no GPU, so a CPU scan completed and exited 0.
54///
55/// Returns `Err(diagnostic)` when a GPU is required but the host has no
56/// non-software adapter, or the GPU self-test (adapter init + one real MoE
57/// compute dispatch) fails. The caller (CLI run loop) maps that to the
58/// documented exit code 2. Returning an `Err` here - rather than calling
59/// `std::process::exit` from the library - keeps embedders alive (finding
60/// M12).
61pub fn require_gpu_preflight() -> Result<(), String> {
62    if !env_require_gpu() {
63        return Ok(());
64    }
65
66    let caps = crate::hw_probe::probe_hardware();
67    if !caps.gpu_available || caps.gpu_is_software {
68        let detail = match (&caps.gpu_name, caps.gpu_is_software) {
69            (Some(name), true) => {
70                format!("only a software GPU adapter is present ({name})")
71            }
72            (Some(name), false) => format!("adapter present but unusable ({name})"),
73            (None, _) => "no GPU adapter detected".to_string(),
74        };
75        return Err(format!(
76            "KEYHOG_REQUIRE_GPU=1 but {detail}; refusing to run on CPU. \
77             Install or enable a non-software GPU adapter + driver, or unset \
78             KEYHOG_REQUIRE_GPU to allow the CPU/SIMD path."
79        ));
80    }
81
82    // A non-software adapter is reported. Prove it can actually run a
83    // production-sized MoE dispatch before declaring the requirement met -
84    // a present-but-broken GPU (driver mismatch, dispatch reject) is exactly
85    // the regression the flag is meant to catch on self-hosted runners.
86    if let Err(reason) = super::gpu_self_test() {
87        return Err(format!(
88            "KEYHOG_REQUIRE_GPU=1 but the GPU self-test failed ({reason}); \
89             refusing to run on CPU. Fix the GPU stack or unset \
90             KEYHOG_REQUIRE_GPU."
91        ));
92    }
93
94    Ok(())
95}
96
97pub fn env_no_gpu() -> bool {
98    if let Ok(v) = std::env::var("KEYHOG_NO_GPU") {
99        // Explicit user choice wins both directions. "0"/"false"/"off"
100        // is the override that says "yes I want the GPU even though
101        // CI is detected" (self-hosted GPU runners exist).
102        return !matches!(v.as_str(), "" | "0" | "false" | "FALSE" | "off" | "OFF");
103    }
104    // `KEYHOG_REQUIRE_GPU=1` implies "do not skip the GPU": the operator
105    // wants a regression on a self-hosted GPU runner to fail loudly, not be
106    // masked by the CI auto-skip below. GitHub Actions always sets
107    // CI=true/GITHUB_ACTIONS=true even on self-hosted runners that have real
108    // GPUs, so without this override the auto-skip would route to SimdCpu
109    // before any GPU probe and the require gate would never fire (finding
110    // C1). This mirrors the explicit `KEYHOG_NO_GPU=0` override above; an
111    // explicit `KEYHOG_NO_GPU=1` still wins as the more specific signal (and
112    // the require-GPU preflight then hard-fails because the GPU is absent).
113    if env_require_gpu() {
114        return false;
115    }
116    // No explicit setting. Auto-skip GPU init on CI runners: they
117    // have no discrete GPU, the wgpu adapter probe enumerates the
118    // llvmpipe/swiftshader software fallback, gpu.rs:83 rightly
119    // rejects it as a software adapter, and the operator gets a
120    // confusing "GPU MoE init failed" warning that costs ~250ms of
121    // cold-start time for nothing. Detecting CI here turns that
122    // failure into a silent no-op (the user is on CPU + SIMD which
123    // is the right path on a CI runner anyway). Set
124    // KEYHOG_NO_GPU=0 to opt back in on self-hosted GPU runners.
125    is_ci_environment()
126}
127
128/// True when we are running inside a CI system. Used by the GPU
129/// init paths to auto-skip the wgpu adapter probe (which always
130/// fails on hosted CI runners and costs ~250ms of pointless cold-
131/// start time + emits a confusing warning).
132///
133/// Checks `CI=true` (the de-facto standard, set by GitHub Actions,
134/// GitLab CI, CircleCI, Travis, Buildkite, Drone, AppVeyor,
135/// Codeship, Wercker, and most others) plus a handful of platform-
136/// specific markers that some runners set without also setting the
137/// generic `CI` (Jenkins, TeamCity, Azure Pipelines, Bitbucket
138/// Pipelines).
139pub fn is_ci_environment() -> bool {
140    // The generic CI marker. Some runners set CI=true, some set
141    // CI=1, GitHub Actions sets both. Treat any non-empty non-false
142    // value as truthy.
143    if let Ok(v) = std::env::var("CI") {
144        if !matches!(v.as_str(), "" | "0" | "false" | "FALSE" | "off" | "OFF") {
145            return true;
146        }
147    }
148    // Platform-specific markers. Some legacy CI systems set their
149    // own variable but not the generic CI=. Hit the common ones.
150    const CI_MARKERS: &[&str] = &[
151        "GITHUB_ACTIONS",         // GitHub Actions
152        "GITLAB_CI",              // GitLab CI
153        "JENKINS_URL",            // Jenkins
154        "TF_BUILD",               // Azure Pipelines
155        "TEAMCITY_VERSION",       // TeamCity
156        "BITBUCKET_BUILD_NUMBER", // Bitbucket Pipelines
157        "BUILDKITE",              // Buildkite
158        "CIRCLECI",               // CircleCI
159        "DRONE",                  // Drone CI
160        "TRAVIS",                 // Travis CI
161        "APPVEYOR",               // AppVeyor
162        "CODEBUILD_BUILD_ID",     // AWS CodeBuild
163        "WERCKER",                // Wercker
164        "SEMAPHORE",              // Semaphore CI
165    ];
166    CI_MARKERS.iter().any(|k| std::env::var(k).is_ok())
167}