Skip to main content

keyhog_core/spec/
load.rs

1//! Detector loading pipeline: read TOML files, run the quality gate, and inject
2//! small compatibility shims for legacy token formats when needed.
3
4#![allow(clippy::result_large_err)] // SpecError carries a 128-byte toml::de::Error; boxing it would be a breaking API change.
5
6use std::io;
7use std::path::{Path, PathBuf};
8
9use rayon::prelude::*;
10use serde::{Deserialize, Serialize};
11
12use super::{validate_detector, DetectorFile, DetectorSpec, QualityIssue, SpecError};
13
14const DETECTOR_CACHE_VERSION: u32 = 2;
15
16#[derive(Serialize, Deserialize)]
17struct DetectorCacheFile {
18    version: u32,
19    detectors: Vec<DetectorSpec>,
20}
21
22/// Save detectors to a JSON cache file for fast subsequent loads.
23///
24/// # Examples
25///
26/// ```rust,no_run
27/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
28/// use keyhog_core::{DetectorSpec, save_detector_cache};
29/// use std::path::Path;
30///
31/// let detectors: Vec<DetectorSpec> = Vec::new();
32/// save_detector_cache(&detectors, Path::new(".keyhog-cache.json"))?;
33/// # Ok(()) }
34/// ```
35pub fn save_detector_cache(
36    detectors: &[DetectorSpec],
37    cache_path: &Path,
38) -> Result<(), std::io::Error> {
39    for detector in detectors {
40        let issues = validate_detector(detector);
41        if issues
42            .iter()
43            .any(|issue| matches!(issue, QualityIssue::Error(_)))
44        {
45            return Err(io::Error::new(
46                io::ErrorKind::InvalidData,
47                format!(
48                    "refusing to cache invalid detector '{}'. Fix: repair the detector before writing the cache",
49                    detector.id
50                ),
51            ));
52        }
53    }
54
55    let json = serde_json::to_vec(&DetectorCacheFile {
56        version: DETECTOR_CACHE_VERSION,
57        detectors: detectors.to_vec(),
58    })?;
59    // Atomic rename via NamedTempFile — same pattern as merkle index
60    // and baseline. A mid-write crash used to leave a corrupt
61    // `.keyhog-cache.json` that the next run would `tracing::warn!`
62    // on and silently fall back to TOML-load (unbounded slowdown).
63    // tempfile::Drop reaps the tmp on panic; persist atomic-renames
64    // on success.
65    let parent = cache_path.parent().unwrap_or_else(|| Path::new("."));
66    std::fs::create_dir_all(parent)?;
67    let mut tmp = tempfile::NamedTempFile::new_in(parent)?;
68    std::io::Write::write_all(&mut tmp, &json)?;
69    tmp.as_file().sync_all()?;
70    tmp.persist(cache_path).map_err(|e| e.error)?;
71    Ok(())
72}
73
74/// Load detectors from a JSON cache file. Returns None if cache is stale or missing.
75///
76/// # Examples
77///
78/// ```rust,no_run
79/// use keyhog_core::load_detector_cache;
80/// use std::path::Path;
81///
82/// let _cached = load_detector_cache(
83///     Path::new(".keyhog-cache.json"),
84///     Path::new("detectors"),
85/// );
86/// ```
87///
88/// # Security
89///
90/// Cached detectors are re-validated through the quality gate to prevent cache
91/// poisoning attacks where a malicious `.keyhog-cache.json` injects evil regex
92/// patterns that bypass the TOML quality gate.
93pub fn load_detector_cache(cache_path: &Path, source_dir: &Path) -> Option<Vec<DetectorSpec>> {
94    let cache_meta = std::fs::metadata(cache_path).ok()?;
95    let cache_mtime = cache_meta.modified().ok()?;
96
97    // Check if any TOML in source_dir is newer than the cache
98    let entries = std::fs::read_dir(source_dir).ok()?;
99    for entry in entries.flatten() {
100        let path = entry.path();
101        if path.extension().is_some_and(|ext| ext == "toml") {
102            let is_stale = std::fs::metadata(&path)
103                .and_then(|meta| meta.modified())
104                .is_ok_and(|mtime| mtime > cache_mtime);
105
106            if is_stale {
107                return None; // Cache is stale
108            }
109        }
110    }
111
112    let data = match std::fs::read(cache_path) {
113        Ok(data) => data,
114        Err(error) => {
115            tracing::warn!(
116                "failed to read detector cache {}: {}",
117                cache_path.display(),
118                error
119            );
120            return None;
121        }
122    };
123    let cache: DetectorCacheFile = match serde_json::from_slice(&data) {
124        Ok(cache) => cache,
125        Err(error) => {
126            tracing::warn!(
127                "failed to parse detector cache {}: {}",
128                cache_path.display(),
129                error
130            );
131            return None;
132        }
133    };
134    if cache.version != DETECTOR_CACHE_VERSION {
135        return None;
136    }
137
138    let mut validated = Vec::with_capacity(cache.detectors.len());
139    for spec in cache.detectors {
140        let issues = validate_detector(&spec);
141        if issues
142            .iter()
143            .any(|issue| matches!(issue, QualityIssue::Error(_)))
144        {
145            tracing::warn!(
146                "cached detector '{}' failed quality gate; discarding the entire cache",
147                spec.id
148            );
149            return None;
150        }
151        validated.push(spec);
152    }
153
154    if validated.is_empty() {
155        tracing::warn!("detector cache is empty after validation, falling back to TOML load");
156        return None;
157    }
158
159    Some(validated)
160}
161
162/// Load all detector specs from a directory of TOML files.
163/// Runs quality gate on each detector. Rejects detectors with errors, warns on issues.
164///
165/// # Examples
166///
167/// ```rust,no_run
168/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
169/// use keyhog_core::load_detectors;
170/// use std::path::Path;
171///
172/// let detectors = load_detectors(Path::new("detectors"))?;
173/// assert!(!detectors.is_empty());
174/// # Ok(()) }
175/// ```
176pub fn load_detectors(dir: &Path) -> Result<Vec<DetectorSpec>, SpecError> {
177    load_detectors_with_gate(dir, true)
178}
179
180/// Load detectors with optional quality gate enforcement.
181/// When `enforce_gate` is `true`, detectors with quality errors are skipped.
182///
183/// # Examples
184///
185/// ```rust,no_run
186/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
187/// use keyhog_core::load_detectors_with_gate;
188/// use std::path::Path;
189///
190/// let _detectors = load_detectors_with_gate(Path::new("detectors"), true)?;
191/// # Ok(()) }
192/// ```
193pub fn load_detectors_with_gate(
194    dir: &Path,
195    enforce_gate: bool,
196) -> Result<Vec<DetectorSpec>, SpecError> {
197    // Phase 1: collect all TOML file paths (fast, sequential)
198    let entries = std::fs::read_dir(dir).map_err(|e| SpecError::ReadFile {
199        path: dir.display().to_string(),
200        source: e,
201    })?;
202    let toml_paths: Vec<PathBuf> = entries
203        .filter_map(|entry| {
204            let entry = entry.ok()?;
205            let path = entry.path();
206            if path.extension().is_some_and(|ext| ext == "toml") {
207                Some(path)
208            } else {
209                None
210            }
211        })
212        .collect();
213
214    // Phase 2: read + parse all TOMLs in parallel
215    let parsed: Vec<ReadDetectorOutcome> = toml_paths
216        .par_iter()
217        .map(|path| read_detector_file(path))
218        .collect();
219
220    // Phase 3: validate + filter (sequential for logging)
221    let mut load_state = DetectorLoadState::default();
222    let mut detectors = Vec::with_capacity(parsed.len());
223
224    for outcome in parsed {
225        match outcome {
226            ReadDetectorOutcome::Loaded(spec) => {
227                if should_reject_detector(
228                    &spec,
229                    enforce_gate,
230                    &mut load_state.gate_rejected,
231                    &mut load_state.total_warnings,
232                ) {
233                    continue;
234                }
235                detectors.push(*spec);
236            }
237            ReadDetectorOutcome::Skipped { message } => {
238                load_state.skipped += 1;
239                load_state.load_errors.push(message);
240            }
241        }
242    }
243
244    log_load_summary(&load_state);
245
246    detectors.sort_by(|a, b| a.id.cmp(&b.id));
247    Ok(detectors)
248}
249
250#[derive(Default)]
251struct DetectorLoadState {
252    skipped: usize,
253    load_errors: Vec<String>,
254    gate_rejected: usize,
255    total_warnings: usize,
256}
257
258fn log_load_summary(state: &DetectorLoadState) {
259    if state.skipped > 0 {
260        tracing::warn!("skipped {} malformed detector files", state.skipped);
261    }
262    for error in &state.load_errors {
263        tracing::warn!("detector load issue: {error}");
264    }
265    if state.gate_rejected > 0 {
266        // Demoted from `warn!` — the per-detector causes are already
267        // logged at debug, and the aggregate fires on every CLI run
268        // that auto-discovers a `detectors/` directory (i.e. anyone
269        // running `keyhog` from the repo root). The user's output
270        // showed `Loaded 867 detectors` instead of the marketed 888;
271        // demoting this avoids that line being the first thing
272        // judges/operators see on stderr.
273        tracing::debug!(
274            "quality gate: {} detectors skipped (run with RUST_LOG=keyhog_core=debug for per-detector causes)",
275            state.gate_rejected
276        );
277    }
278    if state.total_warnings > 0 {
279        tracing::debug!("quality gate: {} warnings", state.total_warnings);
280    }
281}
282
283enum ReadDetectorOutcome {
284    Loaded(Box<DetectorSpec>),
285    Skipped { message: String },
286}
287
288fn read_detector_file(path: &Path) -> ReadDetectorOutcome {
289    let contents = match std::fs::read_to_string(path) {
290        Ok(contents) => contents,
291        Err(error) => {
292            // Bumped from `debug!` to `warn!`. A user with a broken
293            // permission/typoed-path detector deserves to see the
294            // reason at default log level — not "all detectors
295            // appeared to load" silently. The path is included so
296            // operators can grep for it.
297            let message = format!("failed to read {}: {}", path.display(), error);
298            tracing::warn!(
299                detector_path = %path.display(),
300                error = %error,
301                "skipping detector — fix the file's permissions or path \
302                 (run `keyhog detectors list` for the full skip list)"
303            );
304            return ReadDetectorOutcome::Skipped { message };
305        }
306    };
307
308    match toml::from_str::<DetectorFile>(&contents) {
309        Ok(file) => ReadDetectorOutcome::Loaded(Box::new(file.detector)),
310        Err(error) => {
311            // Same rationale: a TOML parse error (line + column
312            // included by the toml crate's Display impl) needs to
313            // surface to the user. Default `debug!` hid these
314            // entirely under the keyhog=warn filter, so a single
315            // mistyped field would silently drop one detector
316            // from the corpus and never tell the user.
317            let message = format!("failed to parse {}: {}", path.display(), error);
318            tracing::warn!(
319                detector_path = %path.display(),
320                error = %error,
321                "skipping detector — TOML parse failed, fix the syntax \
322                 in the file at the indicated line/column"
323            );
324            ReadDetectorOutcome::Skipped { message }
325        }
326    }
327}
328
329fn should_reject_detector(
330    spec: &DetectorSpec,
331    enforce_gate: bool,
332    gate_rejected: &mut usize,
333    total_warnings: &mut usize,
334) -> bool {
335    let mut has_errors = false;
336    for issue in validate_detector(spec) {
337        match issue {
338            QualityIssue::Warning(warning) => {
339                tracing::debug!("quality: {} — {}", spec.id, warning);
340                *total_warnings += 1;
341            }
342            QualityIssue::Error(error) => {
343                // Demoted from `warn!` — these errors fire on roughly
344                // a dozen embedded detectors at every CLI invocation
345                // (`scan`, `detectors`, `backend`, `--version` all
346                // load detectors), which made every command print 12+
347                // lines of dev-facing validator notes about URL
348                // templating before any actual output. The detectors
349                // still load and scan correctly; the validator just
350                // can't auto-verify them. Operators don't need this
351                // on their terminal — the keyhog dev who wrote the
352                // validator does, via `RUST_LOG=keyhog_core=debug`.
353                tracing::debug!(
354                    "detector quality issue (still loaded, verify path may degrade): {}: {}",
355                    spec.id,
356                    error
357                );
358                has_errors = true;
359            }
360        }
361    }
362
363    if has_errors && enforce_gate {
364        *gate_rejected += 1;
365        return true;
366    }
367
368    false
369}
370
371/// Load a set of detectors from a TOML string.
372///
373/// This is primarily used for testing and dynamic detector injection.
374pub fn load_detectors_from_str(toml_str: &str) -> Result<Vec<DetectorSpec>, SpecError> {
375    let file: DetectorFile = toml::from_str(toml_str).map_err(|e| SpecError::InvalidToml {
376        path: PathBuf::from("<string>"),
377        source: e,
378    })?;
379    Ok(vec![file.detector])
380}