Skip to main content

keyhog_core/spec/
load.rs

1//! Detector loading pipeline: read TOML files, run the quality gate, and inject
2//! small compatibility shims for legacy token formats when needed.
3
4use std::io;
5use std::path::{Path, PathBuf};
6
7use rayon::prelude::*;
8use serde::{Deserialize, Serialize};
9
10use super::{validate_detector, DetectorFile, DetectorSpec, QualityIssue, SpecError};
11
12const DETECTOR_CACHE_VERSION: u32 = 2;
13
14#[derive(Serialize, Deserialize)]
15struct DetectorCacheFile {
16    version: u32,
17    detectors: Vec<DetectorSpec>,
18}
19
20/// Save detectors to a JSON cache file for fast subsequent loads.
21///
22/// # Examples
23///
24/// ```rust,no_run
25/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
26/// use keyhog_core::{DetectorSpec, save_detector_cache};
27/// use std::path::Path;
28///
29/// let detectors: Vec<DetectorSpec> = Vec::new();
30/// save_detector_cache(&detectors, Path::new(".keyhog-cache.json"))?;
31/// # Ok(()) }
32/// ```
33pub fn save_detector_cache(
34    detectors: &[DetectorSpec],
35    cache_path: &Path,
36) -> Result<(), std::io::Error> {
37    for detector in detectors {
38        let issues = validate_detector(detector);
39        if issues
40            .iter()
41            .any(|issue| matches!(issue, QualityIssue::Error(_)))
42        {
43            return Err(io::Error::new(
44                io::ErrorKind::InvalidData,
45                format!(
46                    "refusing to cache invalid detector '{}'. Fix: repair the detector before writing the cache",
47                    detector.id
48                ),
49            ));
50        }
51    }
52
53    let json = serde_json::to_vec(&DetectorCacheFile {
54        version: DETECTOR_CACHE_VERSION,
55        detectors: detectors.to_vec(),
56    })?;
57    std::fs::write(cache_path, json)
58}
59
60/// Load detectors from a JSON cache file. Returns None if cache is stale or missing.
61///
62/// # Examples
63///
64/// ```rust,no_run
65/// use keyhog_core::load_detector_cache;
66/// use std::path::Path;
67///
68/// let _cached = load_detector_cache(
69///     Path::new(".keyhog-cache.json"),
70///     Path::new("detectors"),
71/// );
72/// ```
73///
74/// # Security
75///
76/// Cached detectors are re-validated through the quality gate to prevent cache
77/// poisoning attacks where a malicious `.keyhog-cache.json` injects evil regex
78/// patterns that bypass the TOML quality gate.
79pub fn load_detector_cache(cache_path: &Path, source_dir: &Path) -> Option<Vec<DetectorSpec>> {
80    let cache_meta = std::fs::metadata(cache_path).ok()?;
81    let cache_mtime = cache_meta.modified().ok()?;
82
83    // Check if any TOML in source_dir is newer than the cache
84    let entries = std::fs::read_dir(source_dir).ok()?;
85    for entry in entries.flatten() {
86        let path = entry.path();
87        if path.extension().is_some_and(|ext| ext == "toml") {
88            let is_stale = std::fs::metadata(&path)
89                .and_then(|meta| meta.modified())
90                .is_ok_and(|mtime| mtime > cache_mtime);
91
92            if is_stale {
93                return None; // Cache is stale
94            }
95        }
96    }
97
98    let data = match std::fs::read(cache_path) {
99        Ok(data) => data,
100        Err(error) => {
101            tracing::warn!(
102                "failed to read detector cache {}: {}",
103                cache_path.display(),
104                error
105            );
106            return None;
107        }
108    };
109    let cache: DetectorCacheFile = match serde_json::from_slice(&data) {
110        Ok(cache) => cache,
111        Err(error) => {
112            tracing::warn!(
113                "failed to parse detector cache {}: {}",
114                cache_path.display(),
115                error
116            );
117            return None;
118        }
119    };
120    if cache.version != DETECTOR_CACHE_VERSION {
121        return None;
122    }
123
124    let mut validated = Vec::with_capacity(cache.detectors.len());
125    for spec in cache.detectors {
126        let issues = validate_detector(&spec);
127        if issues
128            .iter()
129            .any(|issue| matches!(issue, QualityIssue::Error(_)))
130        {
131            tracing::warn!(
132                "cached detector '{}' failed quality gate; discarding the entire cache",
133                spec.id
134            );
135            return None;
136        }
137        validated.push(spec);
138    }
139
140    if validated.is_empty() {
141        tracing::warn!("detector cache is empty after validation, falling back to TOML load");
142        return None;
143    }
144
145    Some(validated)
146}
147
148/// Load all detector specs from a directory of TOML files.
149/// Runs quality gate on each detector. Rejects detectors with errors, warns on issues.
150///
151/// # Examples
152///
153/// ```rust,no_run
154/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
155/// use keyhog_core::load_detectors;
156/// use std::path::Path;
157///
158/// let detectors = load_detectors(Path::new("detectors"))?;
159/// assert!(!detectors.is_empty());
160/// # Ok(()) }
161/// ```
162pub fn load_detectors(dir: &Path) -> Result<Vec<DetectorSpec>, SpecError> {
163    load_detectors_with_gate(dir, true)
164}
165
166/// Load detectors with optional quality gate enforcement.
167/// When `enforce_gate` is `true`, detectors with quality errors are skipped.
168///
169/// # Examples
170///
171/// ```rust,no_run
172/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
173/// use keyhog_core::load_detectors_with_gate;
174/// use std::path::Path;
175///
176/// let _detectors = load_detectors_with_gate(Path::new("detectors"), true)?;
177/// # Ok(()) }
178/// ```
179pub fn load_detectors_with_gate(
180    dir: &Path,
181    enforce_gate: bool,
182) -> Result<Vec<DetectorSpec>, SpecError> {
183    // Phase 1: collect all TOML file paths (fast, sequential)
184    let entries = std::fs::read_dir(dir).map_err(|e| SpecError::ReadFile {
185        path: dir.display().to_string(),
186        source: e,
187    })?;
188    let toml_paths: Vec<PathBuf> = entries
189        .filter_map(|entry| {
190            let entry = entry.ok()?;
191            let path = entry.path();
192            if path.extension().is_some_and(|ext| ext == "toml") {
193                Some(path)
194            } else {
195                None
196            }
197        })
198        .collect();
199
200    // Phase 2: read + parse all TOMLs in parallel
201    let parsed: Vec<ReadDetectorOutcome> = toml_paths
202        .par_iter()
203        .map(|path| read_detector_file(path))
204        .collect();
205
206    // Phase 3: validate + filter (sequential for logging)
207    let mut load_state = DetectorLoadState::default();
208    let mut detectors = Vec::with_capacity(parsed.len());
209
210    for outcome in parsed {
211        match outcome {
212            ReadDetectorOutcome::Loaded(spec) => {
213                if should_reject_detector(
214                    &spec,
215                    enforce_gate,
216                    &mut load_state.gate_rejected,
217                    &mut load_state.total_warnings,
218                ) {
219                    continue;
220                }
221                detectors.push(*spec);
222            }
223            ReadDetectorOutcome::Skipped { message } => {
224                load_state.skipped += 1;
225                load_state.load_errors.push(message);
226            }
227        }
228    }
229
230    log_load_summary(&load_state);
231
232    detectors.sort_by(|a, b| a.id.cmp(&b.id));
233    Ok(detectors)
234}
235
236#[derive(Default)]
237struct DetectorLoadState {
238    skipped: usize,
239    load_errors: Vec<String>,
240    gate_rejected: usize,
241    total_warnings: usize,
242}
243
244fn log_load_summary(state: &DetectorLoadState) {
245    if state.skipped > 0 {
246        tracing::warn!("skipped {} malformed detector files", state.skipped);
247    }
248    for error in &state.load_errors {
249        tracing::warn!("detector load issue: {error}");
250    }
251    if state.gate_rejected > 0 {
252        // Demoted from `warn!` — the per-detector causes are already
253        // logged at debug, and the aggregate fires on every CLI run
254        // that auto-discovers a `detectors/` directory (i.e. anyone
255        // running `keyhog` from the repo root). The user's output
256        // showed `Loaded 867 detectors` instead of the marketed 888;
257        // demoting this avoids that line being the first thing
258        // judges/operators see on stderr.
259        tracing::debug!(
260            "quality gate: {} detectors skipped (run with RUST_LOG=keyhog_core=debug for per-detector causes)",
261            state.gate_rejected
262        );
263    }
264    if state.total_warnings > 0 {
265        tracing::debug!("quality gate: {} warnings", state.total_warnings);
266    }
267}
268
269enum ReadDetectorOutcome {
270    Loaded(Box<DetectorSpec>),
271    Skipped { message: String },
272}
273
274fn read_detector_file(path: &Path) -> ReadDetectorOutcome {
275    let contents = match std::fs::read_to_string(path) {
276        Ok(contents) => contents,
277        Err(error) => {
278            let message = format!("failed to read {}: {}", path.display(), error);
279            tracing::debug!("{message}");
280            return ReadDetectorOutcome::Skipped { message };
281        }
282    };
283
284    match toml::from_str::<DetectorFile>(&contents) {
285        Ok(file) => ReadDetectorOutcome::Loaded(Box::new(file.detector)),
286        Err(error) => {
287            let message = format!("failed to parse {}: {}", path.display(), error);
288            tracing::debug!("{message}");
289            ReadDetectorOutcome::Skipped { message }
290        }
291    }
292}
293
294fn should_reject_detector(
295    spec: &DetectorSpec,
296    enforce_gate: bool,
297    gate_rejected: &mut usize,
298    total_warnings: &mut usize,
299) -> bool {
300    let mut has_errors = false;
301    for issue in validate_detector(spec) {
302        match issue {
303            QualityIssue::Warning(warning) => {
304                tracing::debug!("quality: {} — {}", spec.id, warning);
305                *total_warnings += 1;
306            }
307            QualityIssue::Error(error) => {
308                // Demoted from `warn!` — these errors fire on roughly
309                // a dozen embedded detectors at every CLI invocation
310                // (`scan`, `detectors`, `backend`, `--version` all
311                // load detectors), which made every command print 12+
312                // lines of dev-facing validator notes about URL
313                // templating before any actual output. The detectors
314                // still load and scan correctly; the validator just
315                // can't auto-verify them. Operators don't need this
316                // on their terminal — the keyhog dev who wrote the
317                // validator does, via `RUST_LOG=keyhog_core=debug`.
318                tracing::debug!(
319                    "detector quality issue (still loaded, verify path may degrade): {}: {}",
320                    spec.id,
321                    error
322                );
323                has_errors = true;
324            }
325        }
326    }
327
328    if has_errors && enforce_gate {
329        *gate_rejected += 1;
330        return true;
331    }
332
333    false
334}
335
336/// Load a set of detectors from a TOML string.
337///
338/// This is primarily used for testing and dynamic detector injection.
339pub fn load_detectors_from_str(toml_str: &str) -> Result<Vec<DetectorSpec>, SpecError> {
340    let file: DetectorFile = toml::from_str(toml_str).map_err(|e| SpecError::InvalidToml {
341        path: PathBuf::from("<string>"),
342        source: e,
343    })?;
344    Ok(vec![file.detector])
345}