Skip to main content

keyhog_core/spec/
load.rs

1//! Detector loading pipeline: read TOML files, run the quality gate, and inject
2//! small compatibility shims for legacy token formats when needed.
3
4use std::path::{Path, PathBuf};
5
6use rayon::prelude::*;
7use serde::{Deserialize, Serialize};
8
9use super::{DetectorFile, DetectorSpec, PatternSpec, QualityIssue, SpecError, validate_detector};
10
11const DETECTOR_CACHE_VERSION: u32 = 2;
12
13#[derive(Serialize, Deserialize)]
14struct DetectorCacheFile {
15    version: u32,
16    detectors: Vec<DetectorSpec>,
17}
18
19/// Save detectors to a JSON cache file for fast subsequent loads.
20///
21/// # Examples
22///
23/// ```rust,no_run
24/// use keyhog_core::{DetectorSpec, save_detector_cache};
25/// use std::path::Path;
26///
27/// let detectors: Vec<DetectorSpec> = Vec::new();
28/// save_detector_cache(&detectors, Path::new(".keyhog-cache.json")).unwrap();
29/// ```
30pub fn save_detector_cache(
31    detectors: &[DetectorSpec],
32    cache_path: &Path,
33) -> Result<(), std::io::Error> {
34    let json = serde_json::to_vec(&DetectorCacheFile {
35        version: DETECTOR_CACHE_VERSION,
36        detectors: detectors.to_vec(),
37    })?;
38    std::fs::write(cache_path, json)
39}
40
41/// Load detectors from a JSON cache file. Returns None if cache is stale or missing.
42///
43/// # Examples
44///
45/// ```rust,no_run
46/// use keyhog_core::load_detector_cache;
47/// use std::path::Path;
48///
49/// let _cached = load_detector_cache(
50///     Path::new(".keyhog-cache.json"),
51///     Path::new("detectors"),
52/// );
53/// ```
54///
55/// # Security
56///
57/// Cached detectors are re-validated through the quality gate to prevent cache
58/// poisoning attacks where a malicious `.keyhog-cache.json` injects evil regex
59/// patterns that bypass the TOML quality gate.
60pub fn load_detector_cache(cache_path: &Path, source_dir: &Path) -> Option<Vec<DetectorSpec>> {
61    let cache_meta = std::fs::metadata(cache_path).ok()?;
62    let cache_mtime = cache_meta.modified().ok()?;
63
64    // Check if any TOML in source_dir is newer than the cache
65    let entries = std::fs::read_dir(source_dir).ok()?;
66    for entry in entries.flatten() {
67        let path = entry.path();
68        if path.extension().is_some_and(|ext| ext == "toml") {
69            let is_stale = std::fs::metadata(&path)
70                .and_then(|meta| meta.modified())
71                .is_ok_and(|mtime| mtime > cache_mtime);
72
73            if is_stale {
74                return None; // Cache is stale
75            }
76        }
77    }
78
79    let data = std::fs::read(cache_path).ok()?;
80    let cache: DetectorCacheFile = serde_json::from_slice(&data).ok()?;
81    if cache.version != DETECTOR_CACHE_VERSION {
82        return None;
83    }
84
85    // Re-validate cached detectors to prevent cache poisoning.
86    let validated: Vec<DetectorSpec> = cache
87        .detectors
88        .into_iter()
89        .filter(|spec| {
90            let issues = validate_detector(spec);
91            let has_errors = issues
92                .iter()
93                .any(|issue| matches!(issue, QualityIssue::Error(_)));
94            if has_errors {
95                tracing::warn!(
96                    "cached detector '{}' failed quality gate, discarding",
97                    spec.id
98                );
99            }
100            !has_errors
101        })
102        .collect();
103
104    if validated.is_empty() {
105        tracing::warn!("all cached detectors failed validation, falling back to TOML load");
106        return None;
107    }
108
109    Some(validated)
110}
111
112/// Load all detector specs from a directory of TOML files.
113/// Runs quality gate on each detector. Rejects detectors with errors, warns on issues.
114///
115/// # Examples
116///
117/// ```rust,no_run
118/// use keyhog_core::load_detectors;
119/// use std::path::Path;
120///
121/// let detectors = load_detectors(Path::new("detectors")).unwrap();
122/// assert!(!detectors.is_empty());
123/// ```
124pub fn load_detectors(dir: &Path) -> Result<Vec<DetectorSpec>, SpecError> {
125    load_detectors_with_gate(dir, true)
126}
127
128/// Load detectors with optional quality gate enforcement.
129/// When `enforce_gate` is `true`, detectors with quality errors are skipped.
130///
131/// # Examples
132///
133/// ```rust,no_run
134/// use keyhog_core::load_detectors_with_gate;
135/// use std::path::Path;
136///
137/// let _detectors = load_detectors_with_gate(Path::new("detectors"), true).unwrap();
138/// ```
139pub fn load_detectors_with_gate(
140    dir: &Path,
141    enforce_gate: bool,
142) -> Result<Vec<DetectorSpec>, SpecError> {
143    // Phase 1: collect all TOML file paths (fast, sequential)
144    let entries = std::fs::read_dir(dir).map_err(|e| SpecError::ReadFile {
145        path: dir.display().to_string(),
146        source: e,
147    })?;
148    let toml_paths: Vec<PathBuf> = entries
149        .filter_map(|entry| {
150            let entry = entry.ok()?;
151            let path = entry.path();
152            if path.extension().is_some_and(|ext| ext == "toml") {
153                Some(path)
154            } else {
155                None
156            }
157        })
158        .collect();
159
160    // Phase 2: read + parse all TOMLs in parallel
161    let parsed: Vec<Option<DetectorSpec>> = toml_paths
162        .par_iter()
163        .map(|path| {
164            let mut skipped = 0;
165            let mut errors = Vec::new();
166            read_detector_file(path, &mut skipped, &mut errors)
167        })
168        .collect();
169
170    // Phase 3: validate + filter (sequential for logging)
171    let mut load_state = DetectorLoadState::default();
172    let mut detectors = Vec::with_capacity(parsed.len());
173
174    for spec in parsed.into_iter().flatten() {
175        if should_reject_detector(
176            &spec,
177            enforce_gate,
178            &mut load_state.gate_rejected,
179            &mut load_state.total_warnings,
180        ) {
181            continue;
182        }
183        detectors.push(spec);
184    }
185
186    if should_inject_github_classic_pat_detector(&detectors) {
187        inject_github_classic_pat_detector(&mut detectors);
188    }
189
190    log_load_summary(&load_state);
191
192    detectors.sort_by(|a, b| a.id.cmp(&b.id));
193    Ok(detectors)
194}
195
196#[derive(Default)]
197struct DetectorLoadState {
198    skipped: usize,
199    load_errors: Vec<String>,
200    gate_rejected: usize,
201    total_warnings: usize,
202}
203
204fn log_load_summary(state: &DetectorLoadState) {
205    if state.skipped > 0 {
206        tracing::info!("skipped {} unparseable files", state.skipped);
207    }
208    for error in &state.load_errors {
209        tracing::info!("detector load issue: {error}");
210    }
211    if state.gate_rejected > 0 {
212        tracing::info!("quality gate: rejected {} detectors", state.gate_rejected);
213    }
214    if state.total_warnings > 0 {
215        tracing::debug!("quality gate: {} warnings", state.total_warnings);
216    }
217}
218
219fn read_detector_file(
220    path: &Path,
221    skipped: &mut usize,
222    load_errors: &mut Vec<String>,
223) -> Option<DetectorSpec> {
224    let contents = match std::fs::read_to_string(path) {
225        Ok(contents) => contents,
226        Err(error) => {
227            let message = format!("failed to read {}: {}", path.display(), error);
228            tracing::debug!("{message}");
229            load_errors.push(message);
230            *skipped += 1;
231            return None;
232        }
233    };
234
235    match toml::from_str::<DetectorFile>(&contents) {
236        Ok(file) => Some(file.detector),
237        Err(error) => {
238            let message = format!("failed to parse {}: {}", path.display(), error);
239            tracing::debug!("{message}");
240            load_errors.push(message);
241            *skipped += 1;
242            None
243        }
244    }
245}
246
247fn should_reject_detector(
248    spec: &DetectorSpec,
249    enforce_gate: bool,
250    gate_rejected: &mut usize,
251    total_warnings: &mut usize,
252) -> bool {
253    let mut has_errors = false;
254    for issue in validate_detector(spec) {
255        match issue {
256            QualityIssue::Warning(warning) => {
257                tracing::debug!("quality: {} — {}", spec.id, warning);
258                *total_warnings += 1;
259            }
260            QualityIssue::Error(error) => {
261                tracing::warn!("failed to validate detector: {}: {}", spec.id, error);
262                has_errors = true;
263            }
264        }
265    }
266
267    if has_errors && enforce_gate {
268        *gate_rejected += 1;
269        return true;
270    }
271
272    false
273}
274
275pub(super) fn inject_github_classic_pat_detector(detectors: &mut Vec<DetectorSpec>) {
276    let Some(github_fine_grained) = detectors
277        .iter()
278        .find(|d| d.id == "github-pat-fine-grained")
279        .cloned()
280    else {
281        return;
282    };
283
284    let mut compat = github_fine_grained;
285    compat.id = "github-classic-pat".into();
286    compat.name = "GitHub Classic PAT".into();
287    compat.keywords = vec!["ghp_".into(), "github".into()];
288    compat.patterns = vec![PatternSpec {
289        regex: "ghp_[a-zA-Z0-9]{36,40}".into(),
290        description: Some("GitHub classic personal access token".into()),
291        group: None,
292    }];
293
294    detectors.push(compat);
295}
296
297fn should_inject_github_classic_pat_detector(detectors: &[DetectorSpec]) -> bool {
298    !detectors.iter().any(|d| d.id == "github-classic-pat")
299        && detectors.iter().any(|d| d.id == "github-pat-fine-grained")
300}