1mod backend;
4mod backend_dispatch;
5mod backend_pattern_hits;
6mod backend_prepared;
7mod backend_triggered;
8pub mod boundary;
9mod compile;
10mod extract;
11mod fallback;
12mod fallback_entropy;
13mod fallback_entropy_helpers;
14mod fallback_generic;
15mod gpu_ac_phase1;
16mod gpu_cache;
17mod gpu_coalesce;
18#[allow(dead_code)]
19pub mod gpu_decode_scan;
20mod gpu_dispatch;
21mod gpu_forced;
22mod gpu_lazy;
23mod gpu_literal_phase1;
24mod gpu_megascan;
25mod gpu_phase2;
26pub(crate) mod gpu_postprocess;
27#[allow(dead_code)]
28pub mod gpu_program_fusion;
29#[allow(dead_code)]
30pub mod gpu_regex_dfa;
31mod gpu_scan_wrappers;
32mod hot_patterns;
33mod process;
34mod rule_pipeline;
35mod scan;
36mod scan_filters;
37mod scan_postprocess;
38pub mod segment_attribution;
39mod windowed;
40
41#[cfg(feature = "simd")]
46pub(crate) use backend_prepared::build_simd_scanner;
47pub(crate) use backend_prepared::PreparedChunk;
48pub use gpu_cache::{AcConstPacks, GpuConstPacks};
49pub use gpu_coalesce::coalesce_chunks;
50pub use gpu_regex_dfa::{build_regex_dfa, RegexDfaError};
51pub use gpu_scan_wrappers::GpuPhase1Output;
52pub use rule_pipeline::{
53 build_rule_pipeline, megascan_input_len, rule_pipeline_cached, AC_GPU_MAX_MATCHES_PER_DISPATCH,
54 MEGASCAN_INPUT_LEN, MEGASCAN_INPUT_LEN_DEFAULT,
55};
56pub use windowed::{
57 floor_char_boundary, line_number_for_offset, next_window_offset, record_window_match,
58 window_chunk, window_end_offset,
59};
60
61use crate::compiler::*;
62use crate::error::Result;
63use crate::pipeline::*;
64use crate::types::*;
65use aho_corasick::AhoCorasick;
66use keyhog_core::{Chunk, DetectorSpec, RawMatch};
67use std::sync::Arc;
68use std::sync::OnceLock;
69
70pub use vyre_libs::scan::LiteralMatch;
71
72fn env_per_chunk_deadline() -> Option<std::time::Instant> {
90 static MS: std::sync::OnceLock<Option<u64>> = std::sync::OnceLock::new();
91 let ms = *MS.get_or_init(|| {
92 std::env::var("KEYHOG_PER_CHUNK_TIMEOUT_MS")
93 .ok()
94 .and_then(|v| v.parse::<u64>().ok())
95 .filter(|&v| v > 0)
96 });
97 ms.map(|ms| std::time::Instant::now() + std::time::Duration::from_millis(ms))
98}
99
100pub enum MlScoreResult<'a> {
101 Final(f64),
103 #[cfg(feature = "ml")]
104 Pending {
106 heuristic_conf: f64,
107 code_context: crate::context::CodeContext,
108 credential: std::borrow::Cow<'a, str>,
109 ml_context: std::borrow::Cow<'a, str>,
110 },
111 #[cfg(not(feature = "ml"))]
116 #[doc(hidden)]
117 _Lifetime(std::marker::PhantomData<&'a ()>),
118}
119
120pub struct CompiledScanner {
121 pub(crate) fragment_cache: crate::fragment_cache::FragmentCache,
122 pub(crate) ac: Option<AhoCorasick>,
123 pub(crate) gpu_backend: Option<Arc<dyn vyre::VyreBackend>>,
124 pub(crate) wgpu_backend: Option<Arc<vyre_driver_wgpu::WgpuBackend>>,
125 pub(crate) gpu_literals: Option<Arc<Vec<Vec<u8>>>>,
126 pub(crate) gpu_matcher: OnceLock<Option<vyre_libs::scan::GpuLiteralSet>>,
127 pub(crate) gpu_const_packs: OnceLock<GpuConstPacks>,
128 pub(crate) gpu_ac_const_packs: OnceLock<AcConstPacks>,
129 pub(crate) ac_gpu_program: OnceLock<Option<vyre::Program>>,
130 pub(crate) rule_pipeline: OnceLock<Option<vyre_libs::scan::RulePipeline>>,
131 pub(crate) fused_program: OnceLock<Option<vyre::Program>>,
134 pub(crate) fused_decode_programs: OnceLock<Option<gpu_decode_scan::FusedDecodeScanPrograms>>,
137 pub(crate) static_intern: Arc<crate::static_intern::StaticInterner>,
138 pub(crate) ac_map: Vec<CompiledPattern>,
139 pub(crate) prefix_propagation: Vec<Vec<usize>>,
140 pub(crate) fallback: Vec<(CompiledPattern, Vec<String>)>,
141 pub(crate) companions: Vec<Vec<CompiledCompanion>>,
142 pub(crate) detectors: Vec<DetectorSpec>,
143 pub(crate) same_prefix_patterns: Vec<Vec<usize>>,
144 pub(crate) fallback_keyword_ac: Option<AhoCorasick>,
145 pub(crate) fallback_keyword_to_patterns: Vec<Vec<usize>>,
146 pub(crate) fallback_always_active: Vec<bool>,
147 #[cfg(feature = "simd")]
148 pub(crate) simd_prefilter: Option<crate::simd::backend::HsScanner>,
149 #[cfg(feature = "simd")]
150 pub(crate) hs_index_map: Vec<Vec<usize>>,
151 #[cfg(feature = "simdsieve")]
158 pub(crate) hot_pattern_validators: Vec<Option<regex::Regex>>,
159 pub config: ScannerConfig,
160 pub alphabet_screen: Option<crate::alphabet_filter::AlphabetScreen>,
161 pub(crate) bigram_bloom: crate::bigram_bloom::BigramBloom,
162}
163
164const _: () = {
165 const fn assert_send_sync<T: Send + Sync>() {}
166 let _ = assert_send_sync::<CompiledScanner>;
167};
168
169impl CompiledScanner {
170 pub fn detector_count(&self) -> usize {
172 self.detectors.len()
173 }
174
175 pub fn pattern_count(&self) -> usize {
177 self.ac_map.len() + self.fallback.len()
178 }
179
180 pub fn warm(&self) {
195 use rayon::prelude::*;
196 self.ac_map.par_iter().for_each(|p| {
197 let _ = p.regex.get();
198 });
199 self.fallback.par_iter().for_each(|(p, _)| {
200 let _ = p.regex.get();
201 });
202 }
203
204 pub fn pattern_regex_strs(&self) -> Vec<&str> {
207 let mut out = Vec::with_capacity(self.ac_map.len() + self.fallback.len());
208 out.extend(self.ac_map.iter().map(|p| p.regex.as_str()));
209 out.extend(self.fallback.iter().map(|(p, _)| p.regex.as_str()));
210 out
211 }
212
213 #[must_use]
215 pub fn select_backend_for_file(&self, file_size: u64) -> crate::hw_probe::ScanBackend {
216 crate::hw_probe::select_backend(
217 crate::hw_probe::probe_hardware(),
218 file_size,
219 self.pattern_count(),
220 )
221 }
222
223 #[must_use]
231 pub fn gpu_backend_label(&self) -> Option<&'static str> {
232 self.gpu_backend.as_ref().map(|b| b.id())
233 }
234
235 #[must_use]
237 pub fn preferred_backend_label(&self) -> &'static str {
238 self.select_backend_for_file(0).label()
239 }
240
241 pub fn warm_backend(&self, backend: crate::hw_probe::ScanBackend) -> bool {
243 let ready = match backend {
244 crate::hw_probe::ScanBackend::Gpu => self.gpu_stack_usable(),
245 crate::hw_probe::ScanBackend::MegaScan => {
246 let pipeline_ready = self.rule_pipeline().is_some();
247 let stack_ready = self.gpu_stack_usable();
248 if !pipeline_ready && stack_ready {
249 gpu_forced::deny_silent_megascan_degrade(
250 "regex pipeline compile rejected the detector set",
251 );
252 }
253 pipeline_ready && stack_ready
254 }
255 crate::hw_probe::ScanBackend::SimdCpu | crate::hw_probe::ScanBackend::CpuFallback => {
256 true
257 }
258 };
259 if !ready {
260 gpu_forced::deny_silent_gpu_degrade(self, backend);
261 }
262 ready
263 }
264
265 pub fn scan(&self, chunk: &Chunk) -> Vec<RawMatch> {
267 self.scan_with_deadline(chunk, env_per_chunk_deadline())
268 }
269
270 pub fn scan_with_backend(
272 &self,
273 chunk: &Chunk,
274 backend: crate::hw_probe::ScanBackend,
275 ) -> Vec<RawMatch> {
276 self.scan_with_deadline_and_backend(chunk, env_per_chunk_deadline(), Some(backend))
277 }
278
279 pub fn scan_chunks_with_backend(
281 &self,
282 chunks: &[Chunk],
283 backend: crate::hw_probe::ScanBackend,
284 ) -> Vec<Vec<RawMatch>> {
285 gpu_forced::deny_silent_gpu_degrade(self, backend);
286 self.scan_chunks_with_backend_internal(chunks, backend)
287 }
288
289 pub fn clear_fragment_cache(&self) {
291 self.fragment_cache.clear();
292 }
293
294 pub fn scan_with_deadline(
296 &self,
297 chunk: &Chunk,
298 deadline: Option<std::time::Instant>,
299 ) -> Vec<RawMatch> {
300 self.scan_with_deadline_and_backend(chunk, deadline, None)
301 }
302
303 pub fn scan_with_deadline_and_backend(
304 &self,
305 chunk: &Chunk,
306 deadline: Option<std::time::Instant>,
307 backend: Option<crate::hw_probe::ScanBackend>,
308 ) -> Vec<RawMatch> {
309 if let Some(path) = chunk.metadata.path.as_deref() {
310 let filename = path.rsplit(['/', '\\']).next().unwrap_or(path);
311 if filename == ".keyhog"
312 || filename == ".keyhogignore"
313 || path.split(['/', '\\']).any(|c| c == "detectors")
314 {
315 crate::telemetry::record_file_skipped();
316 return Vec::new();
317 }
318 }
319
320 if let Some(screen) = &self.alphabet_screen {
321 if !screen.screen(chunk.data.as_bytes()) {
322 crate::telemetry::record_file_skipped();
323 return Vec::new();
324 }
325 }
326
327 if chunk.data.len() >= 64 && !self.bigram_bloom.maybe_overlaps(chunk.data.as_bytes()) {
328 crate::telemetry::record_file_skipped();
329 return Vec::new();
330 }
331
332 let selected_backend =
333 backend.unwrap_or_else(|| self.select_backend_for_file(chunk.data.len() as u64));
334 gpu_forced::deny_silent_gpu_degrade(self, selected_backend);
335 tracing::trace!(
336 target: "keyhog::routing",
337 backend = selected_backend.label(),
338 chunk_bytes = chunk.data.len(),
339 source_type = chunk.metadata.source_type.as_str(),
340 "scan dispatch"
341 );
342 let mut matches = if chunk.data.len() > MAX_SCAN_CHUNK_BYTES {
343 self.scan_windowed(chunk, deadline)
344 } else {
345 self.scan_inner(chunk, selected_backend, deadline)
346 };
347
348 self.post_process_matches(chunk, &mut matches, deadline);
349
350 matches
351 }
352}