use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::Duration;
use vyre::VyreBackend;
use vyre_foundation::match_result::Match;
use vyre_primitives::hash::fnv1a::{fnv1a64_initial_state, fnv1a64_update_byte};
static CACHE_TMP_SEQUENCE: AtomicU64 = AtomicU64::new(0);
const MAX_MATCH_ENGINE_CACHE_BYTES: u64 = 64 * 1024 * 1024;
#[derive(Debug, Clone, Default)]
pub struct ScanResult {
pub matches: Vec<Match>,
pub truncated: bool,
pub elapsed: Duration,
pub cache_hit: bool,
}
impl ScanResult {
#[must_use]
pub fn from_matches(matches: Vec<Match>) -> Self {
Self {
matches,
..Self::default()
}
}
#[must_use]
pub fn len(&self) -> usize {
self.matches.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.matches.is_empty()
}
}
pub trait MatchScan {
fn scan(
&self,
backend: &dyn VyreBackend,
haystack: &[u8],
max_matches: u32,
) -> Result<Vec<Match>, vyre::BackendError>;
fn reference_scan(&self, haystack: &[u8]) -> Vec<Match>;
fn cache_key(&self) -> String;
}
pub trait MatchEngineCache: Sized {
type WireError: std::fmt::Display + std::fmt::Debug;
const WIRE_MAGIC: [u8; 4];
const WIRE_VERSION: u32;
const MAX_CACHE_BYTES: u64 = MAX_MATCH_ENGINE_CACHE_BYTES;
fn to_bytes(&self) -> Result<Vec<u8>, Self::WireError>;
fn from_bytes(bytes: &[u8]) -> Result<Self, Self::WireError>;
}
pub fn cache_path(cache_dir: &Path, cache_key: &str) -> Option<PathBuf> {
if !cache_dir.exists() && std::fs::create_dir_all(cache_dir).is_err() {
return None;
}
Some(cache_dir.join(format!("{cache_key}.bin")))
}
fn cache_tmp_path(path: &Path) -> PathBuf {
let sequence = CACHE_TMP_SEQUENCE.fetch_add(1, Ordering::Relaxed);
path.with_extension(format!("tmp.{}.{}", std::process::id(), sequence))
}
fn read_match_engine_cache_bounded(
path: &Path,
max_bytes: u64,
) -> std::io::Result<Option<Vec<u8>>> {
let mut reader = std::fs::File::open(path)?;
let mut bytes = Vec::new();
let mut total = 0u64;
let mut chunk = [0u8; 8192];
loop {
let read = std::io::Read::read(&mut reader, &mut chunk)?;
if read == 0 {
return Ok(Some(bytes));
}
let read = read as u64;
total = total.saturating_add(read);
if total > max_bytes {
return Ok(None);
}
bytes.extend_from_slice(&chunk[..read as usize]);
}
}
fn remove_match_engine_cache(path: &Path, message: &'static str) {
if let Err(error) = std::fs::remove_file(path) {
tracing::debug!(
path = %path.display(),
error = %error,
"{}",
message
);
}
}
pub fn cached_load_or_compile<E, F>(cache_dir: &Path, cache_key: &str, compile: F) -> E
where
E: MatchEngineCache,
F: FnOnce() -> E,
{
let Some(path) = cache_path(cache_dir, cache_key) else {
return compile();
};
match read_match_engine_cache_bounded(&path, E::MAX_CACHE_BYTES) {
Ok(Some(bytes)) => match E::from_bytes(&bytes) {
Ok(engine) => return engine,
Err(_) => remove_match_engine_cache(&path, "failed to remove corrupt matching cache"),
},
Ok(None) => {
remove_match_engine_cache(&path, "failed to remove oversized matching cache");
}
Err(error) => {
tracing::debug!(
path = %path.display(),
error = %error,
"failed to read matching cache"
);
}
}
let engine = compile();
if let Ok(bytes) = engine.to_bytes() {
let tmp = cache_tmp_path(&path);
match std::fs::write(&tmp, &bytes) {
Ok(()) => {
if let Err(error) = std::fs::rename(&tmp, &path) {
tracing::debug!(
path = %path.display(),
tmp = %tmp.display(),
error = %error,
"failed to publish matching cache"
);
if let Err(cleanup_error) = std::fs::remove_file(&tmp) {
tracing::debug!(
tmp = %tmp.display(),
error = %cleanup_error,
"failed to remove matching cache temp file"
);
}
}
}
Err(error) => {
tracing::debug!(
tmp = %tmp.display(),
error = %error,
"failed to write matching cache temp file"
);
}
}
}
engine
}
use crate::scan::literal_set::{GpuLiteralSet, LiteralSetWireError};
impl MatchScan for GpuLiteralSet {
fn scan(
&self,
backend: &dyn VyreBackend,
haystack: &[u8],
max_matches: u32,
) -> Result<Vec<Match>, vyre::BackendError> {
GpuLiteralSet::scan(self, backend, haystack, max_matches)
}
fn reference_scan(&self, haystack: &[u8]) -> Vec<Match> {
GpuLiteralSet::reference_scan(self, haystack)
}
fn cache_key(&self) -> String {
let h = fnv1a64_word_slices([
self.pattern_offsets.as_slice(),
self.pattern_lengths.as_slice(),
self.pattern_bytes.as_slice(),
]);
format!("lit-{h:016x}")
}
}
impl MatchEngineCache for GpuLiteralSet {
type WireError = LiteralSetWireError;
const WIRE_MAGIC: [u8; 4] = *b"VLIT";
const WIRE_VERSION: u32 = 3;
fn to_bytes(&self) -> Result<Vec<u8>, Self::WireError> {
GpuLiteralSet::to_bytes(self)
}
fn from_bytes(bytes: &[u8]) -> Result<Self, Self::WireError> {
GpuLiteralSet::from_bytes(bytes)
}
}
#[cfg(feature = "matching-dfa")]
mod direct_gpu_impls {
use super::*;
use crate::scan::direct_gpu::DirectGpuScanner;
impl MatchScan for DirectGpuScanner {
fn scan(
&self,
backend: &dyn VyreBackend,
haystack: &[u8],
max_matches: u32,
) -> Result<Vec<Match>, vyre::BackendError> {
DirectGpuScanner::scan(self, backend, haystack, max_matches)
}
fn reference_scan(&self, haystack: &[u8]) -> Vec<Match> {
DirectGpuScanner::reference_scan(self, haystack)
}
fn cache_key(&self) -> String {
format!("direct-gpu-{}", self.literal_set_cache_key())
}
}
}
#[cfg(feature = "matching-nfa")]
mod rule_pipeline_impls {
use super::*;
use crate::scan::mega_scan::{PipelineWireError, RulePipeline};
impl MatchScan for RulePipeline {
fn scan(
&self,
backend: &dyn VyreBackend,
haystack: &[u8],
max_matches: u32,
) -> Result<Vec<Match>, vyre::BackendError> {
RulePipeline::scan(self, backend, haystack, max_matches)
}
fn reference_scan(&self, haystack: &[u8]) -> Vec<Match> {
RulePipeline::reference_scan(self, haystack)
}
fn cache_key(&self) -> String {
let header = [self.plan.num_states, self.plan.input_len];
let h = fnv1a64_word_slices([
header.as_slice(),
self.transition_table.as_slice(),
self.epsilon_table.as_slice(),
]);
format!("pipe-{h:016x}")
}
}
impl MatchEngineCache for RulePipeline {
type WireError = PipelineWireError;
const WIRE_MAGIC: [u8; 4] = *b"VRPL";
const WIRE_VERSION: u32 = 4;
fn to_bytes(&self) -> Result<Vec<u8>, Self::WireError> {
RulePipeline::to_bytes(self)
}
fn from_bytes(bytes: &[u8]) -> Result<Self, Self::WireError> {
RulePipeline::from_bytes(bytes)
}
}
}
fn fnv1a64_word_slices<const N: usize>(slices: [&[u32]; N]) -> u64 {
let mut h = fnv1a64_initial_state();
for words in slices {
for &word in words {
for byte in word.to_le_bytes() {
h = fnv1a64_update_byte(h, byte);
}
}
}
h
}
#[cfg(test)]
mod tests {
use super::*;
use crate::scan::literal_set::GpuLiteralSet;
#[test]
fn cache_key_changes_when_patterns_change() {
let a = GpuLiteralSet::compile(&[b"AKIA".as_slice(), b"ghp_".as_slice()]);
let b = GpuLiteralSet::compile(&[b"AKIA".as_slice(), b"ghp__".as_slice()]);
assert_ne!(MatchScan::cache_key(&a), MatchScan::cache_key(&b));
}
#[test]
fn cache_key_stable_for_same_patterns() {
let a = GpuLiteralSet::compile(&[b"AKIA".as_slice(), b"ghp_".as_slice()]);
let b = GpuLiteralSet::compile(&[b"AKIA".as_slice(), b"ghp_".as_slice()]);
assert_eq!(MatchScan::cache_key(&a), MatchScan::cache_key(&b));
}
#[test]
fn streaming_word_hash_matches_allocated_little_endian_bytes() {
let words_a = [0x0102_0304_u32, 0xAABB_CCDD];
let words_b = [0x1122_3344_u32];
let mut bytes = Vec::new();
for &word in words_a.iter().chain(words_b.iter()) {
bytes.extend_from_slice(&word.to_le_bytes());
}
assert_eq!(
fnv1a64_word_slices([words_a.as_slice(), words_b.as_slice()]),
vyre_primitives::hash::fnv1a::fnv1a64(&bytes)
);
}
#[test]
fn cached_helper_round_trips_via_disk() {
let dir = tempfile::tempdir().unwrap();
let key = "test-engine";
let mut compiles = 0;
let _engine: GpuLiteralSet = cached_load_or_compile(dir.path(), key, || {
compiles += 1;
GpuLiteralSet::compile(&[b"AKIA".as_slice()])
});
assert_eq!(compiles, 1);
let mut second_compiles = 0;
let _engine2: GpuLiteralSet = cached_load_or_compile(dir.path(), key, || {
second_compiles += 1;
GpuLiteralSet::compile(&[b"AKIA".as_slice()])
});
assert_eq!(second_compiles, 0);
}
#[test]
fn cache_tmp_paths_do_not_collide_within_process() {
let dir = tempfile::tempdir().unwrap();
let path = cache_path(dir.path(), "same-key").unwrap();
let first = cache_tmp_path(&path);
let second = cache_tmp_path(&path);
assert_ne!(
first, second,
"Fix: concurrent cache writers in one process need distinct temp files."
);
assert_eq!(first.parent(), path.parent());
assert_eq!(second.parent(), path.parent());
}
#[test]
fn cached_helper_recompiles_on_corrupt_blob() {
let dir = tempfile::tempdir().unwrap();
let key = "test-corrupt";
std::fs::write(dir.path().join(format!("{key}.bin")), b"not a real blob").unwrap();
let mut compiles = 0;
let _engine: GpuLiteralSet = cached_load_or_compile(dir.path(), key, || {
compiles += 1;
GpuLiteralSet::compile(&[b"AKIA".as_slice()])
});
assert_eq!(compiles, 1);
}
}