use crate::spec::match_ops::dfa_scan::{scan_dfa_cpu, DfaSpec};
use crate::spec::EngineInvariant;
use crate::spec::EngineSpec;
const BYTE_CLASSES: usize = 256;
#[inline]
pub fn spec() -> EngineSpec {
EngineSpec {
id: "engine.dfa",
description: "Deterministic finite automaton pattern scanner.",
invariants: vec![
EngineInvariant::Deterministic,
EngineInvariant::NoOutputLost,
EngineInvariant::OutputOrdered,
],
cpu_fn: Some(cpu_fn),
}
}
#[inline]
pub fn cpu_fn(input: &[u8]) -> Vec<u8> {
let (spec, file_bytes) = match deserialize(input) {
Ok(v) => v,
Err(_) => return vec![0xFF; 4],
};
match scan_dfa_cpu(&spec, file_bytes) {
Ok(matches) => matches
.into_iter()
.flat_map(|m| {
[
m.pattern_id.to_le_bytes(),
m.start.to_le_bytes(),
m.end.to_le_bytes(),
]
.concat()
})
.collect(),
Err(_) => vec![0xFF; 4],
}
}
fn deserialize(input: &[u8]) -> Result<(DfaSpec<'_>, &[u8]), String> {
if input.len() < 16 {
return Err("input too short for DFA header".to_string());
}
let state_count = read_u32(input, 0)? as usize;
let accept_count = read_u32(input, 4)? as usize;
let pattern_len_count = read_u32(input, 8)? as usize;
let input_len = read_u32(input, 12)? as usize;
let transitions_size = state_count
.checked_mul(BYTE_CLASSES)
.and_then(|n| n.checked_mul(4))
.ok_or("transition table size overflow")?;
let accept_size = accept_count
.checked_mul(8)
.ok_or("accept states size overflow")?;
let pattern_len_size = pattern_len_count
.checked_mul(4)
.ok_or("pattern lengths size overflow")?;
let header_size = 16usize;
let data_end = header_size
.checked_add(transitions_size)
.and_then(|n| n.checked_add(accept_size))
.and_then(|n| n.checked_add(pattern_len_size))
.and_then(|n| n.checked_add(input_len))
.ok_or("total size overflow")?;
if input.len() < data_end {
return Err("input truncated".to_string());
}
let transitions = bytemuck_u32_slice(&input[header_size..header_size + transitions_size])?;
let accept_start = header_size + transitions_size;
let accept_states = bytemuck_u32_pairs(&input[accept_start..accept_start + accept_size])?;
let plen_start = accept_start + accept_size;
let pattern_lengths = bytemuck_u32_slice(&input[plen_start..plen_start + pattern_len_size])?;
let file_bytes = &input[plen_start + pattern_len_size..data_end];
Ok((
DfaSpec {
transitions,
state_count,
accept_states,
pattern_lengths,
},
file_bytes,
))
}
fn read_u32(bytes: &[u8], offset: usize) -> Result<u32, String> {
let chunk = bytes
.get(offset..offset + 4)
.ok_or("read_u32 out of bounds")?;
Ok(u32::from_le_bytes(chunk.try_into().unwrap()))
}
fn bytemuck_u32_slice(bytes: &[u8]) -> Result<&[u32], String> {
if bytes.len() % 4 != 0 {
return Err("unaligned u32 slice".to_string());
}
let count = bytes.len() / 4;
let mut out = Vec::with_capacity(count);
for chunk in bytes.chunks_exact(4) {
out.push(u32::from_le_bytes(chunk.try_into().unwrap()));
}
Ok(vec_to_leaked_slice(out))
}
fn bytemuck_u32_pairs(bytes: &[u8]) -> Result<&[(u32, u32)], String> {
if bytes.len() % 8 != 0 {
return Err("unaligned pair slice".to_string());
}
let count = bytes.len() / 8;
let mut out = Vec::with_capacity(count);
for chunk in bytes.chunks_exact(8) {
out.push((
u32::from_le_bytes(chunk[0..4].try_into().unwrap()),
u32::from_le_bytes(chunk[4..8].try_into().unwrap()),
));
}
Ok(vec_to_leaked_slice(out))
}
fn vec_to_leaked_slice<T>(v: Vec<T>) -> &'static [T] {
let boxed = v.into_boxed_slice();
Box::leak(boxed)
}
#[cfg(test)]
pub(crate) mod tests {
use super::{cpu_fn, spec};
use crate::spec::EngineInvariant;
#[test]
fn spec_has_correct_invariants() {
let s = spec();
assert_eq!(s.id, "engine.dfa");
assert!(s.invariants.contains(&EngineInvariant::Deterministic));
assert!(s.invariants.contains(&EngineInvariant::NoOutputLost));
assert!(s.invariants.contains(&EngineInvariant::OutputOrdered));
}
#[test]
fn cpu_fn_is_deterministic_on_hand_crafted_input() {
let input = build_dfa_input(
2,
&[(1, 0)], &[1], b"aba",
);
let out1 = cpu_fn(&input);
let out2 = cpu_fn(&input);
assert_eq!(out1, out2);
}
#[test]
fn cpu_fn_output_is_ordered() {
let mut transitions = vec![0u32; 5 * 256];
transitions[b'a' as usize] = 1;
transitions[256 + b'b' as usize] = 2;
transitions[b'c' as usize] = 3;
transitions[3 * 256 + b'd' as usize] = 4;
let input =
build_dfa_input_from_table(&transitions, 5, &[(2, 0), (4, 1)], &[2, 2], b"xxabxxcdxx");
let out = cpu_fn(&input);
assert_eq!(out.len(), 24);
let starts: Vec<u32> = out
.chunks_exact(12)
.map(|c| u32::from_le_bytes([c[4], c[5], c[6], c[7]]))
.collect();
assert_eq!(starts, vec![2, 6]);
assert!(starts.windows(2).all(|w| w[0] <= w[1]));
}
#[test]
fn cpu_fn_no_output_lost() {
let mut transitions = vec![0u32; 2 * 256];
transitions[b'a' as usize] = 1;
transitions[256 + b'a' as usize] = 1; let input = build_dfa_input_from_table(&transitions, 2, &[(1, 0)], &[1], b"aaa");
let out = cpu_fn(&input);
assert_eq!(out.len() % 12, 0);
let count = out.len() / 12;
assert_eq!(count, 3);
}
fn build_dfa_input(
state_count: usize,
accept_states: &[(u32, u32)],
pattern_lengths: &[u32],
file_bytes: &[u8],
) -> Vec<u8> {
let transitions = vec![0u32; state_count * 256];
build_dfa_input_from_table(
&transitions,
state_count,
accept_states,
pattern_lengths,
file_bytes,
)
}
fn build_dfa_input_from_table(
transitions: &[u32],
state_count: usize,
accept_states: &[(u32, u32)],
pattern_lengths: &[u32],
file_bytes: &[u8],
) -> Vec<u8> {
let mut out = Vec::new();
out.extend_from_slice(&(state_count as u32).to_le_bytes());
out.extend_from_slice(&(accept_states.len() as u32).to_le_bytes());
out.extend_from_slice(&(pattern_lengths.len() as u32).to_le_bytes());
out.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
for &t in transitions {
out.extend_from_slice(&t.to_le_bytes());
}
for &(s, p) in accept_states {
out.extend_from_slice(&s.to_le_bytes());
out.extend_from_slice(&p.to_le_bytes());
}
for &l in pattern_lengths {
out.extend_from_slice(&l.to_le_bytes());
}
out.extend_from_slice(file_bytes);
out
}
#[inline]
pub(crate) fn build_dfa_input_for_invariants() -> Vec<u8> {
let mut transitions = vec![0u32; 5 * 256];
transitions[b'a' as usize] = 1;
transitions[256 + b'b' as usize] = 2;
transitions[b'c' as usize] = 3;
transitions[3 * 256 + b'd' as usize] = 4;
build_dfa_input_from_table(&transitions, 5, &[(2, 0), (4, 1)], &[2, 2], b"xxabxxcdxx")
}
}