vyre-conform 0.1.0

Conformance suite for vyre backends — proves byte-identical output to CPU reference
Documentation
//! DFA engine conformance specification.
//!
//! CPU reference: deterministic finite automaton scanner that finds
//! non-overlapping leftmost matches in byte-level input.

use crate::spec::match_ops::dfa_scan::{scan_dfa_cpu, DfaSpec};
use crate::spec::EngineInvariant;
use crate::spec::EngineSpec;

const BYTE_CLASSES: usize = 256;

/// Build the DFA engine conformance specification.
#[inline]
pub fn spec() -> EngineSpec {
    EngineSpec {
        id: "engine.dfa",
        description: "Deterministic finite automaton pattern scanner.",
        invariants: vec![
            EngineInvariant::Deterministic,
            EngineInvariant::NoOutputLost,
            EngineInvariant::OutputOrdered,
        ],
        cpu_fn: Some(cpu_fn),
    }
}

/// CPU reference for the DFA engine.
///
/// Input serialization:
/// - [state_count: u32le]
/// - [accept_state_count: u32le]
/// - [pattern_length_count: u32le]
/// - [input_len: u32le]
/// - [transitions: state_count * 256 * u32le]
/// - [accept_states: accept_state_count * (state_id: u32le, pattern_id: u32le)]
/// - [pattern_lengths: pattern_length_count * u32le]
/// - [input_bytes: input_len * u8]
///
/// Output: each match as 12 bytes (pattern_id, start, end as u32le).
#[inline]
pub fn cpu_fn(input: &[u8]) -> Vec<u8> {
    let (spec, file_bytes) = match deserialize(input) {
        Ok(v) => v,
        Err(_) => return vec![0xFF; 4],
    };
    match scan_dfa_cpu(&spec, file_bytes) {
        Ok(matches) => matches
            .into_iter()
            .flat_map(|m| {
                [
                    m.pattern_id.to_le_bytes(),
                    m.start.to_le_bytes(),
                    m.end.to_le_bytes(),
                ]
                .concat()
            })
            .collect(),
        Err(_) => vec![0xFF; 4],
    }
}

fn deserialize(input: &[u8]) -> Result<(DfaSpec<'_>, &[u8]), String> {
    if input.len() < 16 {
        return Err("input too short for DFA header".to_string());
    }
    let state_count = read_u32(input, 0)? as usize;
    let accept_count = read_u32(input, 4)? as usize;
    let pattern_len_count = read_u32(input, 8)? as usize;
    let input_len = read_u32(input, 12)? as usize;

    let transitions_size = state_count
        .checked_mul(BYTE_CLASSES)
        .and_then(|n| n.checked_mul(4))
        .ok_or("transition table size overflow")?;
    let accept_size = accept_count
        .checked_mul(8)
        .ok_or("accept states size overflow")?;
    let pattern_len_size = pattern_len_count
        .checked_mul(4)
        .ok_or("pattern lengths size overflow")?;

    let header_size = 16usize;
    let data_end = header_size
        .checked_add(transitions_size)
        .and_then(|n| n.checked_add(accept_size))
        .and_then(|n| n.checked_add(pattern_len_size))
        .and_then(|n| n.checked_add(input_len))
        .ok_or("total size overflow")?;

    if input.len() < data_end {
        return Err("input truncated".to_string());
    }

    let transitions = bytemuck_u32_slice(&input[header_size..header_size + transitions_size])?;
    let accept_start = header_size + transitions_size;
    let accept_states = bytemuck_u32_pairs(&input[accept_start..accept_start + accept_size])?;
    let plen_start = accept_start + accept_size;
    let pattern_lengths = bytemuck_u32_slice(&input[plen_start..plen_start + pattern_len_size])?;
    let file_bytes = &input[plen_start + pattern_len_size..data_end];

    Ok((
        DfaSpec {
            transitions,
            state_count,
            accept_states,
            pattern_lengths,
        },
        file_bytes,
    ))
}

fn read_u32(bytes: &[u8], offset: usize) -> Result<u32, String> {
    let chunk = bytes
        .get(offset..offset + 4)
        .ok_or("read_u32 out of bounds")?;
    Ok(u32::from_le_bytes(chunk.try_into().unwrap()))
}

fn bytemuck_u32_slice(bytes: &[u8]) -> Result<&[u32], String> {
    if bytes.len() % 4 != 0 {
        return Err("unaligned u32 slice".to_string());
    }
    let count = bytes.len() / 4;
    let mut out = Vec::with_capacity(count);
    for chunk in bytes.chunks_exact(4) {
        out.push(u32::from_le_bytes(chunk.try_into().unwrap()));
    }
    Ok(vec_to_leaked_slice(out))
}

fn bytemuck_u32_pairs(bytes: &[u8]) -> Result<&[(u32, u32)], String> {
    if bytes.len() % 8 != 0 {
        return Err("unaligned pair slice".to_string());
    }
    let count = bytes.len() / 8;
    let mut out = Vec::with_capacity(count);
    for chunk in bytes.chunks_exact(8) {
        out.push((
            u32::from_le_bytes(chunk[0..4].try_into().unwrap()),
            u32::from_le_bytes(chunk[4..8].try_into().unwrap()),
        ));
    }
    Ok(vec_to_leaked_slice(out))
}

fn vec_to_leaked_slice<T>(v: Vec<T>) -> &'static [T] {
    let boxed = v.into_boxed_slice();
    Box::leak(boxed)
}

#[cfg(test)]
pub(crate) mod tests {
    use super::{cpu_fn, spec};
    use crate::spec::EngineInvariant;

    #[test]
    fn spec_has_correct_invariants() {
        let s = spec();
        assert_eq!(s.id, "engine.dfa");
        assert!(s.invariants.contains(&EngineInvariant::Deterministic));
        assert!(s.invariants.contains(&EngineInvariant::NoOutputLost));
        assert!(s.invariants.contains(&EngineInvariant::OutputOrdered));
    }

    #[test]
    fn cpu_fn_is_deterministic_on_hand_crafted_input() {
        let input = build_dfa_input(
            2,
            &[(1, 0)], // state 1 accepts pattern 0
            &[1],      // pattern 0 has length 1
            b"aba",
        );
        let out1 = cpu_fn(&input);
        let out2 = cpu_fn(&input);
        assert_eq!(out1, out2);
    }

    #[test]
    fn cpu_fn_output_is_ordered() {
        // DFA for "ab" (pattern 0) and "cd" (pattern 1)
        let mut transitions = vec![0u32; 5 * 256];
        transitions[b'a' as usize] = 1;
        transitions[256 + b'b' as usize] = 2;
        transitions[b'c' as usize] = 3;
        transitions[3 * 256 + b'd' as usize] = 4;
        let input =
            build_dfa_input_from_table(&transitions, 5, &[(2, 0), (4, 1)], &[2, 2], b"xxabxxcdxx");
        let out = cpu_fn(&input);
        // Two matches -> 24 bytes
        assert_eq!(out.len(), 24);
        let starts: Vec<u32> = out
            .chunks_exact(12)
            .map(|c| u32::from_le_bytes([c[4], c[5], c[6], c[7]]))
            .collect();
        assert_eq!(starts, vec![2, 6]);
        assert!(starts.windows(2).all(|w| w[0] <= w[1]));
    }

    #[test]
    fn cpu_fn_no_output_lost() {
        // DFA that accepts every 'a' as pattern 0
        let mut transitions = vec![0u32; 2 * 256];
        transitions[b'a' as usize] = 1;
        transitions[256 + b'a' as usize] = 1; // loop back on 'a'
        let input = build_dfa_input_from_table(&transitions, 2, &[(1, 0)], &[1], b"aaa");
        let out = cpu_fn(&input);
        // Non-overlapping: first 'a' at 0..1, then remaining "aa" -> next 'a' at 1..2, then 2..3
        assert_eq!(out.len() % 12, 0);
        let count = out.len() / 12;
        assert_eq!(count, 3);
    }

    fn build_dfa_input(
        state_count: usize,
        accept_states: &[(u32, u32)],
        pattern_lengths: &[u32],
        file_bytes: &[u8],
    ) -> Vec<u8> {
        let transitions = vec![0u32; state_count * 256];
        build_dfa_input_from_table(
            &transitions,
            state_count,
            accept_states,
            pattern_lengths,
            file_bytes,
        )
    }

    fn build_dfa_input_from_table(
        transitions: &[u32],
        state_count: usize,
        accept_states: &[(u32, u32)],
        pattern_lengths: &[u32],
        file_bytes: &[u8],
    ) -> Vec<u8> {
        let mut out = Vec::new();
        out.extend_from_slice(&(state_count as u32).to_le_bytes());
        out.extend_from_slice(&(accept_states.len() as u32).to_le_bytes());
        out.extend_from_slice(&(pattern_lengths.len() as u32).to_le_bytes());
        out.extend_from_slice(&(file_bytes.len() as u32).to_le_bytes());
        for &t in transitions {
            out.extend_from_slice(&t.to_le_bytes());
        }
        for &(s, p) in accept_states {
            out.extend_from_slice(&s.to_le_bytes());
            out.extend_from_slice(&p.to_le_bytes());
        }
        for &l in pattern_lengths {
            out.extend_from_slice(&l.to_le_bytes());
        }
        out.extend_from_slice(file_bytes);
        out
    }

    #[inline]
    pub(crate) fn build_dfa_input_for_invariants() -> Vec<u8> {
        // DFA for "ab" (pattern 0) and "cd" (pattern 1)
        let mut transitions = vec![0u32; 5 * 256];
        transitions[b'a' as usize] = 1;
        transitions[256 + b'b' as usize] = 2;
        transitions[b'c' as usize] = 3;
        transitions[3 * 256 + b'd' as usize] = 4;
        build_dfa_input_from_table(&transitions, 5, &[(2, 0), (4, 1)], &[2, 2], b"xxabxxcdxx")
    }
}