vyre-conform 0.1.0

Conformance suite for vyre backends — proves byte-identical output to CPU reference
Documentation
//! Scatter engine conformance specification.
//!
//! CPU reference: for each match, set the corresponding rule bitmap bit.
//! Input: matches + pattern-to-rule mapping. Output: per-rule bitmaps.

use crate::spec::{EngineInvariant, EngineSpec};

/// Build the scatter engine conformance specification.
#[inline]
pub fn spec() -> EngineSpec {
    EngineSpec {
        id: "engine.scatter",
        description: "Match-to-rule bitmap scatter engine.",
        invariants: vec![
            EngineInvariant::NoOutputLost,
            EngineInvariant::NoDuplicateOutput,
        ],
        cpu_fn: Some(cpu_fn),
    }
}

/// CPU reference for the scatter engine.
///
/// Input serialization:
/// - [rule_count: u32le]
/// - [max_strings: u32le]
/// - [match_count: u32le]
/// - [matches: match_count * (pattern_id, start, end) as u32le]
/// - [pattern_to_rules_count: u32le]
/// - [pattern_to_rules: count * (start, len) as u32le]
/// - [rule_list_count: u32le]
/// - [rule_list: rule_list_count * u32le]
/// - [string_local_ids_count: u32le]
/// - [string_local_ids: count * u32le]
///
/// Output: rule_count * words_per_rule u32le bitmap words.
/// words_per_rule = (max_strings + 31) / 32.
#[inline]
pub fn cpu_fn(input: &[u8]) -> Vec<u8> {
    match scatter_from_bytes(input) {
        Ok(bitmaps) => bitmaps.into_iter().flat_map(|w| w.to_le_bytes()).collect(),
        Err(_) => vec![0xFF; 4],
    }
}

#[derive(Debug, Clone, Copy)]
struct MatchRow {
    pattern_id: u32,
    _start: u32,
    _end: u32,
}

fn scatter_from_bytes(input: &[u8]) -> Result<Vec<u32>, String> {
    let mut offset = 0usize;
    let rule_count = read_u32(input, &mut offset)? as usize;
    let max_strings = read_u32(input, &mut offset)? as usize;
    let words_per_rule = max_strings.div_ceil(32);

    let match_count = read_u32(input, &mut offset)? as usize;
    let mut matches = Vec::with_capacity(match_count);
    for _ in 0..match_count {
        matches.push(MatchRow {
            pattern_id: read_u32(input, &mut offset)?,
            _start: read_u32(input, &mut offset)?,
            _end: read_u32(input, &mut offset)?,
        });
    }

    let pattern_to_rules_count = read_u32(input, &mut offset)? as usize;
    let mut pattern_to_rules = Vec::with_capacity(pattern_to_rules_count);
    for _ in 0..pattern_to_rules_count {
        pattern_to_rules.push((read_u32(input, &mut offset)?, read_u32(input, &mut offset)?));
    }

    let rule_list_count = read_u32(input, &mut offset)? as usize;
    let mut rule_list = Vec::with_capacity(rule_list_count);
    for _ in 0..rule_list_count {
        rule_list.push(read_u32(input, &mut offset)?);
    }

    let string_local_ids_count = read_u32(input, &mut offset)? as usize;
    let mut string_local_ids = Vec::with_capacity(string_local_ids_count);
    for _ in 0..string_local_ids_count {
        string_local_ids.push(read_u32(input, &mut offset)?);
    }

    let mut bitmaps = vec![0u32; rule_count * words_per_rule];

    for row in matches {
        let map = pattern_to_rules
            .get(row.pattern_id as usize)
            .copied()
            .unwrap_or((0, 0));
        for i in 0..map.1 {
            let offset_in_list = map.0 + i;
            let rule_id = rule_list.get(offset_in_list as usize).copied().unwrap_or(0);
            let string_id = string_local_ids
                .get(offset_in_list as usize)
                .copied()
                .unwrap_or(0);
            if rule_id as usize >= rule_count || string_id as usize >= max_strings {
                continue;
            }
            let word_idx = string_id / 32;
            let bit_idx = string_id % 32;
            let bm_idx = rule_id as usize * words_per_rule + word_idx as usize;
            bitmaps[bm_idx] |= 1u32 << bit_idx;
        }
    }

    Ok(bitmaps)
}

fn read_u32(input: &[u8], offset: &mut usize) -> Result<u32, String> {
    let bytes = input
        .get(*offset..*offset + 4)
        .ok_or("unexpected EOF reading u32")?;
    *offset += 4;
    Ok(u32::from_le_bytes(bytes.try_into().unwrap()))
}

// ── Test Helpers ─────────────────────────────────────────────────

#[cfg(test)]
#[inline]
pub(crate) fn build_scatter_input(
    rule_count: u32,
    max_strings: u32,
    matches: &[(u32, u32, u32)],
    pattern_to_rules: &[(u32, u32)],
    rule_list: &[u32],
    string_local_ids: &[u32],
) -> Vec<u8> {
    let mut out = Vec::new();
    out.extend_from_slice(&rule_count.to_le_bytes());
    out.extend_from_slice(&max_strings.to_le_bytes());
    out.extend_from_slice(&(matches.len() as u32).to_le_bytes());
    for &(pid, start, end) in matches {
        out.extend_from_slice(&pid.to_le_bytes());
        out.extend_from_slice(&start.to_le_bytes());
        out.extend_from_slice(&end.to_le_bytes());
    }
    out.extend_from_slice(&(pattern_to_rules.len() as u32).to_le_bytes());
    for &(s, l) in pattern_to_rules {
        out.extend_from_slice(&s.to_le_bytes());
        out.extend_from_slice(&l.to_le_bytes());
    }
    out.extend_from_slice(&(rule_list.len() as u32).to_le_bytes());
    for &v in rule_list {
        out.extend_from_slice(&v.to_le_bytes());
    }
    out.extend_from_slice(&(string_local_ids.len() as u32).to_le_bytes());
    for &v in string_local_ids {
        out.extend_from_slice(&v.to_le_bytes());
    }
    out
}

#[cfg(test)]
pub(crate) mod tests {
    use super::{build_scatter_input, cpu_fn, spec};
    use crate::spec::EngineInvariant;

    #[test]
    fn spec_has_correct_invariants() {
        let s = spec();
        assert_eq!(s.id, "engine.scatter");
        assert!(s.invariants.contains(&EngineInvariant::NoOutputLost));
        assert!(s.invariants.contains(&EngineInvariant::NoDuplicateOutput));
    }

    #[test]
    fn cpu_fn_is_deterministic() {
        let input = build_scatter_input(1, 8, &[(0, 0, 1)], &[(0, 1)], &[0], &[0]);
        let out1 = cpu_fn(&input);
        let out2 = cpu_fn(&input);
        assert_eq!(out1, out2);
    }

    #[test]
    fn cpu_fn_no_output_lost() {
        // One match for pattern 0 -> maps to rule 0, string 3
        let input = build_scatter_input(1, 8, &[(0, 10, 12)], &[(0, 1)], &[0], &[3]);
        let out = cpu_fn(&input);
        assert_eq!(out.len(), 4); // 1 rule * 1 word = 4 bytes
        let word = u32::from_le_bytes([out[0], out[1], out[2], out[3]]);
        assert!(word & (1 << 3) != 0);
    }

    #[test]
    fn cpu_fn_no_duplicate_output_idempotent() {
        // Running twice on same input should produce same bitmap (idempotent)
        let input = build_scatter_input(
            2,
            64,
            &[(0, 0, 1), (1, 1, 2)],
            &[(0, 1), (1, 1)],
            &[0, 1],
            &[5, 10],
        );
        let out1 = cpu_fn(&input);
        let out2 = cpu_fn(&input);
        assert_eq!(out1, out2);
        // Verify both bits are set
        let words: Vec<u32> = out1
            .chunks_exact(4)
            .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]]))
            .collect();
        // words_per_rule = 2. rule 0 at [0..2], rule 1 at [2..4].
        // string 5 -> word 0 bit 5; string 10 -> word 0 bit 10.
        assert!(words[0] & (1 << 5) != 0, "rule 0 string 5 missing");
        assert!(words[2] & (1 << 10) != 0, "rule 1 string 10 missing");
    }

    #[test]
    fn cpu_fn_multiple_strings_across_words() {
        // max_strings=40 -> 2 words per rule
        let input = build_scatter_input(1, 40, &[(0, 0, 1)], &[(0, 2)], &[0, 0], &[0, 35]);
        let out = cpu_fn(&input);
        assert_eq!(out.len(), 8); // 1 rule * 2 words
        let word0 = u32::from_le_bytes([out[0], out[1], out[2], out[3]]);
        let word1 = u32::from_le_bytes([out[4], out[5], out[6], out[7]]);
        assert!(word0 & 1 != 0); // bit 0
        assert!(word1 & (1 << 3) != 0); // bit 35 = word 1, bit 3
    }

    #[inline]
    pub(crate) fn build_scatter_input_for_invariants() -> Vec<u8> {
        // One match for pattern 0 -> maps to rule 0, string 3
        build_scatter_input(1, 8, &[(0, 10, 12)], &[(0, 1)], &[0], &[3])
    }
}