vyre-conform 0.1.0

Conformance suite for vyre backends — proves byte-identical output to CPU reference
Documentation
//! Specification for the `decode.unicode` operation.
use crate::{Convention, DataType, OpSignature, OpSpec};

/// Location-agnostic operation metadata.
pub const VYRE_OP_METADATA: vyre_spec::OpMetadata = vyre_spec::OpMetadata {
    id: "decode.unicode",
    layer: vyre_spec::Layer::L2,
    category: vyre_spec::MetadataCategory::A,
    version: 1,
    description: "decode unicode",
    signature: "(Bytes) -> Bytes",
    strictness: "strict",
    archetype_signature: "(Bytes) -> Bytes",
};

/// Golden samples for this op.
///
/// `decode.unicode` scans for Rust-style `\xHH` and `\uHHHH` escape sequences
/// in the input bytes and emits the corresponding UTF-8 bytes. Non-escape
/// bytes are ignored (not copied through). See `cpu_fn` below for the exact
/// state machine.
pub const GOLDEN: &[vyre_spec::GoldenSample] = &[vyre_spec::GoldenSample {
    op_id: "decode.unicode",
    input: b"",
    expected: b"",
    reason: "empty input decodes to empty output",
}];

/// Known-answer tests derived directly against `cpu_fn`. Input is the literal
/// byte stream; expected is what `cpu_fn` produces after scanning for escapes.
pub const KAT: &[vyre_spec::KatVector] = &[
    vyre_spec::KatVector {
        input: b"",
        expected: b"",
        source: "hand-verified vs cpu_fn (empty input)",
    },
    vyre_spec::KatVector {
        input: b"\\x41",
        expected: b"A",
        source: "hand-verified vs cpu_fn (\\xHH escape → ASCII 'A')",
    },
    vyre_spec::KatVector {
        input: b"\\x00",
        expected: b"\x00",
        source: "hand-verified vs cpu_fn (\\xHH escape → NUL byte)",
    },
    vyre_spec::KatVector {
        input: b"\\xff",
        expected: b"\xff",
        source: "hand-verified vs cpu_fn (\\xHH upper boundary)",
    },
    vyre_spec::KatVector {
        input: b"\\u0041",
        expected: b"A",
        source: "hand-verified vs cpu_fn (\\uHHHH → UTF-8 'A')",
    },
    vyre_spec::KatVector {
        input: b"\\u00A9",
        expected: b"\xc2\xa9",
        source: "hand-verified vs cpu_fn (\\u00A9 → 2-byte UTF-8 ©)",
    },
    vyre_spec::KatVector {
        input: b"\\u2603",
        expected: b"\xe2\x98\x83",
        source: "hand-verified vs cpu_fn (\\u2603 → 3-byte UTF-8 ☃)",
    },
];

/// Adversarial inputs for this op.
pub const ADVERSARIAL: &[vyre_spec::AdversarialInput] = &[
    vyre_spec::AdversarialInput {
        input: b"",
        reason: "empty input exercises zero-length branch",
    },
    vyre_spec::AdversarialInput {
        input: b"\\",
        reason: "trailing backslash with no escape payload — state machine must not read past end",
    },
    vyre_spec::AdversarialInput {
        input: b"\\xGG",
        reason: "\\x followed by non-hex must be rejected, never silently decoded to zero",
    },
    vyre_spec::AdversarialInput {
        input: b"\\uD800",
        reason: "UTF-16 high surrogate U+D800 is forbidden in UTF-8 per RFC 3629 §3; cpu_fn skips it",
    },
    vyre_spec::AdversarialInput {
        input: b"\\uDFFF",
        reason: "UTF-16 low surrogate U+DFFF is forbidden in UTF-8 per RFC 3629 §3; cpu_fn skips it",
    },
    vyre_spec::AdversarialInput {
        input: b"\\x",
        reason: "truncated \\x escape — only two-hex-digit form is defined, shorter forms must be rejected",
    },
    vyre_spec::AdversarialInput {
        input: b"no escapes here",
        reason: "bytes with no backslash escape must produce empty output (op decodes escapes, not passthrough)",
    },
];

/// Build the OpSpec for this decode operation.
#[inline]
pub fn vyre_op() -> OpSpec {
    let id = "decode.unicode";
    OpSpec::builder(id)
        .signature(OpSignature {
            inputs: vec![DataType::Bytes],
            output: DataType::Bytes,
        })
        .cpu_fn(cpu_fn)
        .wgsl_fn(wgsl_fn)
        .category(crate::Category::A {
            composition_of: vec![id],
        })
        .laws(vec![crate::spec::law::AlgebraicLaw::Bounded {
            lo: 0,
            hi: u32::MAX,
        }])
        .strictness(crate::spec::types::Strictness::Strict)
        .version(1)
        .alt_wgsl_fns(vec![("category_a_handwritten", wgsl_fn)])
        .convention(Convention::V1)
        .boundary_values(vec![
            crate::spec::types::BoundaryValue {
                label: "empty",
                inputs: vec![0],
            },
            crate::spec::types::BoundaryValue {
                label: "single_element",
                inputs: vec![1],
            },
            crate::spec::types::BoundaryValue {
                label: "boundary",
                inputs: vec![255],
            },
            crate::spec::types::BoundaryValue {
                label: "max",
                inputs: vec![u32::MAX],
            },
        ])
        .equivalence_classes(vec![
            crate::spec::types::EquivalenceClass::specific("empty input", vec![0]),
            crate::spec::types::EquivalenceClass::specific("typical input", vec![42]),
            crate::spec::types::EquivalenceClass::specific("boundary input", vec![255]),
        ])
        .expect("Fix: checked-in conform spec must satisfy the typestate builder")
}

/// CPU reference implementation.
#[inline]
pub fn cpu_fn(input: &[u8]) -> Vec<u8> {
    let mut out = Vec::new();
    let mut cursor = 0;
    while cursor + 3 < input.len() {
        if input[cursor] != b'\\' {
            cursor += 1;
            continue;
        }
        if input[cursor + 1] == b'x' {
            if let (Some(hi), Some(lo)) =
                (hex_value(input[cursor + 2]), hex_value(input[cursor + 3]))
            {
                out.push((hi << 4) | lo);
            }
            cursor += 1;
            continue;
        }
        if input[cursor + 1] == b'u' && cursor + 5 < input.len() {
            push_unicode(input, cursor, &mut out);
        }
        cursor += 1;
    }
    out
}

fn push_unicode(input: &[u8], cursor: usize, out: &mut Vec<u8>) {
    let d = &input[cursor + 2..cursor + 6];
    if let (Some(d0), Some(d1), Some(d2), Some(d3)) = (
        hex_value(d[0]),
        hex_value(d[1]),
        hex_value(d[2]),
        hex_value(d[3]),
    ) {
        let cp =
            (u32::from(d0) << 12) | (u32::from(d1) << 8) | (u32::from(d2) << 4) | u32::from(d3);
        if !(0xD800..=0xDFFF).contains(&cp) {
            let mut encoded = [0u8; 4];
            if let Some(ch) = char::from_u32(cp) {
                out.extend_from_slice(ch.encode_utf8(&mut encoded).as_bytes());
            }
        }
    }
}

fn hex_value(value: u8) -> Option<u8> {
    match value {
        b'0'..=b'9' => Some(value - b'0'),
        b'A'..=b'F' => Some(value - b'A' + 10),
        b'a'..=b'f' => Some(value - b'a' + 10),
        _ => None,
    }
}

/// WGSL shader source.
#[inline]
pub fn wgsl_fn() -> String {
    r#"
fn hex_value(value: u32) -> i32 {
    if (value >= 48u && value <= 57u) { return i32(value - 48u); }
    if (value >= 65u && value <= 70u) { return i32(value - 55u); }
    if (value >= 97u && value <= 102u) { return i32(value - 87u); }
    return -1;
}

fn utf8_byte(codepoint: u32, slot: u32) -> u32 {
    if (codepoint <= 0x7fu) { return select(0u, codepoint, slot == 0u); }
    if (codepoint <= 0x7ffu) {
        if (slot == 0u) { return 0xc0u | ((codepoint >> 6u) & 0x1fu); }
        if (slot == 1u) { return 0x80u | (codepoint & 0x3fu); }
    }
    if (slot == 0u) { return 0xe0u | ((codepoint >> 12u) & 0x0fu); }
    if (slot == 1u) { return 0x80u | ((codepoint >> 6u) & 0x3fu); }
    if (slot == 2u) { return 0x80u | (codepoint & 0x3fu); }
    return 0u;
}

fn utf8_len(codepoint: u32) -> u32 {
    if (codepoint <= 0x7fu) { return 1u; }
    if (codepoint <= 0x7ffu) { return 2u; }
    return 3u;
}

fn vyre_op(index: u32, input_len: u32) -> u32 {
    var written = 0u;
    var cursor = 0u;
    loop {
        if (cursor + 3u >= input_len) { break; }
        if (input.data[cursor] == 92u && input.data[cursor + 1u] == 120u) {
            let hi = hex_value(input.data[cursor + 2u]);
            let lo = hex_value(input.data[cursor + 3u]);
            if (hi >= 0 && lo >= 0) {
                if (written == index) { return (u32(hi) << 4u) | u32(lo); }
                written = written + 1u;
            }
        }
        if (cursor + 5u < input_len && input.data[cursor] == 92u && input.data[cursor + 1u] == 117u) {
            let d0 = hex_value(input.data[cursor + 2u]);
            let d1 = hex_value(input.data[cursor + 3u]);
            let d2 = hex_value(input.data[cursor + 4u]);
            let d3 = hex_value(input.data[cursor + 5u]);
            if (d0 >= 0 && d1 >= 0 && d2 >= 0 && d3 >= 0) {
                let cp = (u32(d0) << 12u) | (u32(d1) << 8u) | (u32(d2) << 4u) | u32(d3);
                let len = utf8_len(cp);
                if (!(cp >= 0xD800u && cp <= 0xDFFFu) && index < written + len) {
                    return utf8_byte(cp, index - written);
                }
                written = written + len;
            }
        }
        cursor = cursor + 1u;
    }
    return 0u;
}
"#
    .to_string()
}