vyre-conform 0.1.0

Conformance suite for vyre backends — proves byte-identical output to CPU reference
Documentation
use crate::{Convention, DataType, OpSignature, OpSpec};

/// CPU reference implementation that mirrors the WGSL tokenizer exactly.
///
/// Processes the input byte-by-byte, tracking the same state machine used in
/// the GPU shader, and produces a little-endian `u32` per byte.
#[inline]
pub fn tokenize_cpu(bytes: &[u8]) -> Vec<u8> {
    #[repr(u32)]
    #[derive(Clone, Copy, PartialEq, Eq)]
    enum State {
        String = 0,
        Identifier = 1,
        Number = 2,
        Comment = 3,
        Regex = 4,
        Operator = 5,
        Whitespace = 6,
        Unknown = 7,
    }

    let mut tokens = Vec::with_capacity(bytes.len().checked_mul(4).unwrap_or(0));
    let mut state = State::Whitespace;
    let mut string_quote: u8 = 0;
    let mut in_escape = false;
    let mut in_block_comment = false;
    let mut block_comment_star = false;
    let mut in_line_comment = false;
    let mut in_regex = false;
    let mut last_significant_state = State::Unknown;

    for i in 0..bytes.len() {
        let c = bytes[i];

        if in_escape {
            in_escape = false;
            state = if string_quote != 0 {
                State::String
            } else if in_regex {
                State::Regex
            } else {
                State::Unknown
            };
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if c == b'\\' {
            if string_quote != 0 {
                in_escape = true;
                state = State::String;
                tokens.extend_from_slice(&(state as u32).to_le_bytes());
                continue;
            } else if in_regex {
                in_escape = true;
                state = State::Regex;
                tokens.extend_from_slice(&(state as u32).to_le_bytes());
                continue;
            }
        }

        if in_line_comment {
            if c == b'\n' {
                in_line_comment = false;
                state = State::Whitespace;
            } else {
                state = State::Comment;
            }
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if in_block_comment {
            if block_comment_star && c == b'/' {
                in_block_comment = false;
                block_comment_star = false;
                state = State::Comment;
            } else {
                block_comment_star = c == b'*';
                state = State::Comment;
            }
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if string_quote != 0 {
            if c == string_quote {
                string_quote = 0;
                state = State::String;
                last_significant_state = State::String;
            } else {
                state = State::String;
            }
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if in_regex {
            if c == b'/' {
                in_regex = false;
                state = State::Regex;
                last_significant_state = State::Regex;
            } else {
                state = State::Regex;
            }
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if c == b'/' {
            let next_c = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
            if next_c == b'/' {
                in_line_comment = true;
                state = State::Comment;
                tokens.extend_from_slice(&(state as u32).to_le_bytes());
                continue;
            } else if next_c == b'*' {
                in_block_comment = true;
                block_comment_star = false;
                state = State::Comment;
                tokens.extend_from_slice(&(state as u32).to_le_bytes());
                continue;
            } else if last_significant_state == State::Operator
                || last_significant_state == State::Unknown
                || last_significant_state == State::Whitespace
            {
                in_regex = true;
                state = State::Regex;
                tokens.extend_from_slice(&(state as u32).to_le_bytes());
                continue;
            } else {
                state = State::Operator;
                last_significant_state = State::Operator;
                tokens.extend_from_slice(&(state as u32).to_le_bytes());
                continue;
            }
        }

        if c == b'"' || c == b'\'' || c == b'`' {
            string_quote = c;
            state = State::String;
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        let is_alpha =
            (c >= b'A' && c <= b'Z') || (c >= b'a' && c <= b'z') || c == b'_' || c == b'$';
        let is_digit = c >= b'0' && c <= b'9';

        if is_alpha || (is_digit && state == State::Identifier) {
            state = State::Identifier;
            last_significant_state = State::Identifier;
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if is_digit || (c == b'.' && state == State::Number) {
            state = State::Number;
            last_significant_state = State::Number;
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
            state = State::Whitespace;
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        let is_op = c == b'('
            || c == b')'
            || c == b'{'
            || c == b'}'
            || c == b'['
            || c == b']'
            || c == b'='
            || c == b'+'
            || c == b'-'
            || c == b'*'
            || c == b'%'
            || c == b'!'
            || c == b'<'
            || c == b'>'
            || c == b'&'
            || c == b'|'
            || c == b'^'
            || c == b'~'
            || c == b'?'
            || c == b':'
            || c == b','
            || c == b'.'
            || c == b';';

        if is_op {
            state = State::Operator;
            last_significant_state = State::Operator;
            tokens.extend_from_slice(&(state as u32).to_le_bytes());
            continue;
        }

        state = State::Unknown;
        last_significant_state = State::Unknown;
        tokens.extend_from_slice(&(state as u32).to_le_bytes());
    }

    tokens
}