vyre 0.4.0

GPU compute intermediate representation with a standard operation library
Documentation
use crate::ir::{BinOp, BufferDecl, DataType, Expr, Node, Program};
use crate::ops::AlgebraicLaw;
use crate::ops::{OpSpec, BYTES_TO_U32_OUTPUTS};

// WGSL lowering marker for `string.tokenize_gpu`.
//
// No special per-op lowering is needed. The normal IR lowerer handles the
// byte-classification composition.

pub fn and(left: Expr, right: Expr) -> Expr {
    Expr::BinOp {
        op: BinOp::And,
        left: Box::new(left),
        right: Box::new(right),
    }
}

pub fn any_eq(byte: &Expr, values: &[u8]) -> Expr {
    values.iter().fold(Expr::u32(0), |acc, value| {
        or(acc, Expr::eq(byte.clone(), Expr::u32(u32::from(*value))))
    })
}

pub fn classify(byte: &Expr) -> Expr {
    Expr::select(
        any_eq(byte, b"\"'`"),
        Expr::u32(0),
        Expr::select(
            is_identifier(byte.clone()),
            Expr::u32(1),
            Expr::select(
                in_range(byte.clone(), b'0', b'9'),
                Expr::u32(2),
                Expr::select(
                    any_eq(byte, b" \n\r\t"),
                    Expr::u32(6),
                    Expr::select(
                        any_eq(byte, b"+-*/%=!<>|&^~?:;,.(){}[]"),
                        Expr::u32(5),
                        Expr::u32(7),
                    ),
                ),
            ),
        ),
    )
}

impl Tokenize {
    /// Declarative operation specification.
    pub const SPEC: OpSpec = OpSpec::composition(
        "string.tokenize_gpu",
        &[DataType::Bytes],
        BYTES_TO_U32_OUTPUTS,
        LAWS,
        Self::program,
    );

    /// Build the canonical IR program.
    #[must_use]
    pub fn program() -> Program {
        let idx = Expr::var("idx");
        let byte = Expr::load("source", idx.clone());
        Program::new(
            vec![
                BufferDecl::read("source", 0, DataType::Bytes),
                BufferDecl::output("tokens", 1, DataType::U32),
            ],
            [64, 1, 1],
            vec![
                Node::let_bind("idx", Expr::gid_x()),
                Node::if_then(
                    Expr::lt(idx.clone(), Expr::buf_len("source")),
                    vec![Node::store("tokens", idx, classify(&byte))],
                ),
            ],
        )
    }
}

impl From<u32> for TokenType {
    fn from(val: u32) -> Self {
        match val {
            0 => Self::String,
            1 => Self::Identifier,
            2 => Self::Number,
            3 => Self::Comment,
            4 => Self::Regex,
            5 => Self::Operator,
            6 => Self::Whitespace,
            _ => Self::Unknown,
        }
    }
}

pub fn in_range(value: Expr, low: u8, high: u8) -> Expr {
    and(
        le(Expr::u32(u32::from(low)), value.clone()),
        le(value, Expr::u32(u32::from(high))),
    )
}

pub fn is_identifier(byte: Expr) -> Expr {
    or(
        or(
            in_range(byte.clone(), b'a', b'z'),
            in_range(byte.clone(), b'A', b'Z'),
        ),
        Expr::eq(byte, Expr::u32(u32::from(b'_'))),
    )
}

pub const LAWS: &[AlgebraicLaw] = &[AlgebraicLaw::Bounded { lo: 0, hi: 7 }];

pub fn le(left: Expr, right: Expr) -> Expr {
    Expr::BinOp {
        op: BinOp::Le,
        left: Box::new(left),
        right: Box::new(right),
    }
}

pub fn or(left: Expr, right: Expr) -> Expr {
    Expr::BinOp {
        op: BinOp::Or,
        left: Box::new(left),
        right: Box::new(right),
    }
}

/// Token type mapped from the tokenizer composition.
#[non_exhaustive]
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType {
    /// String literal delimiter.
    String = 0,
    /// Identifier byte.
    Identifier = 1,
    /// Numeric literal byte.
    Number = 2,
    /// Comment marker byte.
    Comment = 3,
    /// Regular expression delimiter byte.
    Regex = 4,
    /// Operator byte.
    Operator = 5,
    /// Whitespace byte.
    Whitespace = 6,
    /// Unknown byte.
    Unknown = 7,
}

/// Tokenize operation.
#[derive(Debug, Clone, Copy, Default)]
pub struct Tokenize;