use crate::ir::{BinOp, BufferDecl, DataType, Expr, Node, Program};
use crate::ops::AlgebraicLaw;
use crate::ops::{OpSpec, BYTES_TO_U32_OUTPUTS};
pub fn and(left: Expr, right: Expr) -> Expr {
Expr::BinOp {
op: BinOp::And,
left: Box::new(left),
right: Box::new(right),
}
}
pub fn any_eq(byte: &Expr, values: &[u8]) -> Expr {
values.iter().fold(Expr::u32(0), |acc, value| {
or(acc, Expr::eq(byte.clone(), Expr::u32(u32::from(*value))))
})
}
pub fn classify(byte: &Expr) -> Expr {
Expr::select(
any_eq(byte, b"\"'`"),
Expr::u32(0),
Expr::select(
is_identifier(byte.clone()),
Expr::u32(1),
Expr::select(
in_range(byte.clone(), b'0', b'9'),
Expr::u32(2),
Expr::select(
any_eq(byte, b" \n\r\t"),
Expr::u32(6),
Expr::select(
any_eq(byte, b"+-*/%=!<>|&^~?:;,.(){}[]"),
Expr::u32(5),
Expr::u32(7),
),
),
),
),
)
}
impl Tokenize {
pub const SPEC: OpSpec = OpSpec::composition(
"string.tokenize_gpu",
&[DataType::Bytes],
BYTES_TO_U32_OUTPUTS,
LAWS,
Self::program,
);
#[must_use]
pub fn program() -> Program {
let idx = Expr::var("idx");
let byte = Expr::load("source", idx.clone());
Program::new(
vec![
BufferDecl::read("source", 0, DataType::Bytes),
BufferDecl::output("tokens", 1, DataType::U32),
],
[64, 1, 1],
vec![
Node::let_bind("idx", Expr::gid_x()),
Node::if_then(
Expr::lt(idx.clone(), Expr::buf_len("source")),
vec![Node::store("tokens", idx, classify(&byte))],
),
],
)
}
}
impl From<u32> for TokenType {
fn from(val: u32) -> Self {
match val {
0 => Self::String,
1 => Self::Identifier,
2 => Self::Number,
3 => Self::Comment,
4 => Self::Regex,
5 => Self::Operator,
6 => Self::Whitespace,
_ => Self::Unknown,
}
}
}
pub fn in_range(value: Expr, low: u8, high: u8) -> Expr {
and(
le(Expr::u32(u32::from(low)), value.clone()),
le(value, Expr::u32(u32::from(high))),
)
}
pub fn is_identifier(byte: Expr) -> Expr {
or(
or(
in_range(byte.clone(), b'a', b'z'),
in_range(byte.clone(), b'A', b'Z'),
),
Expr::eq(byte, Expr::u32(u32::from(b'_'))),
)
}
pub const LAWS: &[AlgebraicLaw] = &[AlgebraicLaw::Bounded { lo: 0, hi: 7 }];
pub fn le(left: Expr, right: Expr) -> Expr {
Expr::BinOp {
op: BinOp::Le,
left: Box::new(left),
right: Box::new(right),
}
}
pub fn or(left: Expr, right: Expr) -> Expr {
Expr::BinOp {
op: BinOp::Or,
left: Box::new(left),
right: Box::new(right),
}
}
#[non_exhaustive]
#[repr(u32)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType {
String = 0,
Identifier = 1,
Number = 2,
Comment = 3,
Regex = 4,
Operator = 5,
Whitespace = 6,
Unknown = 7,
}
#[derive(Debug, Clone, Copy, Default)]
pub struct Tokenize;