use crate::{Convention, DataType, OpSignature, OpSpec};
pub const VYRE_OP_METADATA: vyre_spec::OpMetadata = vyre_spec::OpMetadata {
id: "string.tokenize",
layer: vyre_spec::Layer::L2,
category: vyre_spec::MetadataCategory::A,
version: 1,
description: "string tokenize",
signature: "(Bytes) -> Bytes",
strictness: "strict",
archetype_signature: "(Bytes) -> Bytes",
};
pub const GOLDEN: &[vyre_spec::GoldenSample] = &[vyre_spec::GoldenSample {
op_id: "string.tokenize",
input: b"",
expected: b"",
reason: "empty input → empty output; tokenizer produces exactly one u32 per input byte",
}];
pub const KAT: &[vyre_spec::KatVector] = &[
vyre_spec::KatVector {
input: b"",
expected: b"",
source: "empty input → empty output (zero-length boundary)",
},
vyre_spec::KatVector {
input: b" ",
expected: b"\x06\x00\x00\x00",
source: "single space → Whitespace(6)",
},
vyre_spec::KatVector {
input: b"\n",
expected: b"\x06\x00\x00\x00",
source: "single newline → Whitespace(6)",
},
vyre_spec::KatVector {
input: b"\t",
expected: b"\x06\x00\x00\x00",
source: "tab → Whitespace(6)",
},
vyre_spec::KatVector {
input: b"\r",
expected: b"\x06\x00\x00\x00",
source: "carriage return → Whitespace(6)",
},
vyre_spec::KatVector {
input: b"abc",
expected: b"\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00",
source: "lowercase alpha → Identifier×3",
},
vyre_spec::KatVector {
input: b"ABC",
expected: b"\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00",
source: "uppercase alpha → Identifier×3",
},
vyre_spec::KatVector {
input: b"$_x",
expected: b"\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00",
source: "dollar, underscore, alpha all alpha-class → Identifier×3",
},
vyre_spec::KatVector {
input: b"x1",
expected: b"\x01\x00\x00\x00\x01\x00\x00\x00",
source: "digit after alpha → stays Identifier (not Number)",
},
vyre_spec::KatVector {
input: b"123",
expected: b"\x02\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00",
source: "digits → Number×3",
},
vyre_spec::KatVector {
input: b"1.5",
expected: b"\x02\x00\x00\x00\x02\x00\x00\x00\x02\x00\x00\x00",
source: "dot inside Number stays Number (decimal)",
},
vyre_spec::KatVector {
input: b".5",
expected: b"\x05\x00\x00\x00\x02\x00\x00\x00",
source: "leading dot: state was Whitespace so `.` is Operator; `5` then transitions to Number",
},
vyre_spec::KatVector {
input: b"0x1F",
expected: b"\x02\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00",
source: "hex prefix: `0`=Number, `x` is alpha → Identifier, `1F` stay Identifier (digit-in-id, alpha)",
},
vyre_spec::KatVector {
input: b"+=",
expected: b"\x05\x00\x00\x00\x05\x00\x00\x00",
source: "two operators → Operator×2",
},
vyre_spec::KatVector {
input: b"({[]})",
expected: b"\x05\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00\x05\x00\x00\x00",
source: "brackets/braces/parens → Operator×6",
},
vyre_spec::KatVector {
input: b"a=b",
expected: b"\x01\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00",
source: "assignment: Identifier, Operator, Identifier",
},
vyre_spec::KatVector {
input: b"\"a\"",
expected: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
source: "double-quoted single char → String×3 (open, body, close)",
},
vyre_spec::KatVector {
input: b"'a'",
expected: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
source: "single-quoted → String×3",
},
vyre_spec::KatVector {
input: b"`a`",
expected: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
source: "backtick-quoted → String×3",
},
vyre_spec::KatVector {
input: b"\"abc",
expected: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
source: "unclosed string → String×4; state machine stays in String until end-of-input",
},
vyre_spec::KatVector {
input: b"\"a\\nb\"",
expected: b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
source: "string with escape sequence `\\n`: String×6 (open, a, backslash(enters escape), n(exits escape, still String), b, close)",
},
vyre_spec::KatVector {
input: b"//",
expected: b"\x03\x00\x00\x00\x03\x00\x00\x00",
source: "line-comment marker alone → Comment×2",
},
vyre_spec::KatVector {
input: b"//x",
expected: b"\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00",
source: "line comment body → Comment×3",
},
vyre_spec::KatVector {
input: b"//a\n",
expected: b"\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x06\x00\x00\x00",
source: "line comment terminated by newline: Comment×3, then Whitespace on \\n",
},
vyre_spec::KatVector {
input: b"/*x*/",
expected: b"\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00",
source: "closed block comment → Comment×5 (open slash, star, body, star, close slash)",
},
vyre_spec::KatVector {
input: b"/*abc",
expected: b"\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00",
source: "unterminated block comment → Comment×5 (stays in Comment until EOI)",
},
vyre_spec::KatVector {
input: b"/**/",
expected: b"\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00\x03\x00\x00\x00",
source: "empty block comment → Comment×4",
},
vyre_spec::KatVector {
input: b"/",
expected: b"\x04\x00\x00\x00",
source: "lone `/` at start: last_sig=Unknown triggers regex entry → Regex(4)",
},
vyre_spec::KatVector {
input: b"/x/",
expected: b"\x04\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00",
source: "regex literal → Regex×3 (open, body, close)",
},
vyre_spec::KatVector {
input: b"/\\x/",
expected: b"\x04\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00",
source: "regex with escape: Regex×4 (open /, backslash enters escape, x exits escape, close /)",
},
vyre_spec::KatVector {
input: b"+/x/",
expected: b"\x05\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00",
source: "operator then regex: Operator, Regex×3 (last_sig==Operator triggers regex entry)",
},
vyre_spec::KatVector {
input: b"x/y",
expected: b"\x01\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00",
source: "identifier / identifier: `/` is division (last_sig==Identifier), not regex",
},
vyre_spec::KatVector {
input: b"1/2",
expected: b"\x02\x00\x00\x00\x05\x00\x00\x00\x02\x00\x00\x00",
source: "number / number: `/` is division (last_sig==Number), not regex",
},
vyre_spec::KatVector {
input: b"\xff",
expected: b"\x07\x00\x00\x00",
source: "byte 0xFF fits no other class → Unknown(7)",
},
vyre_spec::KatVector {
input: b"\x00",
expected: b"\x07\x00\x00\x00",
source: "NUL byte is not alpha/digit/whitespace/op → Unknown(7)",
},
vyre_spec::KatVector {
input: b" x",
expected: b"\x06\x00\x00\x00\x01\x00\x00\x00",
source: "Whitespace → Identifier transition",
},
vyre_spec::KatVector {
input: b"x y",
expected: b"\x01\x00\x00\x00\x06\x00\x00\x00\x01\x00\x00\x00",
source: "Identifier → Whitespace → Identifier",
},
vyre_spec::KatVector {
input: b"if(x)",
expected: b"\x01\x00\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00\x01\x00\x00\x00\x05\x00\x00\x00",
source: "realistic fragment: Identifier, Identifier, Operator, Identifier, Operator",
},
];
pub const ADVERSARIAL: &[vyre_spec::AdversarialInput] = &[
vyre_spec::AdversarialInput {
input: b"",
reason: "empty input exercises zero-length branch — tokenizer must produce zero output bytes, not a single dummy token",
},
vyre_spec::AdversarialInput {
input: b"\xff",
reason: "0xFF is not valid ASCII for any starting-state transition — tokenizer must route to Operator/error state rather than mis-classify",
},
vyre_spec::AdversarialInput {
input: b"/*\xf0\x9f\x92\xa9*/",
reason: "multi-byte UTF-8 codepoint (pile of poo) inside a comment — the state machine treats each byte as its own state, so every byte emits a `Comment=3` token even when it's a continuation byte",
},
vyre_spec::AdversarialInput {
input: b"\"unclosed string...",
reason: "string literal without closing quote — state machine must stay in String until end-of-input; no silent recovery",
},
];
#[inline]
pub fn tokenize_cpu(bytes: &[u8]) -> Vec<u8> {
#[repr(u32)]
#[derive(Clone, Copy, PartialEq, Eq)]
enum State {
String = 0,
Identifier = 1,
Number = 2,
Comment = 3,
Regex = 4,
Operator = 5,
Whitespace = 6,
Unknown = 7,
}
let mut tokens = Vec::with_capacity(bytes.len().checked_mul(4).unwrap_or(0));
let mut state = State::Whitespace;
let mut string_quote: u8 = 0;
let mut in_escape = false;
let mut in_block_comment = false;
let mut block_comment_star = false;
let mut in_line_comment = false;
let mut in_regex = false;
let mut last_significant_state = State::Unknown;
for i in 0..bytes.len() {
let c = bytes[i];
if in_escape {
in_escape = false;
state = if string_quote != 0 {
State::String
} else if in_regex {
State::Regex
} else {
State::Unknown
};
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if c == b'\\' {
if string_quote != 0 {
in_escape = true;
state = State::String;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else if in_regex {
in_escape = true;
state = State::Regex;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
}
if in_line_comment {
if c == b'\n' {
in_line_comment = false;
state = State::Whitespace;
} else {
state = State::Comment;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if in_block_comment {
if block_comment_star && c == b'/' {
in_block_comment = false;
block_comment_star = false;
state = State::Comment;
} else {
block_comment_star = c == b'*';
state = State::Comment;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if string_quote != 0 {
if c == string_quote {
string_quote = 0;
state = State::String;
last_significant_state = State::String;
} else {
state = State::String;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if in_regex {
if c == b'/' {
in_regex = false;
state = State::Regex;
last_significant_state = State::Regex;
} else {
state = State::Regex;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if c == b'/' {
let next_c = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
if next_c == b'/' {
in_line_comment = true;
state = State::Comment;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else if next_c == b'*' {
in_block_comment = true;
block_comment_star = false;
state = State::Comment;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else if last_significant_state == State::Operator
|| last_significant_state == State::Unknown
|| last_significant_state == State::Whitespace
{
in_regex = true;
state = State::Regex;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else {
state = State::Operator;
last_significant_state = State::Operator;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
}
if c == b'"' || c == b'\'' || c == b'`' {
string_quote = c;
state = State::String;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
let is_alpha = c.is_ascii_uppercase() || c.is_ascii_lowercase() || c == b'_' || c == b'$';
let is_digit = c.is_ascii_digit();
if is_alpha || (is_digit && state == State::Identifier) {
state = State::Identifier;
last_significant_state = State::Identifier;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if is_digit || (c == b'.' && state == State::Number) {
state = State::Number;
last_significant_state = State::Number;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
state = State::Whitespace;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
let is_op = c == b'('
|| c == b')'
|| c == b'{'
|| c == b'}'
|| c == b'['
|| c == b']'
|| c == b'='
|| c == b'+'
|| c == b'-'
|| c == b'*'
|| c == b'%'
|| c == b'!'
|| c == b'<'
|| c == b'>'
|| c == b'&'
|| c == b'|'
|| c == b'^'
|| c == b'~'
|| c == b'?'
|| c == b':'
|| c == b','
|| c == b'.'
|| c == b';';
if is_op {
state = State::Operator;
last_significant_state = State::Operator;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
state = State::Unknown;
last_significant_state = State::Unknown;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
}
tokens
}
fn tokenize_wgsl() -> String {
r#"
const STATE_STRING: u32 = 0u;
const STATE_IDENTIFIER: u32 = 1u;
const STATE_NUMBER: u32 = 2u;
const STATE_COMMENT: u32 = 3u;
const STATE_REGEX: u32 = 4u;
const STATE_OPERATOR: u32 = 5u;
const STATE_WHITESPACE: u32 = 6u;
const STATE_UNKNOWN: u32 = 7u;
struct Config {
length: u32,
};
@group(0) @binding(0) var<storage, read> source: array<u32>;
@group(0) @binding(1) var<storage, read_write> tokens: array<u32>;
@group(0) @binding(2) var<uniform> config: Config;
fn get_char(idx: u32) -> u32 {
let word = source[idx / 4u];
let shift = (idx % 4u) * 8u;
return (word >> shift) & 0xFFu;
}
@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
let idx = global_id.x;
if (idx >= config.length) {
return;
}
var state = STATE_WHITESPACE;
var string_quote = 0u;
var in_escape = false;
var in_block_comment = false;
var block_comment_star = false;
var in_line_comment = false;
var in_regex = false;
var last_significant_state = STATE_UNKNOWN;
for (var i = 0u; i <= idx; i = i + 1u) {
let c = get_char(i);
if (in_escape) {
in_escape = false;
if (i == idx) {
if (string_quote != 0u) { state = STATE_STRING; }
else if (in_regex) { state = STATE_REGEX; }
else { state = STATE_UNKNOWN; }
}
continue;
}
if (c == 92u) { // '\\'
if (string_quote != 0u) {
in_escape = true;
if (i == idx) { state = STATE_STRING; }
continue;
} else if (in_regex) {
in_escape = true;
if (i == idx) { state = STATE_REGEX; }
continue;
}
}
if (in_line_comment) {
if (c == 10u) { // '\n'
in_line_comment = false;
if (i == idx) { state = STATE_WHITESPACE; }
} else {
if (i == idx) { state = STATE_COMMENT; }
}
continue;
}
if (in_block_comment) {
if (block_comment_star && c == 47u) { // '/'
in_block_comment = false;
block_comment_star = false;
if (i == idx) { state = STATE_COMMENT; }
} else {
block_comment_star = (c == 42u); // '*'
if (i == idx) { state = STATE_COMMENT; }
}
continue;
}
if (string_quote != 0u) {
if (c == string_quote) {
string_quote = 0u;
if (i == idx) { state = STATE_STRING; }
last_significant_state = STATE_STRING;
} else {
if (i == idx) { state = STATE_STRING; }
}
continue;
}
if (in_regex) {
if (c == 47u) { // '/'
in_regex = false;
if (i == idx) { state = STATE_REGEX; }
last_significant_state = STATE_REGEX;
} else {
if (i == idx) { state = STATE_REGEX; }
}
continue;
}
if (c == 47u) { // '/'
var next_c = 0u;
if (i + 1u < config.length) {
next_c = get_char(i + 1u);
}
if (next_c == 47u) { // "//"
in_line_comment = true;
if (i == idx) { state = STATE_COMMENT; }
continue;
} else if (next_c == 42u) { // "/*"
in_block_comment = true;
block_comment_star = false;
if (i == idx) { state = STATE_COMMENT; }
continue;
} else {
if (last_significant_state == STATE_OPERATOR || last_significant_state == STATE_UNKNOWN || last_significant_state == STATE_WHITESPACE) {
in_regex = true;
if (i == idx) { state = STATE_REGEX; }
continue;
} else {
if (i == idx) { state = STATE_OPERATOR; }
last_significant_state = STATE_OPERATOR;
continue;
}
}
}
if (c == 34u || c == 39u || c == 96u) { // '"', '\'', '`'
string_quote = c;
if (i == idx) { state = STATE_STRING; }
continue;
}
let is_alpha = (c >= 65u && c <= 90u) || (c >= 97u && c <= 122u) || c == 95u || c == 36u;
let is_digit = (c >= 48u && c <= 57u);
if (is_alpha || (is_digit && state == STATE_IDENTIFIER)) {
state = STATE_IDENTIFIER;
last_significant_state = STATE_IDENTIFIER;
continue;
}
if (is_digit || (c == 46u && state == STATE_NUMBER)) { // '.'
state = STATE_NUMBER;
last_significant_state = STATE_NUMBER;
continue;
}
if (c == 32u || c == 9u || c == 10u || c == 13u) {
state = STATE_WHITESPACE;
continue;
}
let is_op = c == 40u || c == 41u || c == 123u || c == 125u || c == 91u || c == 93u ||
c == 61u || c == 43u || c == 45u || c == 42u || c == 37u || c == 33u ||
c == 60u || c == 62u || c == 38u || c == 124u || c == 94u || c == 126u ||
c == 63u || c == 58u || c == 44u || c == 46u || c == 59u;
if (is_op) {
state = STATE_OPERATOR;
last_significant_state = STATE_OPERATOR;
continue;
}
state = STATE_UNKNOWN;
last_significant_state = STATE_UNKNOWN;
}
tokens[idx] = state;
}
"#
.to_string()
}
#[inline]
pub fn vyre_op() -> OpSpec {
let id = "string.tokenize";
OpSpec::builder(id)
.signature(OpSignature {
inputs: vec![DataType::Bytes],
output: DataType::Bytes,
})
.cpu_fn(tokenize_cpu)
.wgsl_fn(tokenize_wgsl)
.category(crate::Category::A {
composition_of: vec![id],
})
.laws(vec![crate::spec::law::AlgebraicLaw::Bounded {
lo: 0,
hi: u32::MAX,
}])
.strictness(crate::spec::types::Strictness::Strict)
.version(1)
.alt_wgsl_fns(vec![("category_a_handwritten", tokenize_wgsl)])
.convention(Convention::V1)
.workgroup_size(None)
.boundary_values(vec![
crate::spec::types::BoundaryValue {
label: "empty",
inputs: vec![0],
},
crate::spec::types::BoundaryValue {
label: "single_element",
inputs: vec![1],
},
crate::spec::types::BoundaryValue {
label: "boundary",
inputs: vec![255],
},
crate::spec::types::BoundaryValue {
label: "max",
inputs: vec![u32::MAX],
},
])
.equivalence_classes(vec![
crate::spec::types::EquivalenceClass::specific("empty input", vec![0]),
crate::spec::types::EquivalenceClass::specific("typical input", vec![42]),
crate::spec::types::EquivalenceClass::specific("boundary input", vec![255]),
])
.spec_table(crate::spec::tables::tokenize::ROWS)
.expect("Fix: checked-in conform spec must satisfy the typestate builder")
}