use crate::{Convention, DataType, OpSignature, OpSpec};
#[inline]
pub fn tokenize_cpu(bytes: &[u8]) -> Vec<u8> {
#[repr(u32)]
#[derive(Clone, Copy, PartialEq, Eq)]
enum State {
String = 0,
Identifier = 1,
Number = 2,
Comment = 3,
Regex = 4,
Operator = 5,
Whitespace = 6,
Unknown = 7,
}
let mut tokens = Vec::with_capacity(bytes.len().checked_mul(4).unwrap_or(0));
let mut state = State::Whitespace;
let mut string_quote: u8 = 0;
let mut in_escape = false;
let mut in_block_comment = false;
let mut block_comment_star = false;
let mut in_line_comment = false;
let mut in_regex = false;
let mut last_significant_state = State::Unknown;
for i in 0..bytes.len() {
let c = bytes[i];
if in_escape {
in_escape = false;
state = if string_quote != 0 {
State::String
} else if in_regex {
State::Regex
} else {
State::Unknown
};
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if c == b'\\' {
if string_quote != 0 {
in_escape = true;
state = State::String;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else if in_regex {
in_escape = true;
state = State::Regex;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
}
if in_line_comment {
if c == b'\n' {
in_line_comment = false;
state = State::Whitespace;
} else {
state = State::Comment;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if in_block_comment {
if block_comment_star && c == b'/' {
in_block_comment = false;
block_comment_star = false;
state = State::Comment;
} else {
block_comment_star = c == b'*';
state = State::Comment;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if string_quote != 0 {
if c == string_quote {
string_quote = 0;
state = State::String;
last_significant_state = State::String;
} else {
state = State::String;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if in_regex {
if c == b'/' {
in_regex = false;
state = State::Regex;
last_significant_state = State::Regex;
} else {
state = State::Regex;
}
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if c == b'/' {
let next_c = if i + 1 < bytes.len() { bytes[i + 1] } else { 0 };
if next_c == b'/' {
in_line_comment = true;
state = State::Comment;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else if next_c == b'*' {
in_block_comment = true;
block_comment_star = false;
state = State::Comment;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else if last_significant_state == State::Operator
|| last_significant_state == State::Unknown
|| last_significant_state == State::Whitespace
{
in_regex = true;
state = State::Regex;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
} else {
state = State::Operator;
last_significant_state = State::Operator;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
}
if c == b'"' || c == b'\'' || c == b'`' {
string_quote = c;
state = State::String;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
let is_alpha =
(c >= b'A' && c <= b'Z') || (c >= b'a' && c <= b'z') || c == b'_' || c == b'$';
let is_digit = c >= b'0' && c <= b'9';
if is_alpha || (is_digit && state == State::Identifier) {
state = State::Identifier;
last_significant_state = State::Identifier;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if is_digit || (c == b'.' && state == State::Number) {
state = State::Number;
last_significant_state = State::Number;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
state = State::Whitespace;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
let is_op = c == b'('
|| c == b')'
|| c == b'{'
|| c == b'}'
|| c == b'['
|| c == b']'
|| c == b'='
|| c == b'+'
|| c == b'-'
|| c == b'*'
|| c == b'%'
|| c == b'!'
|| c == b'<'
|| c == b'>'
|| c == b'&'
|| c == b'|'
|| c == b'^'
|| c == b'~'
|| c == b'?'
|| c == b':'
|| c == b','
|| c == b'.'
|| c == b';';
if is_op {
state = State::Operator;
last_significant_state = State::Operator;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
continue;
}
state = State::Unknown;
last_significant_state = State::Unknown;
tokens.extend_from_slice(&(state as u32).to_le_bytes());
}
tokens
}