use crate::parsing::rust::lex::keyword::promote;
use crate::parsing::rust::lex::tokens::*;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Token {
pub kind: u16,
pub start: u32,
pub len: u16,
}
impl Token {
pub fn text<'a>(&self, source: &'a [u8]) -> &'a str {
std::str::from_utf8(&source[self.start as usize..(self.start + self.len as u32) as usize])
.expect("Fix: lexer must reject invalid UTF-8 spans; return Lex error instead of panicking - lexer only produces valid UTF-8 spans")
}
}
fn token_len(start: usize, end: usize) -> Result<u16, usize> {
u16::try_from(end - start).map_err(|_| start)
}
pub fn lex(source: &[u8]) -> Result<Vec<Token>, usize> {
let _ = std::str::from_utf8(source).map_err(|e| e.valid_up_to())?;
if source.len() > u32::MAX as usize {
return Err(u32::MAX as usize);
}
let mut tokens = Vec::new();
let mut i = 0usize;
while i < source.len() {
let b = source[i];
if b.is_ascii_whitespace() {
i += 1;
continue;
}
if b == b'/' && i + 1 < source.len() && source[i + 1] == b'/' {
while i < source.len() && source[i] != b'\n' {
i += 1;
}
continue;
}
if b == b'/' && i + 1 < source.len() && source[i + 1] == b'*' {
i += 2;
while i + 1 < source.len() {
if source[i] == b'*' && source[i + 1] == b'/' {
i += 2;
break;
}
i += 1;
}
continue;
}
let start = i;
if b == b'_' || b.is_ascii_alphabetic() {
while i < source.len() && (source[i].is_ascii_alphanumeric() || source[i] == b'_') {
i += 1;
}
let text = std::str::from_utf8(&source[start..i]).unwrap();
let kind = promote(text).unwrap_or(IDENT);
tokens.push(Token {
kind,
start: start as u32,
len: token_len(start, i)?,
});
continue;
}
if b.is_ascii_digit() {
while i < source.len() && source[i].is_ascii_digit() {
i += 1;
}
tokens.push(Token {
kind: LITERAL_INT,
start: start as u32,
len: token_len(start, i)?,
});
continue;
}
if i + 1 < source.len() {
let pair = [b, source[i + 1]];
let (kind, advance) = match &pair {
b"==" => (EQ, 2),
b"+=" => (PLUS_EQ, 2),
b"-=" => (MINUS_EQ, 2),
b"<=" => (LE, 2),
b">=" => (GE, 2),
b"!=" => (NE, 2),
b"&&" => (ANDAND, 2),
b"||" => (OROR, 2),
b"->" => (ARROW, 2),
b".." => (DOTDOT, 2),
b"&m" if i + 4 <= source.len() && &source[i + 1..i + 4] == b"mut" => (AMP_MUT, 4),
_ => (0, 0),
};
if advance > 0 {
i += advance;
tokens.push(Token {
kind,
start: start as u32,
len: advance as u16,
});
continue;
}
}
let kind = match b {
b'+' => PLUS,
b'-' => MINUS,
b'*' => STAR,
b'/' => SLASH,
b'%' => PERCENT,
b'=' => ASSIGN,
b'<' => LT,
b'>' => GT,
b';' => SEMI,
b':' => COLON,
b',' => COMMA,
b'&' => AMP,
b'!' => BANG,
b'(' => LPAREN,
b')' => RPAREN,
b'{' => LBRACE,
b'}' => RBRACE,
_ => return Err(start),
};
i += 1;
tokens.push(Token {
kind,
start: start as u32,
len: 1,
});
}
tokens.push(Token {
kind: EOF,
start: source.len() as u32,
len: 0,
});
Ok(tokens)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn oversized_identifier_fails_closed() {
let src = "a".repeat(70_000);
assert_eq!(
lex(src.as_bytes()),
Err(0),
"70k-byte identifier must fail closed, not truncate"
);
}
#[test]
fn oversized_integer_literal_fails_closed() {
let src = "1".repeat(70_000);
assert_eq!(
lex(src.as_bytes()),
Err(0),
"70k-byte int literal must fail closed"
);
}
#[test]
fn max_length_identifier_still_lexes_with_exact_len() {
let n = u16::MAX as usize;
let src = "a".repeat(n);
let tokens = lex(src.as_bytes()).expect("u16::MAX-byte identifier must lex");
assert_eq!(
tokens[0].len as usize, n,
"length must be exact, not truncated"
);
assert_eq!(
tokens[0].text(src.as_bytes()).len(),
n,
"text() must read the full span"
);
}
}