use crate::Error;
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
Ident(String),
Int(i64),
Float(f64),
String(String),
Duration(i64),
Molecule,
Reaction,
When,
Where,
Rollup,
By,
Emit,
PrimaryKey,
Merge,
Default,
Type,
Enum,
Mixin,
With,
For,
In,
And,
Or,
Not,
True,
False,
Null,
LBrace,
RBrace,
LParen,
RParen,
LBracket,
RBracket,
Colon,
Comma,
Pipe,
Dot,
Plus,
Minus,
Eq,
EqEq,
NotEq,
Lt,
LtEq,
Gt,
GtEq,
Question,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub struct Span {
pub start: usize,
pub end: usize,
pub line: u32,
pub col: u32,
}
#[derive(Clone, Debug)]
pub struct Tok {
pub token: Token,
pub span: Span,
}
pub fn lex(source: &str) -> Result<Vec<Tok>, Error> {
let bytes = source.as_bytes();
let mut out = Vec::new();
let mut i = 0;
let mut line: u32 = 1;
let mut line_start: usize = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b' ' || b == b'\t' || b == b'\r' {
i += 1;
continue;
}
if b == b'\n' {
i += 1;
line += 1;
line_start = i;
continue;
}
if b == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
while i < bytes.len() && bytes[i] != b'\n' {
i += 1;
}
continue;
}
let start = i;
let col = (start - line_start) as u32 + 1;
let mk_span = |end: usize| Span {
start,
end,
line,
col,
};
if i + 1 < bytes.len() {
let two = (bytes[i], bytes[i + 1]);
let multi = match two {
(b'=', b'=') => Some(Token::EqEq),
(b'!', b'=') => Some(Token::NotEq),
(b'<', b'=') => Some(Token::LtEq),
(b'>', b'=') => Some(Token::GtEq),
_ => None,
};
if let Some(tok) = multi {
i += 2;
out.push(Tok {
token: tok,
span: mk_span(i),
});
continue;
}
}
let single = match b {
b'{' => Some(Token::LBrace),
b'}' => Some(Token::RBrace),
b'(' => Some(Token::LParen),
b')' => Some(Token::RParen),
b'[' => Some(Token::LBracket),
b']' => Some(Token::RBracket),
b':' => Some(Token::Colon),
b',' => Some(Token::Comma),
b'|' => Some(Token::Pipe),
b'.' => Some(Token::Dot),
b'+' => Some(Token::Plus),
b'-' => Some(Token::Minus),
b'=' => Some(Token::Eq),
b'<' => Some(Token::Lt),
b'>' => Some(Token::Gt),
b'?' => Some(Token::Question),
_ => None,
};
if let Some(tok) = single {
i += 1;
out.push(Tok {
token: tok,
span: mk_span(i),
});
continue;
}
if b == b'"' {
i += 1;
let s_start = i;
while i < bytes.len() && bytes[i] != b'"' {
if bytes[i] == b'\\' && i + 1 < bytes.len() {
i += 2;
} else {
i += 1;
}
}
if i >= bytes.len() {
return Err(Error::Parse(format!(
"unterminated string at line {line}:{col}"
)));
}
let s = std::str::from_utf8(&bytes[s_start..i])
.map_err(|_| Error::Parse("non-utf8 string".into()))?
.to_string();
i += 1;
out.push(Tok {
token: Token::String(s),
span: mk_span(i),
});
continue;
}
if b.is_ascii_digit() {
let n_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
let mut is_float = false;
if i + 1 < bytes.len() && bytes[i] == b'.' && bytes[i + 1].is_ascii_digit() {
is_float = true;
i += 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
if i < bytes.len() && (bytes[i] == b'e' || bytes[i] == b'E') {
is_float = true;
i += 1;
if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
i += 1;
}
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
if is_float {
let f: f64 = std::str::from_utf8(&bytes[n_start..i])
.map_err(|_| Error::Parse("bad float".into()))?
.parse()
.map_err(|_| Error::Parse("float out of range".into()))?;
out.push(Tok {
token: Token::Float(f),
span: mk_span(i),
});
continue;
}
let n: i64 = std::str::from_utf8(&bytes[n_start..i])
.map_err(|_| Error::Parse("bad number".into()))?
.parse()
.map_err(|_| Error::Parse("number out of range".into()))?;
let suffix_start = i;
while i < bytes.len()
&& (bytes[i].is_ascii_alphabetic() || bytes[i] == b'_')
{
i += 1;
}
let suffix = &bytes[suffix_start..i];
if suffix.is_empty() {
out.push(Tok {
token: Token::Int(n),
span: mk_span(i),
});
} else {
let ms = match suffix {
b"ms" => n,
b"s" => n * 1_000,
b"m" => n * 60_000,
b"h" => n * 3_600_000,
_ => {
return Err(Error::Parse(format!(
"unknown duration suffix at line {line}:{col}"
)))
}
};
out.push(Tok {
token: Token::Duration(ms),
span: mk_span(i),
});
}
continue;
}
if b.is_ascii_alphabetic() || b == b'_' {
let id_start = i;
while i < bytes.len()
&& (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_')
{
i += 1;
}
let s = std::str::from_utf8(&bytes[id_start..i])
.map_err(|_| Error::Parse("non-utf8 ident".into()))?;
let token = match s {
"molecule" => Token::Molecule,
"reaction" => Token::Reaction,
"when" => Token::When,
"where" => Token::Where,
"rollup" => Token::Rollup,
"by" => Token::By,
"emit" => Token::Emit,
"primary_key" => Token::PrimaryKey,
"merge" => Token::Merge,
"default" => Token::Default,
"type" => Token::Type,
"enum" => Token::Enum,
"mixin" => Token::Mixin,
"with" => Token::With,
"for" => Token::For,
"in" => Token::In,
"and" => Token::And,
"or" => Token::Or,
"not" => Token::Not,
"true" => Token::True,
"false" => Token::False,
"null" => Token::Null,
other => Token::Ident(other.to_string()),
};
out.push(Tok {
token,
span: mk_span(i),
});
continue;
}
return Err(Error::Parse(format!(
"unexpected character {:?} at line {line}:{col}",
b as char
)));
}
Ok(out)
}