#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Number(f64),
String(String),
Duration(String), Ident(String),
Add, Sub, Mul, Div, Mod, Pow, Eq, Neq, Lt, Gt, Lte, Gte, Assign, MatchRegex, NotMatchRegex,
LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon,
Eof,
}
pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
let mut tokens = Vec::new();
let bytes = input.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i].is_ascii_whitespace() {
i += 1;
continue;
}
if bytes[i] == b'#' {
while i < bytes.len() && bytes[i] != b'\n' {
i += 1;
}
continue;
}
if bytes[i] == b'"' || bytes[i] == b'\'' || bytes[i] == b'`' {
let (tok, end) = lex_string(bytes, i)?;
tokens.push(tok);
i = end;
continue;
}
if bytes[i].is_ascii_digit()
|| (bytes[i] == b'.' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit())
{
let (tok, end) = lex_number_or_duration(bytes, i)?;
tokens.push(tok);
i = end;
continue;
}
if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' || bytes[i] == b':' {
let (tok, end) = lex_ident(bytes, i);
tokens.push(tok);
i = end;
continue;
}
if i + 1 < bytes.len() {
match (bytes[i], bytes[i + 1]) {
(b'=', b'=') => {
tokens.push(Token::Eq);
i += 2;
continue;
}
(b'!', b'=') => {
tokens.push(Token::Neq);
i += 2;
continue;
}
(b'<', b'=') => {
tokens.push(Token::Lte);
i += 2;
continue;
}
(b'>', b'=') => {
tokens.push(Token::Gte);
i += 2;
continue;
}
(b'=', b'~') => {
tokens.push(Token::MatchRegex);
i += 2;
continue;
}
(b'!', b'~') => {
tokens.push(Token::NotMatchRegex);
i += 2;
continue;
}
_ => {}
}
}
let tok = match bytes[i] {
b'+' => Token::Add,
b'-' => Token::Sub,
b'*' => Token::Mul,
b'/' => Token::Div,
b'%' => Token::Mod,
b'^' => Token::Pow,
b'=' => Token::Assign,
b'<' => Token::Lt,
b'>' => Token::Gt,
b'(' => Token::LParen,
b')' => Token::RParen,
b'{' => Token::LBrace,
b'}' => Token::RBrace,
b'[' => Token::LBracket,
b']' => Token::RBracket,
b',' => Token::Comma,
b':' => Token::Colon,
_ => {
return Err(format!(
"unexpected character '{}' at position {i}",
bytes[i] as char
));
}
};
tokens.push(tok);
i += 1;
}
tokens.push(Token::Eof);
Ok(tokens)
}
fn lex_string(bytes: &[u8], start: usize) -> Result<(Token, usize), String> {
let quote = bytes[start];
let mut i = start + 1;
let mut s = String::new();
while i < bytes.len() {
if bytes[i] == quote {
return Ok((Token::String(s), i + 1));
}
if bytes[i] == b'\\' && quote != b'`' && i + 1 < bytes.len() {
i += 1;
match bytes[i] {
b'n' => s.push('\n'),
b't' => s.push('\t'),
b'\\' => s.push('\\'),
b'\'' => s.push('\''),
b'"' => s.push('"'),
c => {
s.push('\\');
s.push(c as char);
}
}
} else {
s.push(bytes[i] as char);
}
i += 1;
}
Err(format!("unterminated string starting at position {start}"))
}
fn lex_number_or_duration(bytes: &[u8], start: usize) -> Result<(Token, usize), String> {
let mut i = start;
let mut has_dot = false;
let mut has_exp = false;
while i < bytes.len() {
if bytes[i].is_ascii_digit() {
i += 1;
} else if bytes[i] == b'.' && !has_dot && !has_exp {
has_dot = true;
i += 1;
} else if (bytes[i] == b'e' || bytes[i] == b'E') && !has_exp {
has_exp = true;
i += 1;
if i < bytes.len() && (bytes[i] == b'+' || bytes[i] == b'-') {
i += 1;
}
} else {
break;
}
}
if i < bytes.len() && matches!(bytes[i], b's' | b'm' | b'h' | b'd' | b'w' | b'y') {
while i < bytes.len()
&& (bytes[i].is_ascii_digit()
|| matches!(bytes[i], b's' | b'm' | b'h' | b'd' | b'w' | b'y' | b'.'))
{
i += 1;
}
let raw = std::str::from_utf8(&bytes[start..i]).unwrap_or("0s");
return Ok((Token::Duration(raw.to_string()), i));
}
let raw = std::str::from_utf8(&bytes[start..i]).unwrap_or("0");
let num: f64 = raw
.parse()
.map_err(|_| format!("invalid number '{raw}' at {start}"))?;
Ok((Token::Number(num), i))
}
fn lex_ident(bytes: &[u8], start: usize) -> (Token, usize) {
let mut i = start;
while i < bytes.len()
&& (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_' || bytes[i] == b':')
{
i += 1;
}
let word = std::str::from_utf8(&bytes[start..i]).unwrap_or("");
match word {
"Inf" | "inf" => (Token::Number(f64::INFINITY), i),
"NaN" | "nan" => (Token::Number(f64::NAN), i),
_ => (Token::Ident(word.to_string()), i),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_metric() {
let tokens = tokenize("up").unwrap();
assert_eq!(tokens, vec![Token::Ident("up".into()), Token::Eof]);
}
#[test]
fn vector_selector() {
let tokens = tokenize(r#"http_requests_total{method="GET"}"#).unwrap();
assert_eq!(tokens[0], Token::Ident("http_requests_total".into()));
assert_eq!(tokens[1], Token::LBrace);
assert_eq!(tokens[2], Token::Ident("method".into()));
assert_eq!(tokens[3], Token::Assign);
assert_eq!(tokens[4], Token::String("GET".into()));
assert_eq!(tokens[5], Token::RBrace);
}
#[test]
fn range_selector() {
let tokens = tokenize("rate(requests[5m])").unwrap();
assert_eq!(tokens[0], Token::Ident("rate".into()));
assert_eq!(tokens[1], Token::LParen);
assert_eq!(tokens[2], Token::Ident("requests".into()));
assert_eq!(tokens[3], Token::LBracket);
assert_eq!(tokens[4], Token::Duration("5m".into()));
assert_eq!(tokens[5], Token::RBracket);
assert_eq!(tokens[6], Token::RParen);
}
#[test]
fn binary_expr() {
let tokens = tokenize("a + b * 2").unwrap();
assert_eq!(tokens[0], Token::Ident("a".into()));
assert_eq!(tokens[1], Token::Add);
assert_eq!(tokens[2], Token::Ident("b".into()));
assert_eq!(tokens[3], Token::Mul);
assert_eq!(tokens[4], Token::Number(2.0));
}
#[test]
fn comparison_ops() {
let tokens = tokenize("a == b != c >= d").unwrap();
assert_eq!(tokens[1], Token::Eq);
assert_eq!(tokens[3], Token::Neq);
assert_eq!(tokens[5], Token::Gte);
}
#[test]
fn regex_matchers() {
let tokens = tokenize(r#"{job=~"api.*", env!~"test"}"#).unwrap();
assert_eq!(tokens[2], Token::MatchRegex);
assert_eq!(tokens[6], Token::NotMatchRegex);
}
#[test]
fn aggregation() {
let tokens = tokenize("sum by (job) (rate(requests[5m]))").unwrap();
assert_eq!(tokens[0], Token::Ident("sum".into()));
assert_eq!(tokens[1], Token::Ident("by".into()));
}
#[test]
fn inf_nan() {
let tokens = tokenize("Inf NaN").unwrap();
assert!(matches!(tokens[0], Token::Number(v) if v.is_infinite()));
assert!(matches!(tokens[1], Token::Number(v) if v.is_nan()));
}
#[test]
fn string_escapes() {
let tokens = tokenize(r#""hello\nworld""#).unwrap();
assert_eq!(tokens[0], Token::String("hello\nworld".into()));
}
#[test]
fn duration_compound() {
let tokens = tokenize("1h30m").unwrap();
assert_eq!(tokens[0], Token::Duration("1h30m".into()));
}
}