use super::CypherError;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TokenKind {
Match,
OptionalMatch,
Where,
Return,
With,
Unwind,
Order,
By,
Limit,
Skip,
As,
Distinct,
And,
Or,
Not,
In,
Is,
Asc,
Desc,
True,
False,
Null,
Contains,
StartsWith,
EndsWith,
Count,
Collect,
Avg,
Sum,
Min,
Max,
Of,
Timestamp,
Between,
For,
SystemTime,
ValidTime,
Eq,
Ne,
Lt,
Le,
Gt,
Ge,
LParen,
RParen,
LBracket,
RBracket,
LBrace,
RBrace,
Colon,
Dot,
DotDot,
Comma,
Pipe,
Dash,
Arrow,
LeftArrow,
Star,
Plus,
Slash,
Percent,
IntegerLiteral,
FloatLiteral,
StringLiteral,
Parameter,
Identifier,
Eof,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub kind: TokenKind,
pub text: String,
pub position: usize,
}
impl Token {
fn new(kind: TokenKind, text: impl Into<String>, position: usize) -> Self {
Self {
kind,
text: text.into(),
position,
}
}
}
pub struct CypherLexer<'a> {
input: &'a str,
chars: std::iter::Peekable<std::str::CharIndices<'a>>,
position: usize,
}
impl<'a> CypherLexer<'a> {
pub fn tokenize(input: &str) -> Result<Vec<Token>, CypherError> {
let mut lexer = CypherLexer {
input,
chars: input.char_indices().peekable(),
position: 0,
};
let mut tokens = Vec::with_capacity(input.len() / 4 + 1);
loop {
let tok = lexer.next_token()?;
let is_eof = tok.kind == TokenKind::Eof;
tokens.push(tok);
if is_eof {
break;
}
}
Ok(tokens)
}
fn next_token(&mut self) -> Result<Token, CypherError> {
self.skip_whitespace_and_comments();
let Some(&(pos, ch)) = self.chars.peek() else {
return Ok(Token::new(TokenKind::Eof, "", self.input.len()));
};
self.position = pos;
match ch {
'(' => {
self.advance();
Ok(Token::new(TokenKind::LParen, "(", pos))
}
')' => {
self.advance();
Ok(Token::new(TokenKind::RParen, ")", pos))
}
'[' => {
self.advance();
Ok(Token::new(TokenKind::LBracket, "[", pos))
}
']' => {
self.advance();
Ok(Token::new(TokenKind::RBracket, "]", pos))
}
'{' => {
self.advance();
Ok(Token::new(TokenKind::LBrace, "{", pos))
}
'}' => {
self.advance();
Ok(Token::new(TokenKind::RBrace, "}", pos))
}
':' => {
self.advance();
Ok(Token::new(TokenKind::Colon, ":", pos))
}
',' => {
self.advance();
Ok(Token::new(TokenKind::Comma, ",", pos))
}
'*' => {
self.advance();
Ok(Token::new(TokenKind::Star, "*", pos))
}
'+' => {
self.advance();
Ok(Token::new(TokenKind::Plus, "+", pos))
}
'%' => {
self.advance();
Ok(Token::new(TokenKind::Percent, "%", pos))
}
'|' => {
self.advance();
Ok(Token::new(TokenKind::Pipe, "|", pos))
}
'.' => self.read_dot(pos),
'=' => {
self.advance();
Ok(Token::new(TokenKind::Eq, "=", pos))
}
'-' => self.read_dash(pos),
'<' => self.read_less_than(pos),
'>' => self.read_greater_than(pos),
'!' => self.read_bang(pos),
'/' => self.read_slash(pos),
'\'' | '"' => self.read_string(pos),
'$' => self.read_parameter(pos),
'0'..='9' => self.read_number(pos),
'a'..='z' | 'A'..='Z' | '_' => self.read_identifier_or_keyword(pos),
_ => Err(self.lex_error(pos, format!("Unexpected character: '{ch}'"))),
}
}
fn advance(&mut self) -> Option<(usize, char)> {
self.chars.next()
}
fn peek_char(&mut self) -> Option<char> {
self.chars.peek().map(|&(_, c)| c)
}
fn lex_error(&self, position: usize, message: String) -> CypherError {
CypherError::LexError { position, message }
}
fn skip_whitespace_and_comments(&mut self) {
loop {
while let Some(&(_, ch)) = self.chars.peek() {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
if let Some(&(_, '/')) = self.chars.peek() {
let mut lookahead = self.chars.clone();
lookahead.next();
if let Some(&(_, '/')) = lookahead.peek() {
self.advance();
self.advance();
while let Some(&(_, ch)) = self.chars.peek() {
if ch == '\n' {
self.advance();
break;
}
self.advance();
}
continue;
}
}
break;
}
}
fn read_dot(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); if let Some(&(_, '.')) = self.chars.peek() {
self.advance(); Ok(Token::new(TokenKind::DotDot, "..", start))
} else {
Ok(Token::new(TokenKind::Dot, ".", start))
}
}
fn read_dash(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); if let Some(&(_, '>')) = self.chars.peek() {
self.advance();
Ok(Token::new(TokenKind::Arrow, "->", start))
} else {
Ok(Token::new(TokenKind::Dash, "-", start))
}
}
fn read_less_than(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); match self.peek_char() {
Some('=') => {
self.advance();
Ok(Token::new(TokenKind::Le, "<=", start))
}
Some('>') => {
self.advance();
Ok(Token::new(TokenKind::Ne, "<>", start))
}
Some('-') => {
self.advance();
Ok(Token::new(TokenKind::LeftArrow, "<-", start))
}
_ => Ok(Token::new(TokenKind::Lt, "<", start)),
}
}
fn read_greater_than(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); if let Some('=') = self.peek_char() {
self.advance();
Ok(Token::new(TokenKind::Ge, ">=", start))
} else {
Ok(Token::new(TokenKind::Gt, ">", start))
}
}
fn read_bang(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); if let Some('=') = self.peek_char() {
self.advance();
Ok(Token::new(TokenKind::Ne, "!=", start))
} else {
Err(self.lex_error(start, "Expected '=' after '!'".to_string()))
}
}
fn read_slash(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); Ok(Token::new(TokenKind::Slash, "/", start))
}
fn read_string(&mut self, start: usize) -> Result<Token, CypherError> {
let (_, quote) = self.advance().unwrap(); let mut value = String::new();
loop {
match self.advance() {
Some((_, ch)) if ch == quote => {
return Ok(Token::new(TokenKind::StringLiteral, value, start));
}
Some((_, '\\')) => {
match self.advance() {
Some((_, c)) if c == quote => value.push(c),
Some((_, '\\')) => value.push('\\'),
Some((_, 'n')) => value.push('\n'),
Some((_, 't')) => value.push('\t'),
Some((_, 'r')) => value.push('\r'),
Some((_, other)) => {
value.push('\\');
value.push(other);
}
None => {
return Err(
self.lex_error(start, "Unterminated string literal".to_string())
);
}
}
}
Some((_, ch)) => value.push(ch),
None => {
return Err(self.lex_error(start, "Unterminated string literal".to_string()));
}
}
}
}
fn read_parameter(&mut self, start: usize) -> Result<Token, CypherError> {
self.advance(); let mut name = String::new();
while let Some(&(_, ch)) = self.chars.peek() {
if ch.is_alphanumeric() || ch == '_' {
name.push(ch);
self.advance();
} else {
break;
}
}
if name.is_empty() {
return Err(self.lex_error(start, "Expected parameter name after '$'".to_string()));
}
Ok(Token::new(TokenKind::Parameter, name, start))
}
fn read_number(&mut self, start: usize) -> Result<Token, CypherError> {
let mut is_float = false;
while let Some(&(_, ch)) = self.chars.peek() {
if ch.is_ascii_digit() {
self.advance();
} else {
break;
}
}
if let Some(&(_, '.')) = self.chars.peek() {
let mut lookahead = self.chars.clone();
lookahead.next(); match lookahead.peek() {
Some(&(_, ch)) if ch.is_ascii_digit() => {
is_float = true;
self.advance(); while let Some(&(_, ch)) = self.chars.peek() {
if ch.is_ascii_digit() {
self.advance();
} else {
break;
}
}
}
_ => {}
}
}
let end = self
.chars
.peek()
.map(|&(pos, _)| pos)
.unwrap_or(self.input.len());
let text = &self.input[start..end];
let kind = if is_float {
TokenKind::FloatLiteral
} else {
TokenKind::IntegerLiteral
};
Ok(Token::new(kind, text, start))
}
fn read_identifier_or_keyword(&mut self, start: usize) -> Result<Token, CypherError> {
while let Some(&(_, ch)) = self.chars.peek() {
if ch.is_alphanumeric() || ch == '_' {
self.advance();
} else {
break;
}
}
let end = self
.chars
.peek()
.map(|&(pos, _)| pos)
.unwrap_or(self.input.len());
let text = &self.input[start..end];
let kind = match text.to_uppercase().as_str() {
"MATCH" => TokenKind::Match,
"OPTIONAL" => TokenKind::OptionalMatch,
"WHERE" => TokenKind::Where,
"RETURN" => TokenKind::Return,
"WITH" => TokenKind::With,
"UNWIND" => TokenKind::Unwind,
"ORDER" => TokenKind::Order,
"BY" => TokenKind::By,
"LIMIT" => TokenKind::Limit,
"SKIP" => TokenKind::Skip,
"AS" => TokenKind::As,
"DISTINCT" => TokenKind::Distinct,
"AND" => TokenKind::And,
"OR" => TokenKind::Or,
"NOT" => TokenKind::Not,
"IN" => TokenKind::In,
"IS" => TokenKind::Is,
"ASC" => TokenKind::Asc,
"DESC" => TokenKind::Desc,
"TRUE" => TokenKind::True,
"FALSE" => TokenKind::False,
"NULL" => TokenKind::Null,
"CONTAINS" => TokenKind::Contains,
"STARTS" => TokenKind::StartsWith,
"ENDS" => TokenKind::EndsWith,
"COUNT" => TokenKind::Count,
"COLLECT" => TokenKind::Collect,
"AVG" => TokenKind::Avg,
"SUM" => TokenKind::Sum,
"MIN" => TokenKind::Min,
"MAX" => TokenKind::Max,
"OF" => TokenKind::Of,
"TIMESTAMP" => TokenKind::Timestamp,
"BETWEEN" => TokenKind::Between,
"FOR" => TokenKind::For,
"SYSTEM_TIME" => TokenKind::SystemTime,
"VALID_TIME" => TokenKind::ValidTime,
_ => TokenKind::Identifier,
};
Ok(Token::new(kind, text, start))
}
}
#[cfg(test)]
mod unit_tests {
use super::*;
#[test]
fn keyword_lookup_exhaustive() {
let keywords = [
"MATCH",
"OPTIONAL",
"WHERE",
"RETURN",
"WITH",
"UNWIND",
"ORDER",
"BY",
"LIMIT",
"SKIP",
"AS",
"DISTINCT",
"AND",
"OR",
"NOT",
"IN",
"IS",
"ASC",
"DESC",
"TRUE",
"FALSE",
"NULL",
"CONTAINS",
"STARTS",
"ENDS",
"COUNT",
"COLLECT",
"AVG",
"SUM",
"MIN",
"MAX",
"OF",
"TIMESTAMP",
"BETWEEN",
"FOR",
"SYSTEM_TIME",
"VALID_TIME",
];
for kw in keywords {
let tokens = CypherLexer::tokenize(kw).unwrap();
assert_ne!(
tokens[0].kind,
TokenKind::Identifier,
"{kw} should be a keyword, not an identifier"
);
}
}
}