pub(crate) mod locy_parser;
mod locy_walker;
mod walker;
use crate::ast::{Expr, Query};
use crate::locy_ast::LocyProgram;
use pest::Parser;
use pest_derive::Parser;
#[derive(Debug, thiserror::Error)]
#[error("{message}")]
pub struct ParseError {
message: String,
}
impl ParseError {
pub fn new(message: String) -> Self {
Self { message }
}
}
#[derive(Parser)]
#[grammar = "grammar/cypher.pest"]
pub struct CypherParser;
const MAX_NESTING_DEPTH: u32 = 200;
fn check_nesting_depth(input: &str) -> Result<(), ParseError> {
let bytes = input.as_bytes();
let mut i = 0usize;
let mut depth: i32 = 0;
let mut max_depth: i32 = 0;
while i < bytes.len() {
match bytes[i] {
quote @ (b'\'' | b'"') => {
i += 1;
while i < bytes.len() {
match bytes[i] {
b'\\' => i += 2,
c if c == quote => {
i += 1;
break;
}
_ => i += 1,
}
}
}
b'`' => {
i += 1;
while i < bytes.len() && bytes[i] != b'`' {
i += 1;
}
i += 1;
}
b'/' if bytes.get(i + 1) == Some(&b'/') => {
i += 2;
while i < bytes.len() && bytes[i] != b'\n' {
i += 1;
}
}
b'/' if bytes.get(i + 1) == Some(&b'*') => {
i += 2;
while i < bytes.len() && !(bytes[i] == b'*' && bytes.get(i + 1) == Some(&b'/')) {
i += 1;
}
i += 2;
}
b'(' | b'[' | b'{' => {
depth += 1;
max_depth = max_depth.max(depth);
i += 1;
}
b')' | b']' | b'}' => {
depth = (depth - 1).max(0);
i += 1;
}
b if b.is_ascii_alphabetic() || b == b'_' => {
let start = i;
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
let word = &input[start..i];
if word.eq_ignore_ascii_case("case") {
depth += 1;
max_depth = max_depth.max(depth);
} else if word.eq_ignore_ascii_case("end") {
depth = (depth - 1).max(0);
}
}
_ => i += 1,
}
if max_depth as u32 > MAX_NESTING_DEPTH {
return Err(ParseError::new(format!(
"SyntaxError: NestingTooDeep - query nesting exceeds the maximum \
supported depth ({MAX_NESTING_DEPTH})"
)));
}
}
Ok(())
}
pub fn parse(input: &str) -> Result<Query, ParseError> {
check_nesting_depth(input)?;
let pairs = CypherParser::parse(Rule::query, input).map_err(|e| map_pest_error(input, e))?;
walker::build_query(pairs)
}
pub fn parse_expression(input: &str) -> Result<Expr, ParseError> {
check_nesting_depth(input)?;
let pairs =
CypherParser::parse(Rule::expression, input).map_err(|e| map_pest_error(input, e))?;
walker::build_expression(pairs.into_iter().next().unwrap())
}
pub fn parse_locy(input: &str) -> Result<LocyProgram, ParseError> {
use locy_parser::LocyParser;
use locy_parser::Rule as LocyRule;
check_nesting_depth(input)?;
let pairs = LocyParser::parse(LocyRule::locy_query, input)
.map_err(|e| map_locy_pest_error(input, e))?;
locy_walker::build_program(pairs.into_iter().next().unwrap())
}
fn expects_identifier(e: &pest::error::Error<Rule>) -> bool {
use pest::error::ErrorVariant;
match &e.variant {
ErrorVariant::ParsingError { positives, .. } => positives
.iter()
.any(|r| matches!(r, Rule::identifier | Rule::identifier_or_keyword)),
_ => false,
}
}
fn error_position<R: pest::RuleType>(e: &pest::error::Error<R>) -> usize {
match e.location {
pest::error::InputLocation::Pos(p) => p,
pest::error::InputLocation::Span((s, _)) => s,
}
}
fn extract_token_span_at(input: &str, pos: usize) -> Option<(usize, usize)> {
let bytes = input.as_bytes();
if bytes.is_empty() {
return None;
}
let mut p = pos.min(bytes.len() - 1);
let is_token_char =
|b: u8| b.is_ascii_alphanumeric() || matches!(b, b'_' | b'-' | b'.' | b'#' | b'$');
if !is_token_char(bytes[p]) {
if p == 0 || !is_token_char(bytes[p - 1]) {
return None;
}
p -= 1;
}
let mut start = p;
while start > 0 && is_token_char(bytes[start - 1]) {
start -= 1;
}
let mut end = p;
while end < bytes.len() && is_token_char(bytes[end]) {
end += 1;
}
Some((start, end))
}
fn is_map_key_like_context(input: &str, start: usize, end: usize) -> bool {
let bytes = input.as_bytes();
if bytes.is_empty() || start >= bytes.len() || end > bytes.len() {
return false;
}
let mut colon_pos = end;
while colon_pos < bytes.len() && bytes[colon_pos].is_ascii_whitespace() {
colon_pos += 1;
}
if colon_pos >= bytes.len() || bytes[colon_pos] != b':' {
return false;
}
let mut prev_pos = start;
while prev_pos > 0 && bytes[prev_pos - 1].is_ascii_whitespace() {
prev_pos -= 1;
}
if prev_pos == 0 {
return false;
}
matches!(bytes[prev_pos - 1], b'{' | b',')
}
fn relationship_bracket_segment(input: &str, pos: usize) -> Option<&str> {
let pos = pos.min(input.len());
let before = &input[..pos];
let start = before.rfind('[')?;
let prefix = &input[..start];
if !prefix.trim_end().ends_with('-') {
return None;
}
let after = &input[start..];
let end = after.find(']').map(|i| start + i + 1).unwrap_or(pos);
Some(&input[start..end])
}
fn is_invalid_relationship_pattern(input: &str, pos: usize) -> bool {
let Some(segment) = relationship_bracket_segment(input, pos) else {
return false;
};
(segment.contains("..") && !segment.contains('*')) || segment.contains("*-")
}
fn is_invalid_number_literal(input: &str, pos: usize) -> bool {
let Some((start, end)) = extract_token_span_at(input, pos) else {
return false;
};
if is_map_key_like_context(input, start, end) {
return false;
}
let token = &input[start..end];
let t = token.strip_prefix('-').unwrap_or(token);
if !t.as_bytes().first().is_some_and(|b| b.is_ascii_digit()) {
return false;
}
let has_only = |digits: &str, valid: fn(&char) -> bool| {
digits.is_empty() || !digits.chars().all(|c| valid(&c) || c == '_')
};
if let Some(digits) = t.strip_prefix("0x").or_else(|| t.strip_prefix("0X")) {
return has_only(digits, char::is_ascii_hexdigit);
}
if let Some(digits) = t.strip_prefix("0o").or_else(|| t.strip_prefix("0O")) {
return has_only(digits, |c| matches!(c, '0'..='7'));
}
t.chars().any(|c| c.is_ascii_alphabetic())
}
fn invalid_unicode_character(input: &str, pos: usize) -> Option<char> {
let ch = input.get(pos..)?.chars().next()?;
matches!(ch, '—' | '–' | '−').then_some(ch)
}
const CYPHER_RESERVED_KEYWORDS: &[&str] = &[
"match",
"optional",
"where",
"create",
"merge",
"set",
"remove",
"delete",
"detach",
"return",
"with",
"unwind",
"union",
"call",
"yield",
"distinct",
"order",
"by",
"asc",
"desc",
"skip",
"limit",
"as",
"and",
"or",
"xor",
"not",
"in",
"contains",
"starts",
"ends",
"is",
"null",
"true",
"false",
"case",
"when",
"then",
"else",
"if",
"from",
"to",
"on",
"drop",
"alter",
"show",
"over",
"partition",
"explain",
"recursive",
"valid_at",
"each",
];
const LOCY_RESERVED_KEYWORDS: &[&str] = &[
"rule", "along", "prev", "fold", "best", "derive", "assume", "abduce", "query",
];
fn reserved_keyword_at(input: &str, pos: usize, extra_keywords: &[&str]) -> Option<String> {
let (start, end) = extract_token_span_at(input, pos)?;
let token = &input[start..end];
let lower = token.to_lowercase();
if CYPHER_RESERVED_KEYWORDS.contains(&lower.as_str())
|| extra_keywords.contains(&lower.as_str())
{
Some(token.to_string())
} else {
None
}
}
fn locy_context_category(input: &str, pos: usize) -> Option<&'static str> {
let before = input[..pos].trim_end();
let before_upper = before.to_uppercase();
if before_upper.ends_with("BEST BY") {
return Some("InvalidBestByClause");
}
if before_upper.ends_with("ALONG") {
return Some("InvalidAlongClause");
}
if before_upper.ends_with("FOLD") {
return Some("InvalidFoldClause");
}
if before_upper.ends_with("ASSUME") {
return Some("InvalidAssumeBlock");
}
if before_upper.ends_with("DERIVE") {
return Some("InvalidDeriveCommand");
}
if before_upper.contains("CREATE RULE") {
return Some("InvalidRuleDefinition");
}
if before_upper.ends_with("QUERY") && !before_upper.contains("CREATE RULE") {
return Some("InvalidGoalQuery");
}
None
}
fn map_locy_pest_error(input: &str, e: pest::error::Error<locy_parser::Rule>) -> ParseError {
let pos = error_position(&e);
if is_invalid_relationship_pattern(input, pos) {
return ParseError::new(format!("LocySyntaxError: InvalidRelationshipPattern - {e}"));
}
if is_invalid_number_literal(input, pos) {
return ParseError::new(format!("LocySyntaxError: InvalidNumberLiteral - {e}"));
}
if let Some(ch) = invalid_unicode_character(input, pos) {
return ParseError::new(format!(
"LocySyntaxError: InvalidUnicodeCharacter - Invalid character '{ch}'"
));
}
if let Some(kw) = reserved_keyword_at(input, pos, LOCY_RESERVED_KEYWORDS) {
return ParseError::new(format!(
"LocySyntaxError: ReservedKeyword - \"{kw}\" is a reserved keyword \
and cannot be used as a variable name. Use backtick-quoting: `{kw}`\n{e}"
));
}
if let Some(category) = locy_context_category(input, pos) {
return ParseError::new(format!("LocySyntaxError: {category} - {e}"));
}
ParseError::new(format!("LocySyntaxError: {e}"))
}
fn map_pest_error(input: &str, e: pest::error::Error<Rule>) -> ParseError {
let pos = error_position(&e);
if is_invalid_relationship_pattern(input, pos) {
return ParseError::new(format!("SyntaxError: InvalidRelationshipPattern - {e}"));
}
if is_invalid_number_literal(input, pos) {
return ParseError::new(format!("SyntaxError: InvalidNumberLiteral - {e}"));
}
if let Some(ch) = invalid_unicode_character(input, pos) {
return ParseError::new(format!(
"SyntaxError: InvalidUnicodeCharacter - Invalid character '{ch}'"
));
}
if let Some(kw) = expects_identifier(&e)
.then(|| reserved_keyword_at(input, pos, &[]))
.flatten()
{
return ParseError::new(format!(
"SyntaxError: ReservedKeyword - \"{kw}\" is a reserved keyword \
and cannot be used as a variable name. Use backtick-quoting: `{kw}`\n{e}"
));
}
ParseError::new(format!("UnexpectedSyntax: {e}"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_expression_parsing() {
let cases = [
("1", Rule::integer),
("3.14", Rule::float),
("'hello'", Rule::string),
("n.name", Rule::expression),
("1 + 2", Rule::expression),
("a AND b OR c", Rule::expression),
];
for (input, rule) in cases {
let result = CypherParser::parse(rule, input);
assert!(
result.is_ok(),
"Failed to parse '{}' as {:?}: {:?}",
input,
rule,
result.err()
);
}
}
#[test]
fn test_list_expressions() {
assert!(parse_expression("[]").is_ok());
assert!(parse_expression("[1, 2, 3]").is_ok());
assert!(parse_expression("[x IN range(1,10) | x * 2]").is_ok());
assert!(parse_expression("[x IN list WHERE x > 5 | x]").is_ok());
assert!(parse_expression("[(n)-[:KNOWS]->(m) | m.name]").is_ok());
assert!(parse_expression("[p = (n)-->(m) WHERE m.age > 30 | p]").is_ok());
}
#[test]
fn test_ambiguous_cases() {
assert!(parse_expression("[n]").is_ok()); assert!(parse_expression("[n.name]").is_ok()); assert!(parse_expression("[n IN list]").is_ok());
assert!(parse_expression("[(n)]").is_ok()); }
fn parse_err_msg(input: &str) -> String {
parse(input).unwrap_err().to_string()
}
#[test]
fn test_invalid_relationship_pattern_missing_star_error_code() {
let msg = parse_err_msg("MATCH (a:A)\nMATCH (a)-[:LIKES..]->(c)\nRETURN c.name");
assert!(
msg.contains("InvalidRelationshipPattern"),
"expected InvalidRelationshipPattern, got: {msg}"
);
}
#[test]
fn test_invalid_number_literal_error_code_decimal_alpha() {
let msg = parse_err_msg("RETURN 9223372h54775808 AS literal");
assert!(
msg.contains("InvalidNumberLiteral"),
"expected InvalidNumberLiteral, got: {msg}"
);
}
#[test]
fn test_invalid_number_literal_error_code_hex_prefix_only() {
let msg = parse_err_msg("RETURN 0x AS literal");
assert!(
msg.contains("InvalidNumberLiteral"),
"expected InvalidNumberLiteral, got: {msg}"
);
}
#[test]
fn test_invalid_unicode_character_error_code() {
let msg = parse_err_msg("RETURN 42 — 41");
assert!(
msg.contains("InvalidUnicodeCharacter"),
"expected InvalidUnicodeCharacter, got: {msg}"
);
}
#[test]
fn test_symbol_in_number_stays_unexpected_syntax() {
let msg = parse_err_msg("RETURN 9223372#54775808 AS literal");
assert!(
msg.contains("UnexpectedSyntax"),
"expected UnexpectedSyntax, got: {msg}"
);
}
#[test]
fn test_map_key_starting_with_number_stays_unexpected_syntax() {
let msg = parse_err_msg("RETURN {1B2c3e67:1} AS literal");
assert!(
msg.contains("UnexpectedSyntax"),
"expected UnexpectedSyntax, got: {msg}"
);
}
#[test]
fn test_unary_minus_double() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("--5").expect("--5 should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(5)));
}
#[test]
fn test_unary_minus_single() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("-5").expect("-5 should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(-5)));
}
#[test]
fn test_unary_minus_triple() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("---5").expect("---5 should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(-5)));
}
#[test]
fn test_unary_plus_identity() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("+5").expect("+5 should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(5)));
}
#[test]
fn test_unary_plus_minus() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("+-5").expect("+-5 should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(-5)));
}
#[test]
fn test_unary_minus_plus() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("-+5").expect("-+5 should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(-5)));
}
#[test]
fn test_unary_double_minus_overflow() {
let result = parse_expression("--9223372036854775808");
assert!(
result.is_err(),
"expected overflow error, got: {:?}",
result
);
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("IntegerOverflow"),
"expected IntegerOverflow, got: {msg}"
);
}
#[test]
fn test_unary_minus_i64_min() {
use crate::ast::{CypherLiteral, Expr};
let expr = parse_expression("-9223372036854775808").expect("-i64::MIN should parse");
assert_eq!(expr, Expr::Literal(CypherLiteral::Integer(i64::MIN)));
}
#[test]
fn test_stacked_predicates_is_null_is_not_null() {
let result = parse("RETURN x IS NULL IS NOT NULL");
assert!(
result.is_err(),
"expected parse error for stacked IS NULL IS NOT NULL"
);
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("InvalidPredicateChain"),
"expected InvalidPredicateChain, got: {msg}"
);
}
#[test]
fn test_stacked_predicates_starts_with() {
let result = parse("RETURN x STARTS WITH 'a' STARTS WITH 'b'");
assert!(
result.is_err(),
"expected parse error for stacked STARTS WITH"
);
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("InvalidPredicateChain"),
"expected InvalidPredicateChain, got: {msg}"
);
}
#[test]
fn test_stacked_predicates_in() {
let result = parse("RETURN x IN [1] IN [true]");
assert!(result.is_err(), "expected parse error for stacked IN");
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("InvalidPredicateChain"),
"expected InvalidPredicateChain, got: {msg}"
);
}
#[test]
fn test_stacked_predicates_contains_ends_with() {
let result = parse("RETURN x CONTAINS 'a' ENDS WITH 'b'");
assert!(
result.is_err(),
"expected parse error for stacked CONTAINS/ENDS WITH"
);
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("InvalidPredicateChain"),
"expected InvalidPredicateChain, got: {msg}"
);
}
#[test]
fn test_label_stacking_allowed() {
assert!(
parse("MATCH (x) WHERE x:Person:Employee RETURN x").is_ok(),
"label stacking should be allowed"
);
}
#[test]
fn test_range_chaining_allowed() {
assert!(
parse("MATCH (n) WHERE 1 < n.num < 3 RETURN n").is_ok(),
"range chaining 1 < n.num < 3 should be allowed"
);
}
#[test]
fn test_reserved_keyword_as_variable_name() {
let msg = parse_err_msg("MATCH (match:N) RETURN match");
assert!(
msg.contains("ReservedKeyword"),
"expected ReservedKeyword, got: {msg}"
);
assert!(
msg.contains("backtick-quoting"),
"expected backtick hint, got: {msg}"
);
}
#[test]
fn test_reserved_keyword_return_as_variable() {
let msg = parse_err_msg("MATCH (return:N) RETURN return");
assert!(
msg.contains("ReservedKeyword"),
"expected ReservedKeyword, got: {msg}"
);
}
#[test]
fn test_non_reserved_keyword_allowed() {
assert!(
parse("MATCH (end:N) RETURN end").is_ok(),
"non-reserved keyword 'end' should be allowed as variable name"
);
}
#[test]
fn test_backtick_escaped_reserved_keyword() {
assert!(
parse("MATCH (`match`:N) RETURN `match`").is_ok(),
"backtick-escaped reserved keyword should be allowed"
);
}
}