Skip to main content

reddb_rql/parser/
mod.rs

1//! RQL Parser
2//!
3//! Parses RQL (RedDB Query Language) tokens into a unified AST.
4//! Supports SQL-like table queries, Cypher-like graph patterns, and joins.
5//!
6//! # Supported Syntax
7//!
8//! ## Table Queries
9//! ```text
10//! SELECT [columns] FROM table [WHERE condition] [ORDER BY ...] [LIMIT n]
11//! ```
12//!
13//! ## Graph Queries
14//! ```text
15//! MATCH pattern [WHERE condition] RETURN projection
16//! ```
17//!
18//! ## Join Queries
19//! ```text
20//! FROM table [alias] JOIN GRAPH pattern ON condition [RETURN projection]
21//! ```
22//!
23//! ## Path Queries
24//! ```text
25//! PATH FROM selector TO selector [VIA edge_types] [RETURN projection]
26//! ```
27//!
28//! ## Vector Queries
29//! ```text
30//! VECTOR SEARCH collection SIMILAR TO [...] [WHERE ...] [METRIC ...] LIMIT k
31//! ```
32//!
33//! ## Hybrid Queries
34//! ```text
35//! HYBRID (structured query) VECTOR SEARCH ... FUSION strategy LIMIT n
36//! ```
37
38mod auth_ddl;
39mod config;
40mod cte;
41mod ddl;
42mod dml;
43mod error;
44mod expr;
45mod filter;
46mod graph;
47mod graph_commands;
48mod hybrid;
49mod index_ddl;
50mod join;
51mod kv;
52pub mod limits;
53mod migration;
54mod path;
55mod probabilistic_commands;
56mod queue;
57mod search_commands;
58mod table;
59mod timeseries;
60mod tree;
61mod vector;
62
63#[cfg(test)]
64mod json_literal_table;
65#[cfg(test)]
66mod property_tests;
67#[cfg(test)]
68mod tests;
69
70pub use error::{ParseError, ParseErrorKind, SafeTokenDisplay};
71pub use limits::ParserLimits;
72
73use crate::ast::{QueryExpr, QueryWithCte, Span};
74use crate::lexer::{Lexer, Position, Spanned, Token};
75use limits::DepthCounter;
76use reddb_types::types::Value;
77
78/// RQL Parser
79pub struct Parser<'a> {
80    lexer: Lexer<'a>,
81    /// Current token
82    pub(crate) current: Spanned,
83    /// Recursion-depth tracker. Each enter/exit of a recursive
84    /// descent point should bracket itself with [`enter_depth`] /
85    /// [`exit_depth`] (see `parse_expr_prec`).
86    pub(crate) depth: DepthCounter,
87    /// Tracks placeholder style used so far in this statement.
88    /// Mixing `$N` and `?` in one statement is a parse error
89    /// (PRD #351 / issue #354).
90    pub(crate) placeholder_mode: PlaceholderMode,
91    /// Counter for `?` positional placeholders, numbered 1-based
92    /// in source. Unused when mode is `Dollar` or `None`.
93    pub(crate) question_count: usize,
94    /// Number of tokens consumed through `advance`.
95    pub(crate) tokens_consumed: usize,
96    /// Maximum token budget for this parse.
97    pub(crate) max_tokens: usize,
98}
99
100/// Placeholder style locked in by the first placeholder seen.
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub(crate) enum PlaceholderMode {
103    None,
104    Dollar,
105    Question,
106}
107
108impl<'a> Parser<'a> {
109    /// Create a new parser with default DoS limits.
110    pub fn new(input: &'a str) -> Result<Self, ParseError> {
111        Self::with_limits(input, ParserLimits::default())
112    }
113
114    /// Create a new parser with custom DoS limits.
115    pub fn with_limits(input: &'a str, limits: ParserLimits) -> Result<Self, ParseError> {
116        // Input-byte cap is enforced before the lexer is even
117        // constructed: the lexer holds a `Chars` iterator over the
118        // input slice, so refusing oversized input here keeps the
119        // pathological case (10 GiB blob) cheap.
120        if input.len() > limits.max_input_bytes {
121            return Err(ParseError::input_too_large(
122                "max_input_bytes",
123                limits.max_input_bytes,
124                Position::new(1, 1, 0),
125            ));
126        }
127        let mut lexer = Lexer::with_limits(input, limits);
128        let current = lexer.next_token()?;
129        Ok(Self {
130            lexer,
131            current,
132            depth: DepthCounter::new(limits.max_depth),
133            placeholder_mode: PlaceholderMode::None,
134            question_count: 0,
135            tokens_consumed: 0,
136            max_tokens: limits.max_tokens,
137        })
138    }
139
140    /// Increment the recursion-depth counter or bail with
141    /// `ParseError::DepthLimit`. Pair with [`exit_depth`] (use the
142    /// `with_depth` helper for RAII-style bracketing).
143    pub(crate) fn enter_depth(&mut self) -> Result<(), ParseError> {
144        self.depth.depth += 1;
145        if self.depth.depth > self.depth.max_depth {
146            return Err(ParseError::depth_limit(
147                "max_depth",
148                self.depth.max_depth,
149                self.position(),
150            ));
151        }
152        Ok(())
153    }
154
155    /// Decrement the recursion-depth counter. Always called on the
156    /// success path; on the error path, the counter is rebalanced
157    /// when `Parser` itself is dropped (it's only consulted while
158    /// parsing is in progress).
159    pub(crate) fn exit_depth(&mut self) {
160        if self.depth.depth > 0 {
161            self.depth.depth -= 1;
162        }
163    }
164
165    /// Get current position
166    pub fn position(&self) -> Position {
167        self.current.start
168    }
169
170    /// Advance to next token
171    pub fn advance(&mut self) -> Result<Token, ParseError> {
172        self.tokens_consumed += 1;
173        if self.tokens_consumed > self.max_tokens {
174            return Err(ParseError::token_limit(
175                "max_tokens",
176                self.max_tokens,
177                self.position(),
178            ));
179        }
180        let old = std::mem::replace(&mut self.current, self.lexer.next_token()?);
181        Ok(old.token)
182    }
183
184    /// Peek at current token
185    pub fn peek(&self) -> &Token {
186        &self.current.token
187    }
188
189    /// Peek one token past the current parser position without consuming it.
190    pub fn peek_next(&mut self) -> Result<&Token, ParseError> {
191        Ok(&self.lexer.peek_token()?.token)
192    }
193
194    /// Check if current token matches
195    pub fn check(&self, expected: &Token) -> bool {
196        std::mem::discriminant(&self.current.token) == std::mem::discriminant(expected)
197    }
198
199    /// Check if current token is a specific keyword
200    pub fn check_keyword(&self, keyword: &Token) -> bool {
201        self.check(keyword)
202    }
203
204    /// Consume a specific token or error
205    pub fn expect(&mut self, expected: Token) -> Result<Token, ParseError> {
206        if self.check(&expected) {
207            self.advance()
208        } else {
209            Err(ParseError::expected(
210                vec![&expected.to_string()],
211                &self.current.token,
212                self.position(),
213            ))
214        }
215    }
216
217    /// Consume an identifier and return its value
218    pub fn expect_ident(&mut self) -> Result<String, ParseError> {
219        match &self.current.token {
220            Token::Ident(name) => {
221                let name = name.clone();
222                self.advance()?;
223                Ok(name)
224            }
225            other => Err(ParseError::expected(
226                vec!["identifier"],
227                other,
228                self.position(),
229            )),
230        }
231    }
232
233    /// Consume an identifier or aggregate keyword when the grammar expects
234    /// a user-defined column name.
235    pub fn expect_column_ident(&mut self) -> Result<String, ParseError> {
236        let name = match &self.current.token {
237            Token::Ident(name) => name.clone(),
238            Token::Count => "count".to_string(),
239            Token::Sum => "sum".to_string(),
240            Token::Avg => "avg".to_string(),
241            Token::Min => "min".to_string(),
242            Token::Max => "max".to_string(),
243            Token::Weight => "weight".to_string(),
244            other => {
245                return Err(ParseError::expected(
246                    vec!["identifier"],
247                    other,
248                    self.position(),
249                ));
250            }
251        };
252        self.advance()?;
253        Ok(name)
254    }
255
256    /// Consume an identifier or keyword (for type names where keywords are valid)
257    pub fn expect_ident_or_keyword(&mut self) -> Result<String, ParseError> {
258        // Get the string representation of the current token
259        let name = match &self.current.token {
260            Token::Ident(name) => name.clone(),
261            Token::Count => "count".to_string(),
262            Token::Sum => "sum".to_string(),
263            Token::Avg => "avg".to_string(),
264            Token::Min => "min".to_string(),
265            Token::Max => "max".to_string(),
266            // Keywords that can be type names (convert to uppercase for type matching)
267            Token::Contains => "CONTAINS".to_string(),
268            Token::Left => "LEFT".to_string(),
269            Token::Right => "RIGHT".to_string(),
270            Token::First => "FIRST".to_string(),
271            Token::Last => "LAST".to_string(),
272            Token::In => "IN".to_string(),
273            Token::By => "BY".to_string(),
274            // Any other keyword - use its display string
275            other => other.to_string(),
276        };
277
278        // Only advance for valid type-name-like tokens
279        match &self.current.token {
280            // Identifiers are always valid
281            Token::Ident(_) => {
282                self.advance()?;
283                Ok(name)
284            }
285            // These keywords can be type names
286            Token::Contains
287            | Token::Left
288            | Token::Right
289            | Token::First
290            | Token::Last
291            | Token::In
292            | Token::By => {
293                self.advance()?;
294                Ok(name)
295            }
296            // Reject structural tokens that can't be type names
297            Token::Eof
298            | Token::LParen
299            | Token::RParen
300            | Token::LBracket
301            | Token::RBracket
302            | Token::Comma
303            | Token::Dot
304            | Token::Eq
305            | Token::Lt
306            | Token::Gt
307            | Token::Le
308            | Token::Ge
309            | Token::Arrow
310            | Token::ArrowLeft
311            | Token::Dash
312            | Token::Colon
313            | Token::Semi
314            | Token::Star
315            | Token::Plus
316            | Token::Slash => Err(ParseError::expected(
317                vec!["identifier or type name"],
318                &self.current.token,
319                self.position(),
320            )),
321            // All other keywords can potentially be type names
322            _ => {
323                self.advance()?;
324                Ok(name)
325            }
326        }
327    }
328
329    /// Try to consume a token, returning true if consumed
330    pub fn consume(&mut self, expected: &Token) -> Result<bool, ParseError> {
331        if self.check(expected) {
332            self.advance()?;
333            Ok(true)
334        } else {
335            Ok(false)
336        }
337    }
338
339    /// Try to consume an identifier case-insensitively (for keywords not in Token enum)
340    pub fn consume_ident_ci(&mut self, expected: &str) -> Result<bool, ParseError> {
341        match self.peek().clone() {
342            Token::Ident(name) if name.eq_ignore_ascii_case(expected) => {
343                self.advance()?;
344                Ok(true)
345            }
346            _ => Ok(false),
347        }
348    }
349
350    /// Parse a complete query
351    pub fn parse(&mut self) -> Result<QueryExpr, ParseError> {
352        let query = self.parse_frontend_statement()?.into_query_expr();
353
354        // Expect end of input
355        if !self.check(&Token::Eof) {
356            return Err(ParseError::new(
357                // F-05: route the offending token through `SafeTokenDisplay`
358                // so the user-controlled `Ident` / `String` / `JsonLiteral`
359                // payloads are escaped before the message lands in the
360                // downstream JSON / audit / log / gRPC sinks.
361                format!(
362                    "Unexpected token after query: {}",
363                    error::SafeTokenDisplay(&self.current.token)
364                ),
365                self.position(),
366            ));
367        }
368
369        Ok(query)
370    }
371
372    /// Parse the main query expression (without CTEs)
373    pub fn parse_query_expr(&mut self) -> Result<QueryExpr, ParseError> {
374        self.parse_frontend_statement()
375            .map(|statement| statement.into_query_expr())
376    }
377
378    /// Parse an integer literal
379    pub fn parse_integer(&mut self) -> Result<i64, ParseError> {
380        match &self.current.token {
381            Token::Integer(n) => {
382                let n = *n;
383                self.advance()?;
384                Ok(n)
385            }
386            other => Err(ParseError::expected(
387                vec!["integer"],
388                other,
389                self.position(),
390            )),
391        }
392    }
393
394    /// Parse a `$N` or `?` placeholder in a non-Expr slot (e.g. the
395    /// `LIMIT` / `MIN_SCORE` slots of SEARCH SIMILAR; issue #361). The
396    /// `field` name is used only to enrich placeholder-mixing errors.
397    /// Returns the 0-based parameter index.
398    pub fn parse_param_slot(&mut self, field: &'static str) -> Result<usize, ParseError> {
399        match self.peek().clone() {
400            Token::Dollar => {
401                self.advance()?;
402                let n = match *self.peek() {
403                    Token::Integer(n) if n >= 1 => {
404                        self.advance()?;
405                        n as usize
406                    }
407                    _ => {
408                        return Err(ParseError::new(
409                            format!("expected `$N` (N >= 1) for {field} parameter"),
410                            self.position(),
411                        ));
412                    }
413                };
414                if self.placeholder_mode == PlaceholderMode::Question {
415                    return Err(ParseError::new(
416                        "cannot mix `?` and `$N` placeholders in one statement".to_string(),
417                        self.position(),
418                    ));
419                }
420                self.placeholder_mode = PlaceholderMode::Dollar;
421                Ok(n - 1)
422            }
423            Token::Question => {
424                let (index, _) = self.parse_question_param_index()?;
425                Ok(index)
426            }
427            other => Err(ParseError::expected(
428                vec!["$N", "?"],
429                &other,
430                self.position(),
431            )),
432        }
433    }
434
435    /// Parse a question-style positional placeholder. Bare `?` slots are
436    /// assigned left-to-right. Immediate `?N` slots use the explicit
437    /// 1-based index, matching `$N` without changing the placeholder
438    /// family.
439    pub(crate) fn parse_question_param_index(&mut self) -> Result<(usize, Span), ParseError> {
440        let start = self.position();
441        let question_end = self.current.end;
442        self.expect(Token::Question)?;
443        if self.placeholder_mode == PlaceholderMode::Dollar {
444            return Err(ParseError::new(
445                "cannot mix `?` and `$N` placeholders in one statement".to_string(),
446                self.position(),
447            ));
448        }
449        self.placeholder_mode = PlaceholderMode::Question;
450
451        if let Token::Integer(n) = *self.peek() {
452            if self.current.start.offset == question_end.offset {
453                if n < 1 {
454                    return Err(ParseError::new(
455                        "placeholder index must be >= 1".to_string(),
456                        self.position(),
457                    ));
458                }
459                let end = self.current.end;
460                self.advance()?;
461                let index = n as usize - 1;
462                self.question_count = self.question_count.max(index + 1);
463                return Ok((index, Span::new(start, end)));
464            }
465        }
466
467        self.question_count += 1;
468        Ok((self.question_count - 1, Span::new(start, question_end)))
469    }
470
471    /// Parse a strictly-positive integer literal (`> 0`).
472    ///
473    /// Surfaces a `ValueOutOfRange` error for `field` when the literal
474    /// is `0`, negative, or starts with a unary `-` (which the bare
475    /// `parse_integer` rejects with a confusing "expected: integer"
476    /// message). Used by modifier slots like `MAX_SIZE`, `CAPACITY`,
477    /// `WIDTH`, `DEPTH`, `K` where zero or negative values are
478    /// semantically meaningless.
479    pub fn parse_positive_integer(&mut self, field: &'static str) -> Result<i64, ParseError> {
480        let pos = self.position();
481        // Detect the unary-minus path up-front so we surface a
482        // clearer "must be positive" message instead of the generic
483        // "expected: integer".
484        if matches!(self.current.token, Token::Minus | Token::Dash) {
485            return Err(ParseError::value_out_of_range(
486                field,
487                "must be a positive integer",
488                pos,
489            ));
490        }
491        let raw = self.parse_integer()?;
492        if raw <= 0 {
493            return Err(ParseError::value_out_of_range(
494                field,
495                "must be a positive integer",
496                pos,
497            ));
498        }
499        Ok(raw)
500    }
501
502    /// Parse float literal. Accepts an optional leading unary `-`
503    /// followed by a `Float` or `Integer` token so positions like
504    /// vector literals (`[-0.1, …]`), `THRESHOLD -0.5`, `MIN_SCORE
505    /// -0.1`, `RERANK(-0.3)`, `UNION(0.7, -0.3)`, and geo coordinates
506    /// (`LATITUDE -33.86`) parse correctly. See bug #107.
507    pub fn parse_float(&mut self) -> Result<f64, ParseError> {
508        let negate = if matches!(self.current.token, Token::Minus | Token::Dash) {
509            self.advance()?;
510            true
511        } else {
512            false
513        };
514        let value = match &self.current.token {
515            Token::Float(n) => {
516                let n = *n;
517                self.advance()?;
518                n
519            }
520            Token::Integer(n) => {
521                let n = *n as f64;
522                self.advance()?;
523                n
524            }
525            other => {
526                return Err(ParseError::expected(vec!["number"], other, self.position()));
527            }
528        };
529        Ok(if negate { -value } else { value })
530    }
531
532    /// Parse a string literal
533    pub fn parse_string(&mut self) -> Result<String, ParseError> {
534        match &self.current.token {
535            Token::String(s) => {
536                let s = s.clone();
537                self.advance()?;
538                Ok(s)
539            }
540            other => Err(ParseError::expected(vec!["string"], other, self.position())),
541        }
542    }
543
544    /// Parse a value (delegates to parse_literal_value for full JSON support)
545    pub fn parse_value(&mut self) -> Result<Value, ParseError> {
546        self.parse_literal_value()
547    }
548
549    /// Parse value list for IN clause
550    pub fn parse_value_list(&mut self) -> Result<Vec<Value>, ParseError> {
551        let mut values = Vec::new();
552        loop {
553            values.push(self.parse_value()?);
554            if !self.consume(&Token::Comma)? {
555                break;
556            }
557        }
558        Ok(values)
559    }
560
561    /// Phase 1 cutover bridge: parse an expression via the new Pratt
562    /// parser (`parser/expr.rs`) and try to fold it back into a
563    /// literal `Value`. Used by INSERT VALUES / UPDATE SET / DEFAULT
564    /// slots that still store `Value` in their AST nodes — the bridge
565    /// lets them benefit from the full Expr grammar (parenthesised
566    /// literals, unary minus, CAST literals) without an AST cascade.
567    ///
568    /// Folds these Expr shapes:
569    /// - `Expr::Literal { value, .. }` → `value`
570    /// - `Expr::UnaryOp { Neg, operand: Literal(Integer/Float), .. }`
571    ///   → negated value
572    /// - `Expr::Cast { inner: Literal(text), target, .. }` →
573    ///   coerce(text, target) via schema::coerce
574    ///
575    /// Anything else returns an error so callers can decide whether
576    /// to fall back to the legacy `parse_literal_value` path or
577    /// surface a "non-literal not supported in this position" error.
578    pub fn parse_expr_value(&mut self) -> Result<Value, ParseError> {
579        let expr = self.parse_expr()?;
580        crate::sql_lowering::fold_expr_to_value(expr)
581            .map_err(|msg| ParseError::new(msg, self.position()))
582    }
583}
584
585/// Parse an RQL query string into a `QueryWithCte`. A leading `WITH`
586/// is consumed as a CTE prelude; any other statement is returned in
587/// the `QueryWithCte::simple` shape (`with_clause: None`). Callers
588/// that don't care about CTEs read `.query` to recover the legacy
589/// `QueryExpr` shape.
590pub fn parse(input: &str) -> Result<QueryWithCte, ParseError> {
591    let mut parser = Parser::new(input)?;
592    parser.parse_with_cte()
593}