Skip to main content

reddb_server/storage/query/parser/
mod.rs

1//! RQL Parser
2//!
3//! Parses RQL (RedDB Query Language) tokens into a unified AST.
4//! Supports SQL-like table queries, Cypher-like graph patterns, and joins.
5//!
6//! # Supported Syntax
7//!
8//! ## Table Queries
9//! ```text
10//! SELECT [columns] FROM table [WHERE condition] [ORDER BY ...] [LIMIT n]
11//! ```
12//!
13//! ## Graph Queries
14//! ```text
15//! MATCH pattern [WHERE condition] RETURN projection
16//! ```
17//!
18//! ## Join Queries
19//! ```text
20//! FROM table [alias] JOIN GRAPH pattern ON condition [RETURN projection]
21//! ```
22//!
23//! ## Path Queries
24//! ```text
25//! PATH FROM selector TO selector [VIA edge_types] [RETURN projection]
26//! ```
27//!
28//! ## Vector Queries
29//! ```text
30//! VECTOR SEARCH collection SIMILAR TO [...] [WHERE ...] [METRIC ...] LIMIT k
31//! ```
32//!
33//! ## Hybrid Queries
34//! ```text
35//! HYBRID (structured query) VECTOR SEARCH ... FUSION strategy LIMIT n
36//! ```
37
38mod auth_ddl;
39mod config;
40mod cte;
41mod ddl;
42mod dml;
43mod error;
44mod expr;
45mod filter;
46mod graph;
47mod graph_commands;
48mod hybrid;
49mod index_ddl;
50mod join;
51mod kv;
52pub mod limits;
53mod migration;
54mod path;
55mod probabilistic_commands;
56mod queue;
57mod search_commands;
58mod table;
59mod timeseries;
60mod tree;
61mod vector;
62
63#[cfg(test)]
64mod json_literal_table;
65#[cfg(test)]
66mod property_tests;
67#[cfg(test)]
68mod tests;
69
70pub use error::{ParseError, ParseErrorKind, SafeTokenDisplay};
71pub use limits::ParserLimits;
72
73use super::ast::{QueryExpr, QueryWithCte, Span};
74use super::lexer::{Lexer, Position, Spanned, Token};
75use crate::storage::schema::Value;
76use limits::DepthCounter;
77
78/// RQL Parser
79pub struct Parser<'a> {
80    lexer: Lexer<'a>,
81    /// Current token
82    pub(crate) current: Spanned,
83    /// Recursion-depth tracker. Each enter/exit of a recursive
84    /// descent point should bracket itself with [`enter_depth`] /
85    /// [`exit_depth`] (see `parse_expr_prec`).
86    pub(crate) depth: DepthCounter,
87    /// Tracks placeholder style used so far in this statement.
88    /// Mixing `$N` and `?` in one statement is a parse error
89    /// (PRD #351 / issue #354).
90    pub(crate) placeholder_mode: PlaceholderMode,
91    /// Counter for `?` positional placeholders, numbered 1-based
92    /// in source. Unused when mode is `Dollar` or `None`.
93    pub(crate) question_count: usize,
94}
95
96/// Placeholder style locked in by the first placeholder seen.
97#[derive(Debug, Clone, Copy, PartialEq, Eq)]
98pub(crate) enum PlaceholderMode {
99    None,
100    Dollar,
101    Question,
102}
103
104impl<'a> Parser<'a> {
105    /// Create a new parser with default DoS limits.
106    pub fn new(input: &'a str) -> Result<Self, ParseError> {
107        Self::with_limits(input, ParserLimits::default())
108    }
109
110    /// Create a new parser with custom DoS limits.
111    pub fn with_limits(input: &'a str, limits: ParserLimits) -> Result<Self, ParseError> {
112        // Input-byte cap is enforced before the lexer is even
113        // constructed: the lexer holds a `Chars` iterator over the
114        // input slice, so refusing oversized input here keeps the
115        // pathological case (10 GiB blob) cheap.
116        if input.len() > limits.max_input_bytes {
117            return Err(ParseError::input_too_large(
118                "max_input_bytes",
119                limits.max_input_bytes,
120                Position::new(1, 1, 0),
121            ));
122        }
123        let mut lexer = Lexer::with_limits(input, limits);
124        let current = lexer.next_token()?;
125        Ok(Self {
126            lexer,
127            current,
128            depth: DepthCounter::new(limits.max_depth),
129            placeholder_mode: PlaceholderMode::None,
130            question_count: 0,
131        })
132    }
133
134    /// Increment the recursion-depth counter or bail with
135    /// `ParseError::DepthLimit`. Pair with [`exit_depth`] (use the
136    /// `with_depth` helper for RAII-style bracketing).
137    pub(crate) fn enter_depth(&mut self) -> Result<(), ParseError> {
138        self.depth.depth += 1;
139        if self.depth.depth > self.depth.max_depth {
140            return Err(ParseError::depth_limit(
141                "max_depth",
142                self.depth.max_depth,
143                self.position(),
144            ));
145        }
146        Ok(())
147    }
148
149    /// Decrement the recursion-depth counter. Always called on the
150    /// success path; on the error path, the counter is rebalanced
151    /// when `Parser` itself is dropped (it's only consulted while
152    /// parsing is in progress).
153    pub(crate) fn exit_depth(&mut self) {
154        if self.depth.depth > 0 {
155            self.depth.depth -= 1;
156        }
157    }
158
159    /// Get current position
160    pub fn position(&self) -> Position {
161        self.current.start
162    }
163
164    /// Advance to next token
165    pub fn advance(&mut self) -> Result<Token, ParseError> {
166        let old = std::mem::replace(&mut self.current, self.lexer.next_token()?);
167        Ok(old.token)
168    }
169
170    /// Peek at current token
171    pub fn peek(&self) -> &Token {
172        &self.current.token
173    }
174
175    /// Peek one token past the current parser position without consuming it.
176    pub fn peek_next(&mut self) -> Result<&Token, ParseError> {
177        Ok(&self.lexer.peek_token()?.token)
178    }
179
180    /// Check if current token matches
181    pub fn check(&self, expected: &Token) -> bool {
182        std::mem::discriminant(&self.current.token) == std::mem::discriminant(expected)
183    }
184
185    /// Check if current token is a specific keyword
186    pub fn check_keyword(&self, keyword: &Token) -> bool {
187        self.check(keyword)
188    }
189
190    /// Consume a specific token or error
191    pub fn expect(&mut self, expected: Token) -> Result<Token, ParseError> {
192        if self.check(&expected) {
193            self.advance()
194        } else {
195            Err(ParseError::expected(
196                vec![&expected.to_string()],
197                &self.current.token,
198                self.position(),
199            ))
200        }
201    }
202
203    /// Consume an identifier and return its value
204    pub fn expect_ident(&mut self) -> Result<String, ParseError> {
205        match &self.current.token {
206            Token::Ident(name) => {
207                let name = name.clone();
208                self.advance()?;
209                Ok(name)
210            }
211            other => Err(ParseError::expected(
212                vec!["identifier"],
213                other,
214                self.position(),
215            )),
216        }
217    }
218
219    /// Consume an identifier or aggregate keyword when the grammar expects
220    /// a user-defined column name.
221    pub fn expect_column_ident(&mut self) -> Result<String, ParseError> {
222        let name = match &self.current.token {
223            Token::Ident(name) => name.clone(),
224            Token::Count => "count".to_string(),
225            Token::Sum => "sum".to_string(),
226            Token::Avg => "avg".to_string(),
227            Token::Min => "min".to_string(),
228            Token::Max => "max".to_string(),
229            Token::Weight => "weight".to_string(),
230            other => {
231                return Err(ParseError::expected(
232                    vec!["identifier"],
233                    other,
234                    self.position(),
235                ));
236            }
237        };
238        self.advance()?;
239        Ok(name)
240    }
241
242    /// Consume an identifier or keyword (for type names where keywords are valid)
243    pub fn expect_ident_or_keyword(&mut self) -> Result<String, ParseError> {
244        // Get the string representation of the current token
245        let name = match &self.current.token {
246            Token::Ident(name) => name.clone(),
247            Token::Count => "count".to_string(),
248            Token::Sum => "sum".to_string(),
249            Token::Avg => "avg".to_string(),
250            Token::Min => "min".to_string(),
251            Token::Max => "max".to_string(),
252            // Keywords that can be type names (convert to uppercase for type matching)
253            Token::Contains => "CONTAINS".to_string(),
254            Token::Left => "LEFT".to_string(),
255            Token::Right => "RIGHT".to_string(),
256            Token::First => "FIRST".to_string(),
257            Token::Last => "LAST".to_string(),
258            Token::In => "IN".to_string(),
259            Token::By => "BY".to_string(),
260            // Any other keyword - use its display string
261            other => other.to_string(),
262        };
263
264        // Only advance for valid type-name-like tokens
265        match &self.current.token {
266            // Identifiers are always valid
267            Token::Ident(_) => {
268                self.advance()?;
269                Ok(name)
270            }
271            // These keywords can be type names
272            Token::Contains
273            | Token::Left
274            | Token::Right
275            | Token::First
276            | Token::Last
277            | Token::In
278            | Token::By => {
279                self.advance()?;
280                Ok(name)
281            }
282            // Reject structural tokens that can't be type names
283            Token::Eof
284            | Token::LParen
285            | Token::RParen
286            | Token::LBracket
287            | Token::RBracket
288            | Token::Comma
289            | Token::Dot
290            | Token::Eq
291            | Token::Lt
292            | Token::Gt
293            | Token::Le
294            | Token::Ge
295            | Token::Arrow
296            | Token::ArrowLeft
297            | Token::Dash
298            | Token::Colon
299            | Token::Semi
300            | Token::Star
301            | Token::Plus
302            | Token::Slash => Err(ParseError::expected(
303                vec!["identifier or type name"],
304                &self.current.token,
305                self.position(),
306            )),
307            // All other keywords can potentially be type names
308            _ => {
309                self.advance()?;
310                Ok(name)
311            }
312        }
313    }
314
315    /// Try to consume a token, returning true if consumed
316    pub fn consume(&mut self, expected: &Token) -> Result<bool, ParseError> {
317        if self.check(expected) {
318            self.advance()?;
319            Ok(true)
320        } else {
321            Ok(false)
322        }
323    }
324
325    /// Try to consume an identifier case-insensitively (for keywords not in Token enum)
326    pub fn consume_ident_ci(&mut self, expected: &str) -> Result<bool, ParseError> {
327        match self.peek().clone() {
328            Token::Ident(name) if name.eq_ignore_ascii_case(expected) => {
329                self.advance()?;
330                Ok(true)
331            }
332            _ => Ok(false),
333        }
334    }
335
336    /// Parse a complete query
337    pub fn parse(&mut self) -> Result<QueryExpr, ParseError> {
338        let query = self.parse_frontend_statement()?.into_query_expr();
339
340        // Expect end of input
341        if !self.check(&Token::Eof) {
342            return Err(ParseError::new(
343                // F-05: route the offending token through `SafeTokenDisplay`
344                // so the user-controlled `Ident` / `String` / `JsonLiteral`
345                // payloads are escaped before the message lands in the
346                // downstream JSON / audit / log / gRPC sinks.
347                format!(
348                    "Unexpected token after query: {}",
349                    error::SafeTokenDisplay(&self.current.token)
350                ),
351                self.position(),
352            ));
353        }
354
355        Ok(query)
356    }
357
358    /// Parse the main query expression (without CTEs)
359    pub fn parse_query_expr(&mut self) -> Result<QueryExpr, ParseError> {
360        self.parse_frontend_statement()
361            .map(|statement| statement.into_query_expr())
362    }
363
364    /// Parse an integer literal
365    pub fn parse_integer(&mut self) -> Result<i64, ParseError> {
366        match &self.current.token {
367            Token::Integer(n) => {
368                let n = *n;
369                self.advance()?;
370                Ok(n)
371            }
372            other => Err(ParseError::expected(
373                vec!["integer"],
374                other,
375                self.position(),
376            )),
377        }
378    }
379
380    /// Parse a `$N` or `?` placeholder in a non-Expr slot (e.g. the
381    /// `LIMIT` / `MIN_SCORE` slots of SEARCH SIMILAR; issue #361). The
382    /// `field` name is used only to enrich placeholder-mixing errors.
383    /// Returns the 0-based parameter index.
384    pub fn parse_param_slot(&mut self, field: &'static str) -> Result<usize, ParseError> {
385        match self.peek().clone() {
386            Token::Dollar => {
387                self.advance()?;
388                let n = match *self.peek() {
389                    Token::Integer(n) if n >= 1 => {
390                        self.advance()?;
391                        n as usize
392                    }
393                    _ => {
394                        return Err(ParseError::new(
395                            format!("expected `$N` (N >= 1) for {field} parameter"),
396                            self.position(),
397                        ));
398                    }
399                };
400                if self.placeholder_mode == PlaceholderMode::Question {
401                    return Err(ParseError::new(
402                        "cannot mix `?` and `$N` placeholders in one statement".to_string(),
403                        self.position(),
404                    ));
405                }
406                self.placeholder_mode = PlaceholderMode::Dollar;
407                Ok(n - 1)
408            }
409            Token::Question => {
410                let (index, _) = self.parse_question_param_index()?;
411                Ok(index)
412            }
413            other => Err(ParseError::expected(
414                vec!["$N", "?"],
415                &other,
416                self.position(),
417            )),
418        }
419    }
420
421    /// Parse a question-style positional placeholder. Bare `?` slots are
422    /// assigned left-to-right. Immediate `?N` slots use the explicit
423    /// 1-based index, matching `$N` without changing the placeholder
424    /// family.
425    pub(crate) fn parse_question_param_index(&mut self) -> Result<(usize, Span), ParseError> {
426        let start = self.position();
427        let question_end = self.current.end;
428        self.expect(Token::Question)?;
429        if self.placeholder_mode == PlaceholderMode::Dollar {
430            return Err(ParseError::new(
431                "cannot mix `?` and `$N` placeholders in one statement".to_string(),
432                self.position(),
433            ));
434        }
435        self.placeholder_mode = PlaceholderMode::Question;
436
437        if let Token::Integer(n) = *self.peek() {
438            if self.current.start.offset == question_end.offset {
439                if n < 1 {
440                    return Err(ParseError::new(
441                        "placeholder index must be >= 1".to_string(),
442                        self.position(),
443                    ));
444                }
445                let end = self.current.end;
446                self.advance()?;
447                let index = n as usize - 1;
448                self.question_count = self.question_count.max(index + 1);
449                return Ok((index, Span::new(start, end)));
450            }
451        }
452
453        self.question_count += 1;
454        Ok((self.question_count - 1, Span::new(start, question_end)))
455    }
456
457    /// Parse a strictly-positive integer literal (`> 0`).
458    ///
459    /// Surfaces a `ValueOutOfRange` error for `field` when the literal
460    /// is `0`, negative, or starts with a unary `-` (which the bare
461    /// `parse_integer` rejects with a confusing "expected: integer"
462    /// message). Used by modifier slots like `MAX_SIZE`, `CAPACITY`,
463    /// `WIDTH`, `DEPTH`, `K` where zero or negative values are
464    /// semantically meaningless.
465    pub fn parse_positive_integer(&mut self, field: &'static str) -> Result<i64, ParseError> {
466        let pos = self.position();
467        // Detect the unary-minus path up-front so we surface a
468        // clearer "must be positive" message instead of the generic
469        // "expected: integer".
470        if matches!(self.current.token, Token::Minus | Token::Dash) {
471            return Err(ParseError::value_out_of_range(
472                field,
473                "must be a positive integer",
474                pos,
475            ));
476        }
477        let raw = self.parse_integer()?;
478        if raw <= 0 {
479            return Err(ParseError::value_out_of_range(
480                field,
481                "must be a positive integer",
482                pos,
483            ));
484        }
485        Ok(raw)
486    }
487
488    /// Parse float literal. Accepts an optional leading unary `-`
489    /// followed by a `Float` or `Integer` token so positions like
490    /// vector literals (`[-0.1, …]`), `THRESHOLD -0.5`, `MIN_SCORE
491    /// -0.1`, `RERANK(-0.3)`, `UNION(0.7, -0.3)`, and geo coordinates
492    /// (`LATITUDE -33.86`) parse correctly. See bug #107.
493    pub fn parse_float(&mut self) -> Result<f64, ParseError> {
494        let negate = if matches!(self.current.token, Token::Minus | Token::Dash) {
495            self.advance()?;
496            true
497        } else {
498            false
499        };
500        let value = match &self.current.token {
501            Token::Float(n) => {
502                let n = *n;
503                self.advance()?;
504                n
505            }
506            Token::Integer(n) => {
507                let n = *n as f64;
508                self.advance()?;
509                n
510            }
511            other => {
512                return Err(ParseError::expected(vec!["number"], other, self.position()));
513            }
514        };
515        Ok(if negate { -value } else { value })
516    }
517
518    /// Parse a string literal
519    pub fn parse_string(&mut self) -> Result<String, ParseError> {
520        match &self.current.token {
521            Token::String(s) => {
522                let s = s.clone();
523                self.advance()?;
524                Ok(s)
525            }
526            other => Err(ParseError::expected(vec!["string"], other, self.position())),
527        }
528    }
529
530    /// Parse a value (delegates to parse_literal_value for full JSON support)
531    pub fn parse_value(&mut self) -> Result<Value, ParseError> {
532        self.parse_literal_value()
533    }
534
535    /// Parse value list for IN clause
536    pub fn parse_value_list(&mut self) -> Result<Vec<Value>, ParseError> {
537        let mut values = Vec::new();
538        loop {
539            values.push(self.parse_value()?);
540            if !self.consume(&Token::Comma)? {
541                break;
542            }
543        }
544        Ok(values)
545    }
546
547    /// Phase 1 cutover bridge: parse an expression via the new Pratt
548    /// parser (`parser/expr.rs`) and try to fold it back into a
549    /// literal `Value`. Used by INSERT VALUES / UPDATE SET / DEFAULT
550    /// slots that still store `Value` in their AST nodes — the bridge
551    /// lets them benefit from the full Expr grammar (parenthesised
552    /// literals, unary minus, CAST literals) without an AST cascade.
553    ///
554    /// Folds these Expr shapes:
555    /// - `Expr::Literal { value, .. }` → `value`
556    /// - `Expr::UnaryOp { Neg, operand: Literal(Integer/Float), .. }`
557    ///   → negated value
558    /// - `Expr::Cast { inner: Literal(text), target, .. }` →
559    ///   coerce(text, target) via schema::coerce
560    ///
561    /// Anything else returns an error so callers can decide whether
562    /// to fall back to the legacy `parse_literal_value` path or
563    /// surface a "non-literal not supported in this position" error.
564    pub fn parse_expr_value(&mut self) -> Result<Value, ParseError> {
565        let expr = self.parse_expr()?;
566        super::sql_lowering::fold_expr_to_value(expr)
567            .map_err(|msg| ParseError::new(msg, self.position()))
568    }
569}
570
571/// Parse an RQL query string into a `QueryWithCte`. A leading `WITH`
572/// is consumed as a CTE prelude; any other statement is returned in
573/// the `QueryWithCte::simple` shape (`with_clause: None`). Callers
574/// that don't care about CTEs read `.query` to recover the legacy
575/// `QueryExpr` shape.
576pub fn parse(input: &str) -> Result<QueryWithCte, ParseError> {
577    let mut parser = Parser::new(input)?;
578    parser.parse_with_cte()
579}