Skip to main content

nodedb_sql/functions/fts_ops/
tsquery_parser.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Recursive-descent parser for PostgreSQL `tsquery` boolean syntax.
4//!
5//! Supported grammar (left-recursive precedence, lowest-to-highest):
6//! ```text
7//! query   ::= or_expr
8//! or_expr ::= and_expr ( '|' and_expr )*
9//! and_expr::= not_expr ( '&' not_expr )*
10//! not_expr::= '!' not_expr | atom
11//! atom    ::= '(' query ')'
12//!           | term ':*'    (prefix)
13//!           | term         (plain)
14//! ```
15//!
16//! A `term` is any contiguous non-operator, non-paren token.
17//!
18//! `phraseto_tsquery` is handled via `parse_phrase_terms` — the result is
19//! always `FtsQuery::Phrase`, which the executor rejects with Unsupported.
20
21use crate::error::{Result, SqlError};
22use crate::fts_types::FtsQuery;
23
24/// Parse a PG `tsquery` string into an `FtsQuery` tree.
25///
26/// Empty input is rejected with `SqlError::Parse`.
27pub fn parse_tsquery(input: &str) -> Result<FtsQuery> {
28    let input = input.trim();
29    if input.is_empty() {
30        return Err(SqlError::Parse {
31            detail: "tsquery: empty query string".into(),
32        });
33    }
34    let tokens = tokenize(input);
35    let mut pos = 0;
36    let query = parse_or(&tokens, &mut pos)?;
37    if pos < tokens.len() {
38        return Err(SqlError::Parse {
39            detail: format!(
40                "tsquery: unexpected token '{}' at position {pos}",
41                tokens[pos]
42            ),
43        });
44    }
45    Ok(query)
46}
47
48/// Build an `FtsQuery::Phrase` from whitespace-separated terms.
49///
50/// Used by `phraseto_tsquery`.  The resulting variant is always rejected by
51/// the executor with `Unsupported` — it is only produced so that the error
52/// path is reached correctly.
53pub fn parse_phrase_terms(input: &str) -> FtsQuery {
54    let terms: Vec<String> = input
55        .split_whitespace()
56        .map(|s| s.to_ascii_lowercase())
57        .filter(|s| !s.is_empty())
58        .collect();
59    FtsQuery::Phrase(terms)
60}
61
62// ── Tokenizer ────────────────────────────────────────────────────────────────
63
64fn tokenize(input: &str) -> Vec<String> {
65    let mut tokens = Vec::new();
66    let mut chars = input.char_indices().peekable();
67    while let Some((i, c)) = chars.next() {
68        match c {
69            ' ' | '\t' | '\n' | '\r' => continue,
70            '(' | ')' | '&' | '|' | '!' => tokens.push(c.to_string()),
71            ':' => {
72                // ':*' suffix on the preceding term — already handled in term
73                // parsing; if we see a bare ':' here it's part of the next
74                // term text.  Push ':' as a token so we can detect ':*'.
75                if chars.peek().map(|(_, nc)| *nc) == Some('*') {
76                    chars.next(); // consume '*'
77                    tokens.push(":*".into());
78                } else {
79                    tokens.push(":".into());
80                }
81            }
82            _ => {
83                // Collect a term: alphanumeric + hyphen + underscore.
84                let start = i;
85                let mut end = i + c.len_utf8();
86                while let Some(&(j, nc)) = chars.peek() {
87                    if nc.is_alphanumeric() || nc == '-' || nc == '_' {
88                        end = j + nc.len_utf8();
89                        chars.next();
90                    } else {
91                        break;
92                    }
93                }
94                tokens.push(input[start..end].to_ascii_lowercase());
95            }
96        }
97    }
98    tokens
99}
100
101// ── Recursive-descent parser ──────────────────────────────────────────────────
102
103fn parse_or(tokens: &[String], pos: &mut usize) -> Result<FtsQuery> {
104    let mut left = parse_and(tokens, pos)?;
105    while *pos < tokens.len() && tokens[*pos] == "|" {
106        *pos += 1;
107        let right = parse_and(tokens, pos)?;
108        left = match left {
109            FtsQuery::Or(mut parts) => {
110                parts.push(right);
111                FtsQuery::Or(parts)
112            }
113            other => FtsQuery::Or(vec![other, right]),
114        };
115    }
116    Ok(left)
117}
118
119fn parse_and(tokens: &[String], pos: &mut usize) -> Result<FtsQuery> {
120    let mut left = parse_not(tokens, pos)?;
121    while *pos < tokens.len() && tokens[*pos] == "&" {
122        *pos += 1;
123        let right = parse_not(tokens, pos)?;
124        left = match left {
125            FtsQuery::And(mut parts) => {
126                parts.push(right);
127                FtsQuery::And(parts)
128            }
129            other => FtsQuery::And(vec![other, right]),
130        };
131    }
132    Ok(left)
133}
134
135fn parse_not(tokens: &[String], pos: &mut usize) -> Result<FtsQuery> {
136    if *pos < tokens.len() && tokens[*pos] == "!" {
137        *pos += 1;
138        let inner = parse_not(tokens, pos)?;
139        return Ok(FtsQuery::Not(Box::new(inner)));
140    }
141    parse_atom(tokens, pos)
142}
143
144fn parse_atom(tokens: &[String], pos: &mut usize) -> Result<FtsQuery> {
145    if *pos >= tokens.len() {
146        return Err(SqlError::Parse {
147            detail: "tsquery: unexpected end of input".into(),
148        });
149    }
150    if tokens[*pos] == "(" {
151        *pos += 1;
152        let inner = parse_or(tokens, pos)?;
153        if *pos >= tokens.len() || tokens[*pos] != ")" {
154            return Err(SqlError::Parse {
155                detail: "tsquery: unmatched '('".into(),
156            });
157        }
158        *pos += 1;
159        return Ok(inner);
160    }
161    // Must be a term token.
162    let term = tokens[*pos].clone();
163    if term == ")" || term == "&" || term == "|" || term == "!" {
164        return Err(SqlError::Parse {
165            detail: format!("tsquery: unexpected operator '{term}'"),
166        });
167    }
168    *pos += 1;
169    // Check for ':*' suffix (prefix search).
170    if *pos < tokens.len() && tokens[*pos] == ":*" {
171        *pos += 1;
172        return Ok(FtsQuery::Prefix(term));
173    }
174    Ok(FtsQuery::Plain {
175        text: term,
176        fuzzy: false,
177    })
178}
179
180// ── Tests ────────────────────────────────────────────────────────────────────
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn parse_single_term() {
188        let q = parse_tsquery("rust").unwrap();
189        assert_eq!(
190            q,
191            FtsQuery::Plain {
192                text: "rust".into(),
193                fuzzy: false
194            }
195        );
196    }
197
198    #[test]
199    fn parse_and_two_terms() {
200        let q = parse_tsquery("rust & lang").unwrap();
201        assert_eq!(
202            q,
203            FtsQuery::And(vec![
204                FtsQuery::Plain {
205                    text: "rust".into(),
206                    fuzzy: false
207                },
208                FtsQuery::Plain {
209                    text: "lang".into(),
210                    fuzzy: false
211                },
212            ])
213        );
214    }
215
216    #[test]
217    fn parse_or_two_terms() {
218        let q = parse_tsquery("rust | lang").unwrap();
219        assert_eq!(
220            q,
221            FtsQuery::Or(vec![
222                FtsQuery::Plain {
223                    text: "rust".into(),
224                    fuzzy: false
225                },
226                FtsQuery::Plain {
227                    text: "lang".into(),
228                    fuzzy: false
229                },
230            ])
231        );
232    }
233
234    #[test]
235    fn parse_prefix() {
236        let q = parse_tsquery("rust:*").unwrap();
237        assert_eq!(q, FtsQuery::Prefix("rust".into()));
238    }
239
240    #[test]
241    fn parse_nested() {
242        // (a & b) | c
243        let q = parse_tsquery("(a & b) | c").unwrap();
244        assert_eq!(
245            q,
246            FtsQuery::Or(vec![
247                FtsQuery::And(vec![
248                    FtsQuery::Plain {
249                        text: "a".into(),
250                        fuzzy: false
251                    },
252                    FtsQuery::Plain {
253                        text: "b".into(),
254                        fuzzy: false
255                    },
256                ]),
257                FtsQuery::Plain {
258                    text: "c".into(),
259                    fuzzy: false
260                },
261            ])
262        );
263    }
264
265    #[test]
266    fn parse_not_term() {
267        let q = parse_tsquery("!foo").unwrap();
268        assert_eq!(
269            q,
270            FtsQuery::Not(Box::new(FtsQuery::Plain {
271                text: "foo".into(),
272                fuzzy: false
273            }))
274        );
275    }
276
277    #[test]
278    fn parse_empty_is_error() {
279        assert!(parse_tsquery("").is_err());
280        assert!(parse_tsquery("   ").is_err());
281    }
282
283    #[test]
284    fn parse_phrase_terms_basic() {
285        let q = parse_phrase_terms("hello world");
286        assert_eq!(q, FtsQuery::Phrase(vec!["hello".into(), "world".into()]));
287    }
288
289    #[test]
290    fn and_is_left_associative() {
291        let q = parse_tsquery("a & b & c").unwrap();
292        // a & b produces And([a,b]); then & c extends to And([a,b,c])
293        assert_eq!(
294            q,
295            FtsQuery::And(vec![
296                FtsQuery::Plain {
297                    text: "a".into(),
298                    fuzzy: false
299                },
300                FtsQuery::Plain {
301                    text: "b".into(),
302                    fuzzy: false
303                },
304                FtsQuery::Plain {
305                    text: "c".into(),
306                    fuzzy: false
307                },
308            ])
309        );
310    }
311
312    #[test]
313    fn or_is_left_associative() {
314        let q = parse_tsquery("a | b | c").unwrap();
315        assert_eq!(
316            q,
317            FtsQuery::Or(vec![
318                FtsQuery::Plain {
319                    text: "a".into(),
320                    fuzzy: false
321                },
322                FtsQuery::Plain {
323                    text: "b".into(),
324                    fuzzy: false
325                },
326                FtsQuery::Plain {
327                    text: "c".into(),
328                    fuzzy: false
329                },
330            ])
331        );
332    }
333}