Skip to main content

nodedb_fts/search/
query_parser.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! FTS query parser: recognises `NOT <term>` and `-<term>` negation operators.
4//!
5//! The parser is intentionally flat — parentheses are not supported. Attempting
6//! `NOT (x OR y)` returns `Err(InvalidQuery::ParenthesesNotSupported)` so
7//! callers get a clear message explaining the workaround.
8//!
9//! Grammar (informally):
10//!
11//! ```text
12//! query  ::= token*
13//! token  ::= NOT_KW term          -- negated
14//!          | '-' term             -- negated (Lucene-style, no space)
15//!          | term                 -- positive
16//! NOT_KW ::= 'NOT' (uppercase, standalone word)
17//! term   ::= any non-whitespace string that is not NOT_KW
18//! ```
19//!
20//! Multiple negations are independent: `A NOT B NOT C` excludes both B and C.
21//! A query with no positive terms (`NOT python`) returns `Err(InvalidQuery::NegativeOnly)`.
22
23/// A parsed FTS query split into positive (required) and negative (excluded) raw terms.
24///
25/// Both lists are **raw** strings before analysis/stemming. The caller is
26/// responsible for running each list through the collection analyzer and synonym
27/// expansion before BM25 scoring and negative-bitmap construction.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct ParsedQuery {
30    /// Terms that must match (positive clause).
31    pub positive: Vec<String>,
32    /// Terms whose matching documents are excluded from results (NOT clause).
33    pub negative: Vec<String>,
34}
35
36/// Errors produced by [`parse_query`].
37#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
38pub enum InvalidQuery {
39    /// The query contained only negative terms with no positive terms.
40    ///
41    /// This is ill-defined — it would return "all documents except those
42    /// matching X", which requires a full collection scan. Match PostgreSQL
43    /// `tsquery` behaviour and reject the query explicitly.
44    #[error(
45        "FTS query must contain at least one positive term; \
46         a NOT-only query (e.g. 'NOT python') is not supported"
47    )]
48    NegativeOnly,
49
50    /// The query contained `NOT (...)` parenthesised groups, which are not
51    /// supported in this flat-only parser.
52    ///
53    /// Workaround: use separate negations, e.g. `rust NOT python NOT ruby`
54    /// instead of `rust NOT (python OR ruby)`.
55    #[error(
56        "FTS query contains parenthesised NOT groups which are not supported; \
57         use flat negations instead, e.g. 'rust NOT python NOT ruby' \
58         rather than 'rust NOT (python OR ruby)'"
59    )]
60    ParenthesesNotSupported,
61}
62
63/// Parse an FTS query string into positive and negative term lists.
64///
65/// Recognises:
66/// - `NOT <term>` — the word `NOT` (case-sensitive) followed by a whitespace-
67///   separated term marks that term as negative.
68/// - `-<term>` — a term prefixed with `-` (no space) marks it as negative.
69/// - Everything else is a positive term.
70///
71/// Returns `Err(InvalidQuery::ParenthesesNotSupported)` if a `(` is found after
72/// `NOT`. Returns `Err(InvalidQuery::NegativeOnly)` if there are no positive
73/// terms after parsing.
74pub fn parse_query(query: &str) -> Result<ParsedQuery, InvalidQuery> {
75    let mut positive = Vec::new();
76    let mut negative = Vec::new();
77
78    let tokens: Vec<&str> = query.split_whitespace().collect();
79    let mut i = 0;
80    while i < tokens.len() {
81        let tok = tokens[i];
82
83        if tok == "NOT" {
84            // Consume the next token as a negated term.
85            i += 1;
86            if i >= tokens.len() {
87                // Trailing NOT with no term — ignore it (treat as empty positive input).
88                break;
89            }
90            let next = tokens[i];
91            if next.starts_with('(') {
92                return Err(InvalidQuery::ParenthesesNotSupported);
93            }
94            negative.push(next.to_string());
95        } else if let Some(stripped) = tok.strip_prefix('-') {
96            if stripped.is_empty() {
97                // A bare `-` is not a negation prefix — treat as positive.
98                positive.push(tok.to_string());
99            } else {
100                negative.push(stripped.to_string());
101            }
102        } else {
103            positive.push(tok.to_string());
104        }
105        i += 1;
106    }
107
108    if positive.is_empty() && !negative.is_empty() {
109        return Err(InvalidQuery::NegativeOnly);
110    }
111
112    Ok(ParsedQuery { positive, negative })
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118
119    fn pos(terms: &[&str]) -> Vec<String> {
120        terms.iter().map(|s| s.to_string()).collect()
121    }
122
123    #[test]
124    fn simple_positive_terms() {
125        let pq = parse_query("rust python").unwrap();
126        assert_eq!(pq.positive, pos(&["rust", "python"]));
127        assert!(pq.negative.is_empty());
128    }
129
130    #[test]
131    fn not_keyword() {
132        let pq = parse_query("rust NOT python").unwrap();
133        assert_eq!(pq.positive, pos(&["rust"]));
134        assert_eq!(pq.negative, pos(&["python"]));
135    }
136
137    #[test]
138    fn dash_prefix() {
139        let pq = parse_query("rust -python").unwrap();
140        assert_eq!(pq.positive, pos(&["rust"]));
141        assert_eq!(pq.negative, pos(&["python"]));
142    }
143
144    #[test]
145    fn multiple_negations() {
146        let pq = parse_query("rust NOT python NOT ruby").unwrap();
147        assert_eq!(pq.positive, pos(&["rust"]));
148        assert_eq!(pq.negative, pos(&["python", "ruby"]));
149    }
150
151    #[test]
152    fn multiple_dash_negations() {
153        let pq = parse_query("rust -python -ruby").unwrap();
154        assert_eq!(pq.positive, pos(&["rust"]));
155        assert_eq!(pq.negative, pos(&["python", "ruby"]));
156    }
157
158    #[test]
159    fn negative_only_returns_error() {
160        let err = parse_query("NOT python").unwrap_err();
161        assert_eq!(err, InvalidQuery::NegativeOnly);
162    }
163
164    #[test]
165    fn dash_only_negative_returns_error() {
166        let err = parse_query("-python").unwrap_err();
167        assert_eq!(err, InvalidQuery::NegativeOnly);
168    }
169
170    #[test]
171    fn parentheses_after_not_returns_error() {
172        let err = parse_query("rust NOT (python OR ruby)").unwrap_err();
173        assert_eq!(err, InvalidQuery::ParenthesesNotSupported);
174    }
175
176    #[test]
177    fn bare_dash_treated_as_positive() {
178        let pq = parse_query("hello - world").unwrap();
179        assert_eq!(pq.positive, pos(&["hello", "-", "world"]));
180        assert!(pq.negative.is_empty());
181    }
182
183    #[test]
184    fn trailing_not_ignored() {
185        // Trailing NOT with no following term: positive terms are still parsed.
186        let pq = parse_query("rust NOT").unwrap();
187        assert_eq!(pq.positive, pos(&["rust"]));
188        assert!(pq.negative.is_empty());
189    }
190
191    #[test]
192    fn no_positive_only_negatives_multiple() {
193        let err = parse_query("NOT python NOT ruby").unwrap_err();
194        assert_eq!(err, InvalidQuery::NegativeOnly);
195    }
196}