Skip to main content

fathomdb_query/
text_query.rs

1/// A constrained full-text query representation for `FathomDB`'s safe search API.
2///
3/// `TextQuery` models the subset of boolean search supported by
4/// [`QueryBuilder::text_search`](crate::QueryBuilder::text_search):
5/// literal terms, quoted phrases, uppercase `OR`, uppercase `NOT`, and
6/// implicit `AND` by adjacency.
7#[derive(Clone, Debug, PartialEq, Eq)]
8pub enum TextQuery {
9    /// An empty query.
10    Empty,
11    /// A literal search term.
12    Term(String),
13    /// A literal quoted phrase.
14    Phrase(String),
15    /// A negated child query.
16    Not(Box<TextQuery>),
17    /// A conjunction of child queries.
18    And(Vec<TextQuery>),
19    /// A disjunction of child queries.
20    Or(Vec<TextQuery>),
21}
22
23#[derive(Clone, Debug, PartialEq, Eq)]
24enum Token {
25    Word(String),
26    Phrase(String),
27}
28
29impl TextQuery {
30    /// Parse raw user or agent input into `FathomDB`'s supported text-query subset.
31    ///
32    /// Parsing is intentionally forgiving. Only exact uppercase `OR` and `NOT`
33    /// tokens are treated as operators; unsupported or malformed syntax is
34    /// downgraded to literal terms instead of being passed through as raw FTS5.
35    #[must_use]
36    pub fn parse(raw: &str) -> Self {
37        let tokens = tokenize(raw);
38        if tokens.is_empty() {
39            return Self::Empty;
40        }
41
42        let mut groups = Vec::new();
43        let mut current = Vec::new();
44        let mut index = 0;
45
46        while index < tokens.len() {
47            if is_or_token(&tokens[index]) {
48                let can_split = !current.is_empty() && can_start_or_clause(&tokens, index + 1);
49                if can_split {
50                    groups.push(normalize_and(current));
51                    current = Vec::new();
52                } else {
53                    current.push(Self::Term("OR".to_owned()));
54                }
55                index += 1;
56                continue;
57            }
58
59            let (node, next) =
60                parse_atom_or_literal(&tokens, index, can_negate_from_current(&current));
61            current.push(node);
62            index = next;
63        }
64
65        if !current.is_empty() {
66            groups.push(normalize_and(current));
67        }
68
69        match groups.len() {
70            0 => Self::Empty,
71            1 => groups.into_iter().next().unwrap_or(Self::Empty),
72            _ => Self::Or(groups),
73        }
74    }
75}
76
77/// Render a [`TextQuery`] as an FTS5-safe `MATCH` expression.
78///
79/// The renderer is the only place that emits FTS5 control syntax. All literal
80/// terms and phrases are double-quoted and escaped, while only supported
81/// operators (`OR`, `NOT`, and implicit `AND`) are emitted as control syntax.
82#[must_use]
83pub fn render_text_query_fts5(query: &TextQuery) -> String {
84    render_with_grouping(query, false)
85}
86
87fn render_with_grouping(query: &TextQuery, parenthesize: bool) -> String {
88    match query {
89        TextQuery::Empty => String::new(),
90        TextQuery::Term(term) | TextQuery::Phrase(term) => quote_fts5_literal(term),
91        TextQuery::Not(child) => {
92            let rendered = render_with_grouping(child, true);
93            format!("NOT {rendered}")
94        }
95        TextQuery::And(children) => {
96            let rendered = children
97                .iter()
98                .map(|child| render_with_grouping(child, matches!(child, TextQuery::Or(_))))
99                .collect::<Vec<_>>()
100                .join(" ");
101            if parenthesize && children.len() > 1 {
102                format!("({rendered})")
103            } else {
104                rendered
105            }
106        }
107        TextQuery::Or(children) => {
108            let rendered = children
109                .iter()
110                .map(|child| render_with_grouping(child, matches!(child, TextQuery::And(_))))
111                .collect::<Vec<_>>()
112                .join(" OR ");
113            if parenthesize && children.len() > 1 {
114                format!("({rendered})")
115            } else {
116                rendered
117            }
118        }
119    }
120}
121
122fn quote_fts5_literal(raw: &str) -> String {
123    let escaped = raw.replace('"', "\"\"");
124    format!("\"{escaped}\"")
125}
126
127fn tokenize(raw: &str) -> Vec<Token> {
128    let mut tokens = Vec::new();
129    let chars: Vec<char> = raw.chars().collect();
130    let mut index = 0;
131
132    while index < chars.len() {
133        while index < chars.len() && chars[index].is_whitespace() {
134            index += 1;
135        }
136        if index >= chars.len() {
137            break;
138        }
139
140        if chars[index] == '"' {
141            let start = index + 1;
142            let mut end = start;
143            while end < chars.len() && chars[end] != '"' {
144                end += 1;
145            }
146            if end < chars.len() {
147                let phrase: String = chars[start..end].iter().collect();
148                tokens.push(Token::Phrase(phrase));
149                index = end + 1;
150                continue;
151            }
152        }
153
154        let start = index;
155        while index < chars.len() && !chars[index].is_whitespace() {
156            index += 1;
157        }
158        let word: String = chars[start..index].iter().collect();
159        tokens.push(Token::Word(word));
160    }
161
162    tokens
163}
164
165fn is_or_token(token: &Token) -> bool {
166    matches!(token, Token::Word(word) if word == "OR")
167}
168
169fn can_start_or_clause(tokens: &[Token], index: usize) -> bool {
170    match tokens.get(index) {
171        Some(Token::Phrase(_)) => true,
172        Some(Token::Word(word)) => word != "OR" && word != "NOT",
173        None => false,
174    }
175}
176
177fn can_negate_from_current(current: &[TextQuery]) -> bool {
178    match current.last() {
179        Some(TextQuery::Phrase(_)) => true,
180        Some(TextQuery::Term(term)) => term != "OR" && term != "AND" && term != "NOT",
181        _ => false,
182    }
183}
184
185fn parse_atom_or_literal(tokens: &[Token], index: usize, can_negate: bool) -> (TextQuery, usize) {
186    match tokens.get(index) {
187        Some(Token::Phrase(phrase)) => (TextQuery::Phrase(phrase.clone()), index + 1),
188        Some(Token::Word(word)) if word == "NOT" => {
189            if can_negate {
190                match tokens.get(index + 1) {
191                    Some(Token::Phrase(phrase)) => (
192                        TextQuery::Not(Box::new(TextQuery::Phrase(phrase.clone()))),
193                        index + 2,
194                    ),
195                    Some(Token::Word(next)) if next != "OR" && next != "NOT" => (
196                        TextQuery::Not(Box::new(TextQuery::Term(next.clone()))),
197                        index + 2,
198                    ),
199                    _ => (TextQuery::Term("NOT".to_owned()), index + 1),
200                }
201            } else {
202                (TextQuery::Term("NOT".to_owned()), index + 1)
203            }
204        }
205        Some(Token::Word(word)) => (TextQuery::Term(word.clone()), index + 1),
206        None => (TextQuery::Empty, index),
207    }
208}
209
210fn normalize_and(mut nodes: Vec<TextQuery>) -> TextQuery {
211    match nodes.len() {
212        0 => TextQuery::Empty,
213        1 => nodes.pop().unwrap_or(TextQuery::Empty),
214        _ => TextQuery::And(nodes),
215    }
216}
217
218#[cfg(test)]
219// CONTRACT: "Unsupported syntax stays literal"
220//
221// The tests tagged `CONTRACT:` below protect a load-bearing safety
222// property of `TextQuery::parse` and `render_text_query_fts5`: any
223// token, operator-like keyword, or punctuation that the supported
224// grammar does not recognize as control syntax is passed through as
225// a literal search term and rendered as a double-quoted FTS5 literal.
226//
227// This is what lets any agent or application pipe raw user messages
228// (chat input, email bodies, form fields) straight into `search()`
229// without a sanitization layer. There is no parse failure mode that
230// returns an error to the caller, and no failure mode that injects
231// unintended boolean structure into the FTS5 query. The three specific
232// guarantees are:
233//
234//   1. Lowercase `or` / `not` are literal terms (operators require
235//      uppercase; the parser does not match case-insensitively).
236//   2. Clause-leading `NOT` is a literal term (`NOT` is only a real
237//      operator when it binds to a right-hand clause inside an
238//      existing conjunction; at the start of a clause, after an
239//      `OR`, or with nothing to its left it degrades to a literal).
240//   3. Unsupported syntax (`col:value`, `prefix*`, `NEAR`, explicit
241//      `AND`) is parsed as literal terms.
242//
243// A future refactor of the parser or renderer that touches any of the
244// `CONTRACT:`-tagged tests MUST read this block, understand that the
245// test is protecting a public safety property, and preserve the
246// behavior. Do not "fix" a CONTRACT test by updating the expected
247// output to match a new parser behavior; update the parser instead.
248// Any new grammar surface must expand this block with a matching
249// contract clause and its own CONTRACT-tagged tests.
250mod tests {
251    use super::{TextQuery, render_text_query_fts5};
252
253    // CONTRACT: empty / whitespace-only input must never parse-fail.
254    // The chat-to-search path depends on being able to hand any string
255    // to `parse` and get back a well-formed `TextQuery`.
256    #[test]
257    fn parse_empty_query() {
258        assert_eq!(TextQuery::parse(""), TextQuery::Empty);
259        assert_eq!(TextQuery::parse("   "), TextQuery::Empty);
260        assert_eq!(TextQuery::parse("\t\n  \t"), TextQuery::Empty);
261    }
262
263    #[test]
264    fn parse_plain_terms_as_implicit_and() {
265        assert_eq!(
266            TextQuery::parse("budget meeting"),
267            TextQuery::And(vec![
268                TextQuery::Term("budget".into()),
269                TextQuery::Term("meeting".into()),
270            ])
271        );
272    }
273
274    #[test]
275    fn parse_phrase() {
276        assert_eq!(
277            TextQuery::parse("\"release notes\""),
278            TextQuery::Phrase("release notes".into())
279        );
280    }
281
282    #[test]
283    fn parse_or_operator() {
284        assert_eq!(
285            TextQuery::parse("ship OR docs"),
286            TextQuery::Or(vec![
287                TextQuery::Term("ship".into()),
288                TextQuery::Term("docs".into()),
289            ])
290        );
291    }
292
293    #[test]
294    fn parse_not_operator() {
295        assert_eq!(
296            TextQuery::parse("ship NOT blocked"),
297            TextQuery::And(vec![
298                TextQuery::Term("ship".into()),
299                TextQuery::Not(Box::new(TextQuery::Term("blocked".into()))),
300            ])
301        );
302    }
303
304    // CONTRACT: clause-leading `NOT` degrades to a literal term.
305    // `NOT` is only an operator when it binds to a right-hand clause
306    // inside an existing conjunction.
307    #[test]
308    fn parse_leading_not_as_literal() {
309        assert_eq!(
310            TextQuery::parse("NOT blocked"),
311            TextQuery::And(vec![
312                TextQuery::Term("NOT".into()),
313                TextQuery::Term("blocked".into()),
314            ])
315        );
316    }
317
318    // CONTRACT: `NOT` immediately after `OR` degrades to a literal,
319    // as does the surrounding `OR` (it has no right-hand clause to
320    // disjoin). Both fall through to literal terms under an implicit
321    // AND, preserving the "raw-chat-is-safe" property.
322    #[test]
323    fn parse_not_after_or_as_literal() {
324        assert_eq!(
325            TextQuery::parse("ship OR NOT blocked"),
326            TextQuery::And(vec![
327                TextQuery::Term("ship".into()),
328                TextQuery::Term("OR".into()),
329                TextQuery::Term("NOT".into()),
330                TextQuery::Term("blocked".into()),
331            ])
332        );
333    }
334
335    // CONTRACT: lowercase `or` is a literal term. Operators require
336    // uppercase; the parser does not match case-insensitively.
337    #[test]
338    fn parse_lowercase_or_as_literal() {
339        assert_eq!(
340            TextQuery::parse("ship or docs"),
341            TextQuery::And(vec![
342                TextQuery::Term("ship".into()),
343                TextQuery::Term("or".into()),
344                TextQuery::Term("docs".into()),
345            ])
346        );
347    }
348
349    // CONTRACT: lowercase `not` is a literal term. Operators require
350    // uppercase; the parser does not match case-insensitively.
351    #[test]
352    fn parse_lowercase_not_as_literal() {
353        assert_eq!(
354            TextQuery::parse("not a ship"),
355            TextQuery::And(vec![
356                TextQuery::Term("not".into()),
357                TextQuery::Term("a".into()),
358                TextQuery::Term("ship".into()),
359            ])
360        );
361    }
362
363    #[test]
364    fn parse_trailing_or_as_literal() {
365        assert_eq!(
366            TextQuery::parse("ship OR"),
367            TextQuery::And(vec![
368                TextQuery::Term("ship".into()),
369                TextQuery::Term("OR".into()),
370            ])
371        );
372    }
373
374    #[test]
375    fn parse_apostrophe_as_literal_term() {
376        assert_eq!(
377            TextQuery::parse("User's name"),
378            TextQuery::And(vec![
379                TextQuery::Term("User's".into()),
380                TextQuery::Term("name".into()),
381            ])
382        );
383    }
384
385    // CONTRACT: unsupported `col:value` syntax stays literal.
386    // FathomDB does not expose FTS5 column filters via this surface.
387    #[test]
388    fn parse_unsupported_column_filter_as_literal() {
389        assert_eq!(
390            TextQuery::parse("col:value"),
391            TextQuery::Term("col:value".into())
392        );
393    }
394
395    // CONTRACT: unsupported prefix-match syntax (`term*`) stays
396    // literal. Prefix queries are not part of the public grammar.
397    #[test]
398    fn parse_unsupported_prefix_as_literal() {
399        assert_eq!(
400            TextQuery::parse("prefix*"),
401            TextQuery::Term("prefix*".into())
402        );
403    }
404
405    // CONTRACT: unsupported `NEAR` operator stays literal.
406    // Proximity queries are not part of the public grammar.
407    #[test]
408    fn parse_near_as_literal() {
409        assert_eq!(
410            TextQuery::parse("a NEAR b"),
411            TextQuery::And(vec![
412                TextQuery::Term("a".into()),
413                TextQuery::Term("NEAR".into()),
414                TextQuery::Term("b".into()),
415            ])
416        );
417    }
418
419    // CONTRACT: explicit `AND` stays literal. The grammar only
420    // supports implicit AND by adjacency, so `cats AND dogs` parses
421    // as three literal terms joined by implicit conjunction.
422    #[test]
423    fn parse_explicit_and_as_literal() {
424        assert_eq!(
425            TextQuery::parse("cats AND dogs OR fish"),
426            TextQuery::Or(vec![
427                TextQuery::And(vec![
428                    TextQuery::Term("cats".into()),
429                    TextQuery::Term("AND".into()),
430                    TextQuery::Term("dogs".into()),
431                ]),
432                TextQuery::Term("fish".into()),
433            ])
434        );
435    }
436
437    // CONTRACT: phrase + OR operator composes correctly. A quoted
438    // phrase on either side of `OR` must become a `Phrase` node in a
439    // disjunction, not a literal term with the quote characters
440    // embedded. Combinations of phrase and operator are a likely
441    // shape for agent-generated queries and must stay well-defined.
442    #[test]
443    fn parse_phrase_with_or_operator() {
444        assert_eq!(
445            TextQuery::parse("\"release notes\" OR changelog"),
446            TextQuery::Or(vec![
447                TextQuery::Phrase("release notes".into()),
448                TextQuery::Term("changelog".into()),
449            ])
450        );
451        assert_eq!(
452            TextQuery::parse("ship OR \"release notes\""),
453            TextQuery::Or(vec![
454                TextQuery::Term("ship".into()),
455                TextQuery::Phrase("release notes".into()),
456            ])
457        );
458    }
459
460    // CONTRACT: phrase + NOT operator composes correctly. A `NOT`
461    // that binds to a right-hand quoted phrase must produce a
462    // `Not(Phrase)` node under the enclosing conjunction, and render
463    // round-trip with the phrase preserved as a quoted FTS5 literal.
464    #[test]
465    fn parse_phrase_with_not_operator() {
466        assert_eq!(
467            TextQuery::parse("ship NOT \"release notes\""),
468            TextQuery::And(vec![
469                TextQuery::Term("ship".into()),
470                TextQuery::Not(Box::new(TextQuery::Phrase("release notes".into()))),
471            ])
472        );
473        assert_eq!(
474            render_text_query_fts5(&TextQuery::parse("ship NOT \"release notes\"")),
475            "\"ship\" NOT \"release notes\""
476        );
477    }
478
479    #[test]
480    fn render_term_query() {
481        assert_eq!(
482            render_text_query_fts5(&TextQuery::Term("budget".into())),
483            "\"budget\""
484        );
485    }
486
487    #[test]
488    fn render_phrase_query() {
489        assert_eq!(
490            render_text_query_fts5(&TextQuery::Phrase("release notes".into())),
491            "\"release notes\""
492        );
493    }
494
495    #[test]
496    fn render_or_query() {
497        assert_eq!(
498            render_text_query_fts5(&TextQuery::Or(vec![
499                TextQuery::Term("ship".into()),
500                TextQuery::Term("docs".into()),
501            ])),
502            "\"ship\" OR \"docs\""
503        );
504    }
505
506    #[test]
507    fn render_not_query() {
508        assert_eq!(
509            render_text_query_fts5(&TextQuery::And(vec![
510                TextQuery::Term("ship".into()),
511                TextQuery::Not(Box::new(TextQuery::Term("blocked".into()))),
512            ])),
513            "\"ship\" NOT \"blocked\""
514        );
515    }
516
517    #[test]
518    fn render_escapes_embedded_quotes() {
519        assert_eq!(
520            render_text_query_fts5(&TextQuery::Term("say \"hello\"".into())),
521            "\"say \"\"hello\"\"\""
522        );
523    }
524
525    // CONTRACT: the render side of "leading NOT stays literal" —
526    // renders as two quoted FTS5 literals, never as a bare `NOT`
527    // operator that would corrupt the FTS5 query.
528    #[test]
529    fn render_leading_not_literalized_parse_safely() {
530        assert_eq!(
531            render_text_query_fts5(&TextQuery::parse("NOT blocked")),
532            "\"NOT\" \"blocked\""
533        );
534    }
535
536    // CONTRACT: the render side of "lowercase not stays literal" —
537    // all three tokens emit as quoted FTS5 literals.
538    #[test]
539    fn render_lowercase_not_as_literal_terms() {
540        assert_eq!(
541            render_text_query_fts5(&TextQuery::parse("not a ship")),
542            "\"not\" \"a\" \"ship\""
543        );
544    }
545}