Skip to main content

fathomdb_query/
text_query.rs

1/// A constrained full-text query representation for `FathomDB`'s safe search API.
2///
3/// `TextQuery` models the subset of boolean search supported by
4/// [`QueryBuilder::text_search`](crate::QueryBuilder::text_search):
5/// literal terms, quoted phrases, uppercase `OR`, uppercase `NOT`, and
6/// implicit `AND` by adjacency.
7#[derive(Clone, Debug, PartialEq, Eq)]
8pub enum TextQuery {
9    /// An empty query.
10    Empty,
11    /// A literal search term.
12    Term(String),
13    /// A literal quoted phrase.
14    Phrase(String),
15    /// A negated child query.
16    Not(Box<TextQuery>),
17    /// A conjunction of child queries.
18    And(Vec<TextQuery>),
19    /// A disjunction of child queries.
20    Or(Vec<TextQuery>),
21}
22
23#[derive(Clone, Debug, PartialEq, Eq)]
24enum Token {
25    Word(String),
26    Phrase(String),
27}
28
29impl TextQuery {
30    /// Parse raw user or agent input into `FathomDB`'s supported text-query subset.
31    ///
32    /// Parsing is intentionally forgiving. Only exact uppercase `OR` and `NOT`
33    /// tokens are treated as operators; unsupported or malformed syntax is
34    /// downgraded to literal terms instead of being passed through as raw FTS5.
35    #[must_use]
36    pub fn parse(raw: &str) -> Self {
37        let tokens = tokenize(raw);
38        if tokens.is_empty() {
39            return Self::Empty;
40        }
41
42        let mut groups = Vec::new();
43        let mut current = Vec::new();
44        let mut index = 0;
45
46        while index < tokens.len() {
47            if is_or_token(&tokens[index]) {
48                let can_split = !current.is_empty() && can_start_or_clause(&tokens, index + 1);
49                if can_split {
50                    groups.push(normalize_and(current));
51                    current = Vec::new();
52                } else {
53                    current.push(Self::Term("OR".to_owned()));
54                }
55                index += 1;
56                continue;
57            }
58
59            let (node, next) =
60                parse_atom_or_literal(&tokens, index, can_negate_from_current(&current));
61            current.push(node);
62            index = next;
63        }
64
65        if !current.is_empty() {
66            groups.push(normalize_and(current));
67        }
68
69        match groups.len() {
70            0 => Self::Empty,
71            1 => groups.into_iter().next().unwrap_or(Self::Empty),
72            _ => Self::Or(groups),
73        }
74    }
75}
76
77/// Render a [`TextQuery`] as an FTS5-safe `MATCH` expression.
78///
79/// The renderer is the only place that emits FTS5 control syntax. All literal
80/// terms and phrases are double-quoted and escaped, while only supported
81/// operators (`OR`, `NOT`, and implicit `AND`) are emitted as control syntax.
82#[must_use]
83pub fn render_text_query_fts5(query: &TextQuery) -> String {
84    render_with_grouping(query, false)
85}
86
87fn render_with_grouping(query: &TextQuery, parenthesize: bool) -> String {
88    match query {
89        TextQuery::Empty => String::new(),
90        TextQuery::Term(term) | TextQuery::Phrase(term) => quote_fts5_literal(term),
91        TextQuery::Not(child) => {
92            let rendered = render_with_grouping(child, true);
93            format!("NOT {rendered}")
94        }
95        TextQuery::And(children) => {
96            let rendered = children
97                .iter()
98                .map(|child| render_with_grouping(child, matches!(child, TextQuery::Or(_))))
99                .collect::<Vec<_>>()
100                .join(" ");
101            if parenthesize && children.len() > 1 {
102                format!("({rendered})")
103            } else {
104                rendered
105            }
106        }
107        TextQuery::Or(children) => {
108            let rendered = children
109                .iter()
110                .map(|child| render_with_grouping(child, matches!(child, TextQuery::And(_))))
111                .collect::<Vec<_>>()
112                .join(" OR ");
113            if parenthesize && children.len() > 1 {
114                format!("({rendered})")
115            } else {
116                rendered
117            }
118        }
119    }
120}
121
122fn quote_fts5_literal(raw: &str) -> String {
123    let escaped = raw.replace('"', "\"\"");
124    format!("\"{escaped}\"")
125}
126
127fn tokenize(raw: &str) -> Vec<Token> {
128    let mut tokens = Vec::new();
129    let chars: Vec<char> = raw.chars().collect();
130    let mut index = 0;
131
132    while index < chars.len() {
133        while index < chars.len() && chars[index].is_whitespace() {
134            index += 1;
135        }
136        if index >= chars.len() {
137            break;
138        }
139
140        if chars[index] == '"' {
141            let start = index + 1;
142            let mut end = start;
143            while end < chars.len() && chars[end] != '"' {
144                end += 1;
145            }
146            if end < chars.len() {
147                let phrase: String = chars[start..end].iter().collect();
148                tokens.push(Token::Phrase(phrase));
149                index = end + 1;
150                continue;
151            }
152        }
153
154        let start = index;
155        while index < chars.len() && !chars[index].is_whitespace() {
156            index += 1;
157        }
158        let word: String = chars[start..index].iter().collect();
159        tokens.push(Token::Word(word));
160    }
161
162    tokens
163}
164
165fn is_or_token(token: &Token) -> bool {
166    matches!(token, Token::Word(word) if word == "OR")
167}
168
169fn can_start_or_clause(tokens: &[Token], index: usize) -> bool {
170    match tokens.get(index) {
171        Some(Token::Phrase(_)) => true,
172        Some(Token::Word(word)) => word != "OR" && word != "NOT",
173        None => false,
174    }
175}
176
177fn can_negate_from_current(current: &[TextQuery]) -> bool {
178    match current.last() {
179        Some(TextQuery::Phrase(_)) => true,
180        Some(TextQuery::Term(term)) => term != "OR" && term != "AND" && term != "NOT",
181        _ => false,
182    }
183}
184
185fn parse_atom_or_literal(tokens: &[Token], index: usize, can_negate: bool) -> (TextQuery, usize) {
186    match tokens.get(index) {
187        Some(Token::Phrase(phrase)) => (TextQuery::Phrase(phrase.clone()), index + 1),
188        Some(Token::Word(word)) if word == "NOT" => {
189            if can_negate {
190                match tokens.get(index + 1) {
191                    Some(Token::Phrase(phrase)) => (
192                        TextQuery::Not(Box::new(TextQuery::Phrase(phrase.clone()))),
193                        index + 2,
194                    ),
195                    Some(Token::Word(next)) if next != "OR" && next != "NOT" => (
196                        TextQuery::Not(Box::new(TextQuery::Term(next.clone()))),
197                        index + 2,
198                    ),
199                    _ => (TextQuery::Term("NOT".to_owned()), index + 1),
200                }
201            } else {
202                (TextQuery::Term("NOT".to_owned()), index + 1)
203            }
204        }
205        Some(Token::Word(word)) => (TextQuery::Term(word.clone()), index + 1),
206        None => (TextQuery::Empty, index),
207    }
208}
209
210fn normalize_and(mut nodes: Vec<TextQuery>) -> TextQuery {
211    match nodes.len() {
212        0 => TextQuery::Empty,
213        1 => nodes.pop().unwrap_or(TextQuery::Empty),
214        _ => TextQuery::And(nodes),
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use super::{TextQuery, render_text_query_fts5};
221
222    #[test]
223    fn parse_empty_query() {
224        assert_eq!(TextQuery::parse(""), TextQuery::Empty);
225        assert_eq!(TextQuery::parse("   "), TextQuery::Empty);
226    }
227
228    #[test]
229    fn parse_plain_terms_as_implicit_and() {
230        assert_eq!(
231            TextQuery::parse("budget meeting"),
232            TextQuery::And(vec![
233                TextQuery::Term("budget".into()),
234                TextQuery::Term("meeting".into()),
235            ])
236        );
237    }
238
239    #[test]
240    fn parse_phrase() {
241        assert_eq!(
242            TextQuery::parse("\"release notes\""),
243            TextQuery::Phrase("release notes".into())
244        );
245    }
246
247    #[test]
248    fn parse_or_operator() {
249        assert_eq!(
250            TextQuery::parse("ship OR docs"),
251            TextQuery::Or(vec![
252                TextQuery::Term("ship".into()),
253                TextQuery::Term("docs".into()),
254            ])
255        );
256    }
257
258    #[test]
259    fn parse_not_operator() {
260        assert_eq!(
261            TextQuery::parse("ship NOT blocked"),
262            TextQuery::And(vec![
263                TextQuery::Term("ship".into()),
264                TextQuery::Not(Box::new(TextQuery::Term("blocked".into()))),
265            ])
266        );
267    }
268
269    #[test]
270    fn parse_leading_not_as_literal() {
271        assert_eq!(
272            TextQuery::parse("NOT blocked"),
273            TextQuery::And(vec![
274                TextQuery::Term("NOT".into()),
275                TextQuery::Term("blocked".into()),
276            ])
277        );
278    }
279
280    #[test]
281    fn parse_not_after_or_as_literal() {
282        assert_eq!(
283            TextQuery::parse("ship OR NOT blocked"),
284            TextQuery::And(vec![
285                TextQuery::Term("ship".into()),
286                TextQuery::Term("OR".into()),
287                TextQuery::Term("NOT".into()),
288                TextQuery::Term("blocked".into()),
289            ])
290        );
291    }
292
293    #[test]
294    fn parse_lowercase_or_as_literal() {
295        assert_eq!(
296            TextQuery::parse("ship or docs"),
297            TextQuery::And(vec![
298                TextQuery::Term("ship".into()),
299                TextQuery::Term("or".into()),
300                TextQuery::Term("docs".into()),
301            ])
302        );
303    }
304
305    #[test]
306    fn parse_lowercase_not_as_literal() {
307        assert_eq!(
308            TextQuery::parse("not a ship"),
309            TextQuery::And(vec![
310                TextQuery::Term("not".into()),
311                TextQuery::Term("a".into()),
312                TextQuery::Term("ship".into()),
313            ])
314        );
315    }
316
317    #[test]
318    fn parse_trailing_or_as_literal() {
319        assert_eq!(
320            TextQuery::parse("ship OR"),
321            TextQuery::And(vec![
322                TextQuery::Term("ship".into()),
323                TextQuery::Term("OR".into()),
324            ])
325        );
326    }
327
328    #[test]
329    fn parse_apostrophe_as_literal_term() {
330        assert_eq!(
331            TextQuery::parse("User's name"),
332            TextQuery::And(vec![
333                TextQuery::Term("User's".into()),
334                TextQuery::Term("name".into()),
335            ])
336        );
337    }
338
339    #[test]
340    fn parse_unsupported_column_filter_as_literal() {
341        assert_eq!(
342            TextQuery::parse("col:value"),
343            TextQuery::Term("col:value".into())
344        );
345    }
346
347    #[test]
348    fn parse_unsupported_prefix_as_literal() {
349        assert_eq!(
350            TextQuery::parse("prefix*"),
351            TextQuery::Term("prefix*".into())
352        );
353    }
354
355    #[test]
356    fn parse_near_as_literal() {
357        assert_eq!(
358            TextQuery::parse("a NEAR b"),
359            TextQuery::And(vec![
360                TextQuery::Term("a".into()),
361                TextQuery::Term("NEAR".into()),
362                TextQuery::Term("b".into()),
363            ])
364        );
365    }
366
367    #[test]
368    fn parse_explicit_and_as_literal() {
369        assert_eq!(
370            TextQuery::parse("cats AND dogs OR fish"),
371            TextQuery::Or(vec![
372                TextQuery::And(vec![
373                    TextQuery::Term("cats".into()),
374                    TextQuery::Term("AND".into()),
375                    TextQuery::Term("dogs".into()),
376                ]),
377                TextQuery::Term("fish".into()),
378            ])
379        );
380    }
381
382    #[test]
383    fn render_term_query() {
384        assert_eq!(
385            render_text_query_fts5(&TextQuery::Term("budget".into())),
386            "\"budget\""
387        );
388    }
389
390    #[test]
391    fn render_phrase_query() {
392        assert_eq!(
393            render_text_query_fts5(&TextQuery::Phrase("release notes".into())),
394            "\"release notes\""
395        );
396    }
397
398    #[test]
399    fn render_or_query() {
400        assert_eq!(
401            render_text_query_fts5(&TextQuery::Or(vec![
402                TextQuery::Term("ship".into()),
403                TextQuery::Term("docs".into()),
404            ])),
405            "\"ship\" OR \"docs\""
406        );
407    }
408
409    #[test]
410    fn render_not_query() {
411        assert_eq!(
412            render_text_query_fts5(&TextQuery::And(vec![
413                TextQuery::Term("ship".into()),
414                TextQuery::Not(Box::new(TextQuery::Term("blocked".into()))),
415            ])),
416            "\"ship\" NOT \"blocked\""
417        );
418    }
419
420    #[test]
421    fn render_escapes_embedded_quotes() {
422        assert_eq!(
423            render_text_query_fts5(&TextQuery::Term("say \"hello\"".into())),
424            "\"say \"\"hello\"\"\""
425        );
426    }
427
428    #[test]
429    fn render_leading_not_literalized_parse_safely() {
430        assert_eq!(
431            render_text_query_fts5(&TextQuery::parse("NOT blocked")),
432            "\"NOT\" \"blocked\""
433        );
434    }
435
436    #[test]
437    fn render_lowercase_not_as_literal_terms() {
438        assert_eq!(
439            render_text_query_fts5(&TextQuery::parse("not a ship")),
440            "\"not\" \"a\" \"ship\""
441        );
442    }
443}