hermes_core/dsl/ql/
mod.rs

1//! Query language parser using pest
2//!
3//! Supports:
4//! - Term queries: `rust` or `title:rust`
5//! - Phrase queries: `"hello world"` or `title:"hello world"`
6//! - Boolean operators: `AND`, `OR`, `NOT` (or `&&`, `||`, `-`)
7//! - Grouping: `(rust OR python) AND programming`
8//! - Default fields for unqualified terms
9
10use pest::Parser;
11use pest_derive::Parser;
12use std::sync::Arc;
13
14use super::query_field_router::{QueryFieldRouter, RoutingMode};
15use super::schema::{Field, Schema};
16use crate::query::{BooleanQuery, Query, TermQuery};
17use crate::tokenizer::{BoxedTokenizer, TokenizerRegistry};
18
19#[derive(Parser)]
20#[grammar = "dsl/ql/ql.pest"]
21struct QueryParser;
22
23/// Parsed query that can be converted to a Query trait object
24#[derive(Debug, Clone)]
25pub enum ParsedQuery {
26    Term {
27        field: Option<String>,
28        term: String,
29    },
30    Phrase {
31        field: Option<String>,
32        phrase: String,
33    },
34    And(Vec<ParsedQuery>),
35    Or(Vec<ParsedQuery>),
36    Not(Box<ParsedQuery>),
37}
38
39/// Query language parser with schema awareness
40pub struct QueryLanguageParser {
41    schema: Arc<Schema>,
42    default_fields: Vec<Field>,
43    tokenizers: Arc<TokenizerRegistry>,
44    /// Optional query field router for routing queries based on regex patterns
45    field_router: Option<QueryFieldRouter>,
46}
47
48impl QueryLanguageParser {
49    pub fn new(
50        schema: Arc<Schema>,
51        default_fields: Vec<Field>,
52        tokenizers: Arc<TokenizerRegistry>,
53    ) -> Self {
54        Self {
55            schema,
56            default_fields,
57            tokenizers,
58            field_router: None,
59        }
60    }
61
62    /// Create a parser with a query field router
63    pub fn with_router(
64        schema: Arc<Schema>,
65        default_fields: Vec<Field>,
66        tokenizers: Arc<TokenizerRegistry>,
67        router: QueryFieldRouter,
68    ) -> Self {
69        Self {
70            schema,
71            default_fields,
72            tokenizers,
73            field_router: Some(router),
74        }
75    }
76
77    /// Set the query field router
78    pub fn set_router(&mut self, router: QueryFieldRouter) {
79        self.field_router = Some(router);
80    }
81
82    /// Get the query field router
83    pub fn router(&self) -> Option<&QueryFieldRouter> {
84        self.field_router.as_ref()
85    }
86
87    /// Parse a query string into a Query
88    ///
89    /// Supports query language syntax (field:term, AND, OR, NOT, grouping)
90    /// and plain text (tokenized and searched across default fields).
91    ///
92    /// If a query field router is configured, the query is first checked against
93    /// routing rules. If a rule matches:
94    /// - In exclusive mode: only the target field is queried with the substituted value
95    /// - In additional mode: both the target field and default fields are queried
96    pub fn parse(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
97        let query_str = query_str.trim();
98        if query_str.is_empty() {
99            return Err("Empty query".to_string());
100        }
101
102        // Check if query matches any routing rules
103        if let Some(router) = &self.field_router
104            && let Some(routed) = router.route(query_str)
105        {
106            return self.build_routed_query(
107                &routed.query,
108                &routed.target_field,
109                routed.mode,
110                query_str,
111            );
112        }
113
114        // No routing match - parse normally
115        self.parse_normal(query_str)
116    }
117
118    /// Build a query from a routed match
119    fn build_routed_query(
120        &self,
121        routed_query: &str,
122        target_field: &str,
123        mode: RoutingMode,
124        original_query: &str,
125    ) -> Result<Box<dyn Query>, String> {
126        // Validate target field exists
127        let _field_id = self
128            .schema
129            .get_field(target_field)
130            .ok_or_else(|| format!("Unknown target field: {}", target_field))?;
131
132        // Build query for the target field with the substituted value
133        let target_query = self.build_term_query(Some(target_field), routed_query)?;
134
135        match mode {
136            RoutingMode::Exclusive => {
137                // Only query the target field
138                Ok(target_query)
139            }
140            RoutingMode::Additional => {
141                // Query both target field and default fields
142                let mut bool_query = BooleanQuery::new();
143                bool_query = bool_query.should(target_query);
144
145                // Also parse the original query against default fields
146                if let Ok(default_query) = self.parse_normal(original_query) {
147                    bool_query = bool_query.should(default_query);
148                }
149
150                Ok(Box::new(bool_query))
151            }
152        }
153    }
154
155    /// Parse query without routing (normal parsing path)
156    fn parse_normal(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
157        // Try parsing as query language first
158        match self.parse_query_string(query_str) {
159            Ok(parsed) => self.build_query(&parsed),
160            Err(_) => {
161                // If grammar parsing fails, treat as plain text
162                // Split by whitespace and create OR of terms
163                self.parse_plain_text(query_str)
164            }
165        }
166    }
167
168    /// Parse plain text as implicit OR of tokenized terms
169    fn parse_plain_text(&self, text: &str) -> Result<Box<dyn Query>, String> {
170        if self.default_fields.is_empty() {
171            return Err("No default fields configured".to_string());
172        }
173
174        let tokenizer = self.get_tokenizer(self.default_fields[0]);
175        let tokens: Vec<String> = tokenizer
176            .tokenize(text)
177            .into_iter()
178            .map(|t| t.text.to_lowercase())
179            .collect();
180
181        if tokens.is_empty() {
182            return Err("No tokens in query".to_string());
183        }
184
185        let mut bool_query = BooleanQuery::new();
186        for token in &tokens {
187            for &field_id in &self.default_fields {
188                bool_query = bool_query.should(TermQuery::text(field_id, token));
189            }
190        }
191        Ok(Box::new(bool_query))
192    }
193
194    fn parse_query_string(&self, query_str: &str) -> Result<ParsedQuery, String> {
195        let pairs = QueryParser::parse(Rule::query, query_str)
196            .map_err(|e| format!("Parse error: {}", e))?;
197
198        let query_pair = pairs.into_iter().next().ok_or("No query found")?;
199
200        // query = { SOI ~ or_expr ~ EOI }
201        self.parse_or_expr(query_pair.into_inner().next().unwrap())
202    }
203
204    fn parse_or_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
205        let mut inner = pair.into_inner();
206        let first = self.parse_and_expr(inner.next().unwrap())?;
207
208        let rest: Vec<ParsedQuery> = inner
209            .filter(|p| p.as_rule() == Rule::and_expr)
210            .map(|p| self.parse_and_expr(p))
211            .collect::<Result<Vec<_>, _>>()?;
212
213        if rest.is_empty() {
214            Ok(first)
215        } else {
216            let mut all = vec![first];
217            all.extend(rest);
218            Ok(ParsedQuery::Or(all))
219        }
220    }
221
222    fn parse_and_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
223        let mut inner = pair.into_inner();
224        let first = self.parse_primary(inner.next().unwrap())?;
225
226        let rest: Vec<ParsedQuery> = inner
227            .filter(|p| p.as_rule() == Rule::primary)
228            .map(|p| self.parse_primary(p))
229            .collect::<Result<Vec<_>, _>>()?;
230
231        if rest.is_empty() {
232            Ok(first)
233        } else {
234            let mut all = vec![first];
235            all.extend(rest);
236            Ok(ParsedQuery::And(all))
237        }
238    }
239
240    fn parse_primary(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
241        let mut negated = false;
242        let mut inner_query = None;
243
244        for inner in pair.into_inner() {
245            match inner.as_rule() {
246                Rule::not_op => negated = true,
247                Rule::group => {
248                    let or_expr = inner.into_inner().next().unwrap();
249                    inner_query = Some(self.parse_or_expr(or_expr)?);
250                }
251                Rule::phrase_query => {
252                    inner_query = Some(self.parse_phrase_query(inner)?);
253                }
254                Rule::term_query => {
255                    inner_query = Some(self.parse_term_query(inner)?);
256                }
257                _ => {}
258            }
259        }
260
261        let query = inner_query.ok_or("No query in primary")?;
262
263        if negated {
264            Ok(ParsedQuery::Not(Box::new(query)))
265        } else {
266            Ok(query)
267        }
268    }
269
270    fn parse_term_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
271        let mut field = None;
272        let mut term = String::new();
273
274        for inner in pair.into_inner() {
275            match inner.as_rule() {
276                Rule::field_spec => {
277                    field = Some(inner.into_inner().next().unwrap().as_str().to_string());
278                }
279                Rule::term => {
280                    term = inner.as_str().to_string();
281                }
282                _ => {}
283            }
284        }
285
286        Ok(ParsedQuery::Term { field, term })
287    }
288
289    fn parse_phrase_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
290        let mut field = None;
291        let mut phrase = String::new();
292
293        for inner in pair.into_inner() {
294            match inner.as_rule() {
295                Rule::field_spec => {
296                    field = Some(inner.into_inner().next().unwrap().as_str().to_string());
297                }
298                Rule::quoted_string => {
299                    let s = inner.as_str();
300                    phrase = s[1..s.len() - 1].to_string();
301                }
302                _ => {}
303            }
304        }
305
306        Ok(ParsedQuery::Phrase { field, phrase })
307    }
308
309    fn build_query(&self, parsed: &ParsedQuery) -> Result<Box<dyn Query>, String> {
310        match parsed {
311            ParsedQuery::Term { field, term } => self.build_term_query(field.as_deref(), term),
312            ParsedQuery::Phrase { field, phrase } => {
313                self.build_phrase_query(field.as_deref(), phrase)
314            }
315            ParsedQuery::And(queries) => {
316                let mut bool_query = BooleanQuery::new();
317                for q in queries {
318                    bool_query = bool_query.must(self.build_query(q)?);
319                }
320                Ok(Box::new(bool_query))
321            }
322            ParsedQuery::Or(queries) => {
323                let mut bool_query = BooleanQuery::new();
324                for q in queries {
325                    bool_query = bool_query.should(self.build_query(q)?);
326                }
327                Ok(Box::new(bool_query))
328            }
329            ParsedQuery::Not(inner) => {
330                // NOT query needs a context - wrap in a match-all with must_not
331                let mut bool_query = BooleanQuery::new();
332                bool_query = bool_query.must_not(self.build_query(inner)?);
333                Ok(Box::new(bool_query))
334            }
335        }
336    }
337
338    fn build_term_query(&self, field: Option<&str>, term: &str) -> Result<Box<dyn Query>, String> {
339        if let Some(field_name) = field {
340            // Field-qualified term: tokenize using field's tokenizer
341            let field_id = self
342                .schema
343                .get_field(field_name)
344                .ok_or_else(|| format!("Unknown field: {}", field_name))?;
345            let tokenizer = self.get_tokenizer(field_id);
346            let tokens: Vec<String> = tokenizer
347                .tokenize(term)
348                .into_iter()
349                .map(|t| t.text.to_lowercase())
350                .collect();
351
352            if tokens.is_empty() {
353                return Err("No tokens in term".to_string());
354            }
355
356            if tokens.len() == 1 {
357                Ok(Box::new(TermQuery::text(field_id, &tokens[0])))
358            } else {
359                // Multiple tokens from single term - AND them together
360                let mut bool_query = BooleanQuery::new();
361                for token in &tokens {
362                    bool_query = bool_query.must(TermQuery::text(field_id, token));
363                }
364                Ok(Box::new(bool_query))
365            }
366        } else if !self.default_fields.is_empty() {
367            // Unqualified term: tokenize and search across default fields
368            let tokenizer = self.get_tokenizer(self.default_fields[0]);
369            let tokens: Vec<String> = tokenizer
370                .tokenize(term)
371                .into_iter()
372                .map(|t| t.text.to_lowercase())
373                .collect();
374
375            if tokens.is_empty() {
376                return Err("No tokens in term".to_string());
377            }
378
379            // Build SHOULD query across all default fields for each token
380            let mut bool_query = BooleanQuery::new();
381            for token in &tokens {
382                for &field_id in &self.default_fields {
383                    bool_query = bool_query.should(TermQuery::text(field_id, token));
384                }
385            }
386            Ok(Box::new(bool_query))
387        } else {
388            Err("No field specified and no default fields configured".to_string())
389        }
390    }
391
392    fn build_phrase_query(
393        &self,
394        field: Option<&str>,
395        phrase: &str,
396    ) -> Result<Box<dyn Query>, String> {
397        // For phrase queries, tokenize and create AND query of terms
398        let field_id = if let Some(field_name) = field {
399            self.schema
400                .get_field(field_name)
401                .ok_or_else(|| format!("Unknown field: {}", field_name))?
402        } else if !self.default_fields.is_empty() {
403            self.default_fields[0]
404        } else {
405            return Err("No field specified and no default fields configured".to_string());
406        };
407
408        let tokenizer = self.get_tokenizer(field_id);
409        let tokens: Vec<String> = tokenizer
410            .tokenize(phrase)
411            .into_iter()
412            .map(|t| t.text.to_lowercase())
413            .collect();
414
415        if tokens.is_empty() {
416            return Err("No tokens in phrase".to_string());
417        }
418
419        if tokens.len() == 1 {
420            return Ok(Box::new(TermQuery::text(field_id, &tokens[0])));
421        }
422
423        // Create AND query for all tokens (simplified phrase matching)
424        let mut bool_query = BooleanQuery::new();
425        for token in &tokens {
426            bool_query = bool_query.must(TermQuery::text(field_id, token));
427        }
428
429        // If no field specified and multiple default fields, wrap in OR
430        if field.is_none() && self.default_fields.len() > 1 {
431            let mut outer = BooleanQuery::new();
432            for &f in &self.default_fields {
433                let tokenizer = self.get_tokenizer(f);
434                let tokens: Vec<String> = tokenizer
435                    .tokenize(phrase)
436                    .into_iter()
437                    .map(|t| t.text.to_lowercase())
438                    .collect();
439
440                let mut field_query = BooleanQuery::new();
441                for token in &tokens {
442                    field_query = field_query.must(TermQuery::text(f, token));
443                }
444                outer = outer.should(field_query);
445            }
446            return Ok(Box::new(outer));
447        }
448
449        Ok(Box::new(bool_query))
450    }
451
452    fn get_tokenizer(&self, field: Field) -> BoxedTokenizer {
453        // Get tokenizer name from schema field entry, fallback to "default"
454        let tokenizer_name = self
455            .schema
456            .get_field_entry(field)
457            .and_then(|entry| entry.tokenizer.as_deref())
458            .unwrap_or("default");
459
460        self.tokenizers
461            .get(tokenizer_name)
462            .unwrap_or_else(|| Box::new(crate::tokenizer::LowercaseTokenizer))
463    }
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469    use crate::dsl::SchemaBuilder;
470    use crate::tokenizer::TokenizerRegistry;
471
472    fn setup() -> (Arc<Schema>, Vec<Field>, Arc<TokenizerRegistry>) {
473        let mut builder = SchemaBuilder::default();
474        let title = builder.add_text_field("title", true, true);
475        let body = builder.add_text_field("body", true, true);
476        let schema = Arc::new(builder.build());
477        let tokenizers = Arc::new(TokenizerRegistry::default());
478        (schema, vec![title, body], tokenizers)
479    }
480
481    #[test]
482    fn test_simple_term() {
483        let (schema, default_fields, tokenizers) = setup();
484        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
485
486        // Should parse without error - creates BooleanQuery across default fields
487        let _query = parser.parse("rust").unwrap();
488    }
489
490    #[test]
491    fn test_field_term() {
492        let (schema, default_fields, tokenizers) = setup();
493        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
494
495        // Should parse field:term syntax
496        let _query = parser.parse("title:rust").unwrap();
497    }
498
499    #[test]
500    fn test_boolean_and() {
501        let (schema, default_fields, tokenizers) = setup();
502        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
503
504        // Should parse AND boolean query
505        let _query = parser.parse("rust AND programming").unwrap();
506    }
507
508    #[test]
509    fn test_match_query() {
510        let (schema, default_fields, tokenizers) = setup();
511        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
512
513        // Should tokenize and create boolean query
514        let _query = parser.parse("hello world").unwrap();
515    }
516
517    #[test]
518    fn test_phrase_query() {
519        let (schema, default_fields, tokenizers) = setup();
520        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
521
522        // Should parse quoted phrase
523        let _query = parser.parse("\"hello world\"").unwrap();
524    }
525
526    #[test]
527    fn test_boolean_or() {
528        let (schema, default_fields, tokenizers) = setup();
529        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
530
531        // Should parse OR boolean query
532        let _query = parser.parse("rust OR python").unwrap();
533    }
534
535    #[test]
536    fn test_complex_query() {
537        let (schema, default_fields, tokenizers) = setup();
538        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
539
540        // Should parse complex boolean with grouping
541        let _query = parser.parse("(rust OR python) AND programming").unwrap();
542    }
543
544    #[test]
545    fn test_router_exclusive_mode() {
546        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
547
548        let mut builder = SchemaBuilder::default();
549        let _title = builder.add_text_field("title", true, true);
550        let _uri = builder.add_text_field("uri", true, true);
551        let schema = Arc::new(builder.build());
552        let tokenizers = Arc::new(TokenizerRegistry::default());
553
554        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
555            pattern: r"^doi:(10\.\d{4,}/[^\s]+)$".to_string(),
556            substitution: "doi://{1}".to_string(),
557            target_field: "uri".to_string(),
558            mode: RoutingMode::Exclusive,
559        }])
560        .unwrap();
561
562        let parser = QueryLanguageParser::with_router(schema, vec![], tokenizers, router);
563
564        // Should route DOI query to uri field
565        let _query = parser.parse("doi:10.1234/test.123").unwrap();
566    }
567
568    #[test]
569    fn test_router_additional_mode() {
570        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
571
572        let mut builder = SchemaBuilder::default();
573        let title = builder.add_text_field("title", true, true);
574        let _uri = builder.add_text_field("uri", true, true);
575        let schema = Arc::new(builder.build());
576        let tokenizers = Arc::new(TokenizerRegistry::default());
577
578        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
579            pattern: r"#(\d+)".to_string(),
580            substitution: "{1}".to_string(),
581            target_field: "uri".to_string(),
582            mode: RoutingMode::Additional,
583        }])
584        .unwrap();
585
586        let parser = QueryLanguageParser::with_router(schema, vec![title], tokenizers, router);
587
588        // Should route to both uri field and default fields
589        let _query = parser.parse("#42").unwrap();
590    }
591
592    #[test]
593    fn test_router_no_match_falls_through() {
594        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
595
596        let mut builder = SchemaBuilder::default();
597        let title = builder.add_text_field("title", true, true);
598        let _uri = builder.add_text_field("uri", true, true);
599        let schema = Arc::new(builder.build());
600        let tokenizers = Arc::new(TokenizerRegistry::default());
601
602        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
603            pattern: r"^doi:".to_string(),
604            substitution: "{0}".to_string(),
605            target_field: "uri".to_string(),
606            mode: RoutingMode::Exclusive,
607        }])
608        .unwrap();
609
610        let parser = QueryLanguageParser::with_router(schema, vec![title], tokenizers, router);
611
612        // Should NOT match and fall through to normal parsing
613        let _query = parser.parse("rust programming").unwrap();
614    }
615
616    #[test]
617    fn test_router_invalid_target_field() {
618        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
619
620        let mut builder = SchemaBuilder::default();
621        let _title = builder.add_text_field("title", true, true);
622        let schema = Arc::new(builder.build());
623        let tokenizers = Arc::new(TokenizerRegistry::default());
624
625        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
626            pattern: r"test".to_string(),
627            substitution: "{0}".to_string(),
628            target_field: "nonexistent".to_string(),
629            mode: RoutingMode::Exclusive,
630        }])
631        .unwrap();
632
633        let parser = QueryLanguageParser::with_router(schema, vec![], tokenizers, router);
634
635        // Should fail because target field doesn't exist
636        let result = parser.parse("test");
637        assert!(result.is_err());
638        let err = result.err().unwrap();
639        assert!(err.contains("Unknown target field"));
640    }
641}