hermes_core/dsl/ql/
mod.rs

1//! Query language parser using pest
2//!
3//! Supports:
4//! - Term queries: `rust` or `title:rust`
5//! - Phrase queries: `"hello world"` or `title:"hello world"`
6//! - Boolean operators: `AND`, `OR`, `NOT` (or `&&`, `||`, `-`)
7//! - Grouping: `(rust OR python) AND programming`
8//! - Default fields for unqualified terms
9
10use pest::Parser;
11use pest_derive::Parser;
12use std::sync::Arc;
13
14use super::query_field_router::{QueryFieldRouter, RoutingMode};
15use super::schema::{Field, Schema};
16use crate::query::{BooleanQuery, Query, TermQuery};
17use crate::tokenizer::{BoxedTokenizer, TokenizerRegistry};
18
19#[derive(Parser)]
20#[grammar = "dsl/ql/ql.pest"]
21struct QueryParser;
22
23/// Parsed query that can be converted to a Query trait object
24#[derive(Debug, Clone)]
25pub enum ParsedQuery {
26    Term {
27        field: Option<String>,
28        term: String,
29    },
30    Phrase {
31        field: Option<String>,
32        phrase: String,
33    },
34    And(Vec<ParsedQuery>),
35    Or(Vec<ParsedQuery>),
36    Not(Box<ParsedQuery>),
37}
38
39/// Query language parser with schema awareness
40pub struct QueryLanguageParser {
41    schema: Arc<Schema>,
42    default_fields: Vec<Field>,
43    tokenizers: Arc<TokenizerRegistry>,
44    /// Optional query field router for routing queries based on regex patterns
45    field_router: Option<QueryFieldRouter>,
46}
47
48impl QueryLanguageParser {
49    pub fn new(
50        schema: Arc<Schema>,
51        default_fields: Vec<Field>,
52        tokenizers: Arc<TokenizerRegistry>,
53    ) -> Self {
54        Self {
55            schema,
56            default_fields,
57            tokenizers,
58            field_router: None,
59        }
60    }
61
62    /// Create a parser with a query field router
63    pub fn with_router(
64        schema: Arc<Schema>,
65        default_fields: Vec<Field>,
66        tokenizers: Arc<TokenizerRegistry>,
67        router: QueryFieldRouter,
68    ) -> Self {
69        Self {
70            schema,
71            default_fields,
72            tokenizers,
73            field_router: Some(router),
74        }
75    }
76
77    /// Set the query field router
78    pub fn set_router(&mut self, router: QueryFieldRouter) {
79        self.field_router = Some(router);
80    }
81
82    /// Get the query field router
83    pub fn router(&self) -> Option<&QueryFieldRouter> {
84        self.field_router.as_ref()
85    }
86
87    /// Parse a query string into a Query
88    ///
89    /// Supports query language syntax (field:term, AND, OR, NOT, grouping)
90    /// and plain text (tokenized and searched across default fields).
91    ///
92    /// If a query field router is configured, the query is first checked against
93    /// routing rules. If a rule matches:
94    /// - In exclusive mode: only the target field is queried with the substituted value
95    /// - In additional mode: both the target field and default fields are queried
96    pub fn parse(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
97        let query_str = query_str.trim();
98        if query_str.is_empty() {
99            return Err("Empty query".to_string());
100        }
101
102        // Check if query matches any routing rules
103        if let Some(router) = &self.field_router {
104            if let Some(routed) = router.route(query_str) {
105                return self.build_routed_query(&routed.query, &routed.target_field, routed.mode, query_str);
106            }
107        }
108
109        // No routing match - parse normally
110        self.parse_normal(query_str)
111    }
112
113    /// Build a query from a routed match
114    fn build_routed_query(
115        &self,
116        routed_query: &str,
117        target_field: &str,
118        mode: RoutingMode,
119        original_query: &str,
120    ) -> Result<Box<dyn Query>, String> {
121        // Validate target field exists
122        let _field_id = self
123            .schema
124            .get_field(target_field)
125            .ok_or_else(|| format!("Unknown target field: {}", target_field))?;
126
127        // Build query for the target field with the substituted value
128        let target_query = self.build_term_query(Some(target_field), routed_query)?;
129
130        match mode {
131            RoutingMode::Exclusive => {
132                // Only query the target field
133                Ok(target_query)
134            }
135            RoutingMode::Additional => {
136                // Query both target field and default fields
137                let mut bool_query = BooleanQuery::new();
138                bool_query = bool_query.should(target_query);
139
140                // Also parse the original query against default fields
141                if let Ok(default_query) = self.parse_normal(original_query) {
142                    bool_query = bool_query.should(default_query);
143                }
144
145                Ok(Box::new(bool_query))
146            }
147        }
148    }
149
150    /// Parse query without routing (normal parsing path)
151    fn parse_normal(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
152        // Try parsing as query language first
153        match self.parse_query_string(query_str) {
154            Ok(parsed) => self.build_query(&parsed),
155            Err(_) => {
156                // If grammar parsing fails, treat as plain text
157                // Split by whitespace and create OR of terms
158                self.parse_plain_text(query_str)
159            }
160        }
161    }
162
163    /// Parse plain text as implicit OR of tokenized terms
164    fn parse_plain_text(&self, text: &str) -> Result<Box<dyn Query>, String> {
165        if self.default_fields.is_empty() {
166            return Err("No default fields configured".to_string());
167        }
168
169        let tokenizer = self.get_tokenizer(self.default_fields[0]);
170        let tokens: Vec<String> = tokenizer
171            .tokenize(text)
172            .into_iter()
173            .map(|t| t.text.to_lowercase())
174            .collect();
175
176        if tokens.is_empty() {
177            return Err("No tokens in query".to_string());
178        }
179
180        let mut bool_query = BooleanQuery::new();
181        for token in &tokens {
182            for &field_id in &self.default_fields {
183                bool_query = bool_query.should(TermQuery::text(field_id, token));
184            }
185        }
186        Ok(Box::new(bool_query))
187    }
188
189    fn parse_query_string(&self, query_str: &str) -> Result<ParsedQuery, String> {
190        let pairs = QueryParser::parse(Rule::query, query_str)
191            .map_err(|e| format!("Parse error: {}", e))?;
192
193        let query_pair = pairs.into_iter().next().ok_or("No query found")?;
194
195        // query = { SOI ~ or_expr ~ EOI }
196        self.parse_or_expr(query_pair.into_inner().next().unwrap())
197    }
198
199    fn parse_or_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
200        let mut inner = pair.into_inner();
201        let first = self.parse_and_expr(inner.next().unwrap())?;
202
203        let rest: Vec<ParsedQuery> = inner
204            .filter(|p| p.as_rule() == Rule::and_expr)
205            .map(|p| self.parse_and_expr(p))
206            .collect::<Result<Vec<_>, _>>()?;
207
208        if rest.is_empty() {
209            Ok(first)
210        } else {
211            let mut all = vec![first];
212            all.extend(rest);
213            Ok(ParsedQuery::Or(all))
214        }
215    }
216
217    fn parse_and_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
218        let mut inner = pair.into_inner();
219        let first = self.parse_primary(inner.next().unwrap())?;
220
221        let rest: Vec<ParsedQuery> = inner
222            .filter(|p| p.as_rule() == Rule::primary)
223            .map(|p| self.parse_primary(p))
224            .collect::<Result<Vec<_>, _>>()?;
225
226        if rest.is_empty() {
227            Ok(first)
228        } else {
229            let mut all = vec![first];
230            all.extend(rest);
231            Ok(ParsedQuery::And(all))
232        }
233    }
234
235    fn parse_primary(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
236        let mut negated = false;
237        let mut inner_query = None;
238
239        for inner in pair.into_inner() {
240            match inner.as_rule() {
241                Rule::not_op => negated = true,
242                Rule::group => {
243                    let or_expr = inner.into_inner().next().unwrap();
244                    inner_query = Some(self.parse_or_expr(or_expr)?);
245                }
246                Rule::phrase_query => {
247                    inner_query = Some(self.parse_phrase_query(inner)?);
248                }
249                Rule::term_query => {
250                    inner_query = Some(self.parse_term_query(inner)?);
251                }
252                _ => {}
253            }
254        }
255
256        let query = inner_query.ok_or("No query in primary")?;
257
258        if negated {
259            Ok(ParsedQuery::Not(Box::new(query)))
260        } else {
261            Ok(query)
262        }
263    }
264
265    fn parse_term_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
266        let mut field = None;
267        let mut term = String::new();
268
269        for inner in pair.into_inner() {
270            match inner.as_rule() {
271                Rule::field_spec => {
272                    field = Some(inner.into_inner().next().unwrap().as_str().to_string());
273                }
274                Rule::term => {
275                    term = inner.as_str().to_string();
276                }
277                _ => {}
278            }
279        }
280
281        Ok(ParsedQuery::Term { field, term })
282    }
283
284    fn parse_phrase_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
285        let mut field = None;
286        let mut phrase = String::new();
287
288        for inner in pair.into_inner() {
289            match inner.as_rule() {
290                Rule::field_spec => {
291                    field = Some(inner.into_inner().next().unwrap().as_str().to_string());
292                }
293                Rule::quoted_string => {
294                    let s = inner.as_str();
295                    phrase = s[1..s.len() - 1].to_string();
296                }
297                _ => {}
298            }
299        }
300
301        Ok(ParsedQuery::Phrase { field, phrase })
302    }
303
304    fn build_query(&self, parsed: &ParsedQuery) -> Result<Box<dyn Query>, String> {
305        match parsed {
306            ParsedQuery::Term { field, term } => self.build_term_query(field.as_deref(), term),
307            ParsedQuery::Phrase { field, phrase } => {
308                self.build_phrase_query(field.as_deref(), phrase)
309            }
310            ParsedQuery::And(queries) => {
311                let mut bool_query = BooleanQuery::new();
312                for q in queries {
313                    bool_query = bool_query.must(self.build_query(q)?);
314                }
315                Ok(Box::new(bool_query))
316            }
317            ParsedQuery::Or(queries) => {
318                let mut bool_query = BooleanQuery::new();
319                for q in queries {
320                    bool_query = bool_query.should(self.build_query(q)?);
321                }
322                Ok(Box::new(bool_query))
323            }
324            ParsedQuery::Not(inner) => {
325                // NOT query needs a context - wrap in a match-all with must_not
326                let mut bool_query = BooleanQuery::new();
327                bool_query = bool_query.must_not(self.build_query(inner)?);
328                Ok(Box::new(bool_query))
329            }
330        }
331    }
332
333    fn build_term_query(&self, field: Option<&str>, term: &str) -> Result<Box<dyn Query>, String> {
334        if let Some(field_name) = field {
335            // Field-qualified term: tokenize using field's tokenizer
336            let field_id = self
337                .schema
338                .get_field(field_name)
339                .ok_or_else(|| format!("Unknown field: {}", field_name))?;
340            let tokenizer = self.get_tokenizer(field_id);
341            let tokens: Vec<String> = tokenizer
342                .tokenize(term)
343                .into_iter()
344                .map(|t| t.text.to_lowercase())
345                .collect();
346
347            if tokens.is_empty() {
348                return Err("No tokens in term".to_string());
349            }
350
351            if tokens.len() == 1 {
352                Ok(Box::new(TermQuery::text(field_id, &tokens[0])))
353            } else {
354                // Multiple tokens from single term - AND them together
355                let mut bool_query = BooleanQuery::new();
356                for token in &tokens {
357                    bool_query = bool_query.must(TermQuery::text(field_id, token));
358                }
359                Ok(Box::new(bool_query))
360            }
361        } else if !self.default_fields.is_empty() {
362            // Unqualified term: tokenize and search across default fields
363            let tokenizer = self.get_tokenizer(self.default_fields[0]);
364            let tokens: Vec<String> = tokenizer
365                .tokenize(term)
366                .into_iter()
367                .map(|t| t.text.to_lowercase())
368                .collect();
369
370            if tokens.is_empty() {
371                return Err("No tokens in term".to_string());
372            }
373
374            // Build SHOULD query across all default fields for each token
375            let mut bool_query = BooleanQuery::new();
376            for token in &tokens {
377                for &field_id in &self.default_fields {
378                    bool_query = bool_query.should(TermQuery::text(field_id, token));
379                }
380            }
381            Ok(Box::new(bool_query))
382        } else {
383            Err("No field specified and no default fields configured".to_string())
384        }
385    }
386
387    fn build_phrase_query(
388        &self,
389        field: Option<&str>,
390        phrase: &str,
391    ) -> Result<Box<dyn Query>, String> {
392        // For phrase queries, tokenize and create AND query of terms
393        let field_id = if let Some(field_name) = field {
394            self.schema
395                .get_field(field_name)
396                .ok_or_else(|| format!("Unknown field: {}", field_name))?
397        } else if !self.default_fields.is_empty() {
398            self.default_fields[0]
399        } else {
400            return Err("No field specified and no default fields configured".to_string());
401        };
402
403        let tokenizer = self.get_tokenizer(field_id);
404        let tokens: Vec<String> = tokenizer
405            .tokenize(phrase)
406            .into_iter()
407            .map(|t| t.text.to_lowercase())
408            .collect();
409
410        if tokens.is_empty() {
411            return Err("No tokens in phrase".to_string());
412        }
413
414        if tokens.len() == 1 {
415            return Ok(Box::new(TermQuery::text(field_id, &tokens[0])));
416        }
417
418        // Create AND query for all tokens (simplified phrase matching)
419        let mut bool_query = BooleanQuery::new();
420        for token in &tokens {
421            bool_query = bool_query.must(TermQuery::text(field_id, token));
422        }
423
424        // If no field specified and multiple default fields, wrap in OR
425        if field.is_none() && self.default_fields.len() > 1 {
426            let mut outer = BooleanQuery::new();
427            for &f in &self.default_fields {
428                let tokenizer = self.get_tokenizer(f);
429                let tokens: Vec<String> = tokenizer
430                    .tokenize(phrase)
431                    .into_iter()
432                    .map(|t| t.text.to_lowercase())
433                    .collect();
434
435                let mut field_query = BooleanQuery::new();
436                for token in &tokens {
437                    field_query = field_query.must(TermQuery::text(f, token));
438                }
439                outer = outer.should(field_query);
440            }
441            return Ok(Box::new(outer));
442        }
443
444        Ok(Box::new(bool_query))
445    }
446
447    fn get_tokenizer(&self, field: Field) -> BoxedTokenizer {
448        // Get tokenizer name from schema field entry, fallback to "default"
449        let tokenizer_name = self
450            .schema
451            .get_field_entry(field)
452            .and_then(|entry| entry.tokenizer.as_deref())
453            .unwrap_or("default");
454
455        self.tokenizers
456            .get(tokenizer_name)
457            .unwrap_or_else(|| Box::new(crate::tokenizer::LowercaseTokenizer))
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464    use crate::dsl::SchemaBuilder;
465    use crate::tokenizer::TokenizerRegistry;
466
467    fn setup() -> (Arc<Schema>, Vec<Field>, Arc<TokenizerRegistry>) {
468        let mut builder = SchemaBuilder::default();
469        let title = builder.add_text_field("title", true, true);
470        let body = builder.add_text_field("body", true, true);
471        let schema = Arc::new(builder.build());
472        let tokenizers = Arc::new(TokenizerRegistry::default());
473        (schema, vec![title, body], tokenizers)
474    }
475
476    #[test]
477    fn test_simple_term() {
478        let (schema, default_fields, tokenizers) = setup();
479        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
480
481        // Should parse without error - creates BooleanQuery across default fields
482        let _query = parser.parse("rust").unwrap();
483    }
484
485    #[test]
486    fn test_field_term() {
487        let (schema, default_fields, tokenizers) = setup();
488        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
489
490        // Should parse field:term syntax
491        let _query = parser.parse("title:rust").unwrap();
492    }
493
494    #[test]
495    fn test_boolean_and() {
496        let (schema, default_fields, tokenizers) = setup();
497        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
498
499        // Should parse AND boolean query
500        let _query = parser.parse("rust AND programming").unwrap();
501    }
502
503    #[test]
504    fn test_match_query() {
505        let (schema, default_fields, tokenizers) = setup();
506        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
507
508        // Should tokenize and create boolean query
509        let _query = parser.parse("hello world").unwrap();
510    }
511
512    #[test]
513    fn test_phrase_query() {
514        let (schema, default_fields, tokenizers) = setup();
515        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
516
517        // Should parse quoted phrase
518        let _query = parser.parse("\"hello world\"").unwrap();
519    }
520
521    #[test]
522    fn test_boolean_or() {
523        let (schema, default_fields, tokenizers) = setup();
524        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
525
526        // Should parse OR boolean query
527        let _query = parser.parse("rust OR python").unwrap();
528    }
529
530    #[test]
531    fn test_complex_query() {
532        let (schema, default_fields, tokenizers) = setup();
533        let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
534
535        // Should parse complex boolean with grouping
536        let _query = parser.parse("(rust OR python) AND programming").unwrap();
537    }
538
539    #[test]
540    fn test_router_exclusive_mode() {
541        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
542
543        let mut builder = SchemaBuilder::default();
544        let _title = builder.add_text_field("title", true, true);
545        let _uri = builder.add_text_field("uri", true, true);
546        let schema = Arc::new(builder.build());
547        let tokenizers = Arc::new(TokenizerRegistry::default());
548
549        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
550            pattern: r"^doi:(10\.\d{4,}/[^\s]+)$".to_string(),
551            substitution: "doi://{1}".to_string(),
552            target_field: "uri".to_string(),
553            mode: RoutingMode::Exclusive,
554        }])
555        .unwrap();
556
557        let parser = QueryLanguageParser::with_router(
558            schema,
559            vec![],
560            tokenizers,
561            router,
562        );
563
564        // Should route DOI query to uri field
565        let _query = parser.parse("doi:10.1234/test.123").unwrap();
566    }
567
568    #[test]
569    fn test_router_additional_mode() {
570        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
571
572        let mut builder = SchemaBuilder::default();
573        let title = builder.add_text_field("title", true, true);
574        let _uri = builder.add_text_field("uri", true, true);
575        let schema = Arc::new(builder.build());
576        let tokenizers = Arc::new(TokenizerRegistry::default());
577
578        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
579            pattern: r"#(\d+)".to_string(),
580            substitution: "{1}".to_string(),
581            target_field: "uri".to_string(),
582            mode: RoutingMode::Additional,
583        }])
584        .unwrap();
585
586        let parser = QueryLanguageParser::with_router(
587            schema,
588            vec![title],
589            tokenizers,
590            router,
591        );
592
593        // Should route to both uri field and default fields
594        let _query = parser.parse("#42").unwrap();
595    }
596
597    #[test]
598    fn test_router_no_match_falls_through() {
599        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
600
601        let mut builder = SchemaBuilder::default();
602        let title = builder.add_text_field("title", true, true);
603        let _uri = builder.add_text_field("uri", true, true);
604        let schema = Arc::new(builder.build());
605        let tokenizers = Arc::new(TokenizerRegistry::default());
606
607        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
608            pattern: r"^doi:".to_string(),
609            substitution: "{0}".to_string(),
610            target_field: "uri".to_string(),
611            mode: RoutingMode::Exclusive,
612        }])
613        .unwrap();
614
615        let parser = QueryLanguageParser::with_router(
616            schema,
617            vec![title],
618            tokenizers,
619            router,
620        );
621
622        // Should NOT match and fall through to normal parsing
623        let _query = parser.parse("rust programming").unwrap();
624    }
625
626    #[test]
627    fn test_router_invalid_target_field() {
628        use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
629
630        let mut builder = SchemaBuilder::default();
631        let _title = builder.add_text_field("title", true, true);
632        let schema = Arc::new(builder.build());
633        let tokenizers = Arc::new(TokenizerRegistry::default());
634
635        let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
636            pattern: r"test".to_string(),
637            substitution: "{0}".to_string(),
638            target_field: "nonexistent".to_string(),
639            mode: RoutingMode::Exclusive,
640        }])
641        .unwrap();
642
643        let parser = QueryLanguageParser::with_router(
644            schema,
645            vec![],
646            tokenizers,
647            router,
648        );
649
650        // Should fail because target field doesn't exist
651        let result = parser.parse("test");
652        assert!(result.is_err());
653        let err = result.err().unwrap();
654        assert!(err.contains("Unknown target field"));
655    }
656}