1use pest::Parser;
11use pest_derive::Parser;
12use std::sync::Arc;
13
14use super::query_field_router::{QueryFieldRouter, RoutingMode};
15use super::schema::{Field, Schema};
16use crate::query::{BooleanQuery, Query, TermQuery};
17use crate::tokenizer::{BoxedTokenizer, TokenizerRegistry};
18
19#[derive(Parser)]
20#[grammar = "dsl/ql/ql.pest"]
21struct QueryParser;
22
23#[derive(Debug, Clone)]
25pub enum ParsedQuery {
26 Term {
27 field: Option<String>,
28 term: String,
29 },
30 Phrase {
31 field: Option<String>,
32 phrase: String,
33 },
34 Ann {
36 field: String,
37 vector: Vec<f32>,
38 nprobe: usize,
39 rerank: usize,
40 },
41 Sparse {
43 field: String,
44 vector: Vec<(u32, f32)>,
45 },
46 And(Vec<ParsedQuery>),
47 Or(Vec<ParsedQuery>),
48 Not(Box<ParsedQuery>),
49}
50
51pub struct QueryLanguageParser {
53 schema: Arc<Schema>,
54 default_fields: Vec<Field>,
55 tokenizers: Arc<TokenizerRegistry>,
56 field_router: Option<QueryFieldRouter>,
58}
59
60impl QueryLanguageParser {
61 pub fn new(
62 schema: Arc<Schema>,
63 default_fields: Vec<Field>,
64 tokenizers: Arc<TokenizerRegistry>,
65 ) -> Self {
66 Self {
67 schema,
68 default_fields,
69 tokenizers,
70 field_router: None,
71 }
72 }
73
74 pub fn with_router(
76 schema: Arc<Schema>,
77 default_fields: Vec<Field>,
78 tokenizers: Arc<TokenizerRegistry>,
79 router: QueryFieldRouter,
80 ) -> Self {
81 Self {
82 schema,
83 default_fields,
84 tokenizers,
85 field_router: Some(router),
86 }
87 }
88
89 pub fn set_router(&mut self, router: QueryFieldRouter) {
91 self.field_router = Some(router);
92 }
93
94 pub fn router(&self) -> Option<&QueryFieldRouter> {
96 self.field_router.as_ref()
97 }
98
99 pub fn parse(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
109 let query_str = query_str.trim();
110 if query_str.is_empty() {
111 return Err("Empty query".to_string());
112 }
113
114 if let Some(router) = &self.field_router
116 && let Some(routed) = router.route(query_str)
117 {
118 return self.build_routed_query(
119 &routed.query,
120 &routed.target_field,
121 routed.mode,
122 query_str,
123 );
124 }
125
126 self.parse_normal(query_str)
128 }
129
130 fn build_routed_query(
132 &self,
133 routed_query: &str,
134 target_field: &str,
135 mode: RoutingMode,
136 original_query: &str,
137 ) -> Result<Box<dyn Query>, String> {
138 let _field_id = self
140 .schema
141 .get_field(target_field)
142 .ok_or_else(|| format!("Unknown target field: {}", target_field))?;
143
144 let target_query = self.build_term_query(Some(target_field), routed_query)?;
146
147 match mode {
148 RoutingMode::Exclusive => {
149 Ok(target_query)
151 }
152 RoutingMode::Additional => {
153 let mut bool_query = BooleanQuery::new();
155 bool_query = bool_query.should(target_query);
156
157 if let Ok(default_query) = self.parse_normal(original_query) {
159 bool_query = bool_query.should(default_query);
160 }
161
162 Ok(Box::new(bool_query))
163 }
164 }
165 }
166
167 fn parse_normal(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
169 match self.parse_query_string(query_str) {
171 Ok(parsed) => self.build_query(&parsed),
172 Err(_) => {
173 self.parse_plain_text(query_str)
176 }
177 }
178 }
179
180 fn parse_plain_text(&self, text: &str) -> Result<Box<dyn Query>, String> {
182 if self.default_fields.is_empty() {
183 return Err("No default fields configured".to_string());
184 }
185
186 let tokenizer = self.get_tokenizer(self.default_fields[0]);
187 let tokens: Vec<String> = tokenizer
188 .tokenize(text)
189 .into_iter()
190 .map(|t| t.text.to_lowercase())
191 .collect();
192
193 if tokens.is_empty() {
194 return Err("No tokens in query".to_string());
195 }
196
197 let mut bool_query = BooleanQuery::new();
198 for token in &tokens {
199 for &field_id in &self.default_fields {
200 bool_query = bool_query.should(TermQuery::text(field_id, token));
201 }
202 }
203 Ok(Box::new(bool_query))
204 }
205
206 fn parse_query_string(&self, query_str: &str) -> Result<ParsedQuery, String> {
207 let pairs = QueryParser::parse(Rule::query, query_str)
208 .map_err(|e| format!("Parse error: {}", e))?;
209
210 let query_pair = pairs.into_iter().next().ok_or("No query found")?;
211
212 self.parse_or_expr(query_pair.into_inner().next().unwrap())
214 }
215
216 fn parse_or_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
217 let mut inner = pair.into_inner();
218 let first = self.parse_and_expr(inner.next().unwrap())?;
219
220 let rest: Vec<ParsedQuery> = inner
221 .filter(|p| p.as_rule() == Rule::and_expr)
222 .map(|p| self.parse_and_expr(p))
223 .collect::<Result<Vec<_>, _>>()?;
224
225 if rest.is_empty() {
226 Ok(first)
227 } else {
228 let mut all = vec![first];
229 all.extend(rest);
230 Ok(ParsedQuery::Or(all))
231 }
232 }
233
234 fn parse_and_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
235 let mut inner = pair.into_inner();
236 let first = self.parse_primary(inner.next().unwrap())?;
237
238 let rest: Vec<ParsedQuery> = inner
239 .filter(|p| p.as_rule() == Rule::primary)
240 .map(|p| self.parse_primary(p))
241 .collect::<Result<Vec<_>, _>>()?;
242
243 if rest.is_empty() {
244 Ok(first)
245 } else {
246 let mut all = vec![first];
247 all.extend(rest);
248 Ok(ParsedQuery::And(all))
249 }
250 }
251
252 fn parse_primary(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
253 let mut negated = false;
254 let mut inner_query = None;
255
256 for inner in pair.into_inner() {
257 match inner.as_rule() {
258 Rule::not_op => negated = true,
259 Rule::group => {
260 let or_expr = inner.into_inner().next().unwrap();
261 inner_query = Some(self.parse_or_expr(or_expr)?);
262 }
263 Rule::ann_query => {
264 inner_query = Some(self.parse_ann_query(inner)?);
265 }
266 Rule::sparse_query => {
267 inner_query = Some(self.parse_sparse_query(inner)?);
268 }
269 Rule::phrase_query => {
270 inner_query = Some(self.parse_phrase_query(inner)?);
271 }
272 Rule::term_query => {
273 inner_query = Some(self.parse_term_query(inner)?);
274 }
275 _ => {}
276 }
277 }
278
279 let query = inner_query.ok_or("No query in primary")?;
280
281 if negated {
282 Ok(ParsedQuery::Not(Box::new(query)))
283 } else {
284 Ok(query)
285 }
286 }
287
288 fn parse_term_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
289 let mut field = None;
290 let mut term = String::new();
291
292 for inner in pair.into_inner() {
293 match inner.as_rule() {
294 Rule::field_spec => {
295 field = Some(inner.into_inner().next().unwrap().as_str().to_string());
296 }
297 Rule::term => {
298 term = inner.as_str().to_string();
299 }
300 _ => {}
301 }
302 }
303
304 Ok(ParsedQuery::Term { field, term })
305 }
306
307 fn parse_phrase_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
308 let mut field = None;
309 let mut phrase = String::new();
310
311 for inner in pair.into_inner() {
312 match inner.as_rule() {
313 Rule::field_spec => {
314 field = Some(inner.into_inner().next().unwrap().as_str().to_string());
315 }
316 Rule::quoted_string => {
317 let s = inner.as_str();
318 phrase = s[1..s.len() - 1].to_string();
319 }
320 _ => {}
321 }
322 }
323
324 Ok(ParsedQuery::Phrase { field, phrase })
325 }
326
327 fn parse_ann_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
329 let mut field = String::new();
330 let mut vector = Vec::new();
331 let mut nprobe = 32usize;
332 let mut rerank = 3usize;
333
334 for inner in pair.into_inner() {
335 match inner.as_rule() {
336 Rule::field_spec => {
337 field = inner.into_inner().next().unwrap().as_str().to_string();
338 }
339 Rule::vector_array => {
340 for num in inner.into_inner() {
341 if num.as_rule() == Rule::number
342 && let Ok(v) = num.as_str().parse::<f32>()
343 {
344 vector.push(v);
345 }
346 }
347 }
348 Rule::ann_params => {
349 for param in inner.into_inner() {
350 if param.as_rule() == Rule::ann_param {
351 let param_str = param.as_str();
353 if let Some(eq_pos) = param_str.find('=') {
354 let name = ¶m_str[..eq_pos];
355 let value = ¶m_str[eq_pos + 1..];
356 let val: usize = value.parse().unwrap_or(0);
357 match name {
358 "nprobe" => nprobe = val,
359 "rerank" => rerank = val,
360 _ => {}
361 }
362 }
363 }
364 }
365 }
366 _ => {}
367 }
368 }
369
370 Ok(ParsedQuery::Ann {
371 field,
372 vector,
373 nprobe,
374 rerank,
375 })
376 }
377
378 fn parse_sparse_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
380 let mut field = String::new();
381 let mut vector = Vec::new();
382
383 for inner in pair.into_inner() {
384 match inner.as_rule() {
385 Rule::field_spec => {
386 field = inner.into_inner().next().unwrap().as_str().to_string();
387 }
388 Rule::sparse_map => {
389 for entry in inner.into_inner() {
390 if entry.as_rule() == Rule::sparse_entry {
391 let mut entry_inner = entry.into_inner();
392 if let (Some(idx), Some(weight)) =
393 (entry_inner.next(), entry_inner.next())
394 && let (Ok(i), Ok(w)) =
395 (idx.as_str().parse::<u32>(), weight.as_str().parse::<f32>())
396 {
397 vector.push((i, w));
398 }
399 }
400 }
401 }
402 _ => {}
403 }
404 }
405
406 Ok(ParsedQuery::Sparse { field, vector })
407 }
408
409 fn build_query(&self, parsed: &ParsedQuery) -> Result<Box<dyn Query>, String> {
410 use crate::query::{DenseVectorQuery, SparseVectorQuery};
411
412 match parsed {
413 ParsedQuery::Term { field, term } => self.build_term_query(field.as_deref(), term),
414 ParsedQuery::Phrase { field, phrase } => {
415 self.build_phrase_query(field.as_deref(), phrase)
416 }
417 ParsedQuery::Ann {
418 field,
419 vector,
420 nprobe,
421 rerank,
422 } => {
423 let field_id = self
424 .schema
425 .get_field(field)
426 .ok_or_else(|| format!("Unknown field: {}", field))?;
427 let query = DenseVectorQuery::new(field_id, vector.clone())
428 .with_nprobe(*nprobe)
429 .with_rerank_factor(*rerank);
430 Ok(Box::new(query))
431 }
432 ParsedQuery::Sparse { field, vector } => {
433 let field_id = self
434 .schema
435 .get_field(field)
436 .ok_or_else(|| format!("Unknown field: {}", field))?;
437 let query = SparseVectorQuery::new(field_id, vector.clone());
438 Ok(Box::new(query))
439 }
440 ParsedQuery::And(queries) => {
441 let mut bool_query = BooleanQuery::new();
442 for q in queries {
443 bool_query = bool_query.must(self.build_query(q)?);
444 }
445 Ok(Box::new(bool_query))
446 }
447 ParsedQuery::Or(queries) => {
448 let mut bool_query = BooleanQuery::new();
449 for q in queries {
450 bool_query = bool_query.should(self.build_query(q)?);
451 }
452 Ok(Box::new(bool_query))
453 }
454 ParsedQuery::Not(inner) => {
455 let mut bool_query = BooleanQuery::new();
457 bool_query = bool_query.must_not(self.build_query(inner)?);
458 Ok(Box::new(bool_query))
459 }
460 }
461 }
462
463 fn build_term_query(&self, field: Option<&str>, term: &str) -> Result<Box<dyn Query>, String> {
464 if let Some(field_name) = field {
465 let field_id = self
467 .schema
468 .get_field(field_name)
469 .ok_or_else(|| format!("Unknown field: {}", field_name))?;
470 let tokenizer = self.get_tokenizer(field_id);
471 let tokens: Vec<String> = tokenizer
472 .tokenize(term)
473 .into_iter()
474 .map(|t| t.text.to_lowercase())
475 .collect();
476
477 if tokens.is_empty() {
478 return Err("No tokens in term".to_string());
479 }
480
481 if tokens.len() == 1 {
482 Ok(Box::new(TermQuery::text(field_id, &tokens[0])))
483 } else {
484 let mut bool_query = BooleanQuery::new();
486 for token in &tokens {
487 bool_query = bool_query.must(TermQuery::text(field_id, token));
488 }
489 Ok(Box::new(bool_query))
490 }
491 } else if !self.default_fields.is_empty() {
492 let tokenizer = self.get_tokenizer(self.default_fields[0]);
494 let tokens: Vec<String> = tokenizer
495 .tokenize(term)
496 .into_iter()
497 .map(|t| t.text.to_lowercase())
498 .collect();
499
500 if tokens.is_empty() {
501 return Err("No tokens in term".to_string());
502 }
503
504 let mut bool_query = BooleanQuery::new();
506 for token in &tokens {
507 for &field_id in &self.default_fields {
508 bool_query = bool_query.should(TermQuery::text(field_id, token));
509 }
510 }
511 Ok(Box::new(bool_query))
512 } else {
513 Err("No field specified and no default fields configured".to_string())
514 }
515 }
516
517 fn build_phrase_query(
518 &self,
519 field: Option<&str>,
520 phrase: &str,
521 ) -> Result<Box<dyn Query>, String> {
522 let field_id = if let Some(field_name) = field {
524 self.schema
525 .get_field(field_name)
526 .ok_or_else(|| format!("Unknown field: {}", field_name))?
527 } else if !self.default_fields.is_empty() {
528 self.default_fields[0]
529 } else {
530 return Err("No field specified and no default fields configured".to_string());
531 };
532
533 let tokenizer = self.get_tokenizer(field_id);
534 let tokens: Vec<String> = tokenizer
535 .tokenize(phrase)
536 .into_iter()
537 .map(|t| t.text.to_lowercase())
538 .collect();
539
540 if tokens.is_empty() {
541 return Err("No tokens in phrase".to_string());
542 }
543
544 if tokens.len() == 1 {
545 return Ok(Box::new(TermQuery::text(field_id, &tokens[0])));
546 }
547
548 let mut bool_query = BooleanQuery::new();
550 for token in &tokens {
551 bool_query = bool_query.must(TermQuery::text(field_id, token));
552 }
553
554 if field.is_none() && self.default_fields.len() > 1 {
556 let mut outer = BooleanQuery::new();
557 for &f in &self.default_fields {
558 let tokenizer = self.get_tokenizer(f);
559 let tokens: Vec<String> = tokenizer
560 .tokenize(phrase)
561 .into_iter()
562 .map(|t| t.text.to_lowercase())
563 .collect();
564
565 let mut field_query = BooleanQuery::new();
566 for token in &tokens {
567 field_query = field_query.must(TermQuery::text(f, token));
568 }
569 outer = outer.should(field_query);
570 }
571 return Ok(Box::new(outer));
572 }
573
574 Ok(Box::new(bool_query))
575 }
576
577 fn get_tokenizer(&self, field: Field) -> BoxedTokenizer {
578 let tokenizer_name = self
580 .schema
581 .get_field_entry(field)
582 .and_then(|entry| entry.tokenizer.as_deref())
583 .unwrap_or("default");
584
585 self.tokenizers
586 .get(tokenizer_name)
587 .unwrap_or_else(|| Box::new(crate::tokenizer::LowercaseTokenizer))
588 }
589}
590
591#[cfg(test)]
592mod tests {
593 use super::*;
594 use crate::dsl::SchemaBuilder;
595 use crate::tokenizer::TokenizerRegistry;
596
597 fn setup() -> (Arc<Schema>, Vec<Field>, Arc<TokenizerRegistry>) {
598 let mut builder = SchemaBuilder::default();
599 let title = builder.add_text_field("title", true, true);
600 let body = builder.add_text_field("body", true, true);
601 let schema = Arc::new(builder.build());
602 let tokenizers = Arc::new(TokenizerRegistry::default());
603 (schema, vec![title, body], tokenizers)
604 }
605
606 #[test]
607 fn test_simple_term() {
608 let (schema, default_fields, tokenizers) = setup();
609 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
610
611 let _query = parser.parse("rust").unwrap();
613 }
614
615 #[test]
616 fn test_field_term() {
617 let (schema, default_fields, tokenizers) = setup();
618 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
619
620 let _query = parser.parse("title:rust").unwrap();
622 }
623
624 #[test]
625 fn test_boolean_and() {
626 let (schema, default_fields, tokenizers) = setup();
627 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
628
629 let _query = parser.parse("rust AND programming").unwrap();
631 }
632
633 #[test]
634 fn test_match_query() {
635 let (schema, default_fields, tokenizers) = setup();
636 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
637
638 let _query = parser.parse("hello world").unwrap();
640 }
641
642 #[test]
643 fn test_phrase_query() {
644 let (schema, default_fields, tokenizers) = setup();
645 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
646
647 let _query = parser.parse("\"hello world\"").unwrap();
649 }
650
651 #[test]
652 fn test_boolean_or() {
653 let (schema, default_fields, tokenizers) = setup();
654 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
655
656 let _query = parser.parse("rust OR python").unwrap();
658 }
659
660 #[test]
661 fn test_complex_query() {
662 let (schema, default_fields, tokenizers) = setup();
663 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
664
665 let _query = parser.parse("(rust OR python) AND programming").unwrap();
667 }
668
669 #[test]
670 fn test_router_exclusive_mode() {
671 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
672
673 let mut builder = SchemaBuilder::default();
674 let _title = builder.add_text_field("title", true, true);
675 let _uri = builder.add_text_field("uri", true, true);
676 let schema = Arc::new(builder.build());
677 let tokenizers = Arc::new(TokenizerRegistry::default());
678
679 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
680 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$".to_string(),
681 substitution: "doi://{1}".to_string(),
682 target_field: "uri".to_string(),
683 mode: RoutingMode::Exclusive,
684 }])
685 .unwrap();
686
687 let parser = QueryLanguageParser::with_router(schema, vec![], tokenizers, router);
688
689 let _query = parser.parse("doi:10.1234/test.123").unwrap();
691 }
692
693 #[test]
694 fn test_router_additional_mode() {
695 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
696
697 let mut builder = SchemaBuilder::default();
698 let title = builder.add_text_field("title", true, true);
699 let _uri = builder.add_text_field("uri", true, true);
700 let schema = Arc::new(builder.build());
701 let tokenizers = Arc::new(TokenizerRegistry::default());
702
703 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
704 pattern: r"#(\d+)".to_string(),
705 substitution: "{1}".to_string(),
706 target_field: "uri".to_string(),
707 mode: RoutingMode::Additional,
708 }])
709 .unwrap();
710
711 let parser = QueryLanguageParser::with_router(schema, vec![title], tokenizers, router);
712
713 let _query = parser.parse("#42").unwrap();
715 }
716
717 #[test]
718 fn test_router_no_match_falls_through() {
719 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
720
721 let mut builder = SchemaBuilder::default();
722 let title = builder.add_text_field("title", true, true);
723 let _uri = builder.add_text_field("uri", true, true);
724 let schema = Arc::new(builder.build());
725 let tokenizers = Arc::new(TokenizerRegistry::default());
726
727 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
728 pattern: r"^doi:".to_string(),
729 substitution: "{0}".to_string(),
730 target_field: "uri".to_string(),
731 mode: RoutingMode::Exclusive,
732 }])
733 .unwrap();
734
735 let parser = QueryLanguageParser::with_router(schema, vec![title], tokenizers, router);
736
737 let _query = parser.parse("rust programming").unwrap();
739 }
740
741 #[test]
742 fn test_router_invalid_target_field() {
743 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
744
745 let mut builder = SchemaBuilder::default();
746 let _title = builder.add_text_field("title", true, true);
747 let schema = Arc::new(builder.build());
748 let tokenizers = Arc::new(TokenizerRegistry::default());
749
750 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
751 pattern: r"test".to_string(),
752 substitution: "{0}".to_string(),
753 target_field: "nonexistent".to_string(),
754 mode: RoutingMode::Exclusive,
755 }])
756 .unwrap();
757
758 let parser = QueryLanguageParser::with_router(schema, vec![], tokenizers, router);
759
760 let result = parser.parse("test");
762 assert!(result.is_err());
763 let err = result.err().unwrap();
764 assert!(err.contains("Unknown target field"));
765 }
766
767 #[test]
768 fn test_parse_ann_query() {
769 let mut builder = SchemaBuilder::default();
770 let embedding = builder.add_dense_vector_field("embedding", 128, true, true);
771 let schema = Arc::new(builder.build());
772 let tokenizers = Arc::new(TokenizerRegistry::default());
773
774 let parser = QueryLanguageParser::new(schema, vec![embedding], tokenizers);
775
776 let result = parser.parse_query_string("embedding:ann([1.0, 2.0, 3.0], nprobe=32)");
778 assert!(result.is_ok(), "Failed to parse ANN query: {:?}", result);
779
780 if let Ok(ParsedQuery::Ann {
781 field,
782 vector,
783 nprobe,
784 rerank,
785 }) = result
786 {
787 assert_eq!(field, "embedding");
788 assert_eq!(vector, vec![1.0, 2.0, 3.0]);
789 assert_eq!(nprobe, 32);
790 assert_eq!(rerank, 3); } else {
792 panic!("Expected Ann query, got: {:?}", result);
793 }
794 }
795
796 #[test]
797 fn test_parse_sparse_query() {
798 let mut builder = SchemaBuilder::default();
799 let sparse = builder.add_text_field("sparse", true, true);
800 let schema = Arc::new(builder.build());
801 let tokenizers = Arc::new(TokenizerRegistry::default());
802
803 let parser = QueryLanguageParser::new(schema, vec![sparse], tokenizers);
804
805 let result = parser.parse_query_string("sparse:sparse({1: 0.5, 5: 0.3})");
807 assert!(result.is_ok(), "Failed to parse sparse query: {:?}", result);
808
809 if let Ok(ParsedQuery::Sparse { field, vector }) = result {
810 assert_eq!(field, "sparse");
811 assert_eq!(vector, vec![(1, 0.5), (5, 0.3)]);
812 } else {
813 panic!("Expected Sparse query, got: {:?}", result);
814 }
815 }
816}