1use pest::Parser;
11use pest_derive::Parser;
12use std::sync::Arc;
13
14use super::query_field_router::{QueryFieldRouter, RoutingMode};
15use super::schema::{Field, Schema};
16use crate::query::{BooleanQuery, Query, TermQuery};
17use crate::tokenizer::{BoxedTokenizer, TokenizerRegistry};
18
19#[derive(Parser)]
20#[grammar = "dsl/ql/ql.pest"]
21struct QueryParser;
22
23#[derive(Debug, Clone)]
25pub enum ParsedQuery {
26 Term {
27 field: Option<String>,
28 term: String,
29 },
30 Phrase {
31 field: Option<String>,
32 phrase: String,
33 },
34 And(Vec<ParsedQuery>),
35 Or(Vec<ParsedQuery>),
36 Not(Box<ParsedQuery>),
37}
38
39pub struct QueryLanguageParser {
41 schema: Arc<Schema>,
42 default_fields: Vec<Field>,
43 tokenizers: Arc<TokenizerRegistry>,
44 field_router: Option<QueryFieldRouter>,
46}
47
48impl QueryLanguageParser {
49 pub fn new(
50 schema: Arc<Schema>,
51 default_fields: Vec<Field>,
52 tokenizers: Arc<TokenizerRegistry>,
53 ) -> Self {
54 Self {
55 schema,
56 default_fields,
57 tokenizers,
58 field_router: None,
59 }
60 }
61
62 pub fn with_router(
64 schema: Arc<Schema>,
65 default_fields: Vec<Field>,
66 tokenizers: Arc<TokenizerRegistry>,
67 router: QueryFieldRouter,
68 ) -> Self {
69 Self {
70 schema,
71 default_fields,
72 tokenizers,
73 field_router: Some(router),
74 }
75 }
76
77 pub fn set_router(&mut self, router: QueryFieldRouter) {
79 self.field_router = Some(router);
80 }
81
82 pub fn router(&self) -> Option<&QueryFieldRouter> {
84 self.field_router.as_ref()
85 }
86
87 pub fn parse(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
97 let query_str = query_str.trim();
98 if query_str.is_empty() {
99 return Err("Empty query".to_string());
100 }
101
102 if let Some(router) = &self.field_router {
104 if let Some(routed) = router.route(query_str) {
105 return self.build_routed_query(&routed.query, &routed.target_field, routed.mode, query_str);
106 }
107 }
108
109 self.parse_normal(query_str)
111 }
112
113 fn build_routed_query(
115 &self,
116 routed_query: &str,
117 target_field: &str,
118 mode: RoutingMode,
119 original_query: &str,
120 ) -> Result<Box<dyn Query>, String> {
121 let _field_id = self
123 .schema
124 .get_field(target_field)
125 .ok_or_else(|| format!("Unknown target field: {}", target_field))?;
126
127 let target_query = self.build_term_query(Some(target_field), routed_query)?;
129
130 match mode {
131 RoutingMode::Exclusive => {
132 Ok(target_query)
134 }
135 RoutingMode::Additional => {
136 let mut bool_query = BooleanQuery::new();
138 bool_query = bool_query.should(target_query);
139
140 if let Ok(default_query) = self.parse_normal(original_query) {
142 bool_query = bool_query.should(default_query);
143 }
144
145 Ok(Box::new(bool_query))
146 }
147 }
148 }
149
150 fn parse_normal(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
152 match self.parse_query_string(query_str) {
154 Ok(parsed) => self.build_query(&parsed),
155 Err(_) => {
156 self.parse_plain_text(query_str)
159 }
160 }
161 }
162
163 fn parse_plain_text(&self, text: &str) -> Result<Box<dyn Query>, String> {
165 if self.default_fields.is_empty() {
166 return Err("No default fields configured".to_string());
167 }
168
169 let tokenizer = self.get_tokenizer(self.default_fields[0]);
170 let tokens: Vec<String> = tokenizer
171 .tokenize(text)
172 .into_iter()
173 .map(|t| t.text.to_lowercase())
174 .collect();
175
176 if tokens.is_empty() {
177 return Err("No tokens in query".to_string());
178 }
179
180 let mut bool_query = BooleanQuery::new();
181 for token in &tokens {
182 for &field_id in &self.default_fields {
183 bool_query = bool_query.should(TermQuery::text(field_id, token));
184 }
185 }
186 Ok(Box::new(bool_query))
187 }
188
189 fn parse_query_string(&self, query_str: &str) -> Result<ParsedQuery, String> {
190 let pairs = QueryParser::parse(Rule::query, query_str)
191 .map_err(|e| format!("Parse error: {}", e))?;
192
193 let query_pair = pairs.into_iter().next().ok_or("No query found")?;
194
195 self.parse_or_expr(query_pair.into_inner().next().unwrap())
197 }
198
199 fn parse_or_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
200 let mut inner = pair.into_inner();
201 let first = self.parse_and_expr(inner.next().unwrap())?;
202
203 let rest: Vec<ParsedQuery> = inner
204 .filter(|p| p.as_rule() == Rule::and_expr)
205 .map(|p| self.parse_and_expr(p))
206 .collect::<Result<Vec<_>, _>>()?;
207
208 if rest.is_empty() {
209 Ok(first)
210 } else {
211 let mut all = vec![first];
212 all.extend(rest);
213 Ok(ParsedQuery::Or(all))
214 }
215 }
216
217 fn parse_and_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
218 let mut inner = pair.into_inner();
219 let first = self.parse_primary(inner.next().unwrap())?;
220
221 let rest: Vec<ParsedQuery> = inner
222 .filter(|p| p.as_rule() == Rule::primary)
223 .map(|p| self.parse_primary(p))
224 .collect::<Result<Vec<_>, _>>()?;
225
226 if rest.is_empty() {
227 Ok(first)
228 } else {
229 let mut all = vec![first];
230 all.extend(rest);
231 Ok(ParsedQuery::And(all))
232 }
233 }
234
235 fn parse_primary(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
236 let mut negated = false;
237 let mut inner_query = None;
238
239 for inner in pair.into_inner() {
240 match inner.as_rule() {
241 Rule::not_op => negated = true,
242 Rule::group => {
243 let or_expr = inner.into_inner().next().unwrap();
244 inner_query = Some(self.parse_or_expr(or_expr)?);
245 }
246 Rule::phrase_query => {
247 inner_query = Some(self.parse_phrase_query(inner)?);
248 }
249 Rule::term_query => {
250 inner_query = Some(self.parse_term_query(inner)?);
251 }
252 _ => {}
253 }
254 }
255
256 let query = inner_query.ok_or("No query in primary")?;
257
258 if negated {
259 Ok(ParsedQuery::Not(Box::new(query)))
260 } else {
261 Ok(query)
262 }
263 }
264
265 fn parse_term_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
266 let mut field = None;
267 let mut term = String::new();
268
269 for inner in pair.into_inner() {
270 match inner.as_rule() {
271 Rule::field_spec => {
272 field = Some(inner.into_inner().next().unwrap().as_str().to_string());
273 }
274 Rule::term => {
275 term = inner.as_str().to_string();
276 }
277 _ => {}
278 }
279 }
280
281 Ok(ParsedQuery::Term { field, term })
282 }
283
284 fn parse_phrase_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
285 let mut field = None;
286 let mut phrase = String::new();
287
288 for inner in pair.into_inner() {
289 match inner.as_rule() {
290 Rule::field_spec => {
291 field = Some(inner.into_inner().next().unwrap().as_str().to_string());
292 }
293 Rule::quoted_string => {
294 let s = inner.as_str();
295 phrase = s[1..s.len() - 1].to_string();
296 }
297 _ => {}
298 }
299 }
300
301 Ok(ParsedQuery::Phrase { field, phrase })
302 }
303
304 fn build_query(&self, parsed: &ParsedQuery) -> Result<Box<dyn Query>, String> {
305 match parsed {
306 ParsedQuery::Term { field, term } => self.build_term_query(field.as_deref(), term),
307 ParsedQuery::Phrase { field, phrase } => {
308 self.build_phrase_query(field.as_deref(), phrase)
309 }
310 ParsedQuery::And(queries) => {
311 let mut bool_query = BooleanQuery::new();
312 for q in queries {
313 bool_query = bool_query.must(self.build_query(q)?);
314 }
315 Ok(Box::new(bool_query))
316 }
317 ParsedQuery::Or(queries) => {
318 let mut bool_query = BooleanQuery::new();
319 for q in queries {
320 bool_query = bool_query.should(self.build_query(q)?);
321 }
322 Ok(Box::new(bool_query))
323 }
324 ParsedQuery::Not(inner) => {
325 let mut bool_query = BooleanQuery::new();
327 bool_query = bool_query.must_not(self.build_query(inner)?);
328 Ok(Box::new(bool_query))
329 }
330 }
331 }
332
333 fn build_term_query(&self, field: Option<&str>, term: &str) -> Result<Box<dyn Query>, String> {
334 if let Some(field_name) = field {
335 let field_id = self
337 .schema
338 .get_field(field_name)
339 .ok_or_else(|| format!("Unknown field: {}", field_name))?;
340 let tokenizer = self.get_tokenizer(field_id);
341 let tokens: Vec<String> = tokenizer
342 .tokenize(term)
343 .into_iter()
344 .map(|t| t.text.to_lowercase())
345 .collect();
346
347 if tokens.is_empty() {
348 return Err("No tokens in term".to_string());
349 }
350
351 if tokens.len() == 1 {
352 Ok(Box::new(TermQuery::text(field_id, &tokens[0])))
353 } else {
354 let mut bool_query = BooleanQuery::new();
356 for token in &tokens {
357 bool_query = bool_query.must(TermQuery::text(field_id, token));
358 }
359 Ok(Box::new(bool_query))
360 }
361 } else if !self.default_fields.is_empty() {
362 let tokenizer = self.get_tokenizer(self.default_fields[0]);
364 let tokens: Vec<String> = tokenizer
365 .tokenize(term)
366 .into_iter()
367 .map(|t| t.text.to_lowercase())
368 .collect();
369
370 if tokens.is_empty() {
371 return Err("No tokens in term".to_string());
372 }
373
374 let mut bool_query = BooleanQuery::new();
376 for token in &tokens {
377 for &field_id in &self.default_fields {
378 bool_query = bool_query.should(TermQuery::text(field_id, token));
379 }
380 }
381 Ok(Box::new(bool_query))
382 } else {
383 Err("No field specified and no default fields configured".to_string())
384 }
385 }
386
387 fn build_phrase_query(
388 &self,
389 field: Option<&str>,
390 phrase: &str,
391 ) -> Result<Box<dyn Query>, String> {
392 let field_id = if let Some(field_name) = field {
394 self.schema
395 .get_field(field_name)
396 .ok_or_else(|| format!("Unknown field: {}", field_name))?
397 } else if !self.default_fields.is_empty() {
398 self.default_fields[0]
399 } else {
400 return Err("No field specified and no default fields configured".to_string());
401 };
402
403 let tokenizer = self.get_tokenizer(field_id);
404 let tokens: Vec<String> = tokenizer
405 .tokenize(phrase)
406 .into_iter()
407 .map(|t| t.text.to_lowercase())
408 .collect();
409
410 if tokens.is_empty() {
411 return Err("No tokens in phrase".to_string());
412 }
413
414 if tokens.len() == 1 {
415 return Ok(Box::new(TermQuery::text(field_id, &tokens[0])));
416 }
417
418 let mut bool_query = BooleanQuery::new();
420 for token in &tokens {
421 bool_query = bool_query.must(TermQuery::text(field_id, token));
422 }
423
424 if field.is_none() && self.default_fields.len() > 1 {
426 let mut outer = BooleanQuery::new();
427 for &f in &self.default_fields {
428 let tokenizer = self.get_tokenizer(f);
429 let tokens: Vec<String> = tokenizer
430 .tokenize(phrase)
431 .into_iter()
432 .map(|t| t.text.to_lowercase())
433 .collect();
434
435 let mut field_query = BooleanQuery::new();
436 for token in &tokens {
437 field_query = field_query.must(TermQuery::text(f, token));
438 }
439 outer = outer.should(field_query);
440 }
441 return Ok(Box::new(outer));
442 }
443
444 Ok(Box::new(bool_query))
445 }
446
447 fn get_tokenizer(&self, field: Field) -> BoxedTokenizer {
448 let tokenizer_name = self
450 .schema
451 .get_field_entry(field)
452 .and_then(|entry| entry.tokenizer.as_deref())
453 .unwrap_or("default");
454
455 self.tokenizers
456 .get(tokenizer_name)
457 .unwrap_or_else(|| Box::new(crate::tokenizer::LowercaseTokenizer))
458 }
459}
460
461#[cfg(test)]
462mod tests {
463 use super::*;
464 use crate::dsl::SchemaBuilder;
465 use crate::tokenizer::TokenizerRegistry;
466
467 fn setup() -> (Arc<Schema>, Vec<Field>, Arc<TokenizerRegistry>) {
468 let mut builder = SchemaBuilder::default();
469 let title = builder.add_text_field("title", true, true);
470 let body = builder.add_text_field("body", true, true);
471 let schema = Arc::new(builder.build());
472 let tokenizers = Arc::new(TokenizerRegistry::default());
473 (schema, vec![title, body], tokenizers)
474 }
475
476 #[test]
477 fn test_simple_term() {
478 let (schema, default_fields, tokenizers) = setup();
479 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
480
481 let _query = parser.parse("rust").unwrap();
483 }
484
485 #[test]
486 fn test_field_term() {
487 let (schema, default_fields, tokenizers) = setup();
488 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
489
490 let _query = parser.parse("title:rust").unwrap();
492 }
493
494 #[test]
495 fn test_boolean_and() {
496 let (schema, default_fields, tokenizers) = setup();
497 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
498
499 let _query = parser.parse("rust AND programming").unwrap();
501 }
502
503 #[test]
504 fn test_match_query() {
505 let (schema, default_fields, tokenizers) = setup();
506 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
507
508 let _query = parser.parse("hello world").unwrap();
510 }
511
512 #[test]
513 fn test_phrase_query() {
514 let (schema, default_fields, tokenizers) = setup();
515 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
516
517 let _query = parser.parse("\"hello world\"").unwrap();
519 }
520
521 #[test]
522 fn test_boolean_or() {
523 let (schema, default_fields, tokenizers) = setup();
524 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
525
526 let _query = parser.parse("rust OR python").unwrap();
528 }
529
530 #[test]
531 fn test_complex_query() {
532 let (schema, default_fields, tokenizers) = setup();
533 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
534
535 let _query = parser.parse("(rust OR python) AND programming").unwrap();
537 }
538
539 #[test]
540 fn test_router_exclusive_mode() {
541 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
542
543 let mut builder = SchemaBuilder::default();
544 let _title = builder.add_text_field("title", true, true);
545 let _uri = builder.add_text_field("uri", true, true);
546 let schema = Arc::new(builder.build());
547 let tokenizers = Arc::new(TokenizerRegistry::default());
548
549 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
550 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$".to_string(),
551 substitution: "doi://{1}".to_string(),
552 target_field: "uri".to_string(),
553 mode: RoutingMode::Exclusive,
554 }])
555 .unwrap();
556
557 let parser = QueryLanguageParser::with_router(
558 schema,
559 vec![],
560 tokenizers,
561 router,
562 );
563
564 let _query = parser.parse("doi:10.1234/test.123").unwrap();
566 }
567
568 #[test]
569 fn test_router_additional_mode() {
570 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
571
572 let mut builder = SchemaBuilder::default();
573 let title = builder.add_text_field("title", true, true);
574 let _uri = builder.add_text_field("uri", true, true);
575 let schema = Arc::new(builder.build());
576 let tokenizers = Arc::new(TokenizerRegistry::default());
577
578 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
579 pattern: r"#(\d+)".to_string(),
580 substitution: "{1}".to_string(),
581 target_field: "uri".to_string(),
582 mode: RoutingMode::Additional,
583 }])
584 .unwrap();
585
586 let parser = QueryLanguageParser::with_router(
587 schema,
588 vec![title],
589 tokenizers,
590 router,
591 );
592
593 let _query = parser.parse("#42").unwrap();
595 }
596
597 #[test]
598 fn test_router_no_match_falls_through() {
599 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
600
601 let mut builder = SchemaBuilder::default();
602 let title = builder.add_text_field("title", true, true);
603 let _uri = builder.add_text_field("uri", true, true);
604 let schema = Arc::new(builder.build());
605 let tokenizers = Arc::new(TokenizerRegistry::default());
606
607 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
608 pattern: r"^doi:".to_string(),
609 substitution: "{0}".to_string(),
610 target_field: "uri".to_string(),
611 mode: RoutingMode::Exclusive,
612 }])
613 .unwrap();
614
615 let parser = QueryLanguageParser::with_router(
616 schema,
617 vec![title],
618 tokenizers,
619 router,
620 );
621
622 let _query = parser.parse("rust programming").unwrap();
624 }
625
626 #[test]
627 fn test_router_invalid_target_field() {
628 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
629
630 let mut builder = SchemaBuilder::default();
631 let _title = builder.add_text_field("title", true, true);
632 let schema = Arc::new(builder.build());
633 let tokenizers = Arc::new(TokenizerRegistry::default());
634
635 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
636 pattern: r"test".to_string(),
637 substitution: "{0}".to_string(),
638 target_field: "nonexistent".to_string(),
639 mode: RoutingMode::Exclusive,
640 }])
641 .unwrap();
642
643 let parser = QueryLanguageParser::with_router(
644 schema,
645 vec![],
646 tokenizers,
647 router,
648 );
649
650 let result = parser.parse("test");
652 assert!(result.is_err());
653 let err = result.err().unwrap();
654 assert!(err.contains("Unknown target field"));
655 }
656}