1use pest::Parser;
11use pest_derive::Parser;
12use std::sync::Arc;
13
14use super::query_field_router::{QueryFieldRouter, RoutingMode};
15use super::schema::{Field, Schema};
16use crate::query::{BooleanQuery, Query, TermQuery};
17use crate::tokenizer::{BoxedTokenizer, TokenizerRegistry};
18
19#[derive(Parser)]
20#[grammar = "dsl/ql/ql.pest"]
21struct QueryParser;
22
23#[derive(Debug, Clone)]
25pub enum ParsedQuery {
26 Term {
27 field: Option<String>,
28 term: String,
29 },
30 Phrase {
31 field: Option<String>,
32 phrase: String,
33 },
34 And(Vec<ParsedQuery>),
35 Or(Vec<ParsedQuery>),
36 Not(Box<ParsedQuery>),
37}
38
39pub struct QueryLanguageParser {
41 schema: Arc<Schema>,
42 default_fields: Vec<Field>,
43 tokenizers: Arc<TokenizerRegistry>,
44 field_router: Option<QueryFieldRouter>,
46}
47
48impl QueryLanguageParser {
49 pub fn new(
50 schema: Arc<Schema>,
51 default_fields: Vec<Field>,
52 tokenizers: Arc<TokenizerRegistry>,
53 ) -> Self {
54 Self {
55 schema,
56 default_fields,
57 tokenizers,
58 field_router: None,
59 }
60 }
61
62 pub fn with_router(
64 schema: Arc<Schema>,
65 default_fields: Vec<Field>,
66 tokenizers: Arc<TokenizerRegistry>,
67 router: QueryFieldRouter,
68 ) -> Self {
69 Self {
70 schema,
71 default_fields,
72 tokenizers,
73 field_router: Some(router),
74 }
75 }
76
77 pub fn set_router(&mut self, router: QueryFieldRouter) {
79 self.field_router = Some(router);
80 }
81
82 pub fn router(&self) -> Option<&QueryFieldRouter> {
84 self.field_router.as_ref()
85 }
86
87 pub fn parse(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
97 let query_str = query_str.trim();
98 if query_str.is_empty() {
99 return Err("Empty query".to_string());
100 }
101
102 if let Some(router) = &self.field_router
104 && let Some(routed) = router.route(query_str)
105 {
106 return self.build_routed_query(
107 &routed.query,
108 &routed.target_field,
109 routed.mode,
110 query_str,
111 );
112 }
113
114 self.parse_normal(query_str)
116 }
117
118 fn build_routed_query(
120 &self,
121 routed_query: &str,
122 target_field: &str,
123 mode: RoutingMode,
124 original_query: &str,
125 ) -> Result<Box<dyn Query>, String> {
126 let _field_id = self
128 .schema
129 .get_field(target_field)
130 .ok_or_else(|| format!("Unknown target field: {}", target_field))?;
131
132 let target_query = self.build_term_query(Some(target_field), routed_query)?;
134
135 match mode {
136 RoutingMode::Exclusive => {
137 Ok(target_query)
139 }
140 RoutingMode::Additional => {
141 let mut bool_query = BooleanQuery::new();
143 bool_query = bool_query.should(target_query);
144
145 if let Ok(default_query) = self.parse_normal(original_query) {
147 bool_query = bool_query.should(default_query);
148 }
149
150 Ok(Box::new(bool_query))
151 }
152 }
153 }
154
155 fn parse_normal(&self, query_str: &str) -> Result<Box<dyn Query>, String> {
157 match self.parse_query_string(query_str) {
159 Ok(parsed) => self.build_query(&parsed),
160 Err(_) => {
161 self.parse_plain_text(query_str)
164 }
165 }
166 }
167
168 fn parse_plain_text(&self, text: &str) -> Result<Box<dyn Query>, String> {
170 if self.default_fields.is_empty() {
171 return Err("No default fields configured".to_string());
172 }
173
174 let tokenizer = self.get_tokenizer(self.default_fields[0]);
175 let tokens: Vec<String> = tokenizer
176 .tokenize(text)
177 .into_iter()
178 .map(|t| t.text.to_lowercase())
179 .collect();
180
181 if tokens.is_empty() {
182 return Err("No tokens in query".to_string());
183 }
184
185 let mut bool_query = BooleanQuery::new();
186 for token in &tokens {
187 for &field_id in &self.default_fields {
188 bool_query = bool_query.should(TermQuery::text(field_id, token));
189 }
190 }
191 Ok(Box::new(bool_query))
192 }
193
194 fn parse_query_string(&self, query_str: &str) -> Result<ParsedQuery, String> {
195 let pairs = QueryParser::parse(Rule::query, query_str)
196 .map_err(|e| format!("Parse error: {}", e))?;
197
198 let query_pair = pairs.into_iter().next().ok_or("No query found")?;
199
200 self.parse_or_expr(query_pair.into_inner().next().unwrap())
202 }
203
204 fn parse_or_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
205 let mut inner = pair.into_inner();
206 let first = self.parse_and_expr(inner.next().unwrap())?;
207
208 let rest: Vec<ParsedQuery> = inner
209 .filter(|p| p.as_rule() == Rule::and_expr)
210 .map(|p| self.parse_and_expr(p))
211 .collect::<Result<Vec<_>, _>>()?;
212
213 if rest.is_empty() {
214 Ok(first)
215 } else {
216 let mut all = vec![first];
217 all.extend(rest);
218 Ok(ParsedQuery::Or(all))
219 }
220 }
221
222 fn parse_and_expr(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
223 let mut inner = pair.into_inner();
224 let first = self.parse_primary(inner.next().unwrap())?;
225
226 let rest: Vec<ParsedQuery> = inner
227 .filter(|p| p.as_rule() == Rule::primary)
228 .map(|p| self.parse_primary(p))
229 .collect::<Result<Vec<_>, _>>()?;
230
231 if rest.is_empty() {
232 Ok(first)
233 } else {
234 let mut all = vec![first];
235 all.extend(rest);
236 Ok(ParsedQuery::And(all))
237 }
238 }
239
240 fn parse_primary(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
241 let mut negated = false;
242 let mut inner_query = None;
243
244 for inner in pair.into_inner() {
245 match inner.as_rule() {
246 Rule::not_op => negated = true,
247 Rule::group => {
248 let or_expr = inner.into_inner().next().unwrap();
249 inner_query = Some(self.parse_or_expr(or_expr)?);
250 }
251 Rule::phrase_query => {
252 inner_query = Some(self.parse_phrase_query(inner)?);
253 }
254 Rule::term_query => {
255 inner_query = Some(self.parse_term_query(inner)?);
256 }
257 _ => {}
258 }
259 }
260
261 let query = inner_query.ok_or("No query in primary")?;
262
263 if negated {
264 Ok(ParsedQuery::Not(Box::new(query)))
265 } else {
266 Ok(query)
267 }
268 }
269
270 fn parse_term_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
271 let mut field = None;
272 let mut term = String::new();
273
274 for inner in pair.into_inner() {
275 match inner.as_rule() {
276 Rule::field_spec => {
277 field = Some(inner.into_inner().next().unwrap().as_str().to_string());
278 }
279 Rule::term => {
280 term = inner.as_str().to_string();
281 }
282 _ => {}
283 }
284 }
285
286 Ok(ParsedQuery::Term { field, term })
287 }
288
289 fn parse_phrase_query(&self, pair: pest::iterators::Pair<Rule>) -> Result<ParsedQuery, String> {
290 let mut field = None;
291 let mut phrase = String::new();
292
293 for inner in pair.into_inner() {
294 match inner.as_rule() {
295 Rule::field_spec => {
296 field = Some(inner.into_inner().next().unwrap().as_str().to_string());
297 }
298 Rule::quoted_string => {
299 let s = inner.as_str();
300 phrase = s[1..s.len() - 1].to_string();
301 }
302 _ => {}
303 }
304 }
305
306 Ok(ParsedQuery::Phrase { field, phrase })
307 }
308
309 fn build_query(&self, parsed: &ParsedQuery) -> Result<Box<dyn Query>, String> {
310 match parsed {
311 ParsedQuery::Term { field, term } => self.build_term_query(field.as_deref(), term),
312 ParsedQuery::Phrase { field, phrase } => {
313 self.build_phrase_query(field.as_deref(), phrase)
314 }
315 ParsedQuery::And(queries) => {
316 let mut bool_query = BooleanQuery::new();
317 for q in queries {
318 bool_query = bool_query.must(self.build_query(q)?);
319 }
320 Ok(Box::new(bool_query))
321 }
322 ParsedQuery::Or(queries) => {
323 let mut bool_query = BooleanQuery::new();
324 for q in queries {
325 bool_query = bool_query.should(self.build_query(q)?);
326 }
327 Ok(Box::new(bool_query))
328 }
329 ParsedQuery::Not(inner) => {
330 let mut bool_query = BooleanQuery::new();
332 bool_query = bool_query.must_not(self.build_query(inner)?);
333 Ok(Box::new(bool_query))
334 }
335 }
336 }
337
338 fn build_term_query(&self, field: Option<&str>, term: &str) -> Result<Box<dyn Query>, String> {
339 if let Some(field_name) = field {
340 let field_id = self
342 .schema
343 .get_field(field_name)
344 .ok_or_else(|| format!("Unknown field: {}", field_name))?;
345 let tokenizer = self.get_tokenizer(field_id);
346 let tokens: Vec<String> = tokenizer
347 .tokenize(term)
348 .into_iter()
349 .map(|t| t.text.to_lowercase())
350 .collect();
351
352 if tokens.is_empty() {
353 return Err("No tokens in term".to_string());
354 }
355
356 if tokens.len() == 1 {
357 Ok(Box::new(TermQuery::text(field_id, &tokens[0])))
358 } else {
359 let mut bool_query = BooleanQuery::new();
361 for token in &tokens {
362 bool_query = bool_query.must(TermQuery::text(field_id, token));
363 }
364 Ok(Box::new(bool_query))
365 }
366 } else if !self.default_fields.is_empty() {
367 let tokenizer = self.get_tokenizer(self.default_fields[0]);
369 let tokens: Vec<String> = tokenizer
370 .tokenize(term)
371 .into_iter()
372 .map(|t| t.text.to_lowercase())
373 .collect();
374
375 if tokens.is_empty() {
376 return Err("No tokens in term".to_string());
377 }
378
379 let mut bool_query = BooleanQuery::new();
381 for token in &tokens {
382 for &field_id in &self.default_fields {
383 bool_query = bool_query.should(TermQuery::text(field_id, token));
384 }
385 }
386 Ok(Box::new(bool_query))
387 } else {
388 Err("No field specified and no default fields configured".to_string())
389 }
390 }
391
392 fn build_phrase_query(
393 &self,
394 field: Option<&str>,
395 phrase: &str,
396 ) -> Result<Box<dyn Query>, String> {
397 let field_id = if let Some(field_name) = field {
399 self.schema
400 .get_field(field_name)
401 .ok_or_else(|| format!("Unknown field: {}", field_name))?
402 } else if !self.default_fields.is_empty() {
403 self.default_fields[0]
404 } else {
405 return Err("No field specified and no default fields configured".to_string());
406 };
407
408 let tokenizer = self.get_tokenizer(field_id);
409 let tokens: Vec<String> = tokenizer
410 .tokenize(phrase)
411 .into_iter()
412 .map(|t| t.text.to_lowercase())
413 .collect();
414
415 if tokens.is_empty() {
416 return Err("No tokens in phrase".to_string());
417 }
418
419 if tokens.len() == 1 {
420 return Ok(Box::new(TermQuery::text(field_id, &tokens[0])));
421 }
422
423 let mut bool_query = BooleanQuery::new();
425 for token in &tokens {
426 bool_query = bool_query.must(TermQuery::text(field_id, token));
427 }
428
429 if field.is_none() && self.default_fields.len() > 1 {
431 let mut outer = BooleanQuery::new();
432 for &f in &self.default_fields {
433 let tokenizer = self.get_tokenizer(f);
434 let tokens: Vec<String> = tokenizer
435 .tokenize(phrase)
436 .into_iter()
437 .map(|t| t.text.to_lowercase())
438 .collect();
439
440 let mut field_query = BooleanQuery::new();
441 for token in &tokens {
442 field_query = field_query.must(TermQuery::text(f, token));
443 }
444 outer = outer.should(field_query);
445 }
446 return Ok(Box::new(outer));
447 }
448
449 Ok(Box::new(bool_query))
450 }
451
452 fn get_tokenizer(&self, field: Field) -> BoxedTokenizer {
453 let tokenizer_name = self
455 .schema
456 .get_field_entry(field)
457 .and_then(|entry| entry.tokenizer.as_deref())
458 .unwrap_or("default");
459
460 self.tokenizers
461 .get(tokenizer_name)
462 .unwrap_or_else(|| Box::new(crate::tokenizer::LowercaseTokenizer))
463 }
464}
465
466#[cfg(test)]
467mod tests {
468 use super::*;
469 use crate::dsl::SchemaBuilder;
470 use crate::tokenizer::TokenizerRegistry;
471
472 fn setup() -> (Arc<Schema>, Vec<Field>, Arc<TokenizerRegistry>) {
473 let mut builder = SchemaBuilder::default();
474 let title = builder.add_text_field("title", true, true);
475 let body = builder.add_text_field("body", true, true);
476 let schema = Arc::new(builder.build());
477 let tokenizers = Arc::new(TokenizerRegistry::default());
478 (schema, vec![title, body], tokenizers)
479 }
480
481 #[test]
482 fn test_simple_term() {
483 let (schema, default_fields, tokenizers) = setup();
484 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
485
486 let _query = parser.parse("rust").unwrap();
488 }
489
490 #[test]
491 fn test_field_term() {
492 let (schema, default_fields, tokenizers) = setup();
493 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
494
495 let _query = parser.parse("title:rust").unwrap();
497 }
498
499 #[test]
500 fn test_boolean_and() {
501 let (schema, default_fields, tokenizers) = setup();
502 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
503
504 let _query = parser.parse("rust AND programming").unwrap();
506 }
507
508 #[test]
509 fn test_match_query() {
510 let (schema, default_fields, tokenizers) = setup();
511 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
512
513 let _query = parser.parse("hello world").unwrap();
515 }
516
517 #[test]
518 fn test_phrase_query() {
519 let (schema, default_fields, tokenizers) = setup();
520 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
521
522 let _query = parser.parse("\"hello world\"").unwrap();
524 }
525
526 #[test]
527 fn test_boolean_or() {
528 let (schema, default_fields, tokenizers) = setup();
529 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
530
531 let _query = parser.parse("rust OR python").unwrap();
533 }
534
535 #[test]
536 fn test_complex_query() {
537 let (schema, default_fields, tokenizers) = setup();
538 let parser = QueryLanguageParser::new(schema, default_fields, tokenizers);
539
540 let _query = parser.parse("(rust OR python) AND programming").unwrap();
542 }
543
544 #[test]
545 fn test_router_exclusive_mode() {
546 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
547
548 let mut builder = SchemaBuilder::default();
549 let _title = builder.add_text_field("title", true, true);
550 let _uri = builder.add_text_field("uri", true, true);
551 let schema = Arc::new(builder.build());
552 let tokenizers = Arc::new(TokenizerRegistry::default());
553
554 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
555 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$".to_string(),
556 substitution: "doi://{1}".to_string(),
557 target_field: "uri".to_string(),
558 mode: RoutingMode::Exclusive,
559 }])
560 .unwrap();
561
562 let parser = QueryLanguageParser::with_router(schema, vec![], tokenizers, router);
563
564 let _query = parser.parse("doi:10.1234/test.123").unwrap();
566 }
567
568 #[test]
569 fn test_router_additional_mode() {
570 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
571
572 let mut builder = SchemaBuilder::default();
573 let title = builder.add_text_field("title", true, true);
574 let _uri = builder.add_text_field("uri", true, true);
575 let schema = Arc::new(builder.build());
576 let tokenizers = Arc::new(TokenizerRegistry::default());
577
578 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
579 pattern: r"#(\d+)".to_string(),
580 substitution: "{1}".to_string(),
581 target_field: "uri".to_string(),
582 mode: RoutingMode::Additional,
583 }])
584 .unwrap();
585
586 let parser = QueryLanguageParser::with_router(schema, vec![title], tokenizers, router);
587
588 let _query = parser.parse("#42").unwrap();
590 }
591
592 #[test]
593 fn test_router_no_match_falls_through() {
594 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
595
596 let mut builder = SchemaBuilder::default();
597 let title = builder.add_text_field("title", true, true);
598 let _uri = builder.add_text_field("uri", true, true);
599 let schema = Arc::new(builder.build());
600 let tokenizers = Arc::new(TokenizerRegistry::default());
601
602 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
603 pattern: r"^doi:".to_string(),
604 substitution: "{0}".to_string(),
605 target_field: "uri".to_string(),
606 mode: RoutingMode::Exclusive,
607 }])
608 .unwrap();
609
610 let parser = QueryLanguageParser::with_router(schema, vec![title], tokenizers, router);
611
612 let _query = parser.parse("rust programming").unwrap();
614 }
615
616 #[test]
617 fn test_router_invalid_target_field() {
618 use crate::dsl::query_field_router::{QueryFieldRouter, QueryRouterRule, RoutingMode};
619
620 let mut builder = SchemaBuilder::default();
621 let _title = builder.add_text_field("title", true, true);
622 let schema = Arc::new(builder.build());
623 let tokenizers = Arc::new(TokenizerRegistry::default());
624
625 let router = QueryFieldRouter::from_rules(&[QueryRouterRule {
626 pattern: r"test".to_string(),
627 substitution: "{0}".to_string(),
628 target_field: "nonexistent".to_string(),
629 mode: RoutingMode::Exclusive,
630 }])
631 .unwrap();
632
633 let parser = QueryLanguageParser::with_router(schema, vec![], tokenizers, router);
634
635 let result = parser.parse("test");
637 assert!(result.is_err());
638 let err = result.err().unwrap();
639 assert!(err.contains("Unknown target field"));
640 }
641}