1use std::collections::HashSet;
9
10use crate::analysis::token::Token;
11
12pub trait TokenFilter: Send + Sync {
17 fn apply(&self, tokens: &mut Vec<Token>);
19}
20
21pub struct LowercaseFilter;
29
30impl TokenFilter for LowercaseFilter {
31 fn apply(&self, tokens: &mut Vec<Token>) {
32 for token in tokens.iter_mut() {
33 let lowered = token.text.to_lowercase();
35 if lowered != token.text {
36 token.text = lowered;
37 }
38 }
39 }
40}
41
42pub struct StopFilter {
50 stop_words: HashSet<String>,
51}
52
53impl StopFilter {
54 pub fn new(words: impl IntoIterator<Item = impl Into<String>>) -> Self {
56 Self {
57 stop_words: words.into_iter().map(Into::into).collect(),
58 }
59 }
60
61 pub fn english() -> Self {
65 Self::new(ENGLISH_STOP_WORDS.iter().copied())
66 }
67}
68
69impl TokenFilter for StopFilter {
70 fn apply(&self, tokens: &mut Vec<Token>) {
71 tokens.retain(|token| !self.stop_words.contains(&token.text));
72 }
73}
74
75const ENGLISH_STOP_WORDS: &[&str] = &[
77 "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
78 "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
79 "they", "this", "to", "was", "will", "with",
80];
81
82pub struct StemmerFilter {
92 algorithm: rust_stemmers::Algorithm,
93}
94
95impl StemmerFilter {
96 pub fn new(algorithm: rust_stemmers::Algorithm) -> Self {
98 Self { algorithm }
99 }
100
101 pub fn english() -> Self {
103 Self::new(rust_stemmers::Algorithm::English)
104 }
105}
106
107impl TokenFilter for StemmerFilter {
108 fn apply(&self, tokens: &mut Vec<Token>) {
109 let stemmer = rust_stemmers::Stemmer::create(self.algorithm);
110 for token in tokens.iter_mut() {
111 let stemmed = stemmer.stem(&token.text);
112 if stemmed != token.text {
113 token.text = stemmed.into_owned();
114 }
115 }
116 }
117}
118
119pub use rust_stemmers::Algorithm as StemmerAlgorithm;
122
123pub struct AsciiFoldingFilter {
133 pub preserve_original: bool,
134}
135
136impl AsciiFoldingFilter {
137 pub fn new(preserve_original: bool) -> Self {
138 Self { preserve_original }
139 }
140}
141
142impl TokenFilter for AsciiFoldingFilter {
143 fn apply(&self, tokens: &mut Vec<Token>) {
144 if self.preserve_original {
145 let mut extra = Vec::new();
146 for token in tokens.iter_mut() {
147 let folded = ascii_fold(&token.text);
148 if folded != token.text {
149 extra.push(Token {
150 text: folded,
151 offset_from: token.offset_from,
152 offset_to: token.offset_to,
153 position: token.position, });
155 }
156 }
157 tokens.extend(extra);
158 } else {
159 for token in tokens.iter_mut() {
160 let folded = ascii_fold(&token.text);
161 if folded != token.text {
162 token.text = folded;
163 }
164 }
165 }
166 }
167}
168
169fn ascii_fold(s: &str) -> String {
171 let mut result = String::with_capacity(s.len());
172 for ch in s.chars() {
173 if (ch as u32) < 0x80 {
174 result.push(ch);
175 } else {
176 result.push_str(fold_char(ch));
177 }
178 }
179 result
180}
181
182fn fold_char(ch: char) -> &'static str {
186 match ch {
187 '\u{00C0}'..='\u{00C5}' => "A", '\u{00C6}' => "AE", '\u{00C7}' => "C", '\u{00C8}'..='\u{00CB}' => "E", '\u{00CC}'..='\u{00CF}' => "I", '\u{00D0}' => "D", '\u{00D1}' => "N", '\u{00D2}'..='\u{00D6}' => "O", '\u{00D8}' => "O", '\u{00D9}'..='\u{00DC}' => "U", '\u{00DD}' => "Y", '\u{00DE}' => "TH", '\u{00DF}' => "ss", '\u{00E0}'..='\u{00E5}' => "a", '\u{00E6}' => "ae", '\u{00E7}' => "c", '\u{00E8}'..='\u{00EB}' => "e", '\u{00EC}'..='\u{00EF}' => "i", '\u{00F0}' => "d", '\u{00F1}' => "n", '\u{00F2}'..='\u{00F6}' => "o", '\u{00F8}' => "o", '\u{00F9}'..='\u{00FC}' => "u", '\u{00FD}' | '\u{00FF}' => "y", '\u{00FE}' => "th", '\u{0100}' | '\u{0102}' | '\u{0104}' => "A",
216 '\u{0101}' | '\u{0103}' | '\u{0105}' => "a",
217 '\u{0106}' | '\u{0108}' | '\u{010A}' | '\u{010C}' => "C",
218 '\u{0107}' | '\u{0109}' | '\u{010B}' | '\u{010D}' => "c",
219 '\u{010E}' | '\u{0110}' => "D",
220 '\u{010F}' | '\u{0111}' => "d",
221 '\u{0112}' | '\u{0114}' | '\u{0116}' | '\u{0118}' | '\u{011A}' => "E",
222 '\u{0113}' | '\u{0115}' | '\u{0117}' | '\u{0119}' | '\u{011B}' => "e",
223 '\u{011C}' | '\u{011E}' | '\u{0120}' | '\u{0122}' => "G",
224 '\u{011D}' | '\u{011F}' | '\u{0121}' | '\u{0123}' => "g",
225 '\u{0124}' | '\u{0126}' => "H",
226 '\u{0125}' | '\u{0127}' => "h",
227 '\u{0128}' | '\u{012A}' | '\u{012C}' | '\u{012E}' | '\u{0130}' => "I",
228 '\u{0129}' | '\u{012B}' | '\u{012D}' | '\u{012F}' | '\u{0131}' => "i",
229 '\u{0132}' => "IJ",
230 '\u{0133}' => "ij",
231 '\u{0134}' => "J",
232 '\u{0135}' => "j",
233 '\u{0136}' => "K",
234 '\u{0137}' | '\u{0138}' => "k",
235 '\u{0139}' | '\u{013B}' | '\u{013D}' | '\u{013F}' | '\u{0141}' => "L",
236 '\u{013A}' | '\u{013C}' | '\u{013E}' | '\u{0140}' | '\u{0142}' => "l",
237 '\u{0143}' | '\u{0145}' | '\u{0147}' | '\u{014A}' => "N",
238 '\u{0144}' | '\u{0146}' | '\u{0148}' | '\u{0149}' | '\u{014B}' => "n",
239 '\u{014C}' | '\u{014E}' | '\u{0150}' => "O",
240 '\u{014D}' | '\u{014F}' | '\u{0151}' => "o",
241 '\u{0152}' => "OE",
242 '\u{0153}' => "oe",
243 '\u{0154}' | '\u{0156}' | '\u{0158}' => "R",
244 '\u{0155}' | '\u{0157}' | '\u{0159}' => "r",
245 '\u{015A}' | '\u{015C}' | '\u{015E}' | '\u{0160}' => "S",
246 '\u{015B}' | '\u{015D}' | '\u{015F}' | '\u{0161}' => "s",
247 '\u{0162}' | '\u{0164}' | '\u{0166}' => "T",
248 '\u{0163}' | '\u{0165}' | '\u{0167}' => "t",
249 '\u{0168}' | '\u{016A}' | '\u{016C}' | '\u{016E}' | '\u{0170}' | '\u{0172}' => "U",
250 '\u{0169}' | '\u{016B}' | '\u{016D}' | '\u{016F}' | '\u{0171}' | '\u{0173}' => "u",
251 '\u{0174}' => "W",
252 '\u{0175}' => "w",
253 '\u{0176}' => "Y",
254 '\u{0177}' => "y",
255 '\u{0178}' => "Y",
256 '\u{0179}' | '\u{017B}' | '\u{017D}' => "Z",
257 '\u{017A}' | '\u{017C}' | '\u{017E}' => "z",
258
259 '\u{0218}' | '\u{021A}' => "S", '\u{0219}' | '\u{021B}' => "s", '\u{01A0}' | '\u{01A2}' => "O",
263 '\u{01A1}' | '\u{01A3}' => "o",
264 '\u{01AF}' => "U",
265 '\u{01B0}' => "u",
266
267 '\u{FF21}'..='\u{FF3A}' => {
269 return leak_fold(ch);
272 }
273 '\u{FF41}'..='\u{FF5A}' => {
274 return leak_fold(ch);
275 }
276
277 _ => return leak_fold(ch),
279 }
280}
281
282fn leak_fold(ch: char) -> &'static str {
286 let code = ch as u32;
288 if (0xFF21..=0xFF3A).contains(&code) {
289 let ascii = (code - 0xFF21 + b'A' as u32) as u8 as char;
290 return match ascii {
291 'A' => "A",
292 'B' => "B",
293 'C' => "C",
294 'D' => "D",
295 'E' => "E",
296 'F' => "F",
297 'G' => "G",
298 'H' => "H",
299 'I' => "I",
300 'J' => "J",
301 'K' => "K",
302 'L' => "L",
303 'M' => "M",
304 'N' => "N",
305 'O' => "O",
306 'P' => "P",
307 'Q' => "Q",
308 'R' => "R",
309 'S' => "S",
310 'T' => "T",
311 'U' => "U",
312 'V' => "V",
313 'W' => "W",
314 'X' => "X",
315 'Y' => "Y",
316 'Z' => "Z",
317 _ => unreachable!(),
318 };
319 }
320 if (0xFF41..=0xFF5A).contains(&code) {
321 let ascii = (code - 0xFF41 + b'a' as u32) as u8 as char;
322 return match ascii {
323 'a' => "a",
324 'b' => "b",
325 'c' => "c",
326 'd' => "d",
327 'e' => "e",
328 'f' => "f",
329 'g' => "g",
330 'h' => "h",
331 'i' => "i",
332 'j' => "j",
333 'k' => "k",
334 'l' => "l",
335 'm' => "m",
336 'n' => "n",
337 'o' => "o",
338 'p' => "p",
339 'q' => "q",
340 'r' => "r",
341 's' => "s",
342 't' => "t",
343 'u' => "u",
344 'v' => "v",
345 'w' => "w",
346 'x' => "x",
347 'y' => "y",
348 'z' => "z",
349 _ => unreachable!(),
350 };
351 }
352
353 let s = ch.to_string();
357 Box::leak(s.into_boxed_str())
358}
359
360pub struct NGramTokenFilter {
369 pub min_gram: usize,
370 pub max_gram: usize,
371}
372
373impl NGramTokenFilter {
374 pub fn new(min_gram: usize, max_gram: usize) -> Self {
375 Self { min_gram, max_gram }
376 }
377}
378
379impl TokenFilter for NGramTokenFilter {
380 fn apply(&self, tokens: &mut Vec<Token>) {
381 let original = std::mem::take(tokens);
382 for token in &original {
383 let chars: Vec<(usize, char)> = token.text.char_indices().collect();
384 for n in self.min_gram..=self.max_gram {
385 if n > chars.len() {
386 break;
387 }
388 for i in 0..=chars.len() - n {
389 let start = chars[i].0;
390 let end = if i + n < chars.len() {
391 chars[i + n].0
392 } else {
393 token.text.len()
394 };
395 tokens.push(Token {
396 text: token.text[start..end].to_string(),
397 offset_from: token.offset_from,
398 offset_to: token.offset_to,
399 position: token.position,
400 });
401 }
402 }
403 }
404 }
405}
406
407pub struct EdgeNGramTokenFilter {
413 pub min_gram: usize,
414 pub max_gram: usize,
415 pub preserve_original: bool,
416}
417
418impl EdgeNGramTokenFilter {
419 pub fn new(min_gram: usize, max_gram: usize, preserve_original: bool) -> Self {
420 Self {
421 min_gram,
422 max_gram,
423 preserve_original,
424 }
425 }
426}
427
428impl TokenFilter for EdgeNGramTokenFilter {
429 fn apply(&self, tokens: &mut Vec<Token>) {
430 let original = std::mem::take(tokens);
431 for token in &original {
432 let chars: Vec<(usize, char)> = token.text.char_indices().collect();
433 let mut emitted_original = false;
434 for n in self.min_gram..=self.max_gram.min(chars.len()) {
435 let end = if n < chars.len() {
436 chars[n].0
437 } else {
438 token.text.len()
439 };
440 if n == chars.len() {
441 emitted_original = true;
442 }
443 tokens.push(Token {
444 text: token.text[..end].to_string(),
445 offset_from: token.offset_from,
446 offset_to: token.offset_to,
447 position: token.position,
448 });
449 }
450 if self.preserve_original && !emitted_original {
451 tokens.push(token.clone());
452 }
453 }
454 }
455}
456
457pub struct SynonymFilter {
467 synonym_map: std::collections::HashMap<String, Vec<String>>,
469}
470
471impl SynonymFilter {
472 pub fn new(rules: &[String], expand: bool) -> Self {
474 let mut synonym_map: std::collections::HashMap<String, Vec<String>> =
475 std::collections::HashMap::new();
476
477 for rule in rules {
478 let rule = rule.trim();
479 if rule.is_empty() || rule.starts_with('#') {
480 continue;
481 }
482
483 if let Some((left, right)) = rule.split_once("=>") {
484 let left_terms: Vec<String> = left
486 .split(',')
487 .map(|s| s.trim().to_lowercase())
488 .filter(|s| !s.is_empty())
489 .collect();
490 let right_terms: Vec<String> = right
491 .split(',')
492 .map(|s| s.trim().to_lowercase())
493 .filter(|s| !s.is_empty())
494 .collect();
495
496 for term in &left_terms {
497 synonym_map
498 .entry(term.clone())
499 .or_default()
500 .extend(right_terms.clone());
501 }
502 } else {
503 let terms: Vec<String> = rule
505 .split(',')
506 .map(|s| s.trim().to_lowercase())
507 .filter(|s| !s.is_empty())
508 .collect();
509
510 if expand {
511 for term in &terms {
513 let others: Vec<String> =
514 terms.iter().filter(|t| *t != term).cloned().collect();
515 synonym_map.entry(term.clone()).or_default().extend(others);
516 }
517 } else {
518 if let Some(canonical) = terms.first() {
520 for term in &terms[1..] {
521 synonym_map
522 .entry(term.clone())
523 .or_default()
524 .push(canonical.clone());
525 }
526 }
527 }
528 }
529 }
530
531 Self { synonym_map }
532 }
533}
534
535impl TokenFilter for SynonymFilter {
536 fn apply(&self, tokens: &mut Vec<Token>) {
537 let mut extra = Vec::new();
538 for token in tokens.iter() {
539 if let Some(synonyms) = self.synonym_map.get(&token.text) {
540 for syn in synonyms {
541 extra.push(Token {
542 text: syn.clone(),
543 offset_from: token.offset_from,
544 offset_to: token.offset_to,
545 position: token.position, });
547 }
548 }
549 }
550 tokens.extend(extra);
551 }
552}
553
554pub struct ShingleFilter {
564 pub min_size: usize,
565 pub max_size: usize,
566 pub output_unigrams: bool,
567 pub separator: String,
568 pub filler_token: String,
569}
570
571impl ShingleFilter {
572 pub fn new(min_size: usize, max_size: usize, output_unigrams: bool) -> Self {
573 Self {
574 min_size,
575 max_size,
576 output_unigrams,
577 separator: " ".to_string(),
578 filler_token: "_".to_string(),
579 }
580 }
581}
582
583impl TokenFilter for ShingleFilter {
584 fn apply(&self, tokens: &mut Vec<Token>) {
585 if tokens.is_empty() {
586 return;
587 }
588
589 let original = tokens.clone();
590 let mut result = Vec::new();
591
592 for (i, token) in original.iter().enumerate() {
593 if self.output_unigrams {
594 result.push(token.clone());
595 }
596
597 for size in self.min_size..=self.max_size {
599 if i + size > original.len() {
600 break;
601 }
602 let shingle_tokens = &original[i..i + size];
603 let shingle_text: String = shingle_tokens
604 .iter()
605 .map(|t| t.text.as_str())
606 .collect::<Vec<_>>()
607 .join(&self.separator);
608
609 result.push(Token {
610 text: shingle_text,
611 offset_from: shingle_tokens.first().unwrap().offset_from,
612 offset_to: shingle_tokens.last().unwrap().offset_to,
613 position: token.position,
614 });
615 }
616 }
617
618 *tokens = result;
619 }
620}
621
622#[cfg(test)]
623mod tests {
624 use super::*;
625
626 fn make_tokens(words: &[&str]) -> Vec<Token> {
627 words
628 .iter()
629 .enumerate()
630 .map(|(i, w)| Token::new(*w, 0, w.len(), i as u32))
631 .collect()
632 }
633
634 #[test]
637 fn lowercase_basic() {
638 let mut tokens = make_tokens(&["Hello", "WORLD", "TeSt"]);
639 LowercaseFilter.apply(&mut tokens);
640 assert_eq!(tokens[0].text, "hello");
641 assert_eq!(tokens[1].text, "world");
642 assert_eq!(tokens[2].text, "test");
643 }
644
645 #[test]
646 fn lowercase_already_lower() {
647 let mut tokens = make_tokens(&["hello", "world"]);
648 LowercaseFilter.apply(&mut tokens);
649 assert_eq!(tokens[0].text, "hello");
650 assert_eq!(tokens[1].text, "world");
651 }
652
653 #[test]
654 fn lowercase_unicode() {
655 let mut tokens = make_tokens(&["CAFÉ", "Ñoño"]);
656 LowercaseFilter.apply(&mut tokens);
657 assert_eq!(tokens[0].text, "café");
658 assert_eq!(tokens[1].text, "ñoño");
659 }
660
661 #[test]
662 fn lowercase_preserves_positions() {
663 let mut tokens = make_tokens(&["A", "B", "C"]);
664 LowercaseFilter.apply(&mut tokens);
665 assert_eq!(tokens[0].position, 0);
666 assert_eq!(tokens[1].position, 1);
667 assert_eq!(tokens[2].position, 2);
668 }
669
670 #[test]
671 fn lowercase_empty() {
672 let mut tokens: Vec<Token> = Vec::new();
673 LowercaseFilter.apply(&mut tokens);
674 assert!(tokens.is_empty());
675 }
676
677 #[test]
680 fn stop_removes_stop_words() {
681 let mut tokens = make_tokens(&["the", "quick", "brown", "fox"]);
682 StopFilter::english().apply(&mut tokens);
683 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
684 assert_eq!(texts, vec!["quick", "brown", "fox"]);
685 }
686
687 #[test]
688 fn stop_preserves_positions() {
689 let mut tokens = make_tokens(&["the", "quick", "brown", "fox"]);
690 StopFilter::english().apply(&mut tokens);
691 assert_eq!(tokens[0].position, 1); assert_eq!(tokens[1].position, 2); }
695
696 #[test]
697 fn stop_all_removed() {
698 let mut tokens = make_tokens(&["the", "a", "is", "it"]);
699 StopFilter::english().apply(&mut tokens);
700 assert!(tokens.is_empty());
701 }
702
703 #[test]
704 fn stop_none_removed() {
705 let mut tokens = make_tokens(&["quick", "brown", "fox"]);
706 StopFilter::english().apply(&mut tokens);
707 assert_eq!(tokens.len(), 3);
708 }
709
710 #[test]
711 fn stop_custom_words() {
712 let mut tokens = make_tokens(&["hello", "world", "goodbye"]);
713 let filter = StopFilter::new(["hello", "goodbye"]);
714 filter.apply(&mut tokens);
715 assert_eq!(tokens.len(), 1);
716 assert_eq!(tokens[0].text, "world");
717 }
718
719 #[test]
720 fn stop_case_sensitive() {
721 let mut tokens = make_tokens(&["The", "quick"]);
722 StopFilter::english().apply(&mut tokens);
723 assert_eq!(tokens.len(), 2);
726 }
727
728 #[test]
731 fn stemmer_english_basic() {
732 let mut tokens = make_tokens(&["running", "cats", "easily"]);
733 StemmerFilter::english().apply(&mut tokens);
734 assert_eq!(tokens[0].text, "run");
735 assert_eq!(tokens[1].text, "cat");
736 assert_eq!(tokens[2].text, "easili");
737 }
738
739 #[test]
740 fn stemmer_already_stemmed() {
741 let mut tokens = make_tokens(&["run", "cat"]);
742 StemmerFilter::english().apply(&mut tokens);
743 assert_eq!(tokens[0].text, "run");
744 assert_eq!(tokens[1].text, "cat");
745 }
746
747 #[test]
748 fn stemmer_preserves_positions() {
749 let mut tokens = make_tokens(&["running", "jumping"]);
750 StemmerFilter::english().apply(&mut tokens);
751 assert_eq!(tokens[0].position, 0);
752 assert_eq!(tokens[1].position, 1);
753 }
754
755 #[test]
756 fn stemmer_empty() {
757 let mut tokens: Vec<Token> = Vec::new();
758 StemmerFilter::english().apply(&mut tokens);
759 assert!(tokens.is_empty());
760 }
761
762 #[test]
765 fn asciifolding_basic() {
766 let mut tokens = make_tokens(&["café", "résumé", "naïve"]);
767 AsciiFoldingFilter::new(false).apply(&mut tokens);
768 assert_eq!(tokens[0].text, "cafe");
769 assert_eq!(tokens[1].text, "resume");
770 assert_eq!(tokens[2].text, "naive");
771 }
772
773 #[test]
774 fn asciifolding_no_change() {
775 let mut tokens = make_tokens(&["hello", "world"]);
776 AsciiFoldingFilter::new(false).apply(&mut tokens);
777 assert_eq!(tokens[0].text, "hello");
778 assert_eq!(tokens[1].text, "world");
779 }
780
781 #[test]
782 fn asciifolding_preserve_original() {
783 let mut tokens = make_tokens(&["café"]);
784 AsciiFoldingFilter::new(true).apply(&mut tokens);
785 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
786 assert!(texts.contains(&"café")); assert!(texts.contains(&"cafe")); }
789
790 #[test]
791 fn asciifolding_german() {
792 let mut tokens = make_tokens(&["über", "straße"]);
793 AsciiFoldingFilter::new(false).apply(&mut tokens);
794 assert_eq!(tokens[0].text, "uber");
795 assert_eq!(tokens[1].text, "strasse");
796 }
797
798 #[test]
799 fn asciifolding_ligatures() {
800 let mut tokens = make_tokens(&["Æneid", "œuvre"]);
801 AsciiFoldingFilter::new(false).apply(&mut tokens);
802 assert_eq!(tokens[0].text, "AEneid");
803 assert_eq!(tokens[1].text, "oeuvre");
804 }
805
806 #[test]
809 fn ngram_filter_basic() {
810 let mut tokens = make_tokens(&["quick"]);
811 NGramTokenFilter::new(2, 3).apply(&mut tokens);
812 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
813 assert_eq!(texts, vec!["qu", "ui", "ic", "ck", "qui", "uic", "ick"]);
814 }
815
816 #[test]
817 fn ngram_filter_empty() {
818 let mut tokens: Vec<Token> = Vec::new();
819 NGramTokenFilter::new(2, 3).apply(&mut tokens);
820 assert!(tokens.is_empty());
821 }
822
823 #[test]
826 fn edge_ngram_filter_basic() {
827 let mut tokens = make_tokens(&["quick"]);
828 EdgeNGramTokenFilter::new(2, 4, false).apply(&mut tokens);
829 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
830 assert_eq!(texts, vec!["qu", "qui", "quic"]);
831 }
832
833 #[test]
834 fn edge_ngram_filter_preserve_original() {
835 let mut tokens = make_tokens(&["quick"]);
836 EdgeNGramTokenFilter::new(2, 3, true).apply(&mut tokens);
837 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
838 assert_eq!(texts, vec!["qu", "qui", "quick"]); }
840
841 #[test]
844 fn synonym_equivalent() {
845 let filter = SynonymFilter::new(&["quick, fast, speedy".to_string()], true);
846 let mut tokens = make_tokens(&["quick"]);
847 filter.apply(&mut tokens);
848 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
849 assert!(texts.contains(&"quick"));
850 assert!(texts.contains(&"fast"));
851 assert!(texts.contains(&"speedy"));
852 }
853
854 #[test]
855 fn synonym_explicit() {
856 let filter = SynonymFilter::new(&["big => large".to_string()], true);
857 let mut tokens = make_tokens(&["big"]);
858 filter.apply(&mut tokens);
859 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
860 assert!(texts.contains(&"big")); assert!(texts.contains(&"large")); }
863
864 #[test]
865 fn synonym_no_match() {
866 let filter = SynonymFilter::new(&["quick, fast".to_string()], true);
867 let mut tokens = make_tokens(&["slow"]);
868 filter.apply(&mut tokens);
869 assert_eq!(tokens.len(), 1);
870 assert_eq!(tokens[0].text, "slow");
871 }
872
873 #[test]
874 fn synonym_expand_false() {
875 let filter = SynonymFilter::new(&["quick, fast, speedy".to_string()], false);
876 let mut tokens = make_tokens(&["fast"]);
877 filter.apply(&mut tokens);
878 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
879 assert!(texts.contains(&"fast"));
880 assert!(texts.contains(&"quick")); }
882
883 #[test]
884 fn synonym_same_position() {
885 let filter = SynonymFilter::new(&["quick, fast".to_string()], true);
886 let mut tokens = make_tokens(&["quick"]);
887 filter.apply(&mut tokens);
888 assert!(tokens.iter().all(|t| t.position == 0));
890 }
891
892 #[test]
895 fn shingle_basic() {
896 let mut tokens = make_tokens(&["the", "quick", "brown", "fox"]);
897 ShingleFilter::new(2, 2, false).apply(&mut tokens);
898 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
899 assert_eq!(texts, vec!["the quick", "quick brown", "brown fox"]);
900 }
901
902 #[test]
903 fn shingle_with_unigrams() {
904 let mut tokens = make_tokens(&["the", "quick", "brown"]);
905 ShingleFilter::new(2, 2, true).apply(&mut tokens);
906 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
907 assert_eq!(
908 texts,
909 vec!["the", "the quick", "quick", "quick brown", "brown"]
910 );
911 }
912
913 #[test]
914 fn shingle_trigrams() {
915 let mut tokens = make_tokens(&["a", "b", "c", "d"]);
916 ShingleFilter::new(3, 3, false).apply(&mut tokens);
917 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
918 assert_eq!(texts, vec!["a b c", "b c d"]);
919 }
920
921 #[test]
922 fn shingle_empty() {
923 let mut tokens: Vec<Token> = Vec::new();
924 ShingleFilter::new(2, 2, false).apply(&mut tokens);
925 assert!(tokens.is_empty());
926 }
927}