1use alloc::string::{String, ToString};
18use alloc::vec::Vec;
19
20use spg_storage::{TsLexeme, TsQueryAst};
21
22use crate::eval::EvalError;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum TsConfig {
27 Simple,
30 English,
33}
34
35impl TsConfig {
36 pub fn from_name(name: &str) -> Option<Self> {
41 let bare = name.strip_prefix("pg_catalog.").unwrap_or(name);
42 match bare.to_ascii_lowercase().as_str() {
43 "simple" => Some(Self::Simple),
44 "english" => Some(Self::English),
45 _ => None,
46 }
47 }
48}
49
50pub fn to_tsvector(config: TsConfig, text: &str) -> Vec<TsLexeme> {
55 let mut out: Vec<TsLexeme> = Vec::new();
56 let mut position: u16 = 0;
57 for token in tokenize(text) {
58 let lex = match config {
59 TsConfig::Simple => token,
60 TsConfig::English => {
61 if is_english_stopword(&token) {
62 position = position.saturating_add(1);
66 continue;
67 }
68 porter_stem(&token)
69 }
70 };
71 if lex.is_empty() {
72 continue;
73 }
74 position = position.saturating_add(1);
75 match out.binary_search_by(|l| l.word.as_str().cmp(lex.as_str())) {
76 Ok(idx) => {
77 if !out[idx].positions.contains(&position) {
78 out[idx].positions.push(position);
79 }
80 }
81 Err(idx) => {
82 out.insert(
83 idx,
84 TsLexeme {
85 word: lex,
86 positions: alloc::vec![position],
87 weight: 0,
88 },
89 );
90 }
91 }
92 }
93 out
94}
95
96pub fn plainto_tsquery(config: TsConfig, text: &str) -> TsQueryAst {
102 let lexs = collect_lexemes(config, text);
103 fold_and(&lexs)
104}
105
106pub fn phraseto_tsquery(config: TsConfig, text: &str) -> TsQueryAst {
109 let lexs = collect_lexemes(config, text);
110 fold_phrase(&lexs)
111}
112
113pub fn websearch_to_tsquery(config: TsConfig, text: &str) -> TsQueryAst {
120 let mut tokens = web_tokens(text);
121 for t in &mut tokens {
123 match t {
124 WebToken::Term(s) | WebToken::NotTerm(s) => {
125 let lexs = collect_lexemes(config, s);
126 *s = lexs.join(" ");
127 }
128 WebToken::Phrase(words) => {
129 let mut combined = String::new();
130 for w in words.iter() {
131 if !combined.is_empty() {
132 combined.push(' ');
133 }
134 combined.push_str(w);
135 }
136 let lexs = collect_lexemes(config, &combined);
137 *words = lexs;
138 }
139 WebToken::Or => {}
140 }
141 }
142 let mut or_groups: Vec<Vec<TsQueryAst>> = alloc::vec![Vec::new()];
144 let mut i = 0;
145 while i < tokens.len() {
146 match &tokens[i] {
147 WebToken::Or => {
148 or_groups.push(Vec::new());
149 }
150 WebToken::Term(s) => {
151 if !s.is_empty() {
152 let node = fold_and(&split_words(s));
153 or_groups.last_mut().unwrap().push(node);
154 }
155 }
156 WebToken::NotTerm(s) => {
157 if !s.is_empty() {
158 let node = TsQueryAst::Not(alloc::boxed::Box::new(fold_and(&split_words(s))));
159 or_groups.last_mut().unwrap().push(node);
160 }
161 }
162 WebToken::Phrase(words) => {
163 if !words.is_empty() {
164 or_groups.last_mut().unwrap().push(fold_phrase(words));
165 }
166 }
167 }
168 i += 1;
169 }
170 let group_nodes: Vec<TsQueryAst> = or_groups
171 .into_iter()
172 .filter_map(|g| {
173 if g.is_empty() {
174 None
175 } else {
176 let mut it = g.into_iter();
177 let first = it.next().unwrap();
178 Some(it.fold(first, |acc, n| {
179 TsQueryAst::And(alloc::boxed::Box::new(acc), alloc::boxed::Box::new(n))
180 }))
181 }
182 })
183 .collect();
184 if group_nodes.is_empty() {
185 return TsQueryAst::Term {
186 word: String::new(),
187 weight_mask: 0,
188 };
189 }
190 let mut it = group_nodes.into_iter();
191 let first = it.next().unwrap();
192 it.fold(first, |acc, n| {
193 TsQueryAst::Or(alloc::boxed::Box::new(acc), alloc::boxed::Box::new(n))
194 })
195}
196
197pub fn to_tsquery(config: TsConfig, text: &str) -> Result<TsQueryAst, EvalError> {
202 let mut ast = crate::eval::decode_tsquery_external(text)?;
203 stem_tsquery_in_place(&mut ast, config);
204 Ok(ast)
205}
206
207fn stem_tsquery_in_place(ast: &mut TsQueryAst, config: TsConfig) {
208 match ast {
209 TsQueryAst::Term { word, .. } => {
210 let lower = word.to_lowercase();
211 *word = match config {
212 TsConfig::Simple => lower,
213 TsConfig::English => porter_stem(&lower),
214 };
215 }
216 TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
217 stem_tsquery_in_place(a, config);
218 stem_tsquery_in_place(b, config);
219 }
220 TsQueryAst::Not(x) => stem_tsquery_in_place(x, config),
221 TsQueryAst::Phrase { left, right, .. } => {
222 stem_tsquery_in_place(left, config);
223 stem_tsquery_in_place(right, config);
224 }
225 }
226}
227
228fn collect_lexemes(config: TsConfig, text: &str) -> Vec<String> {
229 let mut out: Vec<String> = Vec::new();
230 for token in tokenize(text) {
231 match config {
232 TsConfig::Simple => out.push(token),
233 TsConfig::English => {
234 if is_english_stopword(&token) {
235 continue;
236 }
237 let stemmed = porter_stem(&token);
238 if !stemmed.is_empty() {
239 out.push(stemmed);
240 }
241 }
242 }
243 }
244 out
245}
246
247fn split_words(s: &str) -> Vec<String> {
248 s.split_whitespace().map(|w| w.to_string()).collect()
249}
250
251fn fold_and(lexs: &[String]) -> TsQueryAst {
252 if lexs.is_empty() {
253 return TsQueryAst::Term {
254 word: String::new(),
255 weight_mask: 0,
256 };
257 }
258 let mut it = lexs.iter();
259 let first = TsQueryAst::Term {
260 word: it.next().unwrap().clone(),
261 weight_mask: 0,
262 };
263 it.fold(first, |acc, w| {
264 TsQueryAst::And(
265 alloc::boxed::Box::new(acc),
266 alloc::boxed::Box::new(TsQueryAst::Term {
267 word: w.clone(),
268 weight_mask: 0,
269 }),
270 )
271 })
272}
273
274fn fold_phrase(lexs: &[String]) -> TsQueryAst {
275 if lexs.is_empty() {
276 return TsQueryAst::Term {
277 word: String::new(),
278 weight_mask: 0,
279 };
280 }
281 let mut it = lexs.iter();
282 let first = TsQueryAst::Term {
283 word: it.next().unwrap().clone(),
284 weight_mask: 0,
285 };
286 it.fold(first, |acc, w| TsQueryAst::Phrase {
287 left: alloc::boxed::Box::new(acc),
288 right: alloc::boxed::Box::new(TsQueryAst::Term {
289 word: w.clone(),
290 weight_mask: 0,
291 }),
292 distance: 1,
293 })
294}
295
296#[must_use]
305pub fn ts_query_matches(vec: &[TsLexeme], query: &TsQueryAst) -> bool {
306 match query {
307 TsQueryAst::Term { word, .. } => contains_lexeme(vec, word),
308 TsQueryAst::And(a, b) => ts_query_matches(vec, a) && ts_query_matches(vec, b),
309 TsQueryAst::Or(a, b) => ts_query_matches(vec, a) || ts_query_matches(vec, b),
310 TsQueryAst::Not(x) => !ts_query_matches(vec, x),
311 TsQueryAst::Phrase {
312 left,
313 right,
314 distance,
315 } => phrase_match(vec, left, right, *distance),
316 }
317}
318
319fn contains_lexeme(vec: &[TsLexeme], word: &str) -> bool {
320 vec.binary_search_by(|l| l.word.as_str().cmp(word)).is_ok()
321}
322
323fn phrase_positions(vec: &[TsLexeme], q: &TsQueryAst) -> Vec<u16> {
328 match q {
329 TsQueryAst::Term { word, .. } => {
330 match vec.binary_search_by(|l| l.word.as_str().cmp(word)) {
331 Ok(idx) => vec[idx].positions.clone(),
332 Err(_) => Vec::new(),
333 }
334 }
335 TsQueryAst::Phrase {
336 left,
337 right,
338 distance,
339 } => {
340 let lp = phrase_positions(vec, left);
341 let rp = phrase_positions(vec, right);
342 let mut out = Vec::new();
343 for l in &lp {
344 let target = l.saturating_add(*distance);
345 if rp.binary_search(&target).is_ok() {
346 out.push(target);
347 }
348 }
349 out.sort_unstable();
350 out.dedup();
351 out
352 }
353 _ => {
356 if ts_query_matches(vec, q) {
357 alloc::vec![u16::MAX]
358 } else {
359 Vec::new()
360 }
361 }
362 }
363}
364
365fn phrase_match(vec: &[TsLexeme], left: &TsQueryAst, right: &TsQueryAst, distance: u16) -> bool {
366 let lp = phrase_positions(vec, left);
367 let rp = phrase_positions(vec, right);
368 lp.iter().any(|l| {
369 let target = l.saturating_add(distance);
370 rp.binary_search(&target).is_ok()
371 })
372}
373
374#[must_use]
379pub fn ts_rank(vec: &[TsLexeme], query: &TsQueryAst) -> f32 {
380 let mut score = 0.0f32;
381 let mut unique_terms = 0usize;
382 collect_rank_terms(query, vec, &mut score, &mut unique_terms);
383 if unique_terms == 0 {
384 return 0.0;
385 }
386 let denom = 1.0 + ln_approx(unique_terms as f32);
387 score / denom
388}
389
390#[must_use]
395pub fn ts_rank_cd(vec: &[TsLexeme], query: &TsQueryAst) -> f32 {
396 let mut matched_positions: Vec<u16> = Vec::new();
397 let mut score = 0.0f32;
398 let mut unique_terms = 0usize;
399 collect_cd_positions(
400 query,
401 vec,
402 &mut matched_positions,
403 &mut score,
404 &mut unique_terms,
405 );
406 if matched_positions.is_empty() || unique_terms == 0 {
407 return 0.0;
408 }
409 matched_positions.sort_unstable();
410 matched_positions.dedup();
411 if matched_positions.len() == 1 {
414 return score / (1.0 + ln_approx(unique_terms as f32));
415 }
416 let gaps: u32 = matched_positions
417 .windows(2)
418 .map(|w| u32::from(w[1] - w[0]))
419 .sum();
420 let avg_gap = (gaps as f32) / ((matched_positions.len() - 1) as f32);
421 let density = 1.0 / avg_gap.max(1.0);
422 score * density / (1.0 + ln_approx(unique_terms as f32))
423}
424
425fn ln_approx(x: f32) -> f32 {
429 if x <= 0.0 {
430 return 0.0;
431 }
432 let xd = f64::from(x);
433 let bits = xd.to_bits();
434 let exponent_raw = ((bits >> 52) & 0x7ff) as i64;
435 let exponent = exponent_raw - 1023;
436 let mantissa_bits = (bits & 0x000f_ffff_ffff_ffff) | 0x3ff0_0000_0000_0000;
437 let mantissa = f64::from_bits(mantissa_bits);
438 let t = (mantissa - 1.0) / (mantissa + 1.0);
439 let t2 = t * t;
440 let ln_mantissa = 2.0 * (t + t2 * t / 3.0 + t2 * t2 * t / 5.0 + t2 * t2 * t2 * t / 7.0);
441 let ln = (exponent as f64) * core::f64::consts::LN_2 + ln_mantissa;
442 ln as f32
443}
444
445fn weight_factor(w: u8) -> f32 {
446 match w {
448 3 => 1.0,
449 2 => 0.4,
450 1 => 0.2,
451 _ => 0.1,
452 }
453}
454
455fn collect_rank_terms(query: &TsQueryAst, vec: &[TsLexeme], score: &mut f32, n: &mut usize) {
456 match query {
457 TsQueryAst::Term { word, .. } => {
458 *n += 1;
459 if let Ok(idx) = vec.binary_search_by(|l| l.word.as_str().cmp(word.as_str())) {
460 let w = vec[idx].weight;
461 let occurrences = vec[idx].positions.len().max(1) as f32;
462 *score += weight_factor(w) * occurrences;
463 }
464 }
465 TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
466 collect_rank_terms(a, vec, score, n);
467 collect_rank_terms(b, vec, score, n);
468 }
469 TsQueryAst::Not(_) => {
470 }
472 TsQueryAst::Phrase { left, right, .. } => {
473 collect_rank_terms(left, vec, score, n);
474 collect_rank_terms(right, vec, score, n);
475 }
476 }
477}
478
479fn collect_cd_positions(
480 query: &TsQueryAst,
481 vec: &[TsLexeme],
482 positions: &mut Vec<u16>,
483 score: &mut f32,
484 n: &mut usize,
485) {
486 match query {
487 TsQueryAst::Term { word, .. } => {
488 *n += 1;
489 if let Ok(idx) = vec.binary_search_by(|l| l.word.as_str().cmp(word.as_str())) {
490 positions.extend_from_slice(&vec[idx].positions);
491 let w = vec[idx].weight;
492 let occurrences = vec[idx].positions.len().max(1) as f32;
493 *score += weight_factor(w) * occurrences;
494 }
495 }
496 TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
497 collect_cd_positions(a, vec, positions, score, n);
498 collect_cd_positions(b, vec, positions, score, n);
499 }
500 TsQueryAst::Not(_) => {}
501 TsQueryAst::Phrase { left, right, .. } => {
502 collect_cd_positions(left, vec, positions, score, n);
503 collect_cd_positions(right, vec, positions, score, n);
504 }
505 }
506}
507
508pub fn tokenize(text: &str) -> Vec<String> {
512 let mut out = Vec::new();
513 let mut cur = String::new();
514 for c in text.chars() {
515 if c.is_alphanumeric() || c == '_' {
516 for lc in c.to_lowercase() {
517 cur.push(lc);
518 }
519 } else if !cur.is_empty() {
520 out.push(core::mem::take(&mut cur));
521 }
522 }
523 if !cur.is_empty() {
524 out.push(cur);
525 }
526 out
527}
528
529enum WebToken {
530 Term(String),
531 NotTerm(String),
532 Phrase(Vec<String>),
533 Or,
534}
535
536fn web_tokens(text: &str) -> Vec<WebToken> {
539 let mut out = Vec::new();
540 let bytes = text.as_bytes();
541 let mut i = 0;
542 while i < bytes.len() {
543 let b = bytes[i];
544 if b.is_ascii_whitespace() {
545 i += 1;
546 continue;
547 }
548 if b == b'"' {
549 i += 1;
550 let start = i;
551 while i < bytes.len() && bytes[i] != b'"' {
552 i += 1;
553 }
554 let phrase_text = &text[start..i];
555 let words: Vec<String> = phrase_text
556 .split_whitespace()
557 .map(|w| w.to_string())
558 .collect();
559 out.push(WebToken::Phrase(words));
560 if i < bytes.len() {
561 i += 1; }
563 continue;
564 }
565 let negate = b == b'-';
566 if negate {
567 i += 1;
568 }
569 let start = i;
570 while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'"' {
571 i += 1;
572 }
573 let word = &text[start..i];
574 if word.eq_ignore_ascii_case("or") && !negate {
575 out.push(WebToken::Or);
576 } else if negate {
577 out.push(WebToken::NotTerm(word.to_string()));
578 } else {
579 out.push(WebToken::Term(word.to_string()));
580 }
581 }
582 out
583}
584
585pub fn is_english_stopword(word: &str) -> bool {
588 matches!(
589 word,
590 "i" | "me"
591 | "my"
592 | "myself"
593 | "we"
594 | "our"
595 | "ours"
596 | "ourselves"
597 | "you"
598 | "your"
599 | "yours"
600 | "yourself"
601 | "yourselves"
602 | "he"
603 | "him"
604 | "his"
605 | "himself"
606 | "she"
607 | "her"
608 | "hers"
609 | "herself"
610 | "it"
611 | "its"
612 | "itself"
613 | "they"
614 | "them"
615 | "their"
616 | "theirs"
617 | "themselves"
618 | "what"
619 | "which"
620 | "who"
621 | "whom"
622 | "this"
623 | "that"
624 | "these"
625 | "those"
626 | "am"
627 | "is"
628 | "are"
629 | "was"
630 | "were"
631 | "be"
632 | "been"
633 | "being"
634 | "have"
635 | "has"
636 | "had"
637 | "having"
638 | "do"
639 | "does"
640 | "did"
641 | "doing"
642 | "a"
643 | "an"
644 | "the"
645 | "and"
646 | "but"
647 | "if"
648 | "or"
649 | "because"
650 | "as"
651 | "until"
652 | "while"
653 | "of"
654 | "at"
655 | "by"
656 | "for"
657 | "with"
658 | "about"
659 | "against"
660 | "between"
661 | "into"
662 | "through"
663 | "during"
664 | "before"
665 | "after"
666 | "above"
667 | "below"
668 | "to"
669 | "from"
670 | "up"
671 | "down"
672 | "in"
673 | "out"
674 | "on"
675 | "off"
676 | "over"
677 | "under"
678 | "again"
679 | "further"
680 | "then"
681 | "once"
682 | "here"
683 | "there"
684 | "when"
685 | "where"
686 | "why"
687 | "how"
688 | "all"
689 | "any"
690 | "both"
691 | "each"
692 | "few"
693 | "more"
694 | "most"
695 | "other"
696 | "some"
697 | "such"
698 | "no"
699 | "nor"
700 | "not"
701 | "only"
702 | "own"
703 | "same"
704 | "so"
705 | "than"
706 | "too"
707 | "very"
708 | "s"
709 | "t"
710 | "can"
711 | "will"
712 | "just"
713 | "don"
714 | "should"
715 | "now"
716 )
717}
718
719pub fn porter_stem(word: &str) -> String {
728 if !word.is_ascii() {
729 return word.to_string();
730 }
731 let bytes: Vec<u8> = word.bytes().collect();
732 if bytes.len() <= 2 {
733 return word.to_string();
734 }
735 let mut b = bytes;
736 step1a(&mut b);
737 step1b(&mut b);
738 step1c(&mut b);
739 step2(&mut b);
740 step3(&mut b);
741 step4(&mut b);
742 step5a(&mut b);
743 step5b(&mut b);
744 String::from_utf8(b).expect("porter stem produced non-UTF8 bytes")
746}
747
748fn is_vowel(b: &[u8], i: usize) -> bool {
749 match b[i] {
750 b'a' | b'e' | b'i' | b'o' | b'u' => true,
751 b'y' => i > 0 && !is_vowel(b, i - 1),
752 _ => false,
753 }
754}
755
756fn measure(b: &[u8]) -> usize {
758 let mut m = 0;
759 let mut prev_vowel = false;
760 let mut started = false;
761 for i in 0..b.len() {
762 let v = is_vowel(b, i);
763 if started && prev_vowel && !v {
764 m += 1;
765 }
766 prev_vowel = v;
767 started = true;
768 }
769 m
770}
771
772fn has_vowel(b: &[u8]) -> bool {
773 (0..b.len()).any(|i| is_vowel(b, i))
774}
775
776fn ends_with(b: &[u8], suf: &[u8]) -> bool {
777 b.len() >= suf.len() && &b[b.len() - suf.len()..] == suf
778}
779
780fn replace_suffix(b: &mut Vec<u8>, suf_len: usize, new_suf: &[u8]) {
781 let new_len = b.len() - suf_len;
782 b.truncate(new_len);
783 b.extend_from_slice(new_suf);
784}
785
786fn measure_stem(b: &[u8], suf_len: usize) -> usize {
787 measure(&b[..b.len() - suf_len])
788}
789
790fn step1a(b: &mut Vec<u8>) {
791 if ends_with(b, b"sses") {
792 replace_suffix(b, 4, b"ss");
793 } else if ends_with(b, b"ies") {
794 replace_suffix(b, 3, b"i");
795 } else if ends_with(b, b"ss") {
796 } else if ends_with(b, b"s") {
798 replace_suffix(b, 1, b"");
799 }
800}
801
802fn step1b_post(b: &mut Vec<u8>) {
803 if ends_with(b, b"at") {
804 replace_suffix(b, 2, b"ate");
805 } else if ends_with(b, b"bl") {
806 replace_suffix(b, 2, b"ble");
807 } else if ends_with(b, b"iz") {
808 replace_suffix(b, 2, b"ize");
809 } else if b.len() >= 2 && b[b.len() - 1] == b[b.len() - 2] {
810 let last = b[b.len() - 1];
811 if !matches!(last, b'l' | b's' | b'z') {
812 b.pop();
813 }
814 } else if cvc(b) {
815 b.extend_from_slice(b"e");
816 }
817}
818
819fn cvc(b: &[u8]) -> bool {
820 if b.len() < 3 {
821 return false;
822 }
823 let l = b.len();
824 if !(is_vowel(b, l - 2) && !is_vowel(b, l - 3) && !is_vowel(b, l - 1)) {
825 return false;
826 }
827 !matches!(b[l - 1], b'w' | b'x' | b'y')
828}
829
830fn step1b(b: &mut Vec<u8>) {
831 if ends_with(b, b"eed") {
832 if measure_stem(b, 3) > 0 {
833 replace_suffix(b, 3, b"ee");
834 }
835 return;
836 }
837 if ends_with(b, b"ed") {
838 let stem_has_vowel = has_vowel(&b[..b.len() - 2]);
839 if stem_has_vowel {
840 replace_suffix(b, 2, b"");
841 step1b_post(b);
842 }
843 return;
844 }
845 if ends_with(b, b"ing") {
846 let stem_has_vowel = has_vowel(&b[..b.len() - 3]);
847 if stem_has_vowel {
848 replace_suffix(b, 3, b"");
849 step1b_post(b);
850 }
851 }
852}
853
854fn step1c(b: &mut Vec<u8>) {
855 if ends_with(b, b"y") && has_vowel(&b[..b.len() - 1]) {
856 replace_suffix(b, 1, b"i");
857 }
858}
859
860const STEP2_RULES: &[(&[u8], &[u8])] = &[
861 (b"ational", b"ate"),
862 (b"tional", b"tion"),
863 (b"enci", b"ence"),
864 (b"anci", b"ance"),
865 (b"izer", b"ize"),
866 (b"abli", b"able"),
867 (b"alli", b"al"),
868 (b"entli", b"ent"),
869 (b"eli", b"e"),
870 (b"ousli", b"ous"),
871 (b"ization", b"ize"),
872 (b"ation", b"ate"),
873 (b"ator", b"ate"),
874 (b"alism", b"al"),
875 (b"iveness", b"ive"),
876 (b"fulness", b"ful"),
877 (b"ousness", b"ous"),
878 (b"aliti", b"al"),
879 (b"iviti", b"ive"),
880 (b"biliti", b"ble"),
881];
882
883fn step2(b: &mut Vec<u8>) {
884 for (suf, repl) in STEP2_RULES {
885 if ends_with(b, suf) && measure_stem(b, suf.len()) > 0 {
886 replace_suffix(b, suf.len(), repl);
887 return;
888 }
889 }
890}
891
892const STEP3_RULES: &[(&[u8], &[u8])] = &[
893 (b"icate", b"ic"),
894 (b"ative", b""),
895 (b"alize", b"al"),
896 (b"iciti", b"ic"),
897 (b"ical", b"ic"),
898 (b"ful", b""),
899 (b"ness", b""),
900];
901
902fn step3(b: &mut Vec<u8>) {
903 for (suf, repl) in STEP3_RULES {
904 if ends_with(b, suf) && measure_stem(b, suf.len()) > 0 {
905 replace_suffix(b, suf.len(), repl);
906 return;
907 }
908 }
909}
910
911const STEP4_RULES: &[&[u8]] = &[
912 b"al", b"ance", b"ence", b"er", b"ic", b"able", b"ible", b"ant", b"ement", b"ment", b"ent",
913 b"ou", b"ism", b"ate", b"iti", b"ous", b"ive", b"ize",
914];
915
916fn step4(b: &mut Vec<u8>) {
917 if ends_with(b, b"ion") && measure_stem(b, 3) > 1 {
919 let stem = &b[..b.len() - 3];
920 if matches!(stem.last(), Some(b's') | Some(b't')) {
921 replace_suffix(b, 3, b"");
922 return;
923 }
924 }
925 for suf in STEP4_RULES {
926 if ends_with(b, suf) && measure_stem(b, suf.len()) > 1 {
927 replace_suffix(b, suf.len(), b"");
928 return;
929 }
930 }
931}
932
933fn step5a(b: &mut Vec<u8>) {
934 if ends_with(b, b"e") {
935 let m = measure_stem(b, 1);
936 if m > 1 || (m == 1 && !cvc(&b[..b.len() - 1])) {
937 replace_suffix(b, 1, b"");
938 }
939 }
940}
941
942fn step5b(b: &mut Vec<u8>) {
943 if b.len() >= 2 && b[b.len() - 1] == b'l' && b[b.len() - 2] == b'l' && measure(b) > 1 {
944 b.pop();
945 }
946}
947
948#[cfg(test)]
949mod tests {
950 use super::*;
951
952 #[test]
953 fn porter_simple_cases() {
954 assert_eq!(porter_stem("caresses"), "caress");
955 assert_eq!(porter_stem("ponies"), "poni");
956 assert_eq!(porter_stem("ties"), "ti");
957 assert_eq!(porter_stem("cats"), "cat");
958 assert_eq!(porter_stem("running"), "run");
959 assert_eq!(porter_stem("happy"), "happi");
960 assert_eq!(porter_stem("relational"), "relat");
961 assert_eq!(porter_stem("conditional"), "condit");
962 assert_eq!(porter_stem("hopefulness"), "hope");
963 }
964
965 #[test]
966 fn english_drops_stopwords_and_stems() {
967 let v = to_tsvector(
968 TsConfig::English,
969 "The quick brown foxes are jumping over the lazy dogs",
970 );
971 let words: Vec<&str> = v.iter().map(|l| l.word.as_str()).collect();
972 assert!(words.contains(&"fox"), "expected `fox`, got {words:?}");
976 assert!(words.contains(&"jump"), "expected `jump`, got {words:?}");
977 assert!(words.contains(&"dog"), "expected `dog`, got {words:?}");
978 assert!(!words.contains(&"the"), "stopword `the` leaked: {words:?}");
979 assert!(!words.contains(&"are"), "stopword `are` leaked: {words:?}");
980 }
981
982 #[test]
983 fn simple_preserves_words() {
984 let v = to_tsvector(TsConfig::Simple, "The Quick brown Foxes");
985 let words: Vec<&str> = v.iter().map(|l| l.word.as_str()).collect();
986 assert_eq!(words, alloc::vec!["brown", "foxes", "quick", "the"]);
988 }
989
990 #[test]
991 fn plainto_tsquery_drops_stopwords() {
992 let q = plainto_tsquery(TsConfig::English, "the quick brown fox");
993 let s = crate::eval::format_tsquery(&q);
995 assert_eq!(s, "'quick' & 'brown' & 'fox'");
996 }
997
998 #[test]
999 fn to_tsquery_stems_terms() {
1000 let q = to_tsquery(TsConfig::English, "running & jumps").unwrap();
1001 let s = crate::eval::format_tsquery(&q);
1002 assert_eq!(s, "'run' & 'jump'");
1003 }
1004}