spg_engine/
fts.rs

1//! v7.12.1 — full-text search lexer / stemmer.
2//!
3//! Powers `to_tsvector`, `plainto_tsquery`, `to_tsquery`, and
4//! friends. Two configs are supported in v7.12:
5//!   - `simple` — lowercase + tokenise; no stopwords, no stemming.
6//!   - `english` — lowercase + tokenise + drop PG-standard
7//!     english stopwords + Porter v1 stem.
8//!
9//! Other configs (`spanish`, `german`, `russian`, …) error with
10//! `EvalError::TypeMismatch` carrying the unsupported-config name
11//! so callers see the same shape as `::regtype` rejection.
12//!
13//! Porter stemmer implementation follows the original 1980
14//! Algorithm; corner-case behaviour matches Snowball english v1
15//! (the variant PG also uses).
16
17use alloc::string::{String, ToString};
18use alloc::vec::Vec;
19
20use spg_storage::{TsLexeme, TsQueryAst};
21
22use crate::eval::EvalError;
23
24/// v7.12.1 — supported tokeniser configs.
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum TsConfig {
27    /// `simple` / `pg_catalog.simple` — lowercase + split, no
28    /// stopword drop, no stem.
29    Simple,
30    /// `english` / `pg_catalog.english` — lowercase + split +
31    /// stopword drop + Porter v1 stem.
32    English,
33}
34
35impl TsConfig {
36    /// Resolve a PG text-search config name. The PG-qualified
37    /// form `pg_catalog.<name>` is accepted too. Returns `None`
38    /// for any other name so the caller can produce a clear
39    /// "config not implemented" error listing what is supported.
40    pub fn from_name(name: &str) -> Option<Self> {
41        let bare = name.strip_prefix("pg_catalog.").unwrap_or(name);
42        match bare.to_ascii_lowercase().as_str() {
43            "simple" => Some(Self::Simple),
44            "english" => Some(Self::English),
45            _ => None,
46        }
47    }
48}
49
50/// v7.12.1 — tokenise + (optionally) stem `text` into a sorted +
51/// deduped lexeme set with merged positions. Each token's
52/// position is 1-based and capped at u16::MAX (matching PG's
53/// `MaxTSPosition`).
54pub fn to_tsvector(config: TsConfig, text: &str) -> Vec<TsLexeme> {
55    let mut out: Vec<TsLexeme> = Vec::new();
56    let mut position: u16 = 0;
57    for token in tokenize(text) {
58        let lex = match config {
59            TsConfig::Simple => token,
60            TsConfig::English => {
61                if is_english_stopword(&token) {
62                    // PG drops stopwords from the vector but
63                    // still increments position so phrase
64                    // distances stay meaningful.
65                    position = position.saturating_add(1);
66                    continue;
67                }
68                porter_stem(&token)
69            }
70        };
71        if lex.is_empty() {
72            continue;
73        }
74        position = position.saturating_add(1);
75        match out.binary_search_by(|l| l.word.as_str().cmp(lex.as_str())) {
76            Ok(idx) => {
77                if !out[idx].positions.contains(&position) {
78                    out[idx].positions.push(position);
79                }
80            }
81            Err(idx) => {
82                out.insert(
83                    idx,
84                    TsLexeme {
85                        word: lex,
86                        positions: alloc::vec![position],
87                        weight: 0,
88                    },
89                );
90            }
91        }
92    }
93    out
94}
95
96/// v7.12.1 — `plainto_tsquery(config, text)`: tokenise + stem,
97/// fold the surviving lexemes into an AND tree. Returns
98/// `EvalError::TypeMismatch` only for an unsupported config — an
99/// all-stopwords input becomes an empty `Term("")` so the caller
100/// can detect it.
101pub fn plainto_tsquery(config: TsConfig, text: &str) -> TsQueryAst {
102    let lexs = collect_lexemes(config, text);
103    fold_and(&lexs)
104}
105
106/// v7.12.1 — `phraseto_tsquery(config, text)`: same tokenise + stem,
107/// but preserve order — fold into nested `Phrase(_, _, 1)` nodes.
108pub fn phraseto_tsquery(config: TsConfig, text: &str) -> TsQueryAst {
109    let lexs = collect_lexemes(config, text);
110    fold_phrase(&lexs)
111}
112
113/// v7.12.1 — `websearch_to_tsquery(config, text)`: Google-style
114/// syntax. Quoted phrases → phrase node; `OR` (case-insensitive)
115/// → OR; leading `-` → NOT; otherwise AND.
116///
117/// The grammar is liberal — malformed input degrades to a plain
118/// AND of the bare tokens, matching PG semantics.
119pub fn websearch_to_tsquery(config: TsConfig, text: &str) -> TsQueryAst {
120    let mut tokens = web_tokens(text);
121    // Apply config to each plain term + each phrase part.
122    for t in &mut tokens {
123        match t {
124            WebToken::Term(s) | WebToken::NotTerm(s) => {
125                let lexs = collect_lexemes(config, s);
126                *s = lexs.join(" ");
127            }
128            WebToken::Phrase(words) => {
129                let mut combined = String::new();
130                for w in words.iter() {
131                    if !combined.is_empty() {
132                        combined.push(' ');
133                    }
134                    combined.push_str(w);
135                }
136                let lexs = collect_lexemes(config, &combined);
137                *words = lexs;
138            }
139            WebToken::Or => {}
140        }
141    }
142    // Group by OR boundaries; within each group AND together.
143    let mut or_groups: Vec<Vec<TsQueryAst>> = alloc::vec![Vec::new()];
144    let mut i = 0;
145    while i < tokens.len() {
146        match &tokens[i] {
147            WebToken::Or => {
148                or_groups.push(Vec::new());
149            }
150            WebToken::Term(s) => {
151                if !s.is_empty() {
152                    let node = fold_and(&split_words(s));
153                    or_groups.last_mut().unwrap().push(node);
154                }
155            }
156            WebToken::NotTerm(s) => {
157                if !s.is_empty() {
158                    let node = TsQueryAst::Not(alloc::boxed::Box::new(fold_and(&split_words(s))));
159                    or_groups.last_mut().unwrap().push(node);
160                }
161            }
162            WebToken::Phrase(words) => {
163                if !words.is_empty() {
164                    or_groups.last_mut().unwrap().push(fold_phrase(words));
165                }
166            }
167        }
168        i += 1;
169    }
170    let group_nodes: Vec<TsQueryAst> = or_groups
171        .into_iter()
172        .filter_map(|g| {
173            if g.is_empty() {
174                None
175            } else {
176                let mut it = g.into_iter();
177                let first = it.next().unwrap();
178                Some(it.fold(first, |acc, n| {
179                    TsQueryAst::And(alloc::boxed::Box::new(acc), alloc::boxed::Box::new(n))
180                }))
181            }
182        })
183        .collect();
184    if group_nodes.is_empty() {
185        return TsQueryAst::Term {
186            word: String::new(),
187            weight_mask: 0,
188        };
189    }
190    let mut it = group_nodes.into_iter();
191    let first = it.next().unwrap();
192    it.fold(first, |acc, n| {
193        TsQueryAst::Or(alloc::boxed::Box::new(acc), alloc::boxed::Box::new(n))
194    })
195}
196
197/// v7.12.1 — `to_tsquery(config, text)`: explicit operator syntax
198/// over already-stemmed terms. Reuses the v7.12.0 external-form
199/// parser, then walks each leaf through `porter_stem` (when the
200/// config is `english`). Returns `TypeMismatch` on malformed input.
201pub fn to_tsquery(config: TsConfig, text: &str) -> Result<TsQueryAst, EvalError> {
202    let mut ast = crate::eval::decode_tsquery_external(text)?;
203    stem_tsquery_in_place(&mut ast, config);
204    Ok(ast)
205}
206
207fn stem_tsquery_in_place(ast: &mut TsQueryAst, config: TsConfig) {
208    match ast {
209        TsQueryAst::Term { word, .. } => {
210            let lower = word.to_lowercase();
211            *word = match config {
212                TsConfig::Simple => lower,
213                TsConfig::English => porter_stem(&lower),
214            };
215        }
216        TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
217            stem_tsquery_in_place(a, config);
218            stem_tsquery_in_place(b, config);
219        }
220        TsQueryAst::Not(x) => stem_tsquery_in_place(x, config),
221        TsQueryAst::Phrase { left, right, .. } => {
222            stem_tsquery_in_place(left, config);
223            stem_tsquery_in_place(right, config);
224        }
225    }
226}
227
228fn collect_lexemes(config: TsConfig, text: &str) -> Vec<String> {
229    let mut out: Vec<String> = Vec::new();
230    for token in tokenize(text) {
231        match config {
232            TsConfig::Simple => out.push(token),
233            TsConfig::English => {
234                if is_english_stopword(&token) {
235                    continue;
236                }
237                let stemmed = porter_stem(&token);
238                if !stemmed.is_empty() {
239                    out.push(stemmed);
240                }
241            }
242        }
243    }
244    out
245}
246
247fn split_words(s: &str) -> Vec<String> {
248    s.split_whitespace().map(|w| w.to_string()).collect()
249}
250
251fn fold_and(lexs: &[String]) -> TsQueryAst {
252    if lexs.is_empty() {
253        return TsQueryAst::Term {
254            word: String::new(),
255            weight_mask: 0,
256        };
257    }
258    let mut it = lexs.iter();
259    let first = TsQueryAst::Term {
260        word: it.next().unwrap().clone(),
261        weight_mask: 0,
262    };
263    it.fold(first, |acc, w| {
264        TsQueryAst::And(
265            alloc::boxed::Box::new(acc),
266            alloc::boxed::Box::new(TsQueryAst::Term {
267                word: w.clone(),
268                weight_mask: 0,
269            }),
270        )
271    })
272}
273
274fn fold_phrase(lexs: &[String]) -> TsQueryAst {
275    if lexs.is_empty() {
276        return TsQueryAst::Term {
277            word: String::new(),
278            weight_mask: 0,
279        };
280    }
281    let mut it = lexs.iter();
282    let first = TsQueryAst::Term {
283        word: it.next().unwrap().clone(),
284        weight_mask: 0,
285    };
286    it.fold(first, |acc, w| TsQueryAst::Phrase {
287        left: alloc::boxed::Box::new(acc),
288        right: alloc::boxed::Box::new(TsQueryAst::Term {
289            word: w.clone(),
290            weight_mask: 0,
291        }),
292        distance: 1,
293    })
294}
295
296/// v7.12.2 — evaluate `tsvector @@ tsquery`. Walks the query AST
297/// treating each leaf as "does the vector contain this lexeme".
298/// Phrase semantics: the v7.12.2 implementation honours the
299/// `<N>` distance — both operand terms must appear with their
300/// positions exactly `N` apart in the vector. Higher-arity
301/// phrase chains nest as `Phrase(Phrase(a,b,1), c, 1)`, so the
302/// match recursion folds position sets across the AND of the
303/// chain (a fully general n-gram match in a single pass).
304#[must_use]
305pub fn ts_query_matches(vec: &[TsLexeme], query: &TsQueryAst) -> bool {
306    match query {
307        TsQueryAst::Term { word, .. } => contains_lexeme(vec, word),
308        TsQueryAst::And(a, b) => ts_query_matches(vec, a) && ts_query_matches(vec, b),
309        TsQueryAst::Or(a, b) => ts_query_matches(vec, a) || ts_query_matches(vec, b),
310        TsQueryAst::Not(x) => !ts_query_matches(vec, x),
311        TsQueryAst::Phrase {
312            left,
313            right,
314            distance,
315        } => phrase_match(vec, left, right, *distance),
316    }
317}
318
319fn contains_lexeme(vec: &[TsLexeme], word: &str) -> bool {
320    vec.binary_search_by(|l| l.word.as_str().cmp(word)).is_ok()
321}
322
323/// Phrase positions of a sub-AST. For atomic terms returns the
324/// vector's recorded positions; for nested phrases returns the
325/// rightmost position of each surviving match. Empty positions
326/// mean "no match anywhere".
327fn phrase_positions(vec: &[TsLexeme], q: &TsQueryAst) -> Vec<u16> {
328    match q {
329        TsQueryAst::Term { word, .. } => {
330            match vec.binary_search_by(|l| l.word.as_str().cmp(word)) {
331                Ok(idx) => vec[idx].positions.clone(),
332                Err(_) => Vec::new(),
333            }
334        }
335        TsQueryAst::Phrase {
336            left,
337            right,
338            distance,
339        } => {
340            let lp = phrase_positions(vec, left);
341            let rp = phrase_positions(vec, right);
342            let mut out = Vec::new();
343            for l in &lp {
344                let target = l.saturating_add(*distance);
345                if rp.binary_search(&target).is_ok() {
346                    out.push(target);
347                }
348            }
349            out.sort_unstable();
350            out.dedup();
351            out
352        }
353        // For mixed-shape phrases (Phrase contains an AND/OR/NOT),
354        // fall back to the boolean match (no position tracking).
355        _ => {
356            if ts_query_matches(vec, q) {
357                alloc::vec![u16::MAX]
358            } else {
359                Vec::new()
360            }
361        }
362    }
363}
364
365fn phrase_match(vec: &[TsLexeme], left: &TsQueryAst, right: &TsQueryAst, distance: u16) -> bool {
366    let lp = phrase_positions(vec, left);
367    let rp = phrase_positions(vec, right);
368    lp.iter().any(|l| {
369        let target = l.saturating_add(distance);
370        rp.binary_search(&target).is_ok()
371    })
372}
373
374/// v7.12.2 — `ts_rank(vec, q)` basic form. Score is the sum of
375/// per-matched-lexeme weight factors divided by `1 + log(unique
376/// terms in query)`. Matches PG's `ts_rank` with default
377/// normalisation flag 0.
378#[must_use]
379pub fn ts_rank(vec: &[TsLexeme], query: &TsQueryAst) -> f32 {
380    let mut score = 0.0f32;
381    let mut unique_terms = 0usize;
382    collect_rank_terms(query, vec, &mut score, &mut unique_terms);
383    if unique_terms == 0 {
384        return 0.0;
385    }
386    let denom = 1.0 + ln_approx(unique_terms as f32);
387    score / denom
388}
389
390/// v7.12.2 — `ts_rank_cd(vec, q)` cover-density variant. Higher
391/// score when matched lexemes cluster closer together; defaults
392/// to a per-lexeme contribution divided by the average gap
393/// between matched positions. Returns 0 when no terms match.
394#[must_use]
395pub fn ts_rank_cd(vec: &[TsLexeme], query: &TsQueryAst) -> f32 {
396    let mut matched_positions: Vec<u16> = Vec::new();
397    let mut score = 0.0f32;
398    let mut unique_terms = 0usize;
399    collect_cd_positions(
400        query,
401        vec,
402        &mut matched_positions,
403        &mut score,
404        &mut unique_terms,
405    );
406    if matched_positions.is_empty() || unique_terms == 0 {
407        return 0.0;
408    }
409    matched_positions.sort_unstable();
410    matched_positions.dedup();
411    // Cover density: invert the average distance between
412    // consecutive matched positions.
413    if matched_positions.len() == 1 {
414        return score / (1.0 + ln_approx(unique_terms as f32));
415    }
416    let gaps: u32 = matched_positions
417        .windows(2)
418        .map(|w| u32::from(w[1] - w[0]))
419        .sum();
420    let avg_gap = (gaps as f32) / ((matched_positions.len() - 1) as f32);
421    let density = 1.0 / avg_gap.max(1.0);
422    score * density / (1.0 + ln_approx(unique_terms as f32))
423}
424
425/// `f32::ln` is std-only; spg-engine is no_std. Reuse the bit-
426/// trick decomposition the spg-storage bloom filter uses
427/// (precision ≈ 1e-7, ample for ranking).
428fn ln_approx(x: f32) -> f32 {
429    if x <= 0.0 {
430        return 0.0;
431    }
432    let xd = f64::from(x);
433    let bits = xd.to_bits();
434    let exponent_raw = ((bits >> 52) & 0x7ff) as i64;
435    let exponent = exponent_raw - 1023;
436    let mantissa_bits = (bits & 0x000f_ffff_ffff_ffff) | 0x3ff0_0000_0000_0000;
437    let mantissa = f64::from_bits(mantissa_bits);
438    let t = (mantissa - 1.0) / (mantissa + 1.0);
439    let t2 = t * t;
440    let ln_mantissa = 2.0 * (t + t2 * t / 3.0 + t2 * t2 * t / 5.0 + t2 * t2 * t2 * t / 7.0);
441    let ln = (exponent as f64) * core::f64::consts::LN_2 + ln_mantissa;
442    ln as f32
443}
444
445fn weight_factor(w: u8) -> f32 {
446    // PG default: A=1.0, B=0.4, C=0.2, D=0.1.
447    match w {
448        3 => 1.0,
449        2 => 0.4,
450        1 => 0.2,
451        _ => 0.1,
452    }
453}
454
455fn collect_rank_terms(query: &TsQueryAst, vec: &[TsLexeme], score: &mut f32, n: &mut usize) {
456    match query {
457        TsQueryAst::Term { word, .. } => {
458            *n += 1;
459            if let Ok(idx) = vec.binary_search_by(|l| l.word.as_str().cmp(word.as_str())) {
460                let w = vec[idx].weight;
461                let occurrences = vec[idx].positions.len().max(1) as f32;
462                *score += weight_factor(w) * occurrences;
463            }
464        }
465        TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
466            collect_rank_terms(a, vec, score, n);
467            collect_rank_terms(b, vec, score, n);
468        }
469        TsQueryAst::Not(_) => {
470            // NOT-side terms don't contribute to ts_rank.
471        }
472        TsQueryAst::Phrase { left, right, .. } => {
473            collect_rank_terms(left, vec, score, n);
474            collect_rank_terms(right, vec, score, n);
475        }
476    }
477}
478
479fn collect_cd_positions(
480    query: &TsQueryAst,
481    vec: &[TsLexeme],
482    positions: &mut Vec<u16>,
483    score: &mut f32,
484    n: &mut usize,
485) {
486    match query {
487        TsQueryAst::Term { word, .. } => {
488            *n += 1;
489            if let Ok(idx) = vec.binary_search_by(|l| l.word.as_str().cmp(word.as_str())) {
490                positions.extend_from_slice(&vec[idx].positions);
491                let w = vec[idx].weight;
492                let occurrences = vec[idx].positions.len().max(1) as f32;
493                *score += weight_factor(w) * occurrences;
494            }
495        }
496        TsQueryAst::And(a, b) | TsQueryAst::Or(a, b) => {
497            collect_cd_positions(a, vec, positions, score, n);
498            collect_cd_positions(b, vec, positions, score, n);
499        }
500        TsQueryAst::Not(_) => {}
501        TsQueryAst::Phrase { left, right, .. } => {
502            collect_cd_positions(left, vec, positions, score, n);
503            collect_cd_positions(right, vec, positions, score, n);
504        }
505    }
506}
507
508/// Tokenise on Unicode word boundaries — anything that is not an
509/// alphanumeric scalar value (or `_`) splits the token. Lowercases
510/// each emitted token.
511pub fn tokenize(text: &str) -> Vec<String> {
512    let mut out = Vec::new();
513    let mut cur = String::new();
514    for c in text.chars() {
515        if c.is_alphanumeric() || c == '_' {
516            for lc in c.to_lowercase() {
517                cur.push(lc);
518            }
519        } else if !cur.is_empty() {
520            out.push(core::mem::take(&mut cur));
521        }
522    }
523    if !cur.is_empty() {
524        out.push(cur);
525    }
526    out
527}
528
529enum WebToken {
530    Term(String),
531    NotTerm(String),
532    Phrase(Vec<String>),
533    Or,
534}
535
536/// websearch tokenizer — splits on whitespace, recognises quoted
537/// phrases, leading `-` for NOT, and bare `OR` (case-insensitive).
538fn web_tokens(text: &str) -> Vec<WebToken> {
539    let mut out = Vec::new();
540    let bytes = text.as_bytes();
541    let mut i = 0;
542    while i < bytes.len() {
543        let b = bytes[i];
544        if b.is_ascii_whitespace() {
545            i += 1;
546            continue;
547        }
548        if b == b'"' {
549            i += 1;
550            let start = i;
551            while i < bytes.len() && bytes[i] != b'"' {
552                i += 1;
553            }
554            let phrase_text = &text[start..i];
555            let words: Vec<String> = phrase_text
556                .split_whitespace()
557                .map(|w| w.to_string())
558                .collect();
559            out.push(WebToken::Phrase(words));
560            if i < bytes.len() {
561                i += 1; // close quote
562            }
563            continue;
564        }
565        let negate = b == b'-';
566        if negate {
567            i += 1;
568        }
569        let start = i;
570        while i < bytes.len() && !bytes[i].is_ascii_whitespace() && bytes[i] != b'"' {
571            i += 1;
572        }
573        let word = &text[start..i];
574        if word.eq_ignore_ascii_case("or") && !negate {
575            out.push(WebToken::Or);
576        } else if negate {
577            out.push(WebToken::NotTerm(word.to_string()));
578        } else {
579            out.push(WebToken::Term(word.to_string()));
580        }
581    }
582    out
583}
584
585/// PG's standard english stopword list (`tsearch_data/english.stop`).
586/// Subset of the 127 words in PG 17's distribution — verbatim.
587pub fn is_english_stopword(word: &str) -> bool {
588    matches!(
589        word,
590        "i" | "me"
591            | "my"
592            | "myself"
593            | "we"
594            | "our"
595            | "ours"
596            | "ourselves"
597            | "you"
598            | "your"
599            | "yours"
600            | "yourself"
601            | "yourselves"
602            | "he"
603            | "him"
604            | "his"
605            | "himself"
606            | "she"
607            | "her"
608            | "hers"
609            | "herself"
610            | "it"
611            | "its"
612            | "itself"
613            | "they"
614            | "them"
615            | "their"
616            | "theirs"
617            | "themselves"
618            | "what"
619            | "which"
620            | "who"
621            | "whom"
622            | "this"
623            | "that"
624            | "these"
625            | "those"
626            | "am"
627            | "is"
628            | "are"
629            | "was"
630            | "were"
631            | "be"
632            | "been"
633            | "being"
634            | "have"
635            | "has"
636            | "had"
637            | "having"
638            | "do"
639            | "does"
640            | "did"
641            | "doing"
642            | "a"
643            | "an"
644            | "the"
645            | "and"
646            | "but"
647            | "if"
648            | "or"
649            | "because"
650            | "as"
651            | "until"
652            | "while"
653            | "of"
654            | "at"
655            | "by"
656            | "for"
657            | "with"
658            | "about"
659            | "against"
660            | "between"
661            | "into"
662            | "through"
663            | "during"
664            | "before"
665            | "after"
666            | "above"
667            | "below"
668            | "to"
669            | "from"
670            | "up"
671            | "down"
672            | "in"
673            | "out"
674            | "on"
675            | "off"
676            | "over"
677            | "under"
678            | "again"
679            | "further"
680            | "then"
681            | "once"
682            | "here"
683            | "there"
684            | "when"
685            | "where"
686            | "why"
687            | "how"
688            | "all"
689            | "any"
690            | "both"
691            | "each"
692            | "few"
693            | "more"
694            | "most"
695            | "other"
696            | "some"
697            | "such"
698            | "no"
699            | "nor"
700            | "not"
701            | "only"
702            | "own"
703            | "same"
704            | "so"
705            | "than"
706            | "too"
707            | "very"
708            | "s"
709            | "t"
710            | "can"
711            | "will"
712            | "just"
713            | "don"
714            | "should"
715            | "now"
716    )
717}
718
719// --------------------------------------------------------------
720// Porter stemmer (English, original 1980 algorithm). Operates on
721// pure ASCII — non-ASCII input falls through unchanged.
722// --------------------------------------------------------------
723
724/// v7.12.1 — Porter v1 stem. Lowercased ASCII input gives a
725/// stemmed form; non-ASCII characters bypass the algorithm
726/// (returned verbatim).
727pub fn porter_stem(word: &str) -> String {
728    if !word.is_ascii() {
729        return word.to_string();
730    }
731    let bytes: Vec<u8> = word.bytes().collect();
732    if bytes.len() <= 2 {
733        return word.to_string();
734    }
735    let mut b = bytes;
736    step1a(&mut b);
737    step1b(&mut b);
738    step1c(&mut b);
739    step2(&mut b);
740    step3(&mut b);
741    step4(&mut b);
742    step5a(&mut b);
743    step5b(&mut b);
744    // Safe: we only ever produced ASCII via the steps above.
745    String::from_utf8(b).expect("porter stem produced non-UTF8 bytes")
746}
747
748fn is_vowel(b: &[u8], i: usize) -> bool {
749    match b[i] {
750        b'a' | b'e' | b'i' | b'o' | b'u' => true,
751        b'y' => i > 0 && !is_vowel(b, i - 1),
752        _ => false,
753    }
754}
755
756/// Porter's `m` measure — the number of `[C](VC)^m[V]` units.
757fn measure(b: &[u8]) -> usize {
758    let mut m = 0;
759    let mut prev_vowel = false;
760    let mut started = false;
761    for i in 0..b.len() {
762        let v = is_vowel(b, i);
763        if started && prev_vowel && !v {
764            m += 1;
765        }
766        prev_vowel = v;
767        started = true;
768    }
769    m
770}
771
772fn has_vowel(b: &[u8]) -> bool {
773    (0..b.len()).any(|i| is_vowel(b, i))
774}
775
776fn ends_with(b: &[u8], suf: &[u8]) -> bool {
777    b.len() >= suf.len() && &b[b.len() - suf.len()..] == suf
778}
779
780fn replace_suffix(b: &mut Vec<u8>, suf_len: usize, new_suf: &[u8]) {
781    let new_len = b.len() - suf_len;
782    b.truncate(new_len);
783    b.extend_from_slice(new_suf);
784}
785
786fn measure_stem(b: &[u8], suf_len: usize) -> usize {
787    measure(&b[..b.len() - suf_len])
788}
789
790fn step1a(b: &mut Vec<u8>) {
791    if ends_with(b, b"sses") {
792        replace_suffix(b, 4, b"ss");
793    } else if ends_with(b, b"ies") {
794        replace_suffix(b, 3, b"i");
795    } else if ends_with(b, b"ss") {
796        // No change.
797    } else if ends_with(b, b"s") {
798        replace_suffix(b, 1, b"");
799    }
800}
801
802fn step1b_post(b: &mut Vec<u8>) {
803    if ends_with(b, b"at") {
804        replace_suffix(b, 2, b"ate");
805    } else if ends_with(b, b"bl") {
806        replace_suffix(b, 2, b"ble");
807    } else if ends_with(b, b"iz") {
808        replace_suffix(b, 2, b"ize");
809    } else if b.len() >= 2 && b[b.len() - 1] == b[b.len() - 2] {
810        let last = b[b.len() - 1];
811        if !matches!(last, b'l' | b's' | b'z') {
812            b.pop();
813        }
814    } else if cvc(b) {
815        b.extend_from_slice(b"e");
816    }
817}
818
819fn cvc(b: &[u8]) -> bool {
820    if b.len() < 3 {
821        return false;
822    }
823    let l = b.len();
824    if !(is_vowel(b, l - 2) && !is_vowel(b, l - 3) && !is_vowel(b, l - 1)) {
825        return false;
826    }
827    !matches!(b[l - 1], b'w' | b'x' | b'y')
828}
829
830fn step1b(b: &mut Vec<u8>) {
831    if ends_with(b, b"eed") {
832        if measure_stem(b, 3) > 0 {
833            replace_suffix(b, 3, b"ee");
834        }
835        return;
836    }
837    if ends_with(b, b"ed") {
838        let stem_has_vowel = has_vowel(&b[..b.len() - 2]);
839        if stem_has_vowel {
840            replace_suffix(b, 2, b"");
841            step1b_post(b);
842        }
843        return;
844    }
845    if ends_with(b, b"ing") {
846        let stem_has_vowel = has_vowel(&b[..b.len() - 3]);
847        if stem_has_vowel {
848            replace_suffix(b, 3, b"");
849            step1b_post(b);
850        }
851    }
852}
853
854fn step1c(b: &mut Vec<u8>) {
855    if ends_with(b, b"y") && has_vowel(&b[..b.len() - 1]) {
856        replace_suffix(b, 1, b"i");
857    }
858}
859
860const STEP2_RULES: &[(&[u8], &[u8])] = &[
861    (b"ational", b"ate"),
862    (b"tional", b"tion"),
863    (b"enci", b"ence"),
864    (b"anci", b"ance"),
865    (b"izer", b"ize"),
866    (b"abli", b"able"),
867    (b"alli", b"al"),
868    (b"entli", b"ent"),
869    (b"eli", b"e"),
870    (b"ousli", b"ous"),
871    (b"ization", b"ize"),
872    (b"ation", b"ate"),
873    (b"ator", b"ate"),
874    (b"alism", b"al"),
875    (b"iveness", b"ive"),
876    (b"fulness", b"ful"),
877    (b"ousness", b"ous"),
878    (b"aliti", b"al"),
879    (b"iviti", b"ive"),
880    (b"biliti", b"ble"),
881];
882
883fn step2(b: &mut Vec<u8>) {
884    for (suf, repl) in STEP2_RULES {
885        if ends_with(b, suf) && measure_stem(b, suf.len()) > 0 {
886            replace_suffix(b, suf.len(), repl);
887            return;
888        }
889    }
890}
891
892const STEP3_RULES: &[(&[u8], &[u8])] = &[
893    (b"icate", b"ic"),
894    (b"ative", b""),
895    (b"alize", b"al"),
896    (b"iciti", b"ic"),
897    (b"ical", b"ic"),
898    (b"ful", b""),
899    (b"ness", b""),
900];
901
902fn step3(b: &mut Vec<u8>) {
903    for (suf, repl) in STEP3_RULES {
904        if ends_with(b, suf) && measure_stem(b, suf.len()) > 0 {
905            replace_suffix(b, suf.len(), repl);
906            return;
907        }
908    }
909}
910
911const STEP4_RULES: &[&[u8]] = &[
912    b"al", b"ance", b"ence", b"er", b"ic", b"able", b"ible", b"ant", b"ement", b"ment", b"ent",
913    b"ou", b"ism", b"ate", b"iti", b"ous", b"ive", b"ize",
914];
915
916fn step4(b: &mut Vec<u8>) {
917    // Special-case `ion` — only strip when preceded by s/t.
918    if ends_with(b, b"ion") && measure_stem(b, 3) > 1 {
919        let stem = &b[..b.len() - 3];
920        if matches!(stem.last(), Some(b's') | Some(b't')) {
921            replace_suffix(b, 3, b"");
922            return;
923        }
924    }
925    for suf in STEP4_RULES {
926        if ends_with(b, suf) && measure_stem(b, suf.len()) > 1 {
927            replace_suffix(b, suf.len(), b"");
928            return;
929        }
930    }
931}
932
933fn step5a(b: &mut Vec<u8>) {
934    if ends_with(b, b"e") {
935        let m = measure_stem(b, 1);
936        if m > 1 || (m == 1 && !cvc(&b[..b.len() - 1])) {
937            replace_suffix(b, 1, b"");
938        }
939    }
940}
941
942fn step5b(b: &mut Vec<u8>) {
943    if b.len() >= 2 && b[b.len() - 1] == b'l' && b[b.len() - 2] == b'l' && measure(b) > 1 {
944        b.pop();
945    }
946}
947
948#[cfg(test)]
949mod tests {
950    use super::*;
951
952    #[test]
953    fn porter_simple_cases() {
954        assert_eq!(porter_stem("caresses"), "caress");
955        assert_eq!(porter_stem("ponies"), "poni");
956        assert_eq!(porter_stem("ties"), "ti");
957        assert_eq!(porter_stem("cats"), "cat");
958        assert_eq!(porter_stem("running"), "run");
959        assert_eq!(porter_stem("happy"), "happi");
960        assert_eq!(porter_stem("relational"), "relat");
961        assert_eq!(porter_stem("conditional"), "condit");
962        assert_eq!(porter_stem("hopefulness"), "hope");
963    }
964
965    #[test]
966    fn english_drops_stopwords_and_stems() {
967        let v = to_tsvector(
968            TsConfig::English,
969            "The quick brown foxes are jumping over the lazy dogs",
970        );
971        let words: Vec<&str> = v.iter().map(|l| l.word.as_str()).collect();
972        // Stopwords removed: the, are, over
973        // Stems: quick → quick, brown → brown, foxes → fox,
974        // jumping → jump, lazy → lazi, dogs → dog.
975        assert!(words.contains(&"fox"), "expected `fox`, got {words:?}");
976        assert!(words.contains(&"jump"), "expected `jump`, got {words:?}");
977        assert!(words.contains(&"dog"), "expected `dog`, got {words:?}");
978        assert!(!words.contains(&"the"), "stopword `the` leaked: {words:?}");
979        assert!(!words.contains(&"are"), "stopword `are` leaked: {words:?}");
980    }
981
982    #[test]
983    fn simple_preserves_words() {
984        let v = to_tsvector(TsConfig::Simple, "The Quick brown Foxes");
985        let words: Vec<&str> = v.iter().map(|l| l.word.as_str()).collect();
986        // Sorted ascending.
987        assert_eq!(words, alloc::vec!["brown", "foxes", "quick", "the"]);
988    }
989
990    #[test]
991    fn plainto_tsquery_drops_stopwords() {
992        let q = plainto_tsquery(TsConfig::English, "the quick brown fox");
993        // Expect (quick & brown) & fox after stopword drop.
994        let s = crate::eval::format_tsquery(&q);
995        assert_eq!(s, "'quick' & 'brown' & 'fox'");
996    }
997
998    #[test]
999    fn to_tsquery_stems_terms() {
1000        let q = to_tsquery(TsConfig::English, "running & jumps").unwrap();
1001        let s = crate::eval::format_tsquery(&q);
1002        assert_eq!(s, "'run' & 'jump'");
1003    }
1004}
spg_engine/fts.rs

spg_engine/
fts.rs