Skip to main content

citadel_sql/
fts.rs

1//! Full-text search: tsvector/tsquery binary codec + evaluator.
2//!
3//! Wire format is canonical: identical content produces byte-identical bytes,
4//! enabling hash + equality + B-tree ordering without re-parsing.
5
6use crate::error::{Result, SqlError};
7use std::sync::Arc;
8
9pub const MAX_POSITION: u16 = 16_383;
10pub const MAX_POSITIONS_PER_LEXEME: u16 = 255;
11
12pub const TSV_FLAG_POSITION_OVERFLOW: u8 = 0x01;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15pub enum Weight {
16    D = 0,
17    C = 1,
18    B = 2,
19    A = 3,
20}
21
22impl Weight {
23    pub fn as_bits(self) -> u16 {
24        (self as u16) << 14
25    }
26
27    pub fn from_bits(packed: u16) -> Self {
28        match packed >> 14 {
29            0 => Weight::D,
30            1 => Weight::C,
31            2 => Weight::B,
32            _ => Weight::A,
33        }
34    }
35
36    pub fn label(self) -> char {
37        match self {
38            Weight::D => 'D',
39            Weight::C => 'C',
40            Weight::B => 'B',
41            Weight::A => 'A',
42        }
43    }
44}
45
46#[inline]
47pub fn pack_position(pos: u16, weight: Weight) -> u16 {
48    weight.as_bits() | (pos & MAX_POSITION)
49}
50
51#[inline]
52pub fn unpack_position(packed: u16) -> (u16, Weight) {
53    (packed & MAX_POSITION, Weight::from_bits(packed))
54}
55
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct LexemePos {
58    pub lexeme: Vec<u8>,
59    pub positions: Vec<u16>, // packed (pos|weight); sorted ascending
60}
61
62#[derive(Debug, Clone)]
63pub struct TsVectorBuilder {
64    by_lex: std::collections::BTreeMap<Vec<u8>, Vec<u16>>,
65    overflowed: bool,
66}
67
68impl Default for TsVectorBuilder {
69    fn default() -> Self {
70        Self::new()
71    }
72}
73
74impl TsVectorBuilder {
75    pub fn new() -> Self {
76        Self {
77            by_lex: std::collections::BTreeMap::new(),
78            overflowed: false,
79        }
80    }
81
82    pub fn push(&mut self, lexeme: &[u8], position: u16, weight: Weight) {
83        if position == 0 || position > MAX_POSITION {
84            self.overflowed = true;
85            return;
86        }
87        let entry = self.by_lex.entry(lexeme.to_vec()).or_default();
88        if entry.len() >= MAX_POSITIONS_PER_LEXEME as usize {
89            return;
90        }
91        let packed = pack_position(position, weight);
92        let key = (position, weight as u8);
93        let insert_at = entry
94            .binary_search_by(|p| {
95                let (pp, pw) = unpack_position(*p);
96                (pp, pw as u8).cmp(&key)
97            })
98            .unwrap_or_else(|e| e);
99        if insert_at < entry.len() {
100            let (ep, ew) = unpack_position(entry[insert_at]);
101            if ep == position && ew == weight {
102                return;
103            }
104        }
105        entry.insert(insert_at, packed);
106    }
107
108    pub fn push_no_position(&mut self, lexeme: &[u8]) {
109        self.by_lex.entry(lexeme.to_vec()).or_default();
110    }
111
112    pub fn build(self) -> Arc<[u8]> {
113        let mut buf = Vec::with_capacity(8 + self.by_lex.len() * 16);
114        let flags = if self.overflowed {
115            TSV_FLAG_POSITION_OVERFLOW
116        } else {
117            0
118        };
119        buf.push(flags);
120        buf.extend_from_slice(&(self.by_lex.len() as u32).to_le_bytes());
121        for (lex, positions) in self.by_lex {
122            buf.extend_from_slice(&(lex.len() as u16).to_le_bytes());
123            buf.extend_from_slice(&lex);
124            buf.extend_from_slice(&(positions.len() as u16).to_le_bytes());
125            for p in positions {
126                buf.extend_from_slice(&p.to_le_bytes());
127            }
128        }
129        Arc::from(buf)
130    }
131}
132
133pub struct TsVectorReader<'a> {
134    bytes: &'a [u8],
135    pos: usize,
136    remaining: u32,
137}
138
139impl<'a> TsVectorReader<'a> {
140    pub fn open(bytes: &'a [u8]) -> Result<(u8, Self)> {
141        if bytes.is_empty() {
142            return Err(SqlError::InvalidValue("empty tsvector".into()));
143        }
144        let flags = bytes[0];
145        if bytes.len() < 5 {
146            return Err(SqlError::InvalidValue("truncated tsvector header".into()));
147        }
148        let count = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
149        Ok((
150            flags,
151            Self {
152                bytes,
153                pos: 5,
154                remaining: count,
155            },
156        ))
157    }
158}
159
160impl<'a> Iterator for TsVectorReader<'a> {
161    type Item = Result<(&'a [u8], Vec<u16>)>;
162
163    fn next(&mut self) -> Option<Self::Item> {
164        if self.remaining == 0 {
165            return None;
166        }
167        self.remaining -= 1;
168        if self.pos + 2 > self.bytes.len() {
169            return Some(Err(SqlError::InvalidValue("truncated tsvector lex".into())));
170        }
171        let lex_len = u16::from_le_bytes([self.bytes[self.pos], self.bytes[self.pos + 1]]) as usize;
172        self.pos += 2;
173        if self.pos + lex_len > self.bytes.len() {
174            return Some(Err(SqlError::InvalidValue("truncated lex bytes".into())));
175        }
176        let lex = &self.bytes[self.pos..self.pos + lex_len];
177        self.pos += lex_len;
178        if self.pos + 2 > self.bytes.len() {
179            return Some(Err(SqlError::InvalidValue("truncated pos count".into())));
180        }
181        let pc = u16::from_le_bytes([self.bytes[self.pos], self.bytes[self.pos + 1]]) as usize;
182        self.pos += 2;
183        if self.pos + pc * 2 > self.bytes.len() {
184            return Some(Err(SqlError::InvalidValue("truncated positions".into())));
185        }
186        let mut positions = Vec::with_capacity(pc);
187        for _ in 0..pc {
188            positions.push(u16::from_le_bytes([
189                self.bytes[self.pos],
190                self.bytes[self.pos + 1],
191            ]));
192            self.pos += 2;
193        }
194        Some(Ok((lex, positions)))
195    }
196}
197
198pub fn tsvector_overflowed(bytes: &[u8]) -> bool {
199    !bytes.is_empty() && bytes[0] & TSV_FLAG_POSITION_OVERFLOW != 0
200}
201
202pub fn tsvector_display(bytes: &[u8]) -> String {
203    let (_flags, reader) = match TsVectorReader::open(bytes) {
204        Ok(v) => v,
205        Err(_) => return "<invalid tsvector>".into(),
206    };
207    let mut out = String::new();
208    let mut first = true;
209    for item in reader {
210        let (lex, positions) = match item {
211            Ok(v) => v,
212            Err(_) => return "<invalid tsvector>".into(),
213        };
214        if !first {
215            out.push(' ');
216        }
217        first = false;
218        out.push('\'');
219        out.push_str(&String::from_utf8_lossy(lex));
220        out.push('\'');
221        if !positions.is_empty() {
222            out.push(':');
223            for (i, packed) in positions.iter().enumerate() {
224                if i > 0 {
225                    out.push(',');
226                }
227                let (p, w) = unpack_position(*packed);
228                out.push_str(&p.to_string());
229                if w != Weight::D {
230                    out.push(w.label());
231                }
232            }
233        }
234    }
235    out
236}
237
238/// Tsquery AST. Wire format (preorder):
239/// ```text
240/// [u8 tag]
241///   0 Lexeme [u16 len][bytes][u8 weight_mask][u8 flags(prefix=1)]
242///   1 And    [child][child]
243///   2 Or     [child][child]
244///   3 Not    [child]
245///   4 Phrase [u16 distance][child][child]
246/// ```
247#[derive(Debug, Clone, PartialEq, Eq)]
248pub enum TsQueryAst {
249    Lexeme {
250        lexeme: Vec<u8>,
251        weight_mask: u8,
252        prefix: bool,
253    },
254    And(Box<TsQueryAst>, Box<TsQueryAst>),
255    Or(Box<TsQueryAst>, Box<TsQueryAst>),
256    Not(Box<TsQueryAst>),
257    Phrase {
258        distance: u16,
259        left: Box<TsQueryAst>,
260        right: Box<TsQueryAst>,
261    },
262}
263
264pub const TSQ_TAG_LEXEME: u8 = 0;
265pub const TSQ_TAG_AND: u8 = 1;
266pub const TSQ_TAG_OR: u8 = 2;
267pub const TSQ_TAG_NOT: u8 = 3;
268pub const TSQ_TAG_PHRASE: u8 = 4;
269
270pub const TSQ_FLAG_PREFIX: u8 = 0x01;
271
272impl TsQueryAst {
273    pub fn encode(&self) -> Arc<[u8]> {
274        let mut buf = Vec::new();
275        self.encode_into(&mut buf);
276        Arc::from(buf)
277    }
278
279    fn encode_into(&self, buf: &mut Vec<u8>) {
280        match self {
281            TsQueryAst::Lexeme {
282                lexeme,
283                weight_mask,
284                prefix,
285            } => {
286                buf.push(TSQ_TAG_LEXEME);
287                buf.extend_from_slice(&(lexeme.len() as u16).to_le_bytes());
288                buf.extend_from_slice(lexeme);
289                buf.push(*weight_mask);
290                buf.push(if *prefix { TSQ_FLAG_PREFIX } else { 0 });
291            }
292            TsQueryAst::And(l, r) => {
293                buf.push(TSQ_TAG_AND);
294                l.encode_into(buf);
295                r.encode_into(buf);
296            }
297            TsQueryAst::Or(l, r) => {
298                buf.push(TSQ_TAG_OR);
299                l.encode_into(buf);
300                r.encode_into(buf);
301            }
302            TsQueryAst::Not(c) => {
303                buf.push(TSQ_TAG_NOT);
304                c.encode_into(buf);
305            }
306            TsQueryAst::Phrase {
307                distance,
308                left,
309                right,
310            } => {
311                buf.push(TSQ_TAG_PHRASE);
312                buf.extend_from_slice(&distance.to_le_bytes());
313                left.encode_into(buf);
314                right.encode_into(buf);
315            }
316        }
317    }
318
319    pub fn decode(bytes: &[u8]) -> Result<Self> {
320        let mut cursor = 0;
321        let ast = Self::decode_at(bytes, &mut cursor)?;
322        if cursor != bytes.len() {
323            return Err(SqlError::InvalidValue("trailing tsquery bytes".into()));
324        }
325        Ok(ast)
326    }
327
328    fn decode_at(bytes: &[u8], cursor: &mut usize) -> Result<Self> {
329        if *cursor >= bytes.len() {
330            return Err(SqlError::InvalidValue("truncated tsquery".into()));
331        }
332        let tag = bytes[*cursor];
333        *cursor += 1;
334        match tag {
335            TSQ_TAG_LEXEME => {
336                if *cursor + 2 > bytes.len() {
337                    return Err(SqlError::InvalidValue("truncated tsquery lex".into()));
338                }
339                let len = u16::from_le_bytes([bytes[*cursor], bytes[*cursor + 1]]) as usize;
340                *cursor += 2;
341                if *cursor + len + 2 > bytes.len() {
342                    return Err(SqlError::InvalidValue("truncated tsquery lex body".into()));
343                }
344                let lexeme = bytes[*cursor..*cursor + len].to_vec();
345                *cursor += len;
346                let weight_mask = bytes[*cursor];
347                let flags = bytes[*cursor + 1];
348                *cursor += 2;
349                Ok(TsQueryAst::Lexeme {
350                    lexeme,
351                    weight_mask,
352                    prefix: flags & TSQ_FLAG_PREFIX != 0,
353                })
354            }
355            TSQ_TAG_AND => {
356                let l = Self::decode_at(bytes, cursor)?;
357                let r = Self::decode_at(bytes, cursor)?;
358                Ok(TsQueryAst::And(Box::new(l), Box::new(r)))
359            }
360            TSQ_TAG_OR => {
361                let l = Self::decode_at(bytes, cursor)?;
362                let r = Self::decode_at(bytes, cursor)?;
363                Ok(TsQueryAst::Or(Box::new(l), Box::new(r)))
364            }
365            TSQ_TAG_NOT => {
366                let c = Self::decode_at(bytes, cursor)?;
367                Ok(TsQueryAst::Not(Box::new(c)))
368            }
369            TSQ_TAG_PHRASE => {
370                if *cursor + 2 > bytes.len() {
371                    return Err(SqlError::InvalidValue("truncated phrase distance".into()));
372                }
373                let distance = u16::from_le_bytes([bytes[*cursor], bytes[*cursor + 1]]);
374                *cursor += 2;
375                let l = Self::decode_at(bytes, cursor)?;
376                let r = Self::decode_at(bytes, cursor)?;
377                Ok(TsQueryAst::Phrase {
378                    distance,
379                    left: Box::new(l),
380                    right: Box::new(r),
381                })
382            }
383            other => Err(SqlError::InvalidValue(format!(
384                "unknown tsquery tag: {other}"
385            ))),
386        }
387    }
388}
389
390pub fn tsquery_display(bytes: &[u8]) -> String {
391    match TsQueryAst::decode(bytes) {
392        Ok(ast) => display_ast(&ast),
393        Err(_) => "<invalid tsquery>".into(),
394    }
395}
396
397fn display_ast(ast: &TsQueryAst) -> String {
398    match ast {
399        TsQueryAst::Lexeme {
400            lexeme,
401            weight_mask,
402            prefix,
403        } => {
404            let mut s = format!("'{}'", String::from_utf8_lossy(lexeme));
405            if *prefix || *weight_mask != 0 {
406                s.push(':');
407                if *prefix {
408                    s.push('*');
409                }
410                for (bit, label) in [(8, 'A'), (4, 'B'), (2, 'C'), (1, 'D')] {
411                    if weight_mask & bit != 0 {
412                        s.push(label);
413                    }
414                }
415            }
416            s
417        }
418        TsQueryAst::And(l, r) => format!("{} & {}", display_ast(l), display_ast(r)),
419        TsQueryAst::Or(l, r) => format!("({} | {})", display_ast(l), display_ast(r)),
420        TsQueryAst::Not(c) => format!("!{}", display_ast(c)),
421        TsQueryAst::Phrase {
422            distance,
423            left,
424            right,
425        } => {
426            if *distance == 1 {
427                format!("{} <-> {}", display_ast(left), display_ast(right))
428            } else {
429                format!(
430                    "{} <{}> {}",
431                    display_ast(left),
432                    distance,
433                    display_ast(right)
434                )
435            }
436        }
437    }
438}
439
440pub fn parse_tsquery(input: &str) -> Result<TsQueryAst> {
441    let mut p = TsQueryParser::new(input);
442    let ast = p.parse_or()?;
443    p.skip_ws();
444    if p.cursor < p.input.len() {
445        return Err(SqlError::InvalidValue(format!(
446            "unexpected trailing input in tsquery: {}",
447            &p.input[p.cursor..]
448        )));
449    }
450    Ok(ast)
451}
452
453struct TsQueryParser<'a> {
454    input: &'a str,
455    cursor: usize,
456}
457
458impl<'a> TsQueryParser<'a> {
459    fn new(input: &'a str) -> Self {
460        Self { input, cursor: 0 }
461    }
462
463    fn skip_ws(&mut self) {
464        let bytes = self.input.as_bytes();
465        while self.cursor < bytes.len() && bytes[self.cursor].is_ascii_whitespace() {
466            self.cursor += 1;
467        }
468    }
469
470    fn peek(&self) -> Option<u8> {
471        self.input.as_bytes().get(self.cursor).copied()
472    }
473
474    fn eat(&mut self, c: u8) -> bool {
475        if self.peek() == Some(c) {
476            self.cursor += 1;
477            true
478        } else {
479            false
480        }
481    }
482
483    fn parse_or(&mut self) -> Result<TsQueryAst> {
484        let mut left = self.parse_and()?;
485        loop {
486            self.skip_ws();
487            if !self.eat(b'|') {
488                break;
489            }
490            let right = self.parse_and()?;
491            left = TsQueryAst::Or(Box::new(left), Box::new(right));
492        }
493        Ok(left)
494    }
495
496    fn parse_and(&mut self) -> Result<TsQueryAst> {
497        let mut left = self.parse_not()?;
498        loop {
499            self.skip_ws();
500            if !self.eat(b'&') {
501                break;
502            }
503            let right = self.parse_not()?;
504            left = TsQueryAst::And(Box::new(left), Box::new(right));
505        }
506        Ok(left)
507    }
508
509    fn parse_not(&mut self) -> Result<TsQueryAst> {
510        self.skip_ws();
511        if self.eat(b'!') {
512            let inner = self.parse_not()?;
513            return Ok(TsQueryAst::Not(Box::new(inner)));
514        }
515        self.parse_phrase()
516    }
517
518    fn parse_phrase(&mut self) -> Result<TsQueryAst> {
519        let mut left = self.parse_atom()?;
520        loop {
521            self.skip_ws();
522            if self.peek() != Some(b'<') {
523                break;
524            }
525            let dist = self.parse_phrase_distance()?;
526            let right = self.parse_atom()?;
527            left = TsQueryAst::Phrase {
528                distance: dist,
529                left: Box::new(left),
530                right: Box::new(right),
531            };
532        }
533        Ok(left)
534    }
535
536    fn parse_phrase_distance(&mut self) -> Result<u16> {
537        if !self.eat(b'<') {
538            return Err(SqlError::InvalidValue("expected '<'".into()));
539        }
540        if self.eat(b'-') {
541            if !self.eat(b'>') {
542                return Err(SqlError::InvalidValue("expected '<->' phrase op".into()));
543            }
544            return Ok(1);
545        }
546        let start = self.cursor;
547        while let Some(c) = self.peek() {
548            if c.is_ascii_digit() {
549                self.cursor += 1;
550            } else {
551                break;
552            }
553        }
554        if start == self.cursor {
555            return Err(SqlError::InvalidValue(
556                "expected distance after '<' in phrase op".into(),
557            ));
558        }
559        let dist_str = &self.input[start..self.cursor];
560        let dist: u16 = dist_str
561            .parse()
562            .map_err(|_| SqlError::InvalidValue(format!("invalid phrase distance: {dist_str}")))?;
563        if !(1..=MAX_POSITION).contains(&dist) {
564            return Err(SqlError::InvalidValue(format!(
565                "phrase distance {dist} out of range 1..={MAX_POSITION}"
566            )));
567        }
568        if !self.eat(b'>') {
569            return Err(SqlError::InvalidValue("expected '>' after distance".into()));
570        }
571        Ok(dist)
572    }
573
574    fn parse_atom(&mut self) -> Result<TsQueryAst> {
575        self.skip_ws();
576        if self.eat(b'(') {
577            let inner = self.parse_or()?;
578            self.skip_ws();
579            if !self.eat(b')') {
580                return Err(SqlError::InvalidValue("missing closing paren".into()));
581            }
582            return Ok(inner);
583        }
584        let lexeme = self.parse_lexeme_word()?;
585        let (weight_mask, prefix) = self.parse_weight_and_prefix()?;
586        Ok(TsQueryAst::Lexeme {
587            lexeme: lexeme.into_bytes(),
588            weight_mask,
589            prefix,
590        })
591    }
592
593    fn parse_lexeme_word(&mut self) -> Result<String> {
594        self.skip_ws();
595        if self.eat(b'\'') {
596            let start = self.cursor;
597            let bytes = self.input.as_bytes();
598            while self.cursor < bytes.len() && bytes[self.cursor] != b'\'' {
599                self.cursor += 1;
600            }
601            if self.cursor >= bytes.len() {
602                return Err(SqlError::InvalidValue(
603                    "unterminated quoted lexeme in tsquery".into(),
604                ));
605            }
606            let word = self.input[start..self.cursor].to_string();
607            self.cursor += 1; // closing quote
608            if word.is_empty() {
609                return Err(SqlError::InvalidValue("empty lexeme in tsquery".into()));
610            }
611            return Ok(word);
612        }
613        let start = self.cursor;
614        for (i, ch) in self.input[self.cursor..].char_indices() {
615            if ch.is_alphanumeric() || ch == '_' {
616                self.cursor = start + i + ch.len_utf8();
617            } else {
618                break;
619            }
620        }
621        if self.cursor == start {
622            return Err(SqlError::InvalidValue(format!(
623                "expected lexeme at: {}",
624                &self.input[self.cursor..]
625            )));
626        }
627        Ok(self.input[start..self.cursor].to_string())
628    }
629
630    fn parse_weight_and_prefix(&mut self) -> Result<(u8, bool)> {
631        if !self.eat(b':') {
632            return Ok((0, false));
633        }
634        let mut prefix = false;
635        let mut mask: u8 = 0;
636        loop {
637            match self.peek() {
638                Some(b'*') => {
639                    prefix = true;
640                    self.cursor += 1;
641                }
642                Some(c) if matches!(c, b'A' | b'B' | b'C' | b'D' | b'a' | b'b' | b'c' | b'd') => {
643                    let bit = match c.to_ascii_uppercase() {
644                        b'A' => 0b1000,
645                        b'B' => 0b0100,
646                        b'C' => 0b0010,
647                        b'D' => 0b0001,
648                        _ => unreachable!(),
649                    };
650                    mask |= bit;
651                    self.cursor += 1;
652                }
653                _ => break,
654            }
655        }
656        Ok((mask, prefix))
657    }
658}
659
660pub fn op_match(tsvector_bytes: &[u8], tsquery_bytes: &[u8]) -> Result<crate::types::Value> {
661    let ast = TsQueryAst::decode(tsquery_bytes)?;
662    let (flags, reader) = TsVectorReader::open(tsvector_bytes)?;
663    let mut entries: Vec<(Vec<u8>, Vec<u16>)> = Vec::new();
664    for item in reader {
665        let (lex, positions) = item?;
666        entries.push((lex.to_vec(), positions));
667    }
668    let overflowed = flags & TSV_FLAG_POSITION_OVERFLOW != 0;
669    let matched = eval_match(&ast, &entries, overflowed)?;
670    Ok(crate::types::Value::Boolean(matched))
671}
672
673fn eval_match(ast: &TsQueryAst, entries: &[(Vec<u8>, Vec<u16>)], overflowed: bool) -> Result<bool> {
674    match ast {
675        TsQueryAst::Lexeme {
676            lexeme,
677            weight_mask,
678            prefix,
679        } => Ok(!collect_lex_positions(entries, lexeme, *weight_mask, *prefix).is_empty()),
680        TsQueryAst::And(l, r) => {
681            Ok(eval_match(l, entries, overflowed)? && eval_match(r, entries, overflowed)?)
682        }
683        TsQueryAst::Or(l, r) => {
684            Ok(eval_match(l, entries, overflowed)? || eval_match(r, entries, overflowed)?)
685        }
686        TsQueryAst::Not(c) => Ok(!eval_match(c, entries, overflowed)?),
687        TsQueryAst::Phrase {
688            distance,
689            left,
690            right,
691        } => {
692            if overflowed {
693                return Err(SqlError::Unsupported(
694                    "tsvector position overflow; phrase queries unreliable".into(),
695                ));
696            }
697            let left_pos = phrase_positions(left, entries)?;
698            let right_pos = phrase_positions(right, entries)?;
699            Ok(positions_at_offset(&left_pos, &right_pos, *distance))
700        }
701    }
702}
703
704fn phrase_positions(ast: &TsQueryAst, entries: &[(Vec<u8>, Vec<u16>)]) -> Result<Vec<u16>> {
705    match ast {
706        TsQueryAst::Lexeme {
707            lexeme,
708            weight_mask,
709            prefix,
710        } => Ok(collect_lex_positions(
711            entries,
712            lexeme,
713            *weight_mask,
714            *prefix,
715        )),
716        TsQueryAst::Phrase {
717            distance,
718            left,
719            right,
720        } => {
721            let lp = phrase_positions(left, entries)?;
722            let rp = phrase_positions(right, entries)?;
723            Ok(positions_pairing_right(&lp, &rp, *distance))
724        }
725        _ => Err(SqlError::Unsupported(
726            "tsquery: AND/OR/NOT inside phrase operator not supported".into(),
727        )),
728    }
729}
730
731fn positions_at_offset(left: &[u16], right: &[u16], distance: u16) -> bool {
732    if left.is_empty() || right.is_empty() {
733        return false;
734    }
735    let mut i = 0;
736    let mut j = 0;
737    while i < left.len() && j < right.len() {
738        let lp = left[i] & MAX_POSITION;
739        let rp = right[j] & MAX_POSITION;
740        if rp == lp.saturating_add(distance) {
741            return true;
742        }
743        if rp < lp + distance {
744            j += 1;
745        } else {
746            i += 1;
747        }
748    }
749    false
750}
751
752fn positions_pairing_right(left: &[u16], right: &[u16], distance: u16) -> Vec<u16> {
753    let mut out = Vec::new();
754    let mut i = 0;
755    let mut j = 0;
756    while i < left.len() && j < right.len() {
757        let lp = left[i] & MAX_POSITION;
758        let rp = right[j] & MAX_POSITION;
759        let target = lp.saturating_add(distance);
760        if rp == target {
761            if out.last().copied() != Some(right[j]) {
762                out.push(right[j]);
763            }
764            j += 1;
765        } else if rp < target {
766            j += 1;
767        } else {
768            i += 1;
769        }
770    }
771    out
772}
773
774fn collect_lex_positions(
775    entries: &[(Vec<u8>, Vec<u16>)],
776    query_lex: &[u8],
777    weight_mask: u8,
778    prefix: bool,
779) -> Vec<u16> {
780    let mut out: Vec<u16> = Vec::new();
781    let weight_to_bit = |w: Weight| -> u8 {
782        match w {
783            Weight::A => 0b1000,
784            Weight::B => 0b0100,
785            Weight::C => 0b0010,
786            Weight::D => 0b0001,
787        }
788    };
789    let collect_from = |positions: &[u16], out: &mut Vec<u16>| {
790        for &p in positions {
791            if weight_mask != 0 {
792                let (_pos, w) = unpack_position(p);
793                if weight_to_bit(w) & weight_mask == 0 {
794                    continue;
795                }
796            }
797            out.push(p);
798        }
799    };
800    if prefix {
801        let start = entries.partition_point(|(lex, _)| lex.as_slice() < query_lex);
802        for (lex, positions) in entries[start..].iter() {
803            if !lex.starts_with(query_lex) {
804                break;
805            }
806            collect_from(positions, &mut out);
807        }
808        out.sort_unstable();
809        out.dedup();
810    } else {
811        if let Ok(i) = entries.binary_search_by(|(lex, _)| lex.as_slice().cmp(query_lex)) {
812            collect_from(&entries[i].1, &mut out);
813        }
814    }
815    out
816}
817
818pub fn fn_length_tsvector(bytes: &[u8]) -> Result<crate::types::Value> {
819    let (_flags, reader) = TsVectorReader::open(bytes)?;
820    let count = reader.count() as i64;
821    Ok(crate::types::Value::Integer(count))
822}
823
824pub fn fn_numnode(bytes: &[u8]) -> Result<crate::types::Value> {
825    let ast = TsQueryAst::decode(bytes)?;
826    Ok(crate::types::Value::Integer(count_nodes(&ast) as i64))
827}
828
829fn count_nodes(ast: &TsQueryAst) -> usize {
830    match ast {
831        TsQueryAst::Lexeme { .. } => 1,
832        TsQueryAst::And(l, r) | TsQueryAst::Or(l, r) => 1 + count_nodes(l) + count_nodes(r),
833        TsQueryAst::Not(c) => 1 + count_nodes(c),
834        TsQueryAst::Phrase { left, right, .. } => 1 + count_nodes(left) + count_nodes(right),
835    }
836}
837
838fn weight_default(w: Weight) -> f64 {
839    match w {
840        Weight::A => 1.0,
841        Weight::B => 0.4,
842        Weight::C => 0.2,
843        Weight::D => 0.1,
844    }
845}
846
847pub fn fn_ts_rank(tsv: &[u8], tsq: &[u8], norm: i64) -> Result<crate::types::Value> {
848    let entries = decode_entries(tsv)?;
849    let ast = TsQueryAst::decode(tsq)?;
850    let mut score = 0.0_f64;
851    accumulate_rank(&ast, &entries, &mut score, true);
852    score = apply_norm(score, &entries, norm);
853    Ok(crate::types::Value::Real(score))
854}
855
856pub fn fn_ts_rank_cd(tsv: &[u8], tsq: &[u8], norm: i64) -> Result<crate::types::Value> {
857    let entries = decode_entries(tsv)?;
858    let ast = TsQueryAst::decode(tsq)?;
859    let mut atom_lists: Vec<Vec<u16>> = Vec::new();
860    collect_positive_atoms(&ast, &entries, &mut atom_lists);
861    if atom_lists.is_empty() || atom_lists.iter().any(|l| l.is_empty()) {
862        return Ok(crate::types::Value::Real(0.0));
863    }
864    let score = shortest_cover_score(&atom_lists);
865    let score = apply_norm(score, &entries, norm);
866    Ok(crate::types::Value::Real(score))
867}
868
869fn decode_entries(tsv: &[u8]) -> Result<Vec<(Vec<u8>, Vec<u16>)>> {
870    let (_flags, reader) = TsVectorReader::open(tsv)?;
871    let mut out = Vec::new();
872    for item in reader {
873        let (lex, positions) = item?;
874        out.push((lex.to_vec(), positions));
875    }
876    Ok(out)
877}
878
879fn accumulate_rank(
880    ast: &TsQueryAst,
881    entries: &[(Vec<u8>, Vec<u16>)],
882    out: &mut f64,
883    positive: bool,
884) {
885    match ast {
886        TsQueryAst::Lexeme {
887            lexeme,
888            weight_mask,
889            prefix,
890        } => {
891            if !positive {
892                return;
893            }
894            let positions = collect_lex_positions(entries, lexeme, *weight_mask, *prefix);
895            if positions.is_empty() {
896                return;
897            }
898            let weight_sum: f64 = positions
899                .iter()
900                .map(|&p| weight_default(Weight::from_bits(p)))
901                .sum();
902            let tf = (positions.len() as f64).ln_1p();
903            *out += weight_sum * (1.0 + tf);
904        }
905        TsQueryAst::And(l, r) | TsQueryAst::Or(l, r) => {
906            accumulate_rank(l, entries, out, positive);
907            accumulate_rank(r, entries, out, positive);
908        }
909        TsQueryAst::Not(c) => accumulate_rank(c, entries, out, !positive),
910        TsQueryAst::Phrase { left, right, .. } => {
911            accumulate_rank(left, entries, out, positive);
912            accumulate_rank(right, entries, out, positive);
913        }
914    }
915}
916
917fn collect_positive_atoms(
918    ast: &TsQueryAst,
919    entries: &[(Vec<u8>, Vec<u16>)],
920    out: &mut Vec<Vec<u16>>,
921) {
922    match ast {
923        TsQueryAst::Lexeme {
924            lexeme,
925            weight_mask,
926            prefix,
927        } => {
928            let positions = collect_lex_positions(entries, lexeme, *weight_mask, *prefix);
929            out.push(positions);
930        }
931        TsQueryAst::And(l, r)
932        | TsQueryAst::Or(l, r)
933        | TsQueryAst::Phrase {
934            left: l, right: r, ..
935        } => {
936            collect_positive_atoms(l, entries, out);
937            collect_positive_atoms(r, entries, out);
938        }
939        TsQueryAst::Not(_) => {} // negated atoms don't contribute to cover
940    }
941}
942
943fn shortest_cover_score(atom_lists: &[Vec<u16>]) -> f64 {
944    if atom_lists.is_empty() {
945        return 0.0;
946    }
947    let mut events: Vec<(u16, usize, u16)> = Vec::new();
948    for (i, list) in atom_lists.iter().enumerate() {
949        for &packed in list {
950            let (pos, _w) = unpack_position(packed);
951            events.push((pos, i, packed));
952        }
953    }
954    events.sort_unstable_by_key(|e| e.0);
955
956    let k = atom_lists.len();
957    let mut count_per_atom = vec![0usize; k];
958    let mut covered_count = 0usize;
959    let mut best_score = 0.0_f64;
960    let mut l = 0usize;
961    for r in 0..events.len() {
962        let ai = events[r].1;
963        if count_per_atom[ai] == 0 {
964            covered_count += 1;
965        }
966        count_per_atom[ai] += 1;
967        while covered_count == k {
968            let window_len = (events[r].0 - events[l].0 + 1) as f64;
969            let weight_sum: f64 = events[l..=r]
970                .iter()
971                .map(|e| weight_default(Weight::from_bits(e.2)))
972                .sum();
973            let candidate = weight_sum / window_len;
974            if candidate > best_score {
975                best_score = candidate;
976            }
977            let la = events[l].1;
978            count_per_atom[la] -= 1;
979            if count_per_atom[la] == 0 {
980                covered_count -= 1;
981            }
982            l += 1;
983        }
984    }
985    best_score
986}
987
988fn apply_norm(mut score: f64, entries: &[(Vec<u8>, Vec<u16>)], norm: i64) -> f64 {
989    let doc_len: f64 = entries.iter().map(|e| e.1.len()).sum::<usize>() as f64;
990    let unique = entries.len() as f64;
991    if (norm & 1) != 0 && doc_len > 1.0 {
992        score /= 1.0 + doc_len.ln();
993    }
994    if (norm & 2) != 0 && doc_len > 0.0 {
995        score /= doc_len;
996    }
997    if (norm & 8) != 0 && unique > 0.0 {
998        score /= unique;
999    }
1000    if (norm & 16) != 0 && unique > 1.0 {
1001        score /= 1.0 + unique.ln();
1002    }
1003    if (norm & 32) != 0 {
1004        score /= score + 1.0;
1005    }
1006    score
1007}
1008
1009#[derive(Debug, Clone, PartialEq, Eq)]
1010pub struct Token {
1011    pub lexeme: String,
1012    pub position: u16,
1013    pub stopped: bool,
1014}
1015
1016#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1017pub enum TokenizerKind {
1018    Simple,
1019    English,
1020}
1021
1022impl TokenizerKind {
1023    pub fn from_name(name: &str) -> Result<Self> {
1024        match name.to_ascii_lowercase().as_str() {
1025            "simple" => Ok(TokenizerKind::Simple),
1026            "english" | "pg_catalog.english" => Ok(TokenizerKind::English),
1027            other => Err(SqlError::Unsupported(format!(
1028                "unknown text search configuration: {other}"
1029            ))),
1030        }
1031    }
1032
1033    pub fn as_config_id(self) -> u8 {
1034        match self {
1035            TokenizerKind::Simple => 0,
1036            TokenizerKind::English => 1,
1037        }
1038    }
1039
1040    pub fn from_config_id(id: u8) -> Result<Self> {
1041        match id {
1042            0 => Ok(TokenizerKind::Simple),
1043            1 => Ok(TokenizerKind::English),
1044            _ => Err(SqlError::InvalidValue(format!(
1045                "unknown FTS config_id: {id}"
1046            ))),
1047        }
1048    }
1049}
1050
1051// Vendored from `postgres/src/backend/snowball/stopwords/english.stop`.
1052const ENGLISH_STOP_WORDS: &[&str] = &[
1053    "a",
1054    "about",
1055    "above",
1056    "after",
1057    "again",
1058    "against",
1059    "all",
1060    "am",
1061    "an",
1062    "and",
1063    "any",
1064    "are",
1065    "as",
1066    "at",
1067    "be",
1068    "because",
1069    "been",
1070    "before",
1071    "being",
1072    "below",
1073    "between",
1074    "both",
1075    "but",
1076    "by",
1077    "can",
1078    "did",
1079    "do",
1080    "does",
1081    "doing",
1082    "don",
1083    "down",
1084    "during",
1085    "each",
1086    "few",
1087    "for",
1088    "from",
1089    "further",
1090    "had",
1091    "has",
1092    "have",
1093    "having",
1094    "he",
1095    "her",
1096    "here",
1097    "hers",
1098    "herself",
1099    "him",
1100    "himself",
1101    "his",
1102    "how",
1103    "i",
1104    "if",
1105    "in",
1106    "into",
1107    "is",
1108    "it",
1109    "its",
1110    "itself",
1111    "just",
1112    "me",
1113    "more",
1114    "most",
1115    "my",
1116    "myself",
1117    "no",
1118    "nor",
1119    "not",
1120    "now",
1121    "of",
1122    "off",
1123    "on",
1124    "once",
1125    "only",
1126    "or",
1127    "other",
1128    "our",
1129    "ours",
1130    "ourselves",
1131    "out",
1132    "over",
1133    "own",
1134    "s",
1135    "same",
1136    "she",
1137    "should",
1138    "so",
1139    "some",
1140    "such",
1141    "t",
1142    "than",
1143    "that",
1144    "the",
1145    "their",
1146    "theirs",
1147    "them",
1148    "themselves",
1149    "then",
1150    "there",
1151    "these",
1152    "they",
1153    "this",
1154    "those",
1155    "through",
1156    "to",
1157    "too",
1158    "under",
1159    "until",
1160    "up",
1161    "very",
1162    "was",
1163    "we",
1164    "were",
1165    "what",
1166    "when",
1167    "where",
1168    "which",
1169    "while",
1170    "who",
1171    "whom",
1172    "why",
1173    "will",
1174    "with",
1175    "you",
1176    "your",
1177    "yours",
1178    "yourself",
1179    "yourselves",
1180];
1181
1182fn is_english_stopword(word: &str) -> bool {
1183    ENGLISH_STOP_WORDS.binary_search(&word).is_ok()
1184}
1185
1186pub fn tokenize(kind: TokenizerKind, text: &str) -> Vec<Token> {
1187    use unicode_normalization::UnicodeNormalization;
1188    use unicode_segmentation::UnicodeSegmentation;
1189
1190    let normalized: String = text.nfkc().collect();
1191    let lowered = normalized.to_lowercase();
1192
1193    let mut out = Vec::new();
1194    let mut position: u32 = 0;
1195    for word in lowered.unicode_words() {
1196        position += 1;
1197        let pos_u16 = if position <= MAX_POSITION as u32 {
1198            position as u16
1199        } else {
1200            MAX_POSITION + 1 // signals overflow downstream
1201        };
1202        let mut stopped = false;
1203        let lexeme = match kind {
1204            TokenizerKind::Simple => word.to_string(),
1205            TokenizerKind::English => {
1206                if is_english_stopword(word) {
1207                    stopped = true;
1208                    String::new()
1209                } else {
1210                    tantivy_stemmers::algorithms::english_porter_2(word).into_owned()
1211                }
1212            }
1213        };
1214        if lexeme.is_empty() && !stopped {
1215            position -= 1;
1216            continue;
1217        }
1218        out.push(Token {
1219            lexeme,
1220            position: pos_u16,
1221            stopped,
1222        });
1223    }
1224    out
1225}
1226
1227fn stem_one(kind: TokenizerKind, word: &str) -> Option<String> {
1228    use unicode_normalization::UnicodeNormalization;
1229    let normalized: String = word.nfkc().collect();
1230    let lowered = normalized.to_lowercase();
1231    match kind {
1232        TokenizerKind::Simple => Some(lowered),
1233        TokenizerKind::English => {
1234            if is_english_stopword(&lowered) {
1235                None
1236            } else {
1237                Some(tantivy_stemmers::algorithms::english_porter_2(&lowered).into_owned())
1238            }
1239        }
1240    }
1241}
1242
1243pub fn fn_to_tsvector_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1244    let tokens = tokenize(kind, text);
1245    let mut b = TsVectorBuilder::new();
1246    for t in tokens {
1247        if t.stopped {
1248            continue;
1249        }
1250        b.push(t.lexeme.as_bytes(), t.position, Weight::D);
1251    }
1252    Ok(crate::types::Value::TsVector(b.build()))
1253}
1254
1255pub fn fn_to_tsvector(text: &str) -> Result<crate::types::Value> {
1256    fn_to_tsvector_with(TokenizerKind::English, text)
1257}
1258
1259pub fn fn_to_tsquery_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1260    let raw = parse_tsquery(text)?;
1261    let stemmed = stem_ast(&raw, kind)?;
1262    Ok(crate::types::Value::TsQuery(stemmed.encode()))
1263}
1264
1265fn stem_ast(ast: &TsQueryAst, kind: TokenizerKind) -> Result<TsQueryAst> {
1266    Ok(match ast {
1267        TsQueryAst::Lexeme {
1268            lexeme,
1269            weight_mask,
1270            prefix,
1271        } => {
1272            let s = std::str::from_utf8(lexeme)
1273                .map_err(|_| SqlError::InvalidValue("tsquery lexeme has invalid UTF-8".into()))?;
1274            let stemmed = stem_one(kind, s).ok_or_else(|| {
1275                SqlError::InvalidValue(format!("tsquery: lexeme '{s}' is a stop-word"))
1276            })?;
1277            TsQueryAst::Lexeme {
1278                lexeme: stemmed.into_bytes(),
1279                weight_mask: *weight_mask,
1280                prefix: *prefix,
1281            }
1282        }
1283        TsQueryAst::And(l, r) => {
1284            TsQueryAst::And(Box::new(stem_ast(l, kind)?), Box::new(stem_ast(r, kind)?))
1285        }
1286        TsQueryAst::Or(l, r) => {
1287            TsQueryAst::Or(Box::new(stem_ast(l, kind)?), Box::new(stem_ast(r, kind)?))
1288        }
1289        TsQueryAst::Not(c) => TsQueryAst::Not(Box::new(stem_ast(c, kind)?)),
1290        TsQueryAst::Phrase {
1291            distance,
1292            left,
1293            right,
1294        } => TsQueryAst::Phrase {
1295            distance: *distance,
1296            left: Box::new(stem_ast(left, kind)?),
1297            right: Box::new(stem_ast(right, kind)?),
1298        },
1299    })
1300}
1301
1302pub fn fn_plainto_tsquery_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1303    let tokens = tokenize(kind, text);
1304    let lexemes: Vec<Vec<u8>> = tokens
1305        .into_iter()
1306        .filter(|t| !t.stopped && !t.lexeme.is_empty())
1307        .map(|t| t.lexeme.into_bytes())
1308        .collect();
1309    let ast = and_chain(&lexemes)?;
1310    Ok(crate::types::Value::TsQuery(ast.encode()))
1311}
1312
1313fn and_chain(lexemes: &[Vec<u8>]) -> Result<TsQueryAst> {
1314    if lexemes.is_empty() {
1315        return Err(SqlError::InvalidValue(
1316            "tsquery would be empty (input had only stop-words?)".into(),
1317        ));
1318    }
1319    let mut iter = lexemes.iter().map(|l| TsQueryAst::Lexeme {
1320        lexeme: l.clone(),
1321        weight_mask: 0,
1322        prefix: false,
1323    });
1324    let first = iter.next().unwrap();
1325    Ok(iter.fold(first, |acc, x| TsQueryAst::And(Box::new(acc), Box::new(x))))
1326}
1327
1328pub fn fn_phraseto_tsquery_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1329    let tokens = tokenize(kind, text);
1330    let mut lex_positions: Vec<(Vec<u8>, u16)> = Vec::new();
1331    for t in tokens {
1332        if t.stopped || t.lexeme.is_empty() {
1333            continue;
1334        }
1335        lex_positions.push((t.lexeme.into_bytes(), t.position));
1336    }
1337    if lex_positions.is_empty() {
1338        return Err(SqlError::InvalidValue(
1339            "tsquery would be empty (input had only stop-words?)".into(),
1340        ));
1341    }
1342    let mut iter = lex_positions.into_iter();
1343    let (first_lex, mut prev_pos) = iter.next().unwrap();
1344    let mut acc = TsQueryAst::Lexeme {
1345        lexeme: first_lex,
1346        weight_mask: 0,
1347        prefix: false,
1348    };
1349    for (lex, pos) in iter {
1350        let dist = pos.saturating_sub(prev_pos).max(1);
1351        let right = TsQueryAst::Lexeme {
1352            lexeme: lex,
1353            weight_mask: 0,
1354            prefix: false,
1355        };
1356        acc = TsQueryAst::Phrase {
1357            distance: dist,
1358            left: Box::new(acc),
1359            right: Box::new(right),
1360        };
1361        prev_pos = pos;
1362    }
1363    Ok(crate::types::Value::TsQuery(acc.encode()))
1364}
1365
1366pub fn fn_websearch_to_tsquery_with(
1367    kind: TokenizerKind,
1368    text: &str,
1369) -> Result<crate::types::Value> {
1370    let mut groups: Vec<TsQueryAst> = Vec::new();
1371    let mut current_terms: Vec<TsQueryAst> = Vec::new();
1372    let mut cursor = 0usize;
1373    let bytes = text.as_bytes();
1374
1375    let flush_group = |terms: &mut Vec<TsQueryAst>, groups: &mut Vec<TsQueryAst>| {
1376        if terms.is_empty() {
1377            return;
1378        }
1379        let mut iter = std::mem::take(terms).into_iter();
1380        let first = iter.next().unwrap();
1381        let combined = iter.fold(first, |acc, x| TsQueryAst::And(Box::new(acc), Box::new(x)));
1382        groups.push(combined);
1383    };
1384
1385    while cursor < bytes.len() {
1386        while cursor < bytes.len() && bytes[cursor].is_ascii_whitespace() {
1387            cursor += 1;
1388        }
1389        if cursor >= bytes.len() {
1390            break;
1391        }
1392        let negate = if bytes[cursor] == b'-' {
1393            cursor += 1;
1394            true
1395        } else {
1396            false
1397        };
1398        if cursor < bytes.len() && bytes[cursor] == b'"' {
1399            cursor += 1;
1400            let start = cursor;
1401            while cursor < bytes.len() && bytes[cursor] != b'"' {
1402                cursor += 1;
1403            }
1404            let inner = &text[start..cursor];
1405            if cursor < bytes.len() {
1406                cursor += 1; // closing quote
1407            }
1408            if let Ok(crate::types::Value::TsQuery(q)) = fn_phraseto_tsquery_with(kind, inner) {
1409                let mut ast = TsQueryAst::decode(&q)?;
1410                if negate {
1411                    ast = TsQueryAst::Not(Box::new(ast));
1412                }
1413                current_terms.push(ast);
1414            }
1415            continue;
1416        }
1417        let start = cursor;
1418        while cursor < bytes.len() && !bytes[cursor].is_ascii_whitespace() {
1419            cursor += 1;
1420        }
1421        let word = &text[start..cursor];
1422        if word.eq_ignore_ascii_case("or") {
1423            flush_group(&mut current_terms, &mut groups);
1424            continue;
1425        }
1426        if let Some(stemmed) = stem_one(kind, word) {
1427            let mut ast = TsQueryAst::Lexeme {
1428                lexeme: stemmed.into_bytes(),
1429                weight_mask: 0,
1430                prefix: false,
1431            };
1432            if negate {
1433                ast = TsQueryAst::Not(Box::new(ast));
1434            }
1435            current_terms.push(ast);
1436        }
1437    }
1438    flush_group(&mut current_terms, &mut groups);
1439    if groups.is_empty() {
1440        return Err(SqlError::InvalidValue(
1441            "tsquery would be empty (input had only stop-words?)".into(),
1442        ));
1443    }
1444    let mut iter = groups.into_iter();
1445    let first = iter.next().unwrap();
1446    let combined = iter.fold(first, |acc, x| TsQueryAst::Or(Box::new(acc), Box::new(x)));
1447    Ok(crate::types::Value::TsQuery(combined.encode()))
1448}
1449
1450pub fn fn_ts_headline_with(
1451    kind: TokenizerKind,
1452    text: &str,
1453    tsq_bytes: &[u8],
1454) -> Result<crate::types::Value> {
1455    use unicode_segmentation::UnicodeSegmentation;
1456    let ast = TsQueryAst::decode(tsq_bytes)?;
1457    let positive_lexemes = collect_query_atoms(&ast);
1458    let mut out = String::with_capacity(text.len() + 16);
1459    let mut last_end = 0usize;
1460    for (idx, word) in text.split_word_bound_indices() {
1461        let word_lower: String = word.to_lowercase();
1462        let stemmed = stem_one(kind, &word_lower);
1463        let matched = stemmed.as_ref().is_some_and(|s| {
1464            positive_lexemes
1465                .iter()
1466                .any(|q| q == s.as_bytes() || word_lower.as_bytes() == q)
1467        });
1468        if matched {
1469            out.push_str(&text[last_end..idx]);
1470            out.push_str("<b>");
1471            out.push_str(word);
1472            out.push_str("</b>");
1473            last_end = idx + word.len();
1474        }
1475    }
1476    out.push_str(&text[last_end..]);
1477    Ok(crate::types::Value::Text(out.into()))
1478}
1479
1480fn collect_query_atoms(ast: &TsQueryAst) -> Vec<Vec<u8>> {
1481    let mut out = Vec::new();
1482    fn walk(ast: &TsQueryAst, positive: bool, out: &mut Vec<Vec<u8>>) {
1483        match ast {
1484            TsQueryAst::Lexeme { lexeme, .. } => {
1485                if positive {
1486                    out.push(lexeme.clone());
1487                }
1488            }
1489            TsQueryAst::And(l, r) | TsQueryAst::Or(l, r) => {
1490                walk(l, positive, out);
1491                walk(r, positive, out);
1492            }
1493            TsQueryAst::Not(c) => walk(c, !positive, out),
1494            TsQueryAst::Phrase { left, right, .. } => {
1495                walk(left, positive, out);
1496                walk(right, positive, out);
1497            }
1498        }
1499    }
1500    walk(ast, true, &mut out);
1501    out
1502}
1503
1504pub fn fn_ts_lexize_with(kind: TokenizerKind, word: &str) -> Result<crate::types::Value> {
1505    match stem_one(kind, word) {
1506        Some(s) => Ok(crate::types::Value::Text(s.into())),
1507        None => Ok(crate::types::Value::Null),
1508    }
1509}
1510
1511pub fn fn_to_tsquery(text: &str) -> Result<crate::types::Value> {
1512    fn_to_tsquery_with(TokenizerKind::English, text)
1513}
1514
1515pub fn fn_plainto_tsquery(text: &str) -> Result<crate::types::Value> {
1516    fn_plainto_tsquery_with(TokenizerKind::English, text)
1517}
1518
1519pub fn fn_phraseto_tsquery(text: &str) -> Result<crate::types::Value> {
1520    fn_phraseto_tsquery_with(TokenizerKind::English, text)
1521}
1522
1523pub fn fn_websearch_to_tsquery(text: &str) -> Result<crate::types::Value> {
1524    fn_websearch_to_tsquery_with(TokenizerKind::English, text)
1525}
1526
1527pub fn fn_ts_headline(text: &str, tsq: &[u8]) -> Result<crate::types::Value> {
1528    fn_ts_headline_with(TokenizerKind::English, text, tsq)
1529}
1530
1531pub fn fn_ts_lexize(word: &str) -> Result<crate::types::Value> {
1532    fn_ts_lexize_with(TokenizerKind::English, word)
1533}
1534
1535pub fn parse_weight_char(s: &str) -> Result<Weight> {
1536    let bytes = s.as_bytes();
1537    if bytes.len() == 1 {
1538        match bytes[0].to_ascii_uppercase() {
1539            b'A' => return Ok(Weight::A),
1540            b'B' => return Ok(Weight::B),
1541            b'C' => return Ok(Weight::C),
1542            b'D' => return Ok(Weight::D),
1543            _ => {}
1544        }
1545    }
1546    Err(SqlError::InvalidValue(format!(
1547        "unrecognized weight: {}",
1548        bytes.first().copied().unwrap_or(0)
1549    )))
1550}
1551
1552pub fn fn_setweight(tsv: &[u8], weight: Weight) -> Result<crate::types::Value> {
1553    let (_flags, reader) = TsVectorReader::open(tsv)?;
1554    let mut b = TsVectorBuilder::new();
1555    for item in reader {
1556        let (lex, positions) = item?;
1557        if positions.is_empty() {
1558            b.push_no_position(lex);
1559            continue;
1560        }
1561        for packed in positions {
1562            let pos = packed & MAX_POSITION;
1563            b.push(lex, pos, weight);
1564        }
1565    }
1566    Ok(crate::types::Value::TsVector(b.build()))
1567}
1568
1569#[cfg(test)]
1570#[path = "fts_tests.rs"]
1571mod tests;