1use crate::error::{Result, SqlError};
7use std::sync::Arc;
8
9pub const MAX_POSITION: u16 = 16_383;
10pub const MAX_POSITIONS_PER_LEXEME: u16 = 255;
11
12pub const TSV_FLAG_POSITION_OVERFLOW: u8 = 0x01;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15pub enum Weight {
16 D = 0,
17 C = 1,
18 B = 2,
19 A = 3,
20}
21
22impl Weight {
23 pub fn as_bits(self) -> u16 {
24 (self as u16) << 14
25 }
26
27 pub fn from_bits(packed: u16) -> Self {
28 match packed >> 14 {
29 0 => Weight::D,
30 1 => Weight::C,
31 2 => Weight::B,
32 _ => Weight::A,
33 }
34 }
35
36 pub fn label(self) -> char {
37 match self {
38 Weight::D => 'D',
39 Weight::C => 'C',
40 Weight::B => 'B',
41 Weight::A => 'A',
42 }
43 }
44}
45
46#[inline]
47pub fn pack_position(pos: u16, weight: Weight) -> u16 {
48 weight.as_bits() | (pos & MAX_POSITION)
49}
50
51#[inline]
52pub fn unpack_position(packed: u16) -> (u16, Weight) {
53 (packed & MAX_POSITION, Weight::from_bits(packed))
54}
55
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct LexemePos {
58 pub lexeme: Vec<u8>,
59 pub positions: Vec<u16>, }
61
62#[derive(Debug, Clone)]
63pub struct TsVectorBuilder {
64 by_lex: std::collections::BTreeMap<Vec<u8>, Vec<u16>>,
65 overflowed: bool,
66}
67
68impl Default for TsVectorBuilder {
69 fn default() -> Self {
70 Self::new()
71 }
72}
73
74impl TsVectorBuilder {
75 pub fn new() -> Self {
76 Self {
77 by_lex: std::collections::BTreeMap::new(),
78 overflowed: false,
79 }
80 }
81
82 pub fn push(&mut self, lexeme: &[u8], position: u16, weight: Weight) {
83 if position == 0 || position > MAX_POSITION {
84 self.overflowed = true;
85 return;
86 }
87 let entry = self.by_lex.entry(lexeme.to_vec()).or_default();
88 if entry.len() >= MAX_POSITIONS_PER_LEXEME as usize {
89 return;
90 }
91 let packed = pack_position(position, weight);
92 let key = (position, weight as u8);
93 let insert_at = entry
94 .binary_search_by(|p| {
95 let (pp, pw) = unpack_position(*p);
96 (pp, pw as u8).cmp(&key)
97 })
98 .unwrap_or_else(|e| e);
99 if insert_at < entry.len() {
100 let (ep, ew) = unpack_position(entry[insert_at]);
101 if ep == position && ew == weight {
102 return;
103 }
104 }
105 entry.insert(insert_at, packed);
106 }
107
108 pub fn push_no_position(&mut self, lexeme: &[u8]) {
109 self.by_lex.entry(lexeme.to_vec()).or_default();
110 }
111
112 pub fn build(self) -> Arc<[u8]> {
113 let mut buf = Vec::with_capacity(8 + self.by_lex.len() * 16);
114 let flags = if self.overflowed {
115 TSV_FLAG_POSITION_OVERFLOW
116 } else {
117 0
118 };
119 buf.push(flags);
120 buf.extend_from_slice(&(self.by_lex.len() as u32).to_le_bytes());
121 for (lex, positions) in self.by_lex {
122 buf.extend_from_slice(&(lex.len() as u16).to_le_bytes());
123 buf.extend_from_slice(&lex);
124 buf.extend_from_slice(&(positions.len() as u16).to_le_bytes());
125 for p in positions {
126 buf.extend_from_slice(&p.to_le_bytes());
127 }
128 }
129 Arc::from(buf)
130 }
131}
132
133pub struct TsVectorReader<'a> {
134 bytes: &'a [u8],
135 pos: usize,
136 remaining: u32,
137}
138
139impl<'a> TsVectorReader<'a> {
140 pub fn open(bytes: &'a [u8]) -> Result<(u8, Self)> {
141 if bytes.is_empty() {
142 return Err(SqlError::InvalidValue("empty tsvector".into()));
143 }
144 let flags = bytes[0];
145 if bytes.len() < 5 {
146 return Err(SqlError::InvalidValue("truncated tsvector header".into()));
147 }
148 let count = u32::from_le_bytes(bytes[1..5].try_into().unwrap());
149 Ok((
150 flags,
151 Self {
152 bytes,
153 pos: 5,
154 remaining: count,
155 },
156 ))
157 }
158}
159
160impl<'a> Iterator for TsVectorReader<'a> {
161 type Item = Result<(&'a [u8], Vec<u16>)>;
162
163 fn next(&mut self) -> Option<Self::Item> {
164 if self.remaining == 0 {
165 return None;
166 }
167 self.remaining -= 1;
168 if self.pos + 2 > self.bytes.len() {
169 return Some(Err(SqlError::InvalidValue("truncated tsvector lex".into())));
170 }
171 let lex_len = u16::from_le_bytes([self.bytes[self.pos], self.bytes[self.pos + 1]]) as usize;
172 self.pos += 2;
173 if self.pos + lex_len > self.bytes.len() {
174 return Some(Err(SqlError::InvalidValue("truncated lex bytes".into())));
175 }
176 let lex = &self.bytes[self.pos..self.pos + lex_len];
177 self.pos += lex_len;
178 if self.pos + 2 > self.bytes.len() {
179 return Some(Err(SqlError::InvalidValue("truncated pos count".into())));
180 }
181 let pc = u16::from_le_bytes([self.bytes[self.pos], self.bytes[self.pos + 1]]) as usize;
182 self.pos += 2;
183 if self.pos + pc * 2 > self.bytes.len() {
184 return Some(Err(SqlError::InvalidValue("truncated positions".into())));
185 }
186 let mut positions = Vec::with_capacity(pc);
187 for _ in 0..pc {
188 positions.push(u16::from_le_bytes([
189 self.bytes[self.pos],
190 self.bytes[self.pos + 1],
191 ]));
192 self.pos += 2;
193 }
194 Some(Ok((lex, positions)))
195 }
196}
197
198pub fn tsvector_overflowed(bytes: &[u8]) -> bool {
199 !bytes.is_empty() && bytes[0] & TSV_FLAG_POSITION_OVERFLOW != 0
200}
201
202pub fn tsvector_display(bytes: &[u8]) -> String {
203 let (_flags, reader) = match TsVectorReader::open(bytes) {
204 Ok(v) => v,
205 Err(_) => return "<invalid tsvector>".into(),
206 };
207 let mut out = String::new();
208 let mut first = true;
209 for item in reader {
210 let (lex, positions) = match item {
211 Ok(v) => v,
212 Err(_) => return "<invalid tsvector>".into(),
213 };
214 if !first {
215 out.push(' ');
216 }
217 first = false;
218 out.push('\'');
219 out.push_str(&String::from_utf8_lossy(lex));
220 out.push('\'');
221 if !positions.is_empty() {
222 out.push(':');
223 for (i, packed) in positions.iter().enumerate() {
224 if i > 0 {
225 out.push(',');
226 }
227 let (p, w) = unpack_position(*packed);
228 out.push_str(&p.to_string());
229 if w != Weight::D {
230 out.push(w.label());
231 }
232 }
233 }
234 }
235 out
236}
237
238#[derive(Debug, Clone, PartialEq, Eq)]
248pub enum TsQueryAst {
249 Lexeme {
250 lexeme: Vec<u8>,
251 weight_mask: u8,
252 prefix: bool,
253 },
254 And(Box<TsQueryAst>, Box<TsQueryAst>),
255 Or(Box<TsQueryAst>, Box<TsQueryAst>),
256 Not(Box<TsQueryAst>),
257 Phrase {
258 distance: u16,
259 left: Box<TsQueryAst>,
260 right: Box<TsQueryAst>,
261 },
262}
263
264pub const TSQ_TAG_LEXEME: u8 = 0;
265pub const TSQ_TAG_AND: u8 = 1;
266pub const TSQ_TAG_OR: u8 = 2;
267pub const TSQ_TAG_NOT: u8 = 3;
268pub const TSQ_TAG_PHRASE: u8 = 4;
269
270pub const TSQ_FLAG_PREFIX: u8 = 0x01;
271
272impl TsQueryAst {
273 pub fn encode(&self) -> Arc<[u8]> {
274 let mut buf = Vec::new();
275 self.encode_into(&mut buf);
276 Arc::from(buf)
277 }
278
279 fn encode_into(&self, buf: &mut Vec<u8>) {
280 match self {
281 TsQueryAst::Lexeme {
282 lexeme,
283 weight_mask,
284 prefix,
285 } => {
286 buf.push(TSQ_TAG_LEXEME);
287 buf.extend_from_slice(&(lexeme.len() as u16).to_le_bytes());
288 buf.extend_from_slice(lexeme);
289 buf.push(*weight_mask);
290 buf.push(if *prefix { TSQ_FLAG_PREFIX } else { 0 });
291 }
292 TsQueryAst::And(l, r) => {
293 buf.push(TSQ_TAG_AND);
294 l.encode_into(buf);
295 r.encode_into(buf);
296 }
297 TsQueryAst::Or(l, r) => {
298 buf.push(TSQ_TAG_OR);
299 l.encode_into(buf);
300 r.encode_into(buf);
301 }
302 TsQueryAst::Not(c) => {
303 buf.push(TSQ_TAG_NOT);
304 c.encode_into(buf);
305 }
306 TsQueryAst::Phrase {
307 distance,
308 left,
309 right,
310 } => {
311 buf.push(TSQ_TAG_PHRASE);
312 buf.extend_from_slice(&distance.to_le_bytes());
313 left.encode_into(buf);
314 right.encode_into(buf);
315 }
316 }
317 }
318
319 pub fn decode(bytes: &[u8]) -> Result<Self> {
320 let mut cursor = 0;
321 let ast = Self::decode_at(bytes, &mut cursor)?;
322 if cursor != bytes.len() {
323 return Err(SqlError::InvalidValue("trailing tsquery bytes".into()));
324 }
325 Ok(ast)
326 }
327
328 fn decode_at(bytes: &[u8], cursor: &mut usize) -> Result<Self> {
329 if *cursor >= bytes.len() {
330 return Err(SqlError::InvalidValue("truncated tsquery".into()));
331 }
332 let tag = bytes[*cursor];
333 *cursor += 1;
334 match tag {
335 TSQ_TAG_LEXEME => {
336 if *cursor + 2 > bytes.len() {
337 return Err(SqlError::InvalidValue("truncated tsquery lex".into()));
338 }
339 let len = u16::from_le_bytes([bytes[*cursor], bytes[*cursor + 1]]) as usize;
340 *cursor += 2;
341 if *cursor + len + 2 > bytes.len() {
342 return Err(SqlError::InvalidValue("truncated tsquery lex body".into()));
343 }
344 let lexeme = bytes[*cursor..*cursor + len].to_vec();
345 *cursor += len;
346 let weight_mask = bytes[*cursor];
347 let flags = bytes[*cursor + 1];
348 *cursor += 2;
349 Ok(TsQueryAst::Lexeme {
350 lexeme,
351 weight_mask,
352 prefix: flags & TSQ_FLAG_PREFIX != 0,
353 })
354 }
355 TSQ_TAG_AND => {
356 let l = Self::decode_at(bytes, cursor)?;
357 let r = Self::decode_at(bytes, cursor)?;
358 Ok(TsQueryAst::And(Box::new(l), Box::new(r)))
359 }
360 TSQ_TAG_OR => {
361 let l = Self::decode_at(bytes, cursor)?;
362 let r = Self::decode_at(bytes, cursor)?;
363 Ok(TsQueryAst::Or(Box::new(l), Box::new(r)))
364 }
365 TSQ_TAG_NOT => {
366 let c = Self::decode_at(bytes, cursor)?;
367 Ok(TsQueryAst::Not(Box::new(c)))
368 }
369 TSQ_TAG_PHRASE => {
370 if *cursor + 2 > bytes.len() {
371 return Err(SqlError::InvalidValue("truncated phrase distance".into()));
372 }
373 let distance = u16::from_le_bytes([bytes[*cursor], bytes[*cursor + 1]]);
374 *cursor += 2;
375 let l = Self::decode_at(bytes, cursor)?;
376 let r = Self::decode_at(bytes, cursor)?;
377 Ok(TsQueryAst::Phrase {
378 distance,
379 left: Box::new(l),
380 right: Box::new(r),
381 })
382 }
383 other => Err(SqlError::InvalidValue(format!(
384 "unknown tsquery tag: {other}"
385 ))),
386 }
387 }
388}
389
390pub fn tsquery_display(bytes: &[u8]) -> String {
391 match TsQueryAst::decode(bytes) {
392 Ok(ast) => display_ast(&ast),
393 Err(_) => "<invalid tsquery>".into(),
394 }
395}
396
397fn display_ast(ast: &TsQueryAst) -> String {
398 match ast {
399 TsQueryAst::Lexeme {
400 lexeme,
401 weight_mask,
402 prefix,
403 } => {
404 let mut s = format!("'{}'", String::from_utf8_lossy(lexeme));
405 if *prefix || *weight_mask != 0 {
406 s.push(':');
407 if *prefix {
408 s.push('*');
409 }
410 for (bit, label) in [(8, 'A'), (4, 'B'), (2, 'C'), (1, 'D')] {
411 if weight_mask & bit != 0 {
412 s.push(label);
413 }
414 }
415 }
416 s
417 }
418 TsQueryAst::And(l, r) => format!("{} & {}", display_ast(l), display_ast(r)),
419 TsQueryAst::Or(l, r) => format!("({} | {})", display_ast(l), display_ast(r)),
420 TsQueryAst::Not(c) => format!("!{}", display_ast(c)),
421 TsQueryAst::Phrase {
422 distance,
423 left,
424 right,
425 } => {
426 if *distance == 1 {
427 format!("{} <-> {}", display_ast(left), display_ast(right))
428 } else {
429 format!(
430 "{} <{}> {}",
431 display_ast(left),
432 distance,
433 display_ast(right)
434 )
435 }
436 }
437 }
438}
439
440pub fn parse_tsquery(input: &str) -> Result<TsQueryAst> {
441 let mut p = TsQueryParser::new(input);
442 let ast = p.parse_or()?;
443 p.skip_ws();
444 if p.cursor < p.input.len() {
445 return Err(SqlError::InvalidValue(format!(
446 "unexpected trailing input in tsquery: {}",
447 &p.input[p.cursor..]
448 )));
449 }
450 Ok(ast)
451}
452
453struct TsQueryParser<'a> {
454 input: &'a str,
455 cursor: usize,
456}
457
458impl<'a> TsQueryParser<'a> {
459 fn new(input: &'a str) -> Self {
460 Self { input, cursor: 0 }
461 }
462
463 fn skip_ws(&mut self) {
464 let bytes = self.input.as_bytes();
465 while self.cursor < bytes.len() && bytes[self.cursor].is_ascii_whitespace() {
466 self.cursor += 1;
467 }
468 }
469
470 fn peek(&self) -> Option<u8> {
471 self.input.as_bytes().get(self.cursor).copied()
472 }
473
474 fn eat(&mut self, c: u8) -> bool {
475 if self.peek() == Some(c) {
476 self.cursor += 1;
477 true
478 } else {
479 false
480 }
481 }
482
483 fn parse_or(&mut self) -> Result<TsQueryAst> {
484 let mut left = self.parse_and()?;
485 loop {
486 self.skip_ws();
487 if !self.eat(b'|') {
488 break;
489 }
490 let right = self.parse_and()?;
491 left = TsQueryAst::Or(Box::new(left), Box::new(right));
492 }
493 Ok(left)
494 }
495
496 fn parse_and(&mut self) -> Result<TsQueryAst> {
497 let mut left = self.parse_not()?;
498 loop {
499 self.skip_ws();
500 if !self.eat(b'&') {
501 break;
502 }
503 let right = self.parse_not()?;
504 left = TsQueryAst::And(Box::new(left), Box::new(right));
505 }
506 Ok(left)
507 }
508
509 fn parse_not(&mut self) -> Result<TsQueryAst> {
510 self.skip_ws();
511 if self.eat(b'!') {
512 let inner = self.parse_not()?;
513 return Ok(TsQueryAst::Not(Box::new(inner)));
514 }
515 self.parse_phrase()
516 }
517
518 fn parse_phrase(&mut self) -> Result<TsQueryAst> {
519 let mut left = self.parse_atom()?;
520 loop {
521 self.skip_ws();
522 if self.peek() != Some(b'<') {
523 break;
524 }
525 let dist = self.parse_phrase_distance()?;
526 let right = self.parse_atom()?;
527 left = TsQueryAst::Phrase {
528 distance: dist,
529 left: Box::new(left),
530 right: Box::new(right),
531 };
532 }
533 Ok(left)
534 }
535
536 fn parse_phrase_distance(&mut self) -> Result<u16> {
537 if !self.eat(b'<') {
538 return Err(SqlError::InvalidValue("expected '<'".into()));
539 }
540 if self.eat(b'-') {
541 if !self.eat(b'>') {
542 return Err(SqlError::InvalidValue("expected '<->' phrase op".into()));
543 }
544 return Ok(1);
545 }
546 let start = self.cursor;
547 while let Some(c) = self.peek() {
548 if c.is_ascii_digit() {
549 self.cursor += 1;
550 } else {
551 break;
552 }
553 }
554 if start == self.cursor {
555 return Err(SqlError::InvalidValue(
556 "expected distance after '<' in phrase op".into(),
557 ));
558 }
559 let dist_str = &self.input[start..self.cursor];
560 let dist: u16 = dist_str
561 .parse()
562 .map_err(|_| SqlError::InvalidValue(format!("invalid phrase distance: {dist_str}")))?;
563 if !(1..=MAX_POSITION).contains(&dist) {
564 return Err(SqlError::InvalidValue(format!(
565 "phrase distance {dist} out of range 1..={MAX_POSITION}"
566 )));
567 }
568 if !self.eat(b'>') {
569 return Err(SqlError::InvalidValue("expected '>' after distance".into()));
570 }
571 Ok(dist)
572 }
573
574 fn parse_atom(&mut self) -> Result<TsQueryAst> {
575 self.skip_ws();
576 if self.eat(b'(') {
577 let inner = self.parse_or()?;
578 self.skip_ws();
579 if !self.eat(b')') {
580 return Err(SqlError::InvalidValue("missing closing paren".into()));
581 }
582 return Ok(inner);
583 }
584 let lexeme = self.parse_lexeme_word()?;
585 let (weight_mask, prefix) = self.parse_weight_and_prefix()?;
586 Ok(TsQueryAst::Lexeme {
587 lexeme: lexeme.into_bytes(),
588 weight_mask,
589 prefix,
590 })
591 }
592
593 fn parse_lexeme_word(&mut self) -> Result<String> {
594 self.skip_ws();
595 if self.eat(b'\'') {
596 let start = self.cursor;
597 let bytes = self.input.as_bytes();
598 while self.cursor < bytes.len() && bytes[self.cursor] != b'\'' {
599 self.cursor += 1;
600 }
601 if self.cursor >= bytes.len() {
602 return Err(SqlError::InvalidValue(
603 "unterminated quoted lexeme in tsquery".into(),
604 ));
605 }
606 let word = self.input[start..self.cursor].to_string();
607 self.cursor += 1; if word.is_empty() {
609 return Err(SqlError::InvalidValue("empty lexeme in tsquery".into()));
610 }
611 return Ok(word);
612 }
613 let start = self.cursor;
614 for (i, ch) in self.input[self.cursor..].char_indices() {
615 if ch.is_alphanumeric() || ch == '_' {
616 self.cursor = start + i + ch.len_utf8();
617 } else {
618 break;
619 }
620 }
621 if self.cursor == start {
622 return Err(SqlError::InvalidValue(format!(
623 "expected lexeme at: {}",
624 &self.input[self.cursor..]
625 )));
626 }
627 Ok(self.input[start..self.cursor].to_string())
628 }
629
630 fn parse_weight_and_prefix(&mut self) -> Result<(u8, bool)> {
631 if !self.eat(b':') {
632 return Ok((0, false));
633 }
634 let mut prefix = false;
635 let mut mask: u8 = 0;
636 loop {
637 match self.peek() {
638 Some(b'*') => {
639 prefix = true;
640 self.cursor += 1;
641 }
642 Some(c) if matches!(c, b'A' | b'B' | b'C' | b'D' | b'a' | b'b' | b'c' | b'd') => {
643 let bit = match c.to_ascii_uppercase() {
644 b'A' => 0b1000,
645 b'B' => 0b0100,
646 b'C' => 0b0010,
647 b'D' => 0b0001,
648 _ => unreachable!(),
649 };
650 mask |= bit;
651 self.cursor += 1;
652 }
653 _ => break,
654 }
655 }
656 Ok((mask, prefix))
657 }
658}
659
660pub fn op_match(tsvector_bytes: &[u8], tsquery_bytes: &[u8]) -> Result<crate::types::Value> {
661 let ast = TsQueryAst::decode(tsquery_bytes)?;
662 let (flags, reader) = TsVectorReader::open(tsvector_bytes)?;
663 let mut entries: Vec<(Vec<u8>, Vec<u16>)> = Vec::new();
664 for item in reader {
665 let (lex, positions) = item?;
666 entries.push((lex.to_vec(), positions));
667 }
668 let overflowed = flags & TSV_FLAG_POSITION_OVERFLOW != 0;
669 let matched = eval_match(&ast, &entries, overflowed)?;
670 Ok(crate::types::Value::Boolean(matched))
671}
672
673fn eval_match(ast: &TsQueryAst, entries: &[(Vec<u8>, Vec<u16>)], overflowed: bool) -> Result<bool> {
674 match ast {
675 TsQueryAst::Lexeme {
676 lexeme,
677 weight_mask,
678 prefix,
679 } => Ok(!collect_lex_positions(entries, lexeme, *weight_mask, *prefix).is_empty()),
680 TsQueryAst::And(l, r) => {
681 Ok(eval_match(l, entries, overflowed)? && eval_match(r, entries, overflowed)?)
682 }
683 TsQueryAst::Or(l, r) => {
684 Ok(eval_match(l, entries, overflowed)? || eval_match(r, entries, overflowed)?)
685 }
686 TsQueryAst::Not(c) => Ok(!eval_match(c, entries, overflowed)?),
687 TsQueryAst::Phrase {
688 distance,
689 left,
690 right,
691 } => {
692 if overflowed {
693 return Err(SqlError::Unsupported(
694 "tsvector position overflow; phrase queries unreliable".into(),
695 ));
696 }
697 let left_pos = phrase_positions(left, entries)?;
698 let right_pos = phrase_positions(right, entries)?;
699 Ok(positions_at_offset(&left_pos, &right_pos, *distance))
700 }
701 }
702}
703
704fn phrase_positions(ast: &TsQueryAst, entries: &[(Vec<u8>, Vec<u16>)]) -> Result<Vec<u16>> {
705 match ast {
706 TsQueryAst::Lexeme {
707 lexeme,
708 weight_mask,
709 prefix,
710 } => Ok(collect_lex_positions(
711 entries,
712 lexeme,
713 *weight_mask,
714 *prefix,
715 )),
716 TsQueryAst::Phrase {
717 distance,
718 left,
719 right,
720 } => {
721 let lp = phrase_positions(left, entries)?;
722 let rp = phrase_positions(right, entries)?;
723 Ok(positions_pairing_right(&lp, &rp, *distance))
724 }
725 _ => Err(SqlError::Unsupported(
726 "tsquery: AND/OR/NOT inside phrase operator not supported".into(),
727 )),
728 }
729}
730
731fn positions_at_offset(left: &[u16], right: &[u16], distance: u16) -> bool {
732 if left.is_empty() || right.is_empty() {
733 return false;
734 }
735 let mut i = 0;
736 let mut j = 0;
737 while i < left.len() && j < right.len() {
738 let lp = left[i] & MAX_POSITION;
739 let rp = right[j] & MAX_POSITION;
740 if rp == lp.saturating_add(distance) {
741 return true;
742 }
743 if rp < lp + distance {
744 j += 1;
745 } else {
746 i += 1;
747 }
748 }
749 false
750}
751
752fn positions_pairing_right(left: &[u16], right: &[u16], distance: u16) -> Vec<u16> {
753 let mut out = Vec::new();
754 let mut i = 0;
755 let mut j = 0;
756 while i < left.len() && j < right.len() {
757 let lp = left[i] & MAX_POSITION;
758 let rp = right[j] & MAX_POSITION;
759 let target = lp.saturating_add(distance);
760 if rp == target {
761 if out.last().copied() != Some(right[j]) {
762 out.push(right[j]);
763 }
764 j += 1;
765 } else if rp < target {
766 j += 1;
767 } else {
768 i += 1;
769 }
770 }
771 out
772}
773
774fn collect_lex_positions(
775 entries: &[(Vec<u8>, Vec<u16>)],
776 query_lex: &[u8],
777 weight_mask: u8,
778 prefix: bool,
779) -> Vec<u16> {
780 let mut out: Vec<u16> = Vec::new();
781 let weight_to_bit = |w: Weight| -> u8 {
782 match w {
783 Weight::A => 0b1000,
784 Weight::B => 0b0100,
785 Weight::C => 0b0010,
786 Weight::D => 0b0001,
787 }
788 };
789 let collect_from = |positions: &[u16], out: &mut Vec<u16>| {
790 for &p in positions {
791 if weight_mask != 0 {
792 let (_pos, w) = unpack_position(p);
793 if weight_to_bit(w) & weight_mask == 0 {
794 continue;
795 }
796 }
797 out.push(p);
798 }
799 };
800 if prefix {
801 let start = entries.partition_point(|(lex, _)| lex.as_slice() < query_lex);
802 for (lex, positions) in entries[start..].iter() {
803 if !lex.starts_with(query_lex) {
804 break;
805 }
806 collect_from(positions, &mut out);
807 }
808 out.sort_unstable();
809 out.dedup();
810 } else {
811 if let Ok(i) = entries.binary_search_by(|(lex, _)| lex.as_slice().cmp(query_lex)) {
812 collect_from(&entries[i].1, &mut out);
813 }
814 }
815 out
816}
817
818pub fn fn_length_tsvector(bytes: &[u8]) -> Result<crate::types::Value> {
819 let (_flags, reader) = TsVectorReader::open(bytes)?;
820 let count = reader.count() as i64;
821 Ok(crate::types::Value::Integer(count))
822}
823
824pub fn fn_numnode(bytes: &[u8]) -> Result<crate::types::Value> {
825 let ast = TsQueryAst::decode(bytes)?;
826 Ok(crate::types::Value::Integer(count_nodes(&ast) as i64))
827}
828
829fn count_nodes(ast: &TsQueryAst) -> usize {
830 match ast {
831 TsQueryAst::Lexeme { .. } => 1,
832 TsQueryAst::And(l, r) | TsQueryAst::Or(l, r) => 1 + count_nodes(l) + count_nodes(r),
833 TsQueryAst::Not(c) => 1 + count_nodes(c),
834 TsQueryAst::Phrase { left, right, .. } => 1 + count_nodes(left) + count_nodes(right),
835 }
836}
837
838fn weight_default(w: Weight) -> f64 {
839 match w {
840 Weight::A => 1.0,
841 Weight::B => 0.4,
842 Weight::C => 0.2,
843 Weight::D => 0.1,
844 }
845}
846
847pub fn fn_ts_rank(tsv: &[u8], tsq: &[u8], norm: i64) -> Result<crate::types::Value> {
848 let entries = decode_entries(tsv)?;
849 let ast = TsQueryAst::decode(tsq)?;
850 let mut score = 0.0_f64;
851 accumulate_rank(&ast, &entries, &mut score, true);
852 score = apply_norm(score, &entries, norm);
853 Ok(crate::types::Value::Real(score))
854}
855
856pub fn fn_ts_rank_cd(tsv: &[u8], tsq: &[u8], norm: i64) -> Result<crate::types::Value> {
857 let entries = decode_entries(tsv)?;
858 let ast = TsQueryAst::decode(tsq)?;
859 let mut atom_lists: Vec<Vec<u16>> = Vec::new();
860 collect_positive_atoms(&ast, &entries, &mut atom_lists);
861 if atom_lists.is_empty() || atom_lists.iter().any(|l| l.is_empty()) {
862 return Ok(crate::types::Value::Real(0.0));
863 }
864 let score = shortest_cover_score(&atom_lists);
865 let score = apply_norm(score, &entries, norm);
866 Ok(crate::types::Value::Real(score))
867}
868
869fn decode_entries(tsv: &[u8]) -> Result<Vec<(Vec<u8>, Vec<u16>)>> {
870 let (_flags, reader) = TsVectorReader::open(tsv)?;
871 let mut out = Vec::new();
872 for item in reader {
873 let (lex, positions) = item?;
874 out.push((lex.to_vec(), positions));
875 }
876 Ok(out)
877}
878
879fn accumulate_rank(
880 ast: &TsQueryAst,
881 entries: &[(Vec<u8>, Vec<u16>)],
882 out: &mut f64,
883 positive: bool,
884) {
885 match ast {
886 TsQueryAst::Lexeme {
887 lexeme,
888 weight_mask,
889 prefix,
890 } => {
891 if !positive {
892 return;
893 }
894 let positions = collect_lex_positions(entries, lexeme, *weight_mask, *prefix);
895 if positions.is_empty() {
896 return;
897 }
898 let weight_sum: f64 = positions
899 .iter()
900 .map(|&p| weight_default(Weight::from_bits(p)))
901 .sum();
902 let tf = (positions.len() as f64).ln_1p();
903 *out += weight_sum * (1.0 + tf);
904 }
905 TsQueryAst::And(l, r) | TsQueryAst::Or(l, r) => {
906 accumulate_rank(l, entries, out, positive);
907 accumulate_rank(r, entries, out, positive);
908 }
909 TsQueryAst::Not(c) => accumulate_rank(c, entries, out, !positive),
910 TsQueryAst::Phrase { left, right, .. } => {
911 accumulate_rank(left, entries, out, positive);
912 accumulate_rank(right, entries, out, positive);
913 }
914 }
915}
916
917fn collect_positive_atoms(
918 ast: &TsQueryAst,
919 entries: &[(Vec<u8>, Vec<u16>)],
920 out: &mut Vec<Vec<u16>>,
921) {
922 match ast {
923 TsQueryAst::Lexeme {
924 lexeme,
925 weight_mask,
926 prefix,
927 } => {
928 let positions = collect_lex_positions(entries, lexeme, *weight_mask, *prefix);
929 out.push(positions);
930 }
931 TsQueryAst::And(l, r)
932 | TsQueryAst::Or(l, r)
933 | TsQueryAst::Phrase {
934 left: l, right: r, ..
935 } => {
936 collect_positive_atoms(l, entries, out);
937 collect_positive_atoms(r, entries, out);
938 }
939 TsQueryAst::Not(_) => {} }
941}
942
943fn shortest_cover_score(atom_lists: &[Vec<u16>]) -> f64 {
944 if atom_lists.is_empty() {
945 return 0.0;
946 }
947 let mut events: Vec<(u16, usize, u16)> = Vec::new();
948 for (i, list) in atom_lists.iter().enumerate() {
949 for &packed in list {
950 let (pos, _w) = unpack_position(packed);
951 events.push((pos, i, packed));
952 }
953 }
954 events.sort_unstable_by_key(|e| e.0);
955
956 let k = atom_lists.len();
957 let mut count_per_atom = vec![0usize; k];
958 let mut covered_count = 0usize;
959 let mut best_score = 0.0_f64;
960 let mut l = 0usize;
961 for r in 0..events.len() {
962 let ai = events[r].1;
963 if count_per_atom[ai] == 0 {
964 covered_count += 1;
965 }
966 count_per_atom[ai] += 1;
967 while covered_count == k {
968 let window_len = (events[r].0 - events[l].0 + 1) as f64;
969 let weight_sum: f64 = events[l..=r]
970 .iter()
971 .map(|e| weight_default(Weight::from_bits(e.2)))
972 .sum();
973 let candidate = weight_sum / window_len;
974 if candidate > best_score {
975 best_score = candidate;
976 }
977 let la = events[l].1;
978 count_per_atom[la] -= 1;
979 if count_per_atom[la] == 0 {
980 covered_count -= 1;
981 }
982 l += 1;
983 }
984 }
985 best_score
986}
987
988fn apply_norm(mut score: f64, entries: &[(Vec<u8>, Vec<u16>)], norm: i64) -> f64 {
989 let doc_len: f64 = entries.iter().map(|e| e.1.len()).sum::<usize>() as f64;
990 let unique = entries.len() as f64;
991 if (norm & 1) != 0 && doc_len > 1.0 {
992 score /= 1.0 + doc_len.ln();
993 }
994 if (norm & 2) != 0 && doc_len > 0.0 {
995 score /= doc_len;
996 }
997 if (norm & 8) != 0 && unique > 0.0 {
998 score /= unique;
999 }
1000 if (norm & 16) != 0 && unique > 1.0 {
1001 score /= 1.0 + unique.ln();
1002 }
1003 if (norm & 32) != 0 {
1004 score /= score + 1.0;
1005 }
1006 score
1007}
1008
1009#[derive(Debug, Clone, PartialEq, Eq)]
1010pub struct Token {
1011 pub lexeme: String,
1012 pub position: u16,
1013 pub stopped: bool,
1014}
1015
1016#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1017pub enum TokenizerKind {
1018 Simple,
1019 English,
1020}
1021
1022impl TokenizerKind {
1023 pub fn from_name(name: &str) -> Result<Self> {
1024 match name.to_ascii_lowercase().as_str() {
1025 "simple" => Ok(TokenizerKind::Simple),
1026 "english" | "pg_catalog.english" => Ok(TokenizerKind::English),
1027 other => Err(SqlError::Unsupported(format!(
1028 "unknown text search configuration: {other}"
1029 ))),
1030 }
1031 }
1032
1033 pub fn as_config_id(self) -> u8 {
1034 match self {
1035 TokenizerKind::Simple => 0,
1036 TokenizerKind::English => 1,
1037 }
1038 }
1039
1040 pub fn from_config_id(id: u8) -> Result<Self> {
1041 match id {
1042 0 => Ok(TokenizerKind::Simple),
1043 1 => Ok(TokenizerKind::English),
1044 _ => Err(SqlError::InvalidValue(format!(
1045 "unknown FTS config_id: {id}"
1046 ))),
1047 }
1048 }
1049}
1050
1051const ENGLISH_STOP_WORDS: &[&str] = &[
1053 "a",
1054 "about",
1055 "above",
1056 "after",
1057 "again",
1058 "against",
1059 "all",
1060 "am",
1061 "an",
1062 "and",
1063 "any",
1064 "are",
1065 "as",
1066 "at",
1067 "be",
1068 "because",
1069 "been",
1070 "before",
1071 "being",
1072 "below",
1073 "between",
1074 "both",
1075 "but",
1076 "by",
1077 "can",
1078 "did",
1079 "do",
1080 "does",
1081 "doing",
1082 "don",
1083 "down",
1084 "during",
1085 "each",
1086 "few",
1087 "for",
1088 "from",
1089 "further",
1090 "had",
1091 "has",
1092 "have",
1093 "having",
1094 "he",
1095 "her",
1096 "here",
1097 "hers",
1098 "herself",
1099 "him",
1100 "himself",
1101 "his",
1102 "how",
1103 "i",
1104 "if",
1105 "in",
1106 "into",
1107 "is",
1108 "it",
1109 "its",
1110 "itself",
1111 "just",
1112 "me",
1113 "more",
1114 "most",
1115 "my",
1116 "myself",
1117 "no",
1118 "nor",
1119 "not",
1120 "now",
1121 "of",
1122 "off",
1123 "on",
1124 "once",
1125 "only",
1126 "or",
1127 "other",
1128 "our",
1129 "ours",
1130 "ourselves",
1131 "out",
1132 "over",
1133 "own",
1134 "s",
1135 "same",
1136 "she",
1137 "should",
1138 "so",
1139 "some",
1140 "such",
1141 "t",
1142 "than",
1143 "that",
1144 "the",
1145 "their",
1146 "theirs",
1147 "them",
1148 "themselves",
1149 "then",
1150 "there",
1151 "these",
1152 "they",
1153 "this",
1154 "those",
1155 "through",
1156 "to",
1157 "too",
1158 "under",
1159 "until",
1160 "up",
1161 "very",
1162 "was",
1163 "we",
1164 "were",
1165 "what",
1166 "when",
1167 "where",
1168 "which",
1169 "while",
1170 "who",
1171 "whom",
1172 "why",
1173 "will",
1174 "with",
1175 "you",
1176 "your",
1177 "yours",
1178 "yourself",
1179 "yourselves",
1180];
1181
1182fn is_english_stopword(word: &str) -> bool {
1183 ENGLISH_STOP_WORDS.binary_search(&word).is_ok()
1184}
1185
1186pub fn tokenize(kind: TokenizerKind, text: &str) -> Vec<Token> {
1187 use unicode_normalization::UnicodeNormalization;
1188 use unicode_segmentation::UnicodeSegmentation;
1189
1190 let normalized: String = text.nfkc().collect();
1191 let lowered = normalized.to_lowercase();
1192
1193 let mut out = Vec::new();
1194 let mut position: u32 = 0;
1195 for word in lowered.unicode_words() {
1196 position += 1;
1197 let pos_u16 = if position <= MAX_POSITION as u32 {
1198 position as u16
1199 } else {
1200 MAX_POSITION + 1 };
1202 let mut stopped = false;
1203 let lexeme = match kind {
1204 TokenizerKind::Simple => word.to_string(),
1205 TokenizerKind::English => {
1206 if is_english_stopword(word) {
1207 stopped = true;
1208 String::new()
1209 } else {
1210 tantivy_stemmers::algorithms::english_porter_2(word).into_owned()
1211 }
1212 }
1213 };
1214 if lexeme.is_empty() && !stopped {
1215 position -= 1;
1216 continue;
1217 }
1218 out.push(Token {
1219 lexeme,
1220 position: pos_u16,
1221 stopped,
1222 });
1223 }
1224 out
1225}
1226
1227fn stem_one(kind: TokenizerKind, word: &str) -> Option<String> {
1228 use unicode_normalization::UnicodeNormalization;
1229 let normalized: String = word.nfkc().collect();
1230 let lowered = normalized.to_lowercase();
1231 match kind {
1232 TokenizerKind::Simple => Some(lowered),
1233 TokenizerKind::English => {
1234 if is_english_stopword(&lowered) {
1235 None
1236 } else {
1237 Some(tantivy_stemmers::algorithms::english_porter_2(&lowered).into_owned())
1238 }
1239 }
1240 }
1241}
1242
1243pub fn fn_to_tsvector_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1244 let tokens = tokenize(kind, text);
1245 let mut b = TsVectorBuilder::new();
1246 for t in tokens {
1247 if t.stopped {
1248 continue;
1249 }
1250 b.push(t.lexeme.as_bytes(), t.position, Weight::D);
1251 }
1252 Ok(crate::types::Value::TsVector(b.build()))
1253}
1254
1255pub fn fn_to_tsvector(text: &str) -> Result<crate::types::Value> {
1256 fn_to_tsvector_with(TokenizerKind::English, text)
1257}
1258
1259pub fn fn_to_tsquery_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1260 let raw = parse_tsquery(text)?;
1261 let stemmed = stem_ast(&raw, kind)?;
1262 Ok(crate::types::Value::TsQuery(stemmed.encode()))
1263}
1264
1265fn stem_ast(ast: &TsQueryAst, kind: TokenizerKind) -> Result<TsQueryAst> {
1266 Ok(match ast {
1267 TsQueryAst::Lexeme {
1268 lexeme,
1269 weight_mask,
1270 prefix,
1271 } => {
1272 let s = std::str::from_utf8(lexeme)
1273 .map_err(|_| SqlError::InvalidValue("tsquery lexeme has invalid UTF-8".into()))?;
1274 let stemmed = stem_one(kind, s).ok_or_else(|| {
1275 SqlError::InvalidValue(format!("tsquery: lexeme '{s}' is a stop-word"))
1276 })?;
1277 TsQueryAst::Lexeme {
1278 lexeme: stemmed.into_bytes(),
1279 weight_mask: *weight_mask,
1280 prefix: *prefix,
1281 }
1282 }
1283 TsQueryAst::And(l, r) => {
1284 TsQueryAst::And(Box::new(stem_ast(l, kind)?), Box::new(stem_ast(r, kind)?))
1285 }
1286 TsQueryAst::Or(l, r) => {
1287 TsQueryAst::Or(Box::new(stem_ast(l, kind)?), Box::new(stem_ast(r, kind)?))
1288 }
1289 TsQueryAst::Not(c) => TsQueryAst::Not(Box::new(stem_ast(c, kind)?)),
1290 TsQueryAst::Phrase {
1291 distance,
1292 left,
1293 right,
1294 } => TsQueryAst::Phrase {
1295 distance: *distance,
1296 left: Box::new(stem_ast(left, kind)?),
1297 right: Box::new(stem_ast(right, kind)?),
1298 },
1299 })
1300}
1301
1302pub fn fn_plainto_tsquery_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1303 let tokens = tokenize(kind, text);
1304 let lexemes: Vec<Vec<u8>> = tokens
1305 .into_iter()
1306 .filter(|t| !t.stopped && !t.lexeme.is_empty())
1307 .map(|t| t.lexeme.into_bytes())
1308 .collect();
1309 let ast = and_chain(&lexemes)?;
1310 Ok(crate::types::Value::TsQuery(ast.encode()))
1311}
1312
1313fn and_chain(lexemes: &[Vec<u8>]) -> Result<TsQueryAst> {
1314 if lexemes.is_empty() {
1315 return Err(SqlError::InvalidValue(
1316 "tsquery would be empty (input had only stop-words?)".into(),
1317 ));
1318 }
1319 let mut iter = lexemes.iter().map(|l| TsQueryAst::Lexeme {
1320 lexeme: l.clone(),
1321 weight_mask: 0,
1322 prefix: false,
1323 });
1324 let first = iter.next().unwrap();
1325 Ok(iter.fold(first, |acc, x| TsQueryAst::And(Box::new(acc), Box::new(x))))
1326}
1327
1328pub fn fn_phraseto_tsquery_with(kind: TokenizerKind, text: &str) -> Result<crate::types::Value> {
1329 let tokens = tokenize(kind, text);
1330 let mut lex_positions: Vec<(Vec<u8>, u16)> = Vec::new();
1331 for t in tokens {
1332 if t.stopped || t.lexeme.is_empty() {
1333 continue;
1334 }
1335 lex_positions.push((t.lexeme.into_bytes(), t.position));
1336 }
1337 if lex_positions.is_empty() {
1338 return Err(SqlError::InvalidValue(
1339 "tsquery would be empty (input had only stop-words?)".into(),
1340 ));
1341 }
1342 let mut iter = lex_positions.into_iter();
1343 let (first_lex, mut prev_pos) = iter.next().unwrap();
1344 let mut acc = TsQueryAst::Lexeme {
1345 lexeme: first_lex,
1346 weight_mask: 0,
1347 prefix: false,
1348 };
1349 for (lex, pos) in iter {
1350 let dist = pos.saturating_sub(prev_pos).max(1);
1351 let right = TsQueryAst::Lexeme {
1352 lexeme: lex,
1353 weight_mask: 0,
1354 prefix: false,
1355 };
1356 acc = TsQueryAst::Phrase {
1357 distance: dist,
1358 left: Box::new(acc),
1359 right: Box::new(right),
1360 };
1361 prev_pos = pos;
1362 }
1363 Ok(crate::types::Value::TsQuery(acc.encode()))
1364}
1365
1366pub fn fn_websearch_to_tsquery_with(
1367 kind: TokenizerKind,
1368 text: &str,
1369) -> Result<crate::types::Value> {
1370 let mut groups: Vec<TsQueryAst> = Vec::new();
1371 let mut current_terms: Vec<TsQueryAst> = Vec::new();
1372 let mut cursor = 0usize;
1373 let bytes = text.as_bytes();
1374
1375 let flush_group = |terms: &mut Vec<TsQueryAst>, groups: &mut Vec<TsQueryAst>| {
1376 if terms.is_empty() {
1377 return;
1378 }
1379 let mut iter = std::mem::take(terms).into_iter();
1380 let first = iter.next().unwrap();
1381 let combined = iter.fold(first, |acc, x| TsQueryAst::And(Box::new(acc), Box::new(x)));
1382 groups.push(combined);
1383 };
1384
1385 while cursor < bytes.len() {
1386 while cursor < bytes.len() && bytes[cursor].is_ascii_whitespace() {
1387 cursor += 1;
1388 }
1389 if cursor >= bytes.len() {
1390 break;
1391 }
1392 let negate = if bytes[cursor] == b'-' {
1393 cursor += 1;
1394 true
1395 } else {
1396 false
1397 };
1398 if cursor < bytes.len() && bytes[cursor] == b'"' {
1399 cursor += 1;
1400 let start = cursor;
1401 while cursor < bytes.len() && bytes[cursor] != b'"' {
1402 cursor += 1;
1403 }
1404 let inner = &text[start..cursor];
1405 if cursor < bytes.len() {
1406 cursor += 1; }
1408 if let Ok(crate::types::Value::TsQuery(q)) = fn_phraseto_tsquery_with(kind, inner) {
1409 let mut ast = TsQueryAst::decode(&q)?;
1410 if negate {
1411 ast = TsQueryAst::Not(Box::new(ast));
1412 }
1413 current_terms.push(ast);
1414 }
1415 continue;
1416 }
1417 let start = cursor;
1418 while cursor < bytes.len() && !bytes[cursor].is_ascii_whitespace() {
1419 cursor += 1;
1420 }
1421 let word = &text[start..cursor];
1422 if word.eq_ignore_ascii_case("or") {
1423 flush_group(&mut current_terms, &mut groups);
1424 continue;
1425 }
1426 if let Some(stemmed) = stem_one(kind, word) {
1427 let mut ast = TsQueryAst::Lexeme {
1428 lexeme: stemmed.into_bytes(),
1429 weight_mask: 0,
1430 prefix: false,
1431 };
1432 if negate {
1433 ast = TsQueryAst::Not(Box::new(ast));
1434 }
1435 current_terms.push(ast);
1436 }
1437 }
1438 flush_group(&mut current_terms, &mut groups);
1439 if groups.is_empty() {
1440 return Err(SqlError::InvalidValue(
1441 "tsquery would be empty (input had only stop-words?)".into(),
1442 ));
1443 }
1444 let mut iter = groups.into_iter();
1445 let first = iter.next().unwrap();
1446 let combined = iter.fold(first, |acc, x| TsQueryAst::Or(Box::new(acc), Box::new(x)));
1447 Ok(crate::types::Value::TsQuery(combined.encode()))
1448}
1449
1450pub fn fn_ts_headline_with(
1451 kind: TokenizerKind,
1452 text: &str,
1453 tsq_bytes: &[u8],
1454) -> Result<crate::types::Value> {
1455 use unicode_segmentation::UnicodeSegmentation;
1456 let ast = TsQueryAst::decode(tsq_bytes)?;
1457 let positive_lexemes = collect_query_atoms(&ast);
1458 let mut out = String::with_capacity(text.len() + 16);
1459 let mut last_end = 0usize;
1460 for (idx, word) in text.split_word_bound_indices() {
1461 let word_lower: String = word.to_lowercase();
1462 let stemmed = stem_one(kind, &word_lower);
1463 let matched = stemmed.as_ref().is_some_and(|s| {
1464 positive_lexemes
1465 .iter()
1466 .any(|q| q == s.as_bytes() || word_lower.as_bytes() == q)
1467 });
1468 if matched {
1469 out.push_str(&text[last_end..idx]);
1470 out.push_str("<b>");
1471 out.push_str(word);
1472 out.push_str("</b>");
1473 last_end = idx + word.len();
1474 }
1475 }
1476 out.push_str(&text[last_end..]);
1477 Ok(crate::types::Value::Text(out.into()))
1478}
1479
1480fn collect_query_atoms(ast: &TsQueryAst) -> Vec<Vec<u8>> {
1481 let mut out = Vec::new();
1482 fn walk(ast: &TsQueryAst, positive: bool, out: &mut Vec<Vec<u8>>) {
1483 match ast {
1484 TsQueryAst::Lexeme { lexeme, .. } => {
1485 if positive {
1486 out.push(lexeme.clone());
1487 }
1488 }
1489 TsQueryAst::And(l, r) | TsQueryAst::Or(l, r) => {
1490 walk(l, positive, out);
1491 walk(r, positive, out);
1492 }
1493 TsQueryAst::Not(c) => walk(c, !positive, out),
1494 TsQueryAst::Phrase { left, right, .. } => {
1495 walk(left, positive, out);
1496 walk(right, positive, out);
1497 }
1498 }
1499 }
1500 walk(ast, true, &mut out);
1501 out
1502}
1503
1504pub fn fn_ts_lexize_with(kind: TokenizerKind, word: &str) -> Result<crate::types::Value> {
1505 match stem_one(kind, word) {
1506 Some(s) => Ok(crate::types::Value::Text(s.into())),
1507 None => Ok(crate::types::Value::Null),
1508 }
1509}
1510
1511pub fn fn_to_tsquery(text: &str) -> Result<crate::types::Value> {
1512 fn_to_tsquery_with(TokenizerKind::English, text)
1513}
1514
1515pub fn fn_plainto_tsquery(text: &str) -> Result<crate::types::Value> {
1516 fn_plainto_tsquery_with(TokenizerKind::English, text)
1517}
1518
1519pub fn fn_phraseto_tsquery(text: &str) -> Result<crate::types::Value> {
1520 fn_phraseto_tsquery_with(TokenizerKind::English, text)
1521}
1522
1523pub fn fn_websearch_to_tsquery(text: &str) -> Result<crate::types::Value> {
1524 fn_websearch_to_tsquery_with(TokenizerKind::English, text)
1525}
1526
1527pub fn fn_ts_headline(text: &str, tsq: &[u8]) -> Result<crate::types::Value> {
1528 fn_ts_headline_with(TokenizerKind::English, text, tsq)
1529}
1530
1531pub fn fn_ts_lexize(word: &str) -> Result<crate::types::Value> {
1532 fn_ts_lexize_with(TokenizerKind::English, word)
1533}
1534
1535pub fn parse_weight_char(s: &str) -> Result<Weight> {
1536 let bytes = s.as_bytes();
1537 if bytes.len() == 1 {
1538 match bytes[0].to_ascii_uppercase() {
1539 b'A' => return Ok(Weight::A),
1540 b'B' => return Ok(Weight::B),
1541 b'C' => return Ok(Weight::C),
1542 b'D' => return Ok(Weight::D),
1543 _ => {}
1544 }
1545 }
1546 Err(SqlError::InvalidValue(format!(
1547 "unrecognized weight: {}",
1548 bytes.first().copied().unwrap_or(0)
1549 )))
1550}
1551
1552pub fn fn_setweight(tsv: &[u8], weight: Weight) -> Result<crate::types::Value> {
1553 let (_flags, reader) = TsVectorReader::open(tsv)?;
1554 let mut b = TsVectorBuilder::new();
1555 for item in reader {
1556 let (lex, positions) = item?;
1557 if positions.is_empty() {
1558 b.push_no_position(lex);
1559 continue;
1560 }
1561 for packed in positions {
1562 let pos = packed & MAX_POSITION;
1563 b.push(lex, pos, weight);
1564 }
1565 }
1566 Ok(crate::types::Value::TsVector(b.build()))
1567}
1568
1569pub fn fn_setweight_selective(
1571 tsv: &[u8],
1572 weight: Weight,
1573 filter: &[crate::types::Value],
1574) -> Result<crate::types::Value> {
1575 let mut filter_set: std::collections::HashSet<Vec<u8>> = std::collections::HashSet::new();
1576 for v in filter {
1577 match v {
1578 crate::types::Value::Text(s) => {
1579 filter_set.insert(s.as_bytes().to_vec());
1580 }
1581 crate::types::Value::Null => continue,
1582 other => {
1583 return Err(crate::error::SqlError::TypeMismatch {
1584 expected: "TEXT[]".into(),
1585 got: other.data_type().to_string(),
1586 });
1587 }
1588 }
1589 }
1590 let (_flags, reader) = TsVectorReader::open(tsv)?;
1591 let mut b = TsVectorBuilder::new();
1592 for item in reader {
1593 let (lex, positions) = item?;
1594 let should_reweight = filter_set.contains(lex);
1595 if positions.is_empty() {
1596 b.push_no_position(lex);
1597 continue;
1598 }
1599 for packed in positions {
1600 let pos = packed & MAX_POSITION;
1601 if should_reweight {
1602 b.push(lex, pos, weight);
1603 } else {
1604 let (_p, w) = unpack_position(packed);
1605 b.push(lex, pos, w);
1606 }
1607 }
1608 }
1609 Ok(crate::types::Value::TsVector(b.build()))
1610}
1611
1612pub fn fn_strip(tsv: &[u8]) -> Result<crate::types::Value> {
1614 let (_flags, reader) = TsVectorReader::open(tsv)?;
1615 let mut b = TsVectorBuilder::new();
1616 for item in reader {
1617 let (lex, _positions) = item?;
1618 b.push_no_position(lex);
1619 }
1620 Ok(crate::types::Value::TsVector(b.build()))
1621}
1622
1623pub fn op_concat(a: &[u8], b: &[u8]) -> Result<crate::types::Value> {
1625 let (_, reader_a) = TsVectorReader::open(a)?;
1626 let (_, reader_b) = TsVectorReader::open(b)?;
1627 let mut builder = TsVectorBuilder::new();
1628 for item in reader_a {
1629 let (lex, positions) = item?;
1630 if positions.is_empty() {
1631 builder.push_no_position(lex);
1632 continue;
1633 }
1634 for packed in positions {
1635 let (pos, w) = unpack_position(packed);
1636 builder.push(lex, pos, w);
1637 }
1638 }
1639 for item in reader_b {
1640 let (lex, positions) = item?;
1641 if positions.is_empty() {
1642 builder.push_no_position(lex);
1643 continue;
1644 }
1645 for packed in positions {
1646 let (pos, w) = unpack_position(packed);
1647 builder.push(lex, pos, w);
1648 }
1649 }
1650 Ok(crate::types::Value::TsVector(builder.build()))
1651}
1652
1653#[cfg(test)]
1654#[path = "fts_tests.rs"]
1655mod tests;