postgres_types_extra/
pg_tsquery.rs

1/*
2[0, 0, 0, 5, 2, 3, 1, 0, 0, 99, 97, 116, 0, 2, 2, 1, 0, 0, 114, 97, 116, 0, 1, 0, 0, 102, 97, 116, 0] == 'fat' & 'rat' | 'cat'
3[0, 0, 0, 5, -- 5 items
42, 3, -- operator 3 '|'
5text:entry -> weight -> prefix -> string 'cat' -> terminated by 0
61, 0, 0, 99, 97, 116, 0,
7operator:entry -> operator 2 '&''
82, 2,
9text:entry -> weight -> prefix -> string 'rat' -> terminated by 0
101, 0, 0, 114, 97, 116, 0,
11text:entry -> weight -> prefix -> string 'fat' -> terminated by 0
121, 0, 0, 102, 97, 116, 0]
13*/
14
15/*
16To understand how the raw data in the format | & eleph ! bird & | cat dog lazi is converted to the output 'lazi' & ( 'dog' | 'cat' ) | !'bird' & 'eleph' in PostgreSQL's ts_query, let's break down the process.
17
181. Understanding the Raw Data Format
19The raw data represents a postfix (or Reverse Polish Notation - RPN) expression where operators come after their operands. Here's how the provided string maps to this notation:
20
21Operands: lazi, dog, cat, bird, eleph
22Operators: |, &, ! (OR, AND, NOT)
23The order of the elements indicates how these operands and operators should be combined:
24
25| & eleph ! bird & | cat dog lazi
26lazi, dog, cat, bird, eleph are the individual terms.
27|, &, ! are the logical operators, and their position in the sequence determines how the terms are logically combined.
282. Processing Postfix Notation
29PostgreSQL’s ts_query engine processes the input in postfix notation by following these rules:
30
31Operands (Values) are pushed onto a stack.
32When an operator is encountered, it pops the required number of operands from the stack, applies the operator, and then pushes the result back onto the stack.
33Given the postfix expression | & eleph ! bird & | cat dog lazi, let's break down how PostgreSQL would process it:
34
35Step-by-Step Evaluation
36Push lazi onto the stack.
37Push dog onto the stack.
38Push cat onto the stack.
39Apply | (OR): Pop dog and cat, combine them as 'dog' | 'cat', and push the result back onto the stack.
40Stack: ['lazi', "'dog' | 'cat'"]
41Apply & (AND): Pop lazi and the result of the previous OR operation, combine them as 'lazi' & ('dog' | 'cat'), and push the result back onto the stack.
42Stack: ["'lazi' & ('dog' | 'cat')"]
43Push bird onto the stack.
44Apply ! (NOT): Pop bird, apply NOT, resulting in !'bird', and push the result back onto the stack.
45Stack: ["'lazi' & ('dog' | 'cat')", "!'bird'"]
46Push eleph onto the stack.
47Apply & (AND): Pop !'bird' and eleph, combine them as !'bird' & 'eleph', and push the result back onto the stack.
48Stack: ["'lazi' & ('dog' | 'cat')", "!'bird' & 'eleph'"]
49Apply | (OR): Pop the two previous results from the stack, combine them as ('lazi' & ('dog' | 'cat')) | (!'bird' & 'eleph'), and push the final result back onto the stack.
50Final Result
51The final expression is:
52
53Copy code
54'lazi' & ('dog' | 'cat') | !'bird' & 'eleph'
55This represents how the raw data is evaluated in PostgreSQL's ts_query engine to produce the correct infix expression.
56
573. How PostgreSQL Handles This Internally
58PostgreSQL's tsquery uses an internal tree structure to represent the query. Each node can be an operand (a term in the query) or an operator (AND, OR, NOT).
59When parsing a query, PostgreSQL constructs this tree based on operator precedence and parentheses (if any).
60Infix Conversion: PostgreSQL traverses this tree to convert the internal representation into a human-readable infix expression (with operators placed between operands, and proper parentheses to reflect precedence).
61Summary
62The raw postfix expression | & eleph ! bird & | cat dog lazi is processed by PostgreSQL's ts_query engine, which uses a stack-based approach to apply operators in the correct order.
63This processing ultimately results in the infix expression 'lazi' & ('dog' | 'cat') | !'bird' & 'eleph', correctly reflecting operator precedence and logical grouping.
64This method of parsing and evaluation ensures that the logical relationships between the terms are preserved in the final query representation.
65*/
66
67use bigdecimal::ToPrimitive;
68use byteorder::{NetworkEndian, ReadBytesExt};
69use bytes::BufMut;
70use postgres_types::{FromSql, IsNull, ToSql, Type, to_sql_checked};
71use std::error::Error;
72use std::io::{BufRead, Cursor};
73use std::{fmt, str};
74
75#[derive(Clone, Debug, PartialEq)]
76pub struct PgTsQuery {
77    pub entries: Vec<Entry>,
78}
79
80#[derive(Copy, Clone, Eq, PartialEq, Debug, PartialOrd)]
81pub enum Operators {
82    Not = 1,
83    And = 2,
84    Or = 3,
85    Phrase = 4,
86}
87
88impl From<Operators> for i8 {
89    fn from(val: Operators) -> Self {
90        match val {
91            Operators::Not => 1,
92            Operators::And => 2,
93            Operators::Or => 3,
94            Operators::Phrase => 4,
95        }
96    }
97}
98
99impl TryFrom<i8> for Operators {
100    type Error = Box<dyn Error>;
101
102    fn try_from(value: i8) -> Result<Self, Self::Error> {
103        match value {
104            1 => Ok(Operators::Not),
105            2 => Ok(Operators::And),
106            3 => Ok(Operators::Or),
107            4 => Ok(Operators::Phrase),
108            _ => Err("Invalid type".into()),
109        }
110    }
111}
112
113#[derive(Clone, Debug, Copy, PartialEq)]
114pub struct Operator {
115    pub operator: Operators,
116    pub distance: Option<i16>,
117}
118
119#[allow(dead_code)]
120#[derive(Clone, Debug, PartialEq)]
121pub struct Value {
122    pub weight: u8,
123    pub text: String,
124    pub prefix: u8,
125    pub distance: i16,
126}
127
128#[derive(Clone, Debug, PartialEq)]
129pub enum Entry {
130    Operator(Operator),
131    Value(Value),
132}
133
134#[derive(Copy, Clone, Eq, PartialEq, Debug)]
135pub enum EntryType {
136    Value = 1,
137    Operator = 2,
138}
139
140impl From<EntryType> for u8 {
141    fn from(val: EntryType) -> Self {
142        match val {
143            EntryType::Value => 1,
144            EntryType::Operator => 2,
145        }
146    }
147}
148
149impl TryFrom<u8> for EntryType {
150    type Error = Box<dyn Error>;
151
152    fn try_from(value: u8) -> Result<Self, Self::Error> {
153        match value {
154            0 => Ok(EntryType::Value),
155            2 => Ok(EntryType::Operator),
156            _ => Err("Invalid type".into()),
157        }
158    }
159}
160
161impl<'a> FromSql<'a> for PgTsQuery {
162    fn from_sql(_: &Type, raw: &'a [u8]) -> Result<Self, Box<dyn Error + Sync + Send>> {
163        let ts_query = raw.try_into().unwrap();
164
165        Ok(ts_query)
166    }
167    fn accepts(ty: &Type) -> bool {
168        ty.name() == "tsquery"
169    }
170}
171
172impl fmt::Display for PgTsQuery {
173    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
174        write!(f, "{}", infix_string(self.entries.clone()))?;
175        Ok(())
176    }
177}
178
179impl TryFrom<&[u8]> for PgTsQuery {
180    type Error = Box<dyn Error>;
181
182    /// Decode binary data into [`TsQuery`] based on the binary data format defined in
183    /// https://github.com/postgres/postgres/blob/252dcb32397f64a5e1ceac05b29a271ab19aa960/src/backend/utils/adt/tsquery.c#L1174
184    fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
185        let mut reader = Cursor::new(bytes);
186        let size = bytes.len().to_u64().unwrap();
187
188        //First 4 bytes tell you about the content length
189        //eg: 'postgr' & 'data' & 'tobon' = 5
190        //eg: 'postgr' & 'data' == 0003
191        let count = reader.read_u32::<NetworkEndian>()?;
192
193        let mut entries = Vec::<Entry>::with_capacity(count as usize);
194
195        for _ in 0..count {
196            let entry_type = reader.read_u8()?;
197
198            if entry_type == 2 {
199                let operator_type: Operators = reader.read_i8()?.try_into()?;
200                //If the operator is in the last position then error
201                if reader.position() == (size - 1) {
202                    return Err("Invalid tsquery: invalid pointer to right operand".into());
203                }
204
205                if operator_type == Operators::Phrase {
206                    //If the operator is a phrase i.e. 4 then read the next two bytes as the distance <3>
207                    // The below value gives you ['quick' <2> 'fox' & 'lazi' <3> 'dog']
208                    //[0, 0, 0, 7, 2, 2, <2, 4, 0, 3,> 1, 0, 0, 100, 111, 103, 0, 1, 0, 0, 108, 97, 122, 105, 0, <2, 4, 0, 2,> 1, 0, 0, 102, 111, 120, 0, 1, 0, 0, 113, 117, 105, 99, 107, 0]
209                    entries.push(Entry::Operator(Operator {
210                        operator: operator_type,
211                        distance: Some(reader.read_i16::<NetworkEndian>()?),
212                    }));
213                } else {
214                    entries.push(Entry::Operator(Operator {
215                        operator: operator_type,
216                        distance: None,
217                    }));
218                }
219
220                // println!("{:?}", operator_type);
221            } else if entry_type == 1 {
222                // The entry point of the string is '1' byte
223                // ['fat':AB & 'rat':B] = [0, 0, 0, 3, 2, 2, text start--[1, 4, 0, 114, 97, 116, 0,]--text end    text start --[1, 12, 0, 102, 97, 116, 0]--text end]
224                // text start--[1, 4, 0, 114, 97, 116, 0,]--text end
225                // the second byte 4 == weight B check table below
226                // text start --[1, 12, 0, 102, 97, 116, 0]--text end
227                // the second byte 12 == weight AB check table below
228                let weight = reader.read_u8()?;
229
230                // [0, 0, 0, 1, ----[1, 0, 1, 112, 111, 115, 116, 103, 114, 0]] == 'postgr':*
231                // the prefix byte 1 = true meaning add * as prefix
232                let prefix = reader.read_u8()?;
233
234                let mut text = String::new().into_bytes();
235                reader.read_until(0, &mut text)?;
236                text.pop();
237                let text_utf8 = str::from_utf8(&text)?;
238
239                //this will form distace telling after how many characters to put the operators
240                let val_len = text.len().to_i16().unwrap();
241
242                if weight > 15 {
243                    return Err(format!("Invalid tsquery: invalid weight={weight}").into());
244                }
245                if val_len > ((1 << 11) - 1) {
246                    return Err(
247                        format!("Invalid tsquery: operand too long: length={val_len}").into(),
248                    );
249                }
250
251                entries.push(Entry::Value(Value {
252                    weight,
253                    text: text_utf8.to_owned(),
254                    prefix,
255                    distance: val_len + 1,
256                }));
257            }
258        }
259
260        Ok(PgTsQuery { entries })
261    }
262}
263
264fn infix_string(mut entries: Vec<Entry>) -> String {
265    // println!("{:?}", entries);
266    let mut stack: Vec<String> = Vec::new();
267    let len = entries.len().to_u32().unwrap();
268    let mut priority = 0;
269
270    for num in 0..len {
271        match entries.pop() {
272            Some(entry) => match entry {
273                Entry::Operator(op) => match op.operator {
274                    Operators::Not => {
275                        if let Some(operand) = stack.pop() {
276                            stack.push(format!("!{operand}"));
277                            priority = 1;
278                        }
279                    }
280                    Operators::And => {
281                        if let (Some(right), Some(left)) = (stack.pop(), stack.pop()) {
282                            match 2.cmp(&priority) {
283                                std::cmp::Ordering::Less => {
284                                    // println!("Less");
285                                    stack.push(format!("{left} & {right}"));
286                                }
287                                std::cmp::Ordering::Equal => {
288                                    // println!("Equal");
289
290                                    stack.push(format!("{left} & {right}"));
291                                }
292                                std::cmp::Ordering::Greater => {
293                                    // println!("Greater");
294
295                                    if num == len - 1 {
296                                        stack.push(format!("{left} & {right}"));
297                                    } else {
298                                        stack.push(format!("({left} & {right})"));
299                                    }
300                                }
301                            }
302                            priority = 2;
303                        }
304                    }
305                    Operators::Or => {
306                        if let (Some(right), Some(left)) = (stack.pop(), stack.pop()) {
307                            match 3.cmp(&priority) {
308                                std::cmp::Ordering::Less => {
309                                    stack.push(format!("{left} | {right}"));
310                                }
311                                std::cmp::Ordering::Equal => {
312                                    stack.push(format!("{left} | {right}"));
313                                }
314                                std::cmp::Ordering::Greater => {
315                                    if num == len - 1 {
316                                        stack.push(format!("{left} | {right}"));
317                                    } else {
318                                        stack.push(format!("({left} | {right})"));
319                                    }
320                                }
321                            }
322                            priority = 3;
323                        }
324                    }
325
326                    Operators::Phrase => {
327                        if let (Some(right), Some(left)) = (stack.pop(), stack.pop()) {
328                            let op_str = match op.distance {
329                                Some(1) | None => "<->".to_string(),
330                                Some(distance) => format!("<{distance}>"),
331                            };
332                            stack.push(format!("{left} {op_str} {right}"));
333                            priority = 4;
334                        }
335                    }
336                },
337                Entry::Value(val) => {
338                    // println!("{:?}", val);
339                    if val.prefix > 0 {
340                        stack.push(format!("'{}':*", val.text));
341                    } else if let Some(weight) = weight_to_string(val.weight) {
342                        stack.push(format!("'{}':{}", val.text, weight));
343                    } else {
344                        stack.push(format!("'{}'", val.text));
345                    }
346                }
347            },
348            None => {
349                eprintln!("No entry after popped!");
350            }
351        }
352    }
353
354    stack.join(" ").to_string()
355}
356
357fn weight_to_string(weight: u8) -> Option<&'static str> {
358    match weight {
359        0 => None,
360        8 => Some("A"),
361        4 => Some("B"),
362        2 => Some("C"),
363        1 => Some("D"),
364        12 => Some("AB"),
365        10 => Some("AC"),
366        9 => Some("AD"),
367        6 => Some("BC"),
368        5 => Some("BD"),
369        3 => Some("CD"),
370        14 => Some("ABC"),
371        13 => Some("ABD"),
372        11 => Some("ACD"),
373        7 => Some("BCD"),
374        15 => Some("ABCD"),
375        _ => None,
376    }
377}
378
379impl ToSql for PgTsQuery {
380    fn to_sql(
381        &self,
382        _: &postgres_types::Type,
383        out: &mut bytes::BytesMut,
384    ) -> Result<IsNull, Box<dyn std::error::Error + Sync + Send>> {
385        // Write the count of entries (4 bytes, network endian)
386        out.put_u32(self.entries.len() as u32);
387
388        // Write each entry
389        for entry in &self.entries {
390            match entry {
391                Entry::Operator(op) => {
392                    // Entry type: 2 for operator
393                    out.put_u8(EntryType::Operator.into());
394
395                    // Operator type
396                    out.put_i8(op.operator.into());
397
398                    // If it's a phrase operator, write the distance
399                    if op.operator == Operators::Phrase {
400                        out.put_i16(op.distance.unwrap_or(1));
401                    }
402                }
403                Entry::Value(val) => {
404                    // Entry type: 1 for value
405                    out.put_u8(EntryType::Value.into());
406
407                    // Weight
408                    out.put_u8(val.weight);
409
410                    // Prefix flag
411                    out.put_u8(val.prefix);
412
413                    // Text (null-terminated string)
414                    out.put_slice(val.text.as_bytes());
415                    out.put_u8(0); // null terminator
416                }
417            }
418        }
419
420        Ok(IsNull::No)
421    }
422
423    fn accepts(ty: &postgres_types::Type) -> bool {
424        ty.name() == "tsquery"
425    }
426
427    to_sql_checked!();
428}