postgres_types_extra/pg_tsquery.rs
1/*
2[0, 0, 0, 5, 2, 3, 1, 0, 0, 99, 97, 116, 0, 2, 2, 1, 0, 0, 114, 97, 116, 0, 1, 0, 0, 102, 97, 116, 0] == 'fat' & 'rat' | 'cat'
3[0, 0, 0, 5, -- 5 items
42, 3, -- operator 3 '|'
5text:entry -> weight -> prefix -> string 'cat' -> terminated by 0
61, 0, 0, 99, 97, 116, 0,
7operator:entry -> operator 2 '&''
82, 2,
9text:entry -> weight -> prefix -> string 'rat' -> terminated by 0
101, 0, 0, 114, 97, 116, 0,
11text:entry -> weight -> prefix -> string 'fat' -> terminated by 0
121, 0, 0, 102, 97, 116, 0]
13*/
14
15/*
16To understand how the raw data in the format | & eleph ! bird & | cat dog lazi is converted to the output 'lazi' & ( 'dog' | 'cat' ) | !'bird' & 'eleph' in PostgreSQL's ts_query, let's break down the process.
17
181. Understanding the Raw Data Format
19The raw data represents a postfix (or Reverse Polish Notation - RPN) expression where operators come after their operands. Here's how the provided string maps to this notation:
20
21Operands: lazi, dog, cat, bird, eleph
22Operators: |, &, ! (OR, AND, NOT)
23The order of the elements indicates how these operands and operators should be combined:
24
25| & eleph ! bird & | cat dog lazi
26lazi, dog, cat, bird, eleph are the individual terms.
27|, &, ! are the logical operators, and their position in the sequence determines how the terms are logically combined.
282. Processing Postfix Notation
29PostgreSQL’s ts_query engine processes the input in postfix notation by following these rules:
30
31Operands (Values) are pushed onto a stack.
32When an operator is encountered, it pops the required number of operands from the stack, applies the operator, and then pushes the result back onto the stack.
33Given the postfix expression | & eleph ! bird & | cat dog lazi, let's break down how PostgreSQL would process it:
34
35Step-by-Step Evaluation
36Push lazi onto the stack.
37Push dog onto the stack.
38Push cat onto the stack.
39Apply | (OR): Pop dog and cat, combine them as 'dog' | 'cat', and push the result back onto the stack.
40Stack: ['lazi', "'dog' | 'cat'"]
41Apply & (AND): Pop lazi and the result of the previous OR operation, combine them as 'lazi' & ('dog' | 'cat'), and push the result back onto the stack.
42Stack: ["'lazi' & ('dog' | 'cat')"]
43Push bird onto the stack.
44Apply ! (NOT): Pop bird, apply NOT, resulting in !'bird', and push the result back onto the stack.
45Stack: ["'lazi' & ('dog' | 'cat')", "!'bird'"]
46Push eleph onto the stack.
47Apply & (AND): Pop !'bird' and eleph, combine them as !'bird' & 'eleph', and push the result back onto the stack.
48Stack: ["'lazi' & ('dog' | 'cat')", "!'bird' & 'eleph'"]
49Apply | (OR): Pop the two previous results from the stack, combine them as ('lazi' & ('dog' | 'cat')) | (!'bird' & 'eleph'), and push the final result back onto the stack.
50Final Result
51The final expression is:
52
53Copy code
54'lazi' & ('dog' | 'cat') | !'bird' & 'eleph'
55This represents how the raw data is evaluated in PostgreSQL's ts_query engine to produce the correct infix expression.
56
573. How PostgreSQL Handles This Internally
58PostgreSQL's tsquery uses an internal tree structure to represent the query. Each node can be an operand (a term in the query) or an operator (AND, OR, NOT).
59When parsing a query, PostgreSQL constructs this tree based on operator precedence and parentheses (if any).
60Infix Conversion: PostgreSQL traverses this tree to convert the internal representation into a human-readable infix expression (with operators placed between operands, and proper parentheses to reflect precedence).
61Summary
62The raw postfix expression | & eleph ! bird & | cat dog lazi is processed by PostgreSQL's ts_query engine, which uses a stack-based approach to apply operators in the correct order.
63This processing ultimately results in the infix expression 'lazi' & ('dog' | 'cat') | !'bird' & 'eleph', correctly reflecting operator precedence and logical grouping.
64This method of parsing and evaluation ensures that the logical relationships between the terms are preserved in the final query representation.
65*/
66
67use bigdecimal::ToPrimitive;
68use byteorder::{NetworkEndian, ReadBytesExt};
69use bytes::BufMut;
70use postgres_types::{FromSql, IsNull, ToSql, Type, to_sql_checked};
71use std::error::Error;
72use std::io::{BufRead, Cursor};
73use std::{fmt, str};
74
75#[derive(Clone, Debug, PartialEq)]
76pub struct PgTsQuery {
77 pub entries: Vec<Entry>,
78}
79
80#[derive(Copy, Clone, Eq, PartialEq, Debug, PartialOrd)]
81pub enum Operators {
82 Not = 1,
83 And = 2,
84 Or = 3,
85 Phrase = 4,
86}
87
88impl From<Operators> for i8 {
89 fn from(val: Operators) -> Self {
90 match val {
91 Operators::Not => 1,
92 Operators::And => 2,
93 Operators::Or => 3,
94 Operators::Phrase => 4,
95 }
96 }
97}
98
99impl TryFrom<i8> for Operators {
100 type Error = Box<dyn Error>;
101
102 fn try_from(value: i8) -> Result<Self, Self::Error> {
103 match value {
104 1 => Ok(Operators::Not),
105 2 => Ok(Operators::And),
106 3 => Ok(Operators::Or),
107 4 => Ok(Operators::Phrase),
108 _ => Err("Invalid type".into()),
109 }
110 }
111}
112
113#[derive(Clone, Debug, Copy, PartialEq)]
114pub struct Operator {
115 pub operator: Operators,
116 pub distance: Option<i16>,
117}
118
119#[allow(dead_code)]
120#[derive(Clone, Debug, PartialEq)]
121pub struct Value {
122 pub weight: u8,
123 pub text: String,
124 pub prefix: u8,
125 pub distance: i16,
126}
127
128#[derive(Clone, Debug, PartialEq)]
129pub enum Entry {
130 Operator(Operator),
131 Value(Value),
132}
133
134#[derive(Copy, Clone, Eq, PartialEq, Debug)]
135pub enum EntryType {
136 Value = 1,
137 Operator = 2,
138}
139
140impl From<EntryType> for u8 {
141 fn from(val: EntryType) -> Self {
142 match val {
143 EntryType::Value => 1,
144 EntryType::Operator => 2,
145 }
146 }
147}
148
149impl TryFrom<u8> for EntryType {
150 type Error = Box<dyn Error>;
151
152 fn try_from(value: u8) -> Result<Self, Self::Error> {
153 match value {
154 0 => Ok(EntryType::Value),
155 2 => Ok(EntryType::Operator),
156 _ => Err("Invalid type".into()),
157 }
158 }
159}
160
161impl<'a> FromSql<'a> for PgTsQuery {
162 fn from_sql(_: &Type, raw: &'a [u8]) -> Result<Self, Box<dyn Error + Sync + Send>> {
163 let ts_query = raw.try_into().unwrap();
164
165 Ok(ts_query)
166 }
167 fn accepts(ty: &Type) -> bool {
168 ty.name() == "tsquery"
169 }
170}
171
172impl fmt::Display for PgTsQuery {
173 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
174 write!(f, "{}", infix_string(self.entries.clone()))?;
175 Ok(())
176 }
177}
178
179impl TryFrom<&[u8]> for PgTsQuery {
180 type Error = Box<dyn Error>;
181
182 /// Decode binary data into [`TsQuery`] based on the binary data format defined in
183 /// https://github.com/postgres/postgres/blob/252dcb32397f64a5e1ceac05b29a271ab19aa960/src/backend/utils/adt/tsquery.c#L1174
184 fn try_from(bytes: &[u8]) -> Result<Self, Self::Error> {
185 let mut reader = Cursor::new(bytes);
186 let size = bytes.len().to_u64().unwrap();
187
188 //First 4 bytes tell you about the content length
189 //eg: 'postgr' & 'data' & 'tobon' = 5
190 //eg: 'postgr' & 'data' == 0003
191 let count = reader.read_u32::<NetworkEndian>()?;
192
193 let mut entries = Vec::<Entry>::with_capacity(count as usize);
194
195 for _ in 0..count {
196 let entry_type = reader.read_u8()?;
197
198 if entry_type == 2 {
199 let operator_type: Operators = reader.read_i8()?.try_into()?;
200 //If the operator is in the last position then error
201 if reader.position() == (size - 1) {
202 return Err("Invalid tsquery: invalid pointer to right operand".into());
203 }
204
205 if operator_type == Operators::Phrase {
206 //If the operator is a phrase i.e. 4 then read the next two bytes as the distance <3>
207 // The below value gives you ['quick' <2> 'fox' & 'lazi' <3> 'dog']
208 //[0, 0, 0, 7, 2, 2, <2, 4, 0, 3,> 1, 0, 0, 100, 111, 103, 0, 1, 0, 0, 108, 97, 122, 105, 0, <2, 4, 0, 2,> 1, 0, 0, 102, 111, 120, 0, 1, 0, 0, 113, 117, 105, 99, 107, 0]
209 entries.push(Entry::Operator(Operator {
210 operator: operator_type,
211 distance: Some(reader.read_i16::<NetworkEndian>()?),
212 }));
213 } else {
214 entries.push(Entry::Operator(Operator {
215 operator: operator_type,
216 distance: None,
217 }));
218 }
219
220 // println!("{:?}", operator_type);
221 } else if entry_type == 1 {
222 // The entry point of the string is '1' byte
223 // ['fat':AB & 'rat':B] = [0, 0, 0, 3, 2, 2, text start--[1, 4, 0, 114, 97, 116, 0,]--text end text start --[1, 12, 0, 102, 97, 116, 0]--text end]
224 // text start--[1, 4, 0, 114, 97, 116, 0,]--text end
225 // the second byte 4 == weight B check table below
226 // text start --[1, 12, 0, 102, 97, 116, 0]--text end
227 // the second byte 12 == weight AB check table below
228 let weight = reader.read_u8()?;
229
230 // [0, 0, 0, 1, ----[1, 0, 1, 112, 111, 115, 116, 103, 114, 0]] == 'postgr':*
231 // the prefix byte 1 = true meaning add * as prefix
232 let prefix = reader.read_u8()?;
233
234 let mut text = String::new().into_bytes();
235 reader.read_until(0, &mut text)?;
236 text.pop();
237 let text_utf8 = str::from_utf8(&text)?;
238
239 //this will form distace telling after how many characters to put the operators
240 let val_len = text.len().to_i16().unwrap();
241
242 if weight > 15 {
243 return Err(format!("Invalid tsquery: invalid weight={weight}").into());
244 }
245 if val_len > ((1 << 11) - 1) {
246 return Err(
247 format!("Invalid tsquery: operand too long: length={val_len}").into(),
248 );
249 }
250
251 entries.push(Entry::Value(Value {
252 weight,
253 text: text_utf8.to_owned(),
254 prefix,
255 distance: val_len + 1,
256 }));
257 }
258 }
259
260 Ok(PgTsQuery { entries })
261 }
262}
263
264fn infix_string(mut entries: Vec<Entry>) -> String {
265 // println!("{:?}", entries);
266 let mut stack: Vec<String> = Vec::new();
267 let len = entries.len().to_u32().unwrap();
268 let mut priority = 0;
269
270 for num in 0..len {
271 match entries.pop() {
272 Some(entry) => match entry {
273 Entry::Operator(op) => match op.operator {
274 Operators::Not => {
275 if let Some(operand) = stack.pop() {
276 stack.push(format!("!{operand}"));
277 priority = 1;
278 }
279 }
280 Operators::And => {
281 if let (Some(right), Some(left)) = (stack.pop(), stack.pop()) {
282 match 2.cmp(&priority) {
283 std::cmp::Ordering::Less => {
284 // println!("Less");
285 stack.push(format!("{left} & {right}"));
286 }
287 std::cmp::Ordering::Equal => {
288 // println!("Equal");
289
290 stack.push(format!("{left} & {right}"));
291 }
292 std::cmp::Ordering::Greater => {
293 // println!("Greater");
294
295 if num == len - 1 {
296 stack.push(format!("{left} & {right}"));
297 } else {
298 stack.push(format!("({left} & {right})"));
299 }
300 }
301 }
302 priority = 2;
303 }
304 }
305 Operators::Or => {
306 if let (Some(right), Some(left)) = (stack.pop(), stack.pop()) {
307 match 3.cmp(&priority) {
308 std::cmp::Ordering::Less => {
309 stack.push(format!("{left} | {right}"));
310 }
311 std::cmp::Ordering::Equal => {
312 stack.push(format!("{left} | {right}"));
313 }
314 std::cmp::Ordering::Greater => {
315 if num == len - 1 {
316 stack.push(format!("{left} | {right}"));
317 } else {
318 stack.push(format!("({left} | {right})"));
319 }
320 }
321 }
322 priority = 3;
323 }
324 }
325
326 Operators::Phrase => {
327 if let (Some(right), Some(left)) = (stack.pop(), stack.pop()) {
328 let op_str = match op.distance {
329 Some(1) | None => "<->".to_string(),
330 Some(distance) => format!("<{distance}>"),
331 };
332 stack.push(format!("{left} {op_str} {right}"));
333 priority = 4;
334 }
335 }
336 },
337 Entry::Value(val) => {
338 // println!("{:?}", val);
339 if val.prefix > 0 {
340 stack.push(format!("'{}':*", val.text));
341 } else if let Some(weight) = weight_to_string(val.weight) {
342 stack.push(format!("'{}':{}", val.text, weight));
343 } else {
344 stack.push(format!("'{}'", val.text));
345 }
346 }
347 },
348 None => {
349 eprintln!("No entry after popped!");
350 }
351 }
352 }
353
354 stack.join(" ").to_string()
355}
356
357fn weight_to_string(weight: u8) -> Option<&'static str> {
358 match weight {
359 0 => None,
360 8 => Some("A"),
361 4 => Some("B"),
362 2 => Some("C"),
363 1 => Some("D"),
364 12 => Some("AB"),
365 10 => Some("AC"),
366 9 => Some("AD"),
367 6 => Some("BC"),
368 5 => Some("BD"),
369 3 => Some("CD"),
370 14 => Some("ABC"),
371 13 => Some("ABD"),
372 11 => Some("ACD"),
373 7 => Some("BCD"),
374 15 => Some("ABCD"),
375 _ => None,
376 }
377}
378
379impl ToSql for PgTsQuery {
380 fn to_sql(
381 &self,
382 _: &postgres_types::Type,
383 out: &mut bytes::BytesMut,
384 ) -> Result<IsNull, Box<dyn std::error::Error + Sync + Send>> {
385 // Write the count of entries (4 bytes, network endian)
386 out.put_u32(self.entries.len() as u32);
387
388 // Write each entry
389 for entry in &self.entries {
390 match entry {
391 Entry::Operator(op) => {
392 // Entry type: 2 for operator
393 out.put_u8(EntryType::Operator.into());
394
395 // Operator type
396 out.put_i8(op.operator.into());
397
398 // If it's a phrase operator, write the distance
399 if op.operator == Operators::Phrase {
400 out.put_i16(op.distance.unwrap_or(1));
401 }
402 }
403 Entry::Value(val) => {
404 // Entry type: 1 for value
405 out.put_u8(EntryType::Value.into());
406
407 // Weight
408 out.put_u8(val.weight);
409
410 // Prefix flag
411 out.put_u8(val.prefix);
412
413 // Text (null-terminated string)
414 out.put_slice(val.text.as_bytes());
415 out.put_u8(0); // null terminator
416 }
417 }
418 }
419
420 Ok(IsNull::No)
421 }
422
423 fn accepts(ty: &postgres_types::Type) -> bool {
424 ty.name() == "tsquery"
425 }
426
427 to_sql_checked!();
428}