summa_core/components/
summa_document.rs

1use std::net::IpAddr;
2use std::str::{from_utf8, FromStr};
3
4use base64::Engine;
5use serde_json::{json, Value as JsonValue};
6use tantivy::schema::{Facet, FieldType, IntoIpv6Addr, OwnedValue, Schema};
7use tantivy::tokenizer::PreTokenizedString;
8use tantivy::{DateTime, TantivyDocument};
9use time::format_description::well_known::Rfc3339;
10use time::OffsetDateTime;
11
12use crate::errors::{Error, SummaResult, ValidationError};
13use crate::page_rank::quantize_page_rank;
14use crate::utils::current_time;
15
16/// Wrapper for carrying `tantivy::Document` from various sources
17pub enum SummaDocument<'a> {
18    BoundJsonBytes((&'a Schema, &'a [u8])),
19    UnboundJsonBytes(&'a [u8]),
20    TantivyDocument(TantivyDocument),
21}
22
23/// Possible error that may occur while parsing a field value
24/// At this point the JSON is known to be valid.
25#[derive(thiserror::Error, Debug)]
26pub enum ValueParsingError {
27    #[error("overflow_error: <expected: {expected}, got: {json}>")]
28    OverflowError { expected: &'static str, json: JsonValue },
29    #[error("type_error: <expected: {expected}, got: {json}")]
30    TypeError { expected: &'static str, json: JsonValue },
31    #[error("invalid_base64: {base64}")]
32    InvalidBase64 { base64: String },
33    #[error("null_value_error")]
34    NullValueError,
35    #[error("parse_error: {json}, error: {error}")]
36    ParseError { error: String, json: serde_json::Value },
37}
38
39/// Error that may happen when deserializing
40/// a document from JSON.
41#[derive(thiserror::Error, Debug)]
42pub enum DocumentParsingError {
43    /// The payload given is not valid JSON.
44    #[error("The provided string is not valid JSON")]
45    InvalidJson(String),
46    /// One of the value node could not be parsed.
47    #[error("The field '{0:?}' could not be parsed: {1:?}")]
48    ValueError(String, ValueParsingError),
49}
50
51pub fn process_dynamic_fields(schema: &Schema, json_object: &mut serde_json::Map<String, JsonValue>, skip_updated_at_modification: bool) {
52    if schema.get_field("page_rank").is_ok() && schema.get_field("quantized_page_rank").is_ok() {
53        if let Some(page_rank_value) = json_object.get_mut("page_rank") {
54            if let Some(v) = page_rank_value.as_f64() {
55                json_object.insert("quantized_page_rank".to_string(), json!(quantize_page_rank(v)));
56            }
57        }
58    }
59    if schema.get_field("updated_at").is_ok() && !skip_updated_at_modification {
60        json_object.insert("updated_at".to_string(), json!(current_time()));
61    }
62}
63
64/// Parse single json value
65#[inline]
66pub fn value_from_json(field_type: &FieldType, json: JsonValue) -> Result<OwnedValue, ValueParsingError> {
67    match json {
68        JsonValue::String(field_text) => match *field_type {
69            FieldType::Date(_) => {
70                let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339).map_err(|_err| ValueParsingError::TypeError {
71                    expected: "rfc3339 format",
72                    json: JsonValue::String(field_text),
73                })?;
74                Ok(DateTime::from_utc(dt_with_fixed_tz).into())
75            }
76            FieldType::Str(_) => Ok(OwnedValue::Str(field_text)),
77            FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(ValueParsingError::TypeError {
78                expected: "an integer",
79                json: JsonValue::String(field_text),
80            }),
81            FieldType::Bool(_) => Err(ValueParsingError::TypeError {
82                expected: "a boolean",
83                json: JsonValue::String(field_text),
84            }),
85            FieldType::Facet(_) => Ok(OwnedValue::Facet(Facet::from(&field_text))),
86            FieldType::Bytes(_) => base64::engine::general_purpose::STANDARD
87                .decode(&field_text)
88                .map(OwnedValue::Bytes)
89                .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
90            FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
91                expected: "a json object",
92                json: JsonValue::String(field_text),
93            }),
94            FieldType::IpAddr(_) => {
95                let ip_addr: IpAddr = IpAddr::from_str(&field_text).map_err(|err| ValueParsingError::ParseError {
96                    error: err.to_string(),
97                    json: JsonValue::String(field_text),
98                })?;
99                Ok(OwnedValue::IpAddr(ip_addr.into_ipv6_addr()))
100            }
101        },
102        JsonValue::Number(field_val_num) => match field_type {
103            FieldType::I64(_) | FieldType::Date(_) => {
104                if let Some(field_val_i64) = field_val_num.as_i64() {
105                    Ok(OwnedValue::I64(field_val_i64))
106                } else {
107                    Err(ValueParsingError::OverflowError {
108                        expected: "an i64 int",
109                        json: JsonValue::Number(field_val_num),
110                    })
111                }
112            }
113            FieldType::U64(_) => {
114                if let Some(field_val_u64) = field_val_num.as_u64() {
115                    Ok(OwnedValue::U64(field_val_u64))
116                } else {
117                    Err(ValueParsingError::OverflowError {
118                        expected: "u64",
119                        json: JsonValue::Number(field_val_num),
120                    })
121                }
122            }
123            FieldType::F64(_) => {
124                if let Some(field_val_f64) = field_val_num.as_f64() {
125                    Ok(OwnedValue::F64(field_val_f64))
126                } else {
127                    Err(ValueParsingError::OverflowError {
128                        expected: "a f64",
129                        json: JsonValue::Number(field_val_num),
130                    })
131                }
132            }
133            FieldType::Bool(_) => Err(ValueParsingError::TypeError {
134                expected: "a boolean",
135                json: JsonValue::Number(field_val_num),
136            }),
137            FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => Err(ValueParsingError::TypeError {
138                expected: "a string",
139                json: JsonValue::Number(field_val_num),
140            }),
141            FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
142                expected: "a json object",
143                json: JsonValue::Number(field_val_num),
144            }),
145            FieldType::IpAddr(_) => Err(ValueParsingError::TypeError {
146                expected: "a string with an ip addr",
147                json: JsonValue::Number(field_val_num),
148            }),
149        },
150        JsonValue::Object(json_map) => match field_type {
151            FieldType::Str(_) => {
152                if let Ok(tok_str_val) = serde_json::from_value::<PreTokenizedString>(serde_json::Value::Object(json_map.clone())) {
153                    Ok(OwnedValue::PreTokStr(tok_str_val))
154                } else {
155                    Err(ValueParsingError::TypeError {
156                        expected: "a string or an pretokenized string",
157                        json: JsonValue::Object(json_map),
158                    })
159                }
160            }
161            FieldType::JsonObject(_) => Ok(OwnedValue::from(json_map)),
162            _ => Err(ValueParsingError::TypeError {
163                expected: field_type.value_type().name(),
164                json: JsonValue::Object(json_map),
165            }),
166        },
167        JsonValue::Bool(json_bool_val) => match field_type {
168            FieldType::Bool(_) => Ok(OwnedValue::Bool(json_bool_val)),
169            _ => Err(ValueParsingError::TypeError {
170                expected: field_type.value_type().name(),
171                json: JsonValue::Bool(json_bool_val),
172            }),
173        },
174        JsonValue::Null => Err(ValueParsingError::NullValueError),
175        _ => Err(ValueParsingError::TypeError {
176            expected: field_type.value_type().name(),
177            json: json.clone(),
178        }),
179    }
180}
181
182impl<'a> SummaDocument<'a> {
183    pub fn bound_with(self, schema: &'a Schema) -> SummaDocument {
184        match self {
185            SummaDocument::UnboundJsonBytes(json_bytes) => SummaDocument::BoundJsonBytes((schema, json_bytes)),
186            SummaDocument::BoundJsonBytes((_, json_bytes)) => SummaDocument::BoundJsonBytes((schema, json_bytes)),
187            other => other,
188        }
189    }
190
191    /// Build a document object from a json-object.
192    pub fn parse_and_setup_document(schema: &Schema, doc_json: &str, skip_updated_at_modification: bool) -> SummaResult<TantivyDocument> {
193        let mut json_obj: serde_json::Map<String, JsonValue> =
194            serde_json::from_str(doc_json).map_err(|_| DocumentParsingError::InvalidJson(doc_json.to_owned()))?;
195        process_dynamic_fields(schema, &mut json_obj, skip_updated_at_modification);
196        Self::json_object_to_doc(schema, json_obj)
197    }
198
199    /// Build a document object from a json-object.
200    pub fn json_object_to_doc(schema: &Schema, json_obj: serde_json::Map<String, JsonValue>) -> SummaResult<TantivyDocument> {
201        let mut doc = TantivyDocument::default();
202        for (field_name, json_value) in json_obj {
203            if let Ok(field) = schema.get_field(&field_name) {
204                let field_entry = schema.get_field_entry(field);
205                let field_type = field_entry.field_type();
206                match json_value {
207                    JsonValue::Array(json_items) => {
208                        for json_item in json_items {
209                            match value_from_json(field_type, json_item) {
210                                Ok(value) => doc.add_field_value(field, &value),
211                                Err(ValueParsingError::NullValueError) => continue,
212                                Err(error) => return Err(DocumentParsingError::ValueError(field_name.to_owned(), error).into()),
213                            }
214                        }
215                    }
216                    _ => match value_from_json(field_type, json_value) {
217                        Ok(value) => doc.add_field_value(field, &value),
218                        Err(ValueParsingError::NullValueError) => continue,
219                        Err(error) => return Err(DocumentParsingError::ValueError(field_name.to_owned(), error).into()),
220                    },
221                }
222            }
223        }
224        Ok(doc)
225    }
226
227    pub fn parse_json_bytes(schema: &Schema, json_bytes: &[u8], skip_updated_at_modification: bool) -> SummaResult<TantivyDocument> {
228        let text_document = from_utf8(json_bytes).map_err(ValidationError::Utf8)?;
229        let parsed_document = Self::parse_and_setup_document(schema, text_document, skip_updated_at_modification)?;
230        Ok(parsed_document)
231    }
232}
233
234impl<'a> TryInto<TantivyDocument> for SummaDocument<'a> {
235    type Error = Error;
236
237    fn try_into(self) -> SummaResult<TantivyDocument> {
238        match self {
239            SummaDocument::BoundJsonBytes((schema, json_bytes)) => Self::parse_json_bytes(schema, json_bytes, false),
240            SummaDocument::UnboundJsonBytes(_) => Err(Error::UnboundDocument),
241            SummaDocument::TantivyDocument(document) => Ok(document),
242        }
243    }
244}
245
246impl<'a> From<&'a Vec<u8>> for SummaDocument<'a> {
247    fn from(v: &'a Vec<u8>) -> Self {
248        SummaDocument::UnboundJsonBytes(v)
249    }
250}