1use std::net::IpAddr;
2use std::str::{from_utf8, FromStr};
3
4use base64::Engine;
5use serde_json::{json, Value as JsonValue};
6use tantivy::schema::{Facet, FieldType, IntoIpv6Addr, OwnedValue, Schema};
7use tantivy::tokenizer::PreTokenizedString;
8use tantivy::{DateTime, TantivyDocument};
9use time::format_description::well_known::Rfc3339;
10use time::OffsetDateTime;
11
12use crate::errors::{Error, SummaResult, ValidationError};
13use crate::page_rank::quantize_page_rank;
14use crate::utils::current_time;
15
16pub enum SummaDocument<'a> {
18 BoundJsonBytes((&'a Schema, &'a [u8])),
19 UnboundJsonBytes(&'a [u8]),
20 TantivyDocument(TantivyDocument),
21}
22
23#[derive(thiserror::Error, Debug)]
26pub enum ValueParsingError {
27 #[error("overflow_error: <expected: {expected}, got: {json}>")]
28 OverflowError { expected: &'static str, json: JsonValue },
29 #[error("type_error: <expected: {expected}, got: {json}")]
30 TypeError { expected: &'static str, json: JsonValue },
31 #[error("invalid_base64: {base64}")]
32 InvalidBase64 { base64: String },
33 #[error("null_value_error")]
34 NullValueError,
35 #[error("parse_error: {json}, error: {error}")]
36 ParseError { error: String, json: serde_json::Value },
37}
38
39#[derive(thiserror::Error, Debug)]
42pub enum DocumentParsingError {
43 #[error("The provided string is not valid JSON")]
45 InvalidJson(String),
46 #[error("The field '{0:?}' could not be parsed: {1:?}")]
48 ValueError(String, ValueParsingError),
49}
50
51pub fn process_dynamic_fields(schema: &Schema, json_object: &mut serde_json::Map<String, JsonValue>, skip_updated_at_modification: bool) {
52 if schema.get_field("page_rank").is_ok() && schema.get_field("quantized_page_rank").is_ok() {
53 if let Some(page_rank_value) = json_object.get_mut("page_rank") {
54 if let Some(v) = page_rank_value.as_f64() {
55 json_object.insert("quantized_page_rank".to_string(), json!(quantize_page_rank(v)));
56 }
57 }
58 }
59 if schema.get_field("updated_at").is_ok() && !skip_updated_at_modification {
60 json_object.insert("updated_at".to_string(), json!(current_time()));
61 }
62}
63
64#[inline]
66pub fn value_from_json(field_type: &FieldType, json: JsonValue) -> Result<OwnedValue, ValueParsingError> {
67 match json {
68 JsonValue::String(field_text) => match *field_type {
69 FieldType::Date(_) => {
70 let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339).map_err(|_err| ValueParsingError::TypeError {
71 expected: "rfc3339 format",
72 json: JsonValue::String(field_text),
73 })?;
74 Ok(DateTime::from_utc(dt_with_fixed_tz).into())
75 }
76 FieldType::Str(_) => Ok(OwnedValue::Str(field_text)),
77 FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => Err(ValueParsingError::TypeError {
78 expected: "an integer",
79 json: JsonValue::String(field_text),
80 }),
81 FieldType::Bool(_) => Err(ValueParsingError::TypeError {
82 expected: "a boolean",
83 json: JsonValue::String(field_text),
84 }),
85 FieldType::Facet(_) => Ok(OwnedValue::Facet(Facet::from(&field_text))),
86 FieldType::Bytes(_) => base64::engine::general_purpose::STANDARD
87 .decode(&field_text)
88 .map(OwnedValue::Bytes)
89 .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
90 FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
91 expected: "a json object",
92 json: JsonValue::String(field_text),
93 }),
94 FieldType::IpAddr(_) => {
95 let ip_addr: IpAddr = IpAddr::from_str(&field_text).map_err(|err| ValueParsingError::ParseError {
96 error: err.to_string(),
97 json: JsonValue::String(field_text),
98 })?;
99 Ok(OwnedValue::IpAddr(ip_addr.into_ipv6_addr()))
100 }
101 },
102 JsonValue::Number(field_val_num) => match field_type {
103 FieldType::I64(_) | FieldType::Date(_) => {
104 if let Some(field_val_i64) = field_val_num.as_i64() {
105 Ok(OwnedValue::I64(field_val_i64))
106 } else {
107 Err(ValueParsingError::OverflowError {
108 expected: "an i64 int",
109 json: JsonValue::Number(field_val_num),
110 })
111 }
112 }
113 FieldType::U64(_) => {
114 if let Some(field_val_u64) = field_val_num.as_u64() {
115 Ok(OwnedValue::U64(field_val_u64))
116 } else {
117 Err(ValueParsingError::OverflowError {
118 expected: "u64",
119 json: JsonValue::Number(field_val_num),
120 })
121 }
122 }
123 FieldType::F64(_) => {
124 if let Some(field_val_f64) = field_val_num.as_f64() {
125 Ok(OwnedValue::F64(field_val_f64))
126 } else {
127 Err(ValueParsingError::OverflowError {
128 expected: "a f64",
129 json: JsonValue::Number(field_val_num),
130 })
131 }
132 }
133 FieldType::Bool(_) => Err(ValueParsingError::TypeError {
134 expected: "a boolean",
135 json: JsonValue::Number(field_val_num),
136 }),
137 FieldType::Str(_) | FieldType::Facet(_) | FieldType::Bytes(_) => Err(ValueParsingError::TypeError {
138 expected: "a string",
139 json: JsonValue::Number(field_val_num),
140 }),
141 FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
142 expected: "a json object",
143 json: JsonValue::Number(field_val_num),
144 }),
145 FieldType::IpAddr(_) => Err(ValueParsingError::TypeError {
146 expected: "a string with an ip addr",
147 json: JsonValue::Number(field_val_num),
148 }),
149 },
150 JsonValue::Object(json_map) => match field_type {
151 FieldType::Str(_) => {
152 if let Ok(tok_str_val) = serde_json::from_value::<PreTokenizedString>(serde_json::Value::Object(json_map.clone())) {
153 Ok(OwnedValue::PreTokStr(tok_str_val))
154 } else {
155 Err(ValueParsingError::TypeError {
156 expected: "a string or an pretokenized string",
157 json: JsonValue::Object(json_map),
158 })
159 }
160 }
161 FieldType::JsonObject(_) => Ok(OwnedValue::from(json_map)),
162 _ => Err(ValueParsingError::TypeError {
163 expected: field_type.value_type().name(),
164 json: JsonValue::Object(json_map),
165 }),
166 },
167 JsonValue::Bool(json_bool_val) => match field_type {
168 FieldType::Bool(_) => Ok(OwnedValue::Bool(json_bool_val)),
169 _ => Err(ValueParsingError::TypeError {
170 expected: field_type.value_type().name(),
171 json: JsonValue::Bool(json_bool_val),
172 }),
173 },
174 JsonValue::Null => Err(ValueParsingError::NullValueError),
175 _ => Err(ValueParsingError::TypeError {
176 expected: field_type.value_type().name(),
177 json: json.clone(),
178 }),
179 }
180}
181
182impl<'a> SummaDocument<'a> {
183 pub fn bound_with(self, schema: &'a Schema) -> SummaDocument {
184 match self {
185 SummaDocument::UnboundJsonBytes(json_bytes) => SummaDocument::BoundJsonBytes((schema, json_bytes)),
186 SummaDocument::BoundJsonBytes((_, json_bytes)) => SummaDocument::BoundJsonBytes((schema, json_bytes)),
187 other => other,
188 }
189 }
190
191 pub fn parse_and_setup_document(schema: &Schema, doc_json: &str, skip_updated_at_modification: bool) -> SummaResult<TantivyDocument> {
193 let mut json_obj: serde_json::Map<String, JsonValue> =
194 serde_json::from_str(doc_json).map_err(|_| DocumentParsingError::InvalidJson(doc_json.to_owned()))?;
195 process_dynamic_fields(schema, &mut json_obj, skip_updated_at_modification);
196 Self::json_object_to_doc(schema, json_obj)
197 }
198
199 pub fn json_object_to_doc(schema: &Schema, json_obj: serde_json::Map<String, JsonValue>) -> SummaResult<TantivyDocument> {
201 let mut doc = TantivyDocument::default();
202 for (field_name, json_value) in json_obj {
203 if let Ok(field) = schema.get_field(&field_name) {
204 let field_entry = schema.get_field_entry(field);
205 let field_type = field_entry.field_type();
206 match json_value {
207 JsonValue::Array(json_items) => {
208 for json_item in json_items {
209 match value_from_json(field_type, json_item) {
210 Ok(value) => doc.add_field_value(field, &value),
211 Err(ValueParsingError::NullValueError) => continue,
212 Err(error) => return Err(DocumentParsingError::ValueError(field_name.to_owned(), error).into()),
213 }
214 }
215 }
216 _ => match value_from_json(field_type, json_value) {
217 Ok(value) => doc.add_field_value(field, &value),
218 Err(ValueParsingError::NullValueError) => continue,
219 Err(error) => return Err(DocumentParsingError::ValueError(field_name.to_owned(), error).into()),
220 },
221 }
222 }
223 }
224 Ok(doc)
225 }
226
227 pub fn parse_json_bytes(schema: &Schema, json_bytes: &[u8], skip_updated_at_modification: bool) -> SummaResult<TantivyDocument> {
228 let text_document = from_utf8(json_bytes).map_err(ValidationError::Utf8)?;
229 let parsed_document = Self::parse_and_setup_document(schema, text_document, skip_updated_at_modification)?;
230 Ok(parsed_document)
231 }
232}
233
234impl<'a> TryInto<TantivyDocument> for SummaDocument<'a> {
235 type Error = Error;
236
237 fn try_into(self) -> SummaResult<TantivyDocument> {
238 match self {
239 SummaDocument::BoundJsonBytes((schema, json_bytes)) => Self::parse_json_bytes(schema, json_bytes, false),
240 SummaDocument::UnboundJsonBytes(_) => Err(Error::UnboundDocument),
241 SummaDocument::TantivyDocument(document) => Ok(document),
242 }
243 }
244}
245
246impl<'a> From<&'a Vec<u8>> for SummaDocument<'a> {
247 fn from(v: &'a Vec<u8>) -> Self {
248 SummaDocument::UnboundJsonBytes(v)
249 }
250}