1use std::borrow::Borrow;
2use std::fmt::Write;
3
4use arrow::array::*;
5use arrow::bitmap::BitmapBuilder;
6use arrow::datatypes::{ArrowDataType, IntervalUnit};
7use arrow::offset::{Offset, Offsets};
8use arrow::temporal_conversions;
9use arrow::types::NativeType;
10use num_traits::NumCast;
11use simd_json::{BorrowedValue, StaticNode};
12
13use super::*;
14
15const JSON_NULL_VALUE: BorrowedValue = BorrowedValue::Static(StaticNode::Null);
16
17fn deserialize_boolean_into<'a, A: Borrow<BorrowedValue<'a>>>(
18 target: &mut MutableBooleanArray,
19 rows: &[A],
20) -> PolarsResult<()> {
21 let mut err_idx = rows.len();
22 let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
23 BorrowedValue::Static(StaticNode::Bool(v)) => Some(v),
24 BorrowedValue::Static(StaticNode::Null) => None,
25 _ => {
26 err_idx = if err_idx == rows.len() { i } else { err_idx };
27 None
28 },
29 });
30 target.extend_trusted_len(iter);
31 check_err_idx(rows, err_idx, "boolean")
32}
33
34fn deserialize_primitive_into<'a, T: NativeType + NumCast, A: Borrow<BorrowedValue<'a>>>(
35 target: &mut MutablePrimitiveArray<T>,
36 rows: &[A],
37) -> PolarsResult<()> {
38 let mut err_idx = rows.len();
39 let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
40 BorrowedValue::Static(StaticNode::I64(v)) => T::from(*v),
41 BorrowedValue::Static(StaticNode::U64(v)) => T::from(*v),
42 BorrowedValue::Static(StaticNode::F64(v)) => T::from(*v),
43 BorrowedValue::Static(StaticNode::Bool(v)) => T::from(*v as u8),
44 BorrowedValue::Static(StaticNode::Null) => None,
45 _ => {
46 err_idx = if err_idx == rows.len() { i } else { err_idx };
47 None
48 },
49 });
50 target.extend_trusted_len(iter);
51 check_err_idx(rows, err_idx, "numeric")
52}
53
54fn deserialize_binary<'a, A: Borrow<BorrowedValue<'a>>>(
55 rows: &[A],
56) -> PolarsResult<BinaryArray<i64>> {
57 let mut err_idx = rows.len();
58 let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
59 BorrowedValue::String(v) => Some(v.as_bytes()),
60 BorrowedValue::Static(StaticNode::Null) => None,
61 _ => {
62 err_idx = if err_idx == rows.len() { i } else { err_idx };
63 None
64 },
65 });
66 let out = BinaryArray::from_trusted_len_iter(iter);
67 check_err_idx(rows, err_idx, "binary")?;
68 Ok(out)
69}
70
71fn deserialize_utf8_into<'a, O: Offset, A: Borrow<BorrowedValue<'a>>>(
72 target: &mut MutableUtf8Array<O>,
73 rows: &[A],
74) -> PolarsResult<()> {
75 let mut err_idx = rows.len();
76 let mut scratch = String::new();
77 for (i, row) in rows.iter().enumerate() {
78 match row.borrow() {
79 BorrowedValue::String(v) => target.push(Some(v.as_ref())),
80 BorrowedValue::Static(StaticNode::Bool(v)) => {
81 target.push(Some(if *v { "true" } else { "false" }))
82 },
83 BorrowedValue::Static(StaticNode::Null) => target.push_null(),
84 BorrowedValue::Static(node) => {
85 write!(scratch, "{node}").unwrap();
86 target.push(Some(scratch.as_str()));
87 scratch.clear();
88 },
89 _ => {
90 err_idx = if err_idx == rows.len() { i } else { err_idx };
91 },
92 }
93 }
94 check_err_idx(rows, err_idx, "string")
95}
96
97fn deserialize_utf8view_into<'a, A: Borrow<BorrowedValue<'a>>>(
98 target: &mut MutableBinaryViewArray<str>,
99 rows: &[A],
100) -> PolarsResult<()> {
101 let mut err_idx = rows.len();
102 let mut scratch = String::new();
103 for (i, row) in rows.iter().enumerate() {
104 match row.borrow() {
105 BorrowedValue::String(v) => target.push_value(v.as_ref()),
106 BorrowedValue::Static(StaticNode::Bool(v)) => {
107 target.push_value(if *v { "true" } else { "false" })
108 },
109 BorrowedValue::Static(StaticNode::Null) => target.push_null(),
110 BorrowedValue::Static(node) => {
111 write!(scratch, "{node}").unwrap();
112 target.push_value(scratch.as_str());
113 scratch.clear();
114 },
115 _ => {
116 err_idx = if err_idx == rows.len() { i } else { err_idx };
117 },
118 }
119 }
120 check_err_idx(rows, err_idx, "string")
121}
122
123fn deserialize_list<'a, A: Borrow<BorrowedValue<'a>>>(
124 rows: &[A],
125 dtype: ArrowDataType,
126 allow_extra_fields_in_struct: bool,
127) -> PolarsResult<ListArray<i64>> {
128 let mut err_idx = rows.len();
129 let child = ListArray::<i64>::get_child_type(&dtype);
130
131 let mut validity = BitmapBuilder::with_capacity(rows.len());
132 let mut offsets = Offsets::<i64>::with_capacity(rows.len());
133 let mut inner = vec![];
134 rows.iter()
135 .enumerate()
136 .for_each(|(i, row)| match row.borrow() {
137 BorrowedValue::Array(value) => {
138 inner.extend(value.iter());
139 validity.push(true);
140 offsets
141 .try_push(value.len())
142 .expect("List offset is too large :/");
143 },
144 BorrowedValue::Static(StaticNode::Null) => {
145 validity.push(false);
146 offsets.extend_constant(1)
147 },
148 value @ (BorrowedValue::Static(_) | BorrowedValue::String(_)) => {
149 inner.push(value);
150 validity.push(true);
151 offsets.try_push(1).expect("List offset is too large :/");
152 },
153 _ => {
154 err_idx = if err_idx == rows.len() { i } else { err_idx };
155 },
156 });
157
158 check_err_idx(rows, err_idx, "list")?;
159
160 let values = _deserialize(&inner, child.clone(), allow_extra_fields_in_struct)?;
161
162 Ok(ListArray::<i64>::new(
163 dtype,
164 offsets.into(),
165 values,
166 validity.into_opt_validity(),
167 ))
168}
169
170fn deserialize_struct<'a, A: Borrow<BorrowedValue<'a>>>(
171 rows: &[A],
172 dtype: ArrowDataType,
173 allow_extra_fields_in_struct: bool,
174) -> PolarsResult<StructArray> {
175 let mut err_idx = rows.len();
176 let fields = StructArray::get_fields(&dtype);
177
178 let mut out_values = fields
179 .iter()
180 .map(|f| (f.name.as_str(), (f.dtype(), vec![])))
181 .collect::<PlHashMap<_, _>>();
182
183 let mut validity = BitmapBuilder::with_capacity(rows.len());
184 let mut extra_field = None;
186
187 rows.iter().enumerate().for_each(|(i, row)| {
188 match row.borrow() {
189 BorrowedValue::Object(values) => {
190 let mut n_matched = 0usize;
191 for (&key, &mut (_, ref mut inner)) in out_values.iter_mut() {
192 if let Some(v) = values.get(key) {
193 n_matched += 1;
194 inner.push(v)
195 } else {
196 inner.push(&JSON_NULL_VALUE)
197 }
198 }
199
200 validity.push(true);
201
202 if n_matched < values.len() && extra_field.is_none() {
203 for k in values.keys() {
204 if !out_values.contains_key(k.as_ref()) {
205 extra_field = Some(k.as_ref())
206 }
207 }
208 }
209 },
210 BorrowedValue::Static(StaticNode::Null) => {
211 out_values
212 .iter_mut()
213 .for_each(|(_, (_, inner))| inner.push(&JSON_NULL_VALUE));
214 validity.push(false);
215 },
216 _ => {
217 err_idx = if err_idx == rows.len() { i } else { err_idx };
218 },
219 };
220 });
221
222 if let Some(v) = extra_field {
223 if !allow_extra_fields_in_struct {
224 polars_bail!(
225 ComputeError:
226 "extra field in struct data: {}, consider increasing infer_schema_length, or \
227 manually specifying the full schema to ignore extra fields",
228 v
229 )
230 }
231 }
232
233 check_err_idx(rows, err_idx, "struct")?;
234
235 let values = fields
237 .iter()
238 .map(|fld| {
239 let (dtype, vals) = out_values.get(fld.name.as_str()).unwrap();
240 _deserialize(vals, (*dtype).clone(), allow_extra_fields_in_struct)
241 })
242 .collect::<PolarsResult<Vec<_>>>()?;
243
244 Ok(StructArray::new(
245 dtype.clone(),
246 rows.len(),
247 values,
248 validity.into_opt_validity(),
249 ))
250}
251
252fn fill_array_from<B, T, A>(
253 f: fn(&mut MutablePrimitiveArray<T>, &[B]) -> PolarsResult<()>,
254 dtype: ArrowDataType,
255 rows: &[B],
256) -> PolarsResult<Box<dyn Array>>
257where
258 T: NativeType,
259 A: From<MutablePrimitiveArray<T>> + Array,
260{
261 let mut array = MutablePrimitiveArray::<T>::with_capacity(rows.len()).to(dtype);
262 f(&mut array, rows)?;
263 Ok(Box::new(A::from(array)))
264}
265
266pub(crate) trait Container {
269 fn with_capacity(capacity: usize) -> Self
271 where
272 Self: Sized;
273}
274
275impl<O: Offset> Container for MutableBinaryArray<O> {
276 fn with_capacity(capacity: usize) -> Self {
277 MutableBinaryArray::with_capacity(capacity)
278 }
279}
280
281impl Container for MutableBooleanArray {
282 fn with_capacity(capacity: usize) -> Self {
283 MutableBooleanArray::with_capacity(capacity)
284 }
285}
286
287impl Container for MutableFixedSizeBinaryArray {
288 fn with_capacity(capacity: usize) -> Self {
289 MutableFixedSizeBinaryArray::with_capacity(capacity, 0)
290 }
291}
292
293impl Container for MutableBinaryViewArray<str> {
294 fn with_capacity(capacity: usize) -> Self
295 where
296 Self: Sized,
297 {
298 MutableBinaryViewArray::with_capacity(capacity)
299 }
300}
301
302impl<O: Offset, M: MutableArray + Default + 'static> Container for MutableListArray<O, M> {
303 fn with_capacity(capacity: usize) -> Self {
304 MutableListArray::with_capacity(capacity)
305 }
306}
307
308impl<T: NativeType> Container for MutablePrimitiveArray<T> {
309 fn with_capacity(capacity: usize) -> Self {
310 MutablePrimitiveArray::with_capacity(capacity)
311 }
312}
313
314impl<O: Offset> Container for MutableUtf8Array<O> {
315 fn with_capacity(capacity: usize) -> Self {
316 MutableUtf8Array::with_capacity(capacity)
317 }
318}
319
320fn fill_generic_array_from<B, M, A>(
321 f: fn(&mut M, &[B]) -> PolarsResult<()>,
322 rows: &[B],
323) -> PolarsResult<Box<dyn Array>>
324where
325 M: Container,
326 A: From<M> + Array,
327{
328 let mut array = M::with_capacity(rows.len());
329 f(&mut array, rows)?;
330 Ok(Box::new(A::from(array)))
331}
332
333pub(crate) fn _deserialize<'a, A: Borrow<BorrowedValue<'a>>>(
334 rows: &[A],
335 dtype: ArrowDataType,
336 allow_extra_fields_in_struct: bool,
337) -> PolarsResult<Box<dyn Array>> {
338 match &dtype {
339 ArrowDataType::Null => {
340 if let Some(err_idx) = (0..rows.len())
341 .find(|i| !matches!(rows[*i].borrow(), BorrowedValue::Static(StaticNode::Null)))
342 {
343 check_err_idx(rows, err_idx, "null")?;
344 }
345
346 Ok(Box::new(NullArray::new(dtype, rows.len())))
347 },
348 ArrowDataType::Boolean => {
349 fill_generic_array_from::<_, _, BooleanArray>(deserialize_boolean_into, rows)
350 },
351 ArrowDataType::Int8 => {
352 fill_array_from::<_, _, PrimitiveArray<i8>>(deserialize_primitive_into, dtype, rows)
353 },
354 ArrowDataType::Int16 => {
355 fill_array_from::<_, _, PrimitiveArray<i16>>(deserialize_primitive_into, dtype, rows)
356 },
357 ArrowDataType::Int32
358 | ArrowDataType::Date32
359 | ArrowDataType::Time32(_)
360 | ArrowDataType::Interval(IntervalUnit::YearMonth) => {
361 fill_array_from::<_, _, PrimitiveArray<i32>>(deserialize_primitive_into, dtype, rows)
362 },
363 ArrowDataType::Interval(IntervalUnit::DayTime) => {
364 unimplemented!("There is no natural representation of DayTime in JSON.")
365 },
366 ArrowDataType::Int64
367 | ArrowDataType::Date64
368 | ArrowDataType::Time64(_)
369 | ArrowDataType::Duration(_) => {
370 fill_array_from::<_, _, PrimitiveArray<i64>>(deserialize_primitive_into, dtype, rows)
371 },
372 ArrowDataType::Timestamp(tu, tz) => {
373 let mut err_idx = rows.len();
374 let iter = rows.iter().enumerate().map(|(i, row)| match row.borrow() {
375 BorrowedValue::Static(StaticNode::I64(v)) => Some(*v),
376 BorrowedValue::String(v) => match (tu, tz) {
377 (_, None) => {
378 polars_compute::cast::temporal::utf8_to_naive_timestamp_scalar(v, "%+", tu)
379 },
380 (_, Some(tz)) => {
381 let tz = temporal_conversions::parse_offset(tz.as_str()).unwrap();
382 temporal_conversions::utf8_to_timestamp_scalar(v, "%+", &tz, tu)
383 },
384 },
385 BorrowedValue::Static(StaticNode::Null) => None,
386 _ => {
387 err_idx = if err_idx == rows.len() { i } else { err_idx };
388 None
389 },
390 });
391 let out = Box::new(Int64Array::from_iter(iter).to(dtype));
392 check_err_idx(rows, err_idx, "timestamp")?;
393 Ok(out)
394 },
395 ArrowDataType::UInt8 => {
396 fill_array_from::<_, _, PrimitiveArray<u8>>(deserialize_primitive_into, dtype, rows)
397 },
398 ArrowDataType::UInt16 => {
399 fill_array_from::<_, _, PrimitiveArray<u16>>(deserialize_primitive_into, dtype, rows)
400 },
401 ArrowDataType::UInt32 => {
402 fill_array_from::<_, _, PrimitiveArray<u32>>(deserialize_primitive_into, dtype, rows)
403 },
404 ArrowDataType::UInt64 => {
405 fill_array_from::<_, _, PrimitiveArray<u64>>(deserialize_primitive_into, dtype, rows)
406 },
407 ArrowDataType::Float16 => unreachable!(),
408 ArrowDataType::Float32 => {
409 fill_array_from::<_, _, PrimitiveArray<f32>>(deserialize_primitive_into, dtype, rows)
410 },
411 ArrowDataType::Float64 => {
412 fill_array_from::<_, _, PrimitiveArray<f64>>(deserialize_primitive_into, dtype, rows)
413 },
414 ArrowDataType::LargeUtf8 => {
415 fill_generic_array_from::<_, _, Utf8Array<i64>>(deserialize_utf8_into, rows)
416 },
417 ArrowDataType::Utf8View => {
418 fill_generic_array_from::<_, _, Utf8ViewArray>(deserialize_utf8view_into, rows)
419 },
420 ArrowDataType::LargeList(_) => Ok(Box::new(deserialize_list(
421 rows,
422 dtype,
423 allow_extra_fields_in_struct,
424 )?)),
425 ArrowDataType::LargeBinary => Ok(Box::new(deserialize_binary(rows)?)),
426 ArrowDataType::Struct(_) => Ok(Box::new(deserialize_struct(
427 rows,
428 dtype,
429 allow_extra_fields_in_struct,
430 )?)),
431 _ => todo!(),
432 }
433}
434
435pub fn deserialize(
436 json: &BorrowedValue,
437 dtype: ArrowDataType,
438 allow_extra_fields_in_struct: bool,
439) -> PolarsResult<Box<dyn Array>> {
440 match json {
441 BorrowedValue::Array(rows) => match dtype {
442 ArrowDataType::LargeList(inner) => {
443 _deserialize(rows, inner.dtype, allow_extra_fields_in_struct)
444 },
445 _ => todo!("read an Array from a non-Array data type"),
446 },
447 _ => _deserialize(&[json], dtype, allow_extra_fields_in_struct),
448 }
449}
450
451fn check_err_idx<'a>(
452 rows: &[impl Borrow<BorrowedValue<'a>>],
453 err_idx: usize,
454 type_name: &'static str,
455) -> PolarsResult<()> {
456 if err_idx != rows.len() {
457 polars_bail!(
458 ComputeError:
459 r#"error deserializing value "{:?}" as {}. \
460 Try increasing `infer_schema_length` or specifying a schema.
461 "#,
462 rows[err_idx].borrow(), type_name,
463 )
464 }
465
466 Ok(())
467}