re_types_builder/codegen/rust/deserializer.rs
1use arrow2::datatypes::DataType;
2use proc_macro2::{Literal, TokenStream};
3use quote::{format_ident, quote};
4
5use crate::{
6 codegen::rust::{
7 arrow::{is_backed_by_arrow_buffer, quote_fqname_as_type_path, ArrowDataTypeTokenizer},
8 util::{is_tuple_struct_from_obj, quote_comment},
9 },
10 ArrowRegistry, Object, Objects,
11};
12
13// ---
14
15/// This generates code that deserializes a runtime Arrow payload into the specified `obj`, taking
16/// Arrow-transparency into account.
17///
18/// This short-circuits on error using the `try` (`?`) operator: the outer scope must be one that
19/// returns a `Result<_, DeserializationError>`!
20///
21/// There is a 1:1 relationship between `quote_arrow_deserializer` and `Loggable::from_arrow_opt`:
22/// ```ignore
23/// fn from_arrow_opt(data: &dyn ::arrow::array::Array) -> DeserializationResult<Vec<Option<Self>>> {
24/// Ok(#quoted_deserializer)
25/// }
26/// ```
27///
28/// This tells you two things:
29/// - The runtime Arrow payload is always held in a variable `data`, identified as `data_src` below.
30/// - The returned `TokenStream` must always instantiates a `Vec<Option<Self>>`.
31///
32/// ## Performance vs validation
33/// The deserializers are designed for maximum performance, assuming the incoming data is correct.
34/// If the data is not correct, the deserializers will return an error, but never panic or crash.
35///
36/// TODO(#5305): Currently we're doing a lot of checking for exact matches.
37/// We should instead assume data is correct and handle errors gracefully.
38///
39/// ## Understanding datatypes
40///
41/// There are three (!) datatypes involved in the deserialization process:
42/// - The object's native Rust type, which was derived from its IDL definition by the codegen
43/// framework.
44/// - The object's Arrow datatype, which was also derived from its IDL definition.
45/// - The runtime payload's advertised Arrow datatype.
46///
47/// The deserialization process is _entirely_ driven by our own compile-time IDL-derived definitions:
48/// the runtime payload's advertised Arrow datatype is only ever used as a mean of checking whether
49/// the data we receive can be coerced one way or another into something that fit our schema.
50///
51/// In some places that coercion can be very strict (if the data doesn't match exactly, we abort
52/// with a runtime error) while in other it might be more relaxed for performance reasons
53/// (e.g. ignore the fact that the data has a bitmap altogether).
54pub fn quote_arrow_deserializer(
55 arrow_registry: &ArrowRegistry,
56 objects: &Objects,
57 obj: &Object,
58) -> TokenStream {
59 // Runtime identifier of the variable holding the Arrow payload (`&dyn ::arrow::array::Array`).
60 let data_src = format_ident!("arrow_data");
61
62 let datatype = &arrow_registry.get(&obj.fqname);
63 let quoted_self_datatype = quote! { Self::arrow_datatype() };
64
65 let obj_fqname = obj.fqname.as_str();
66 let is_enum = obj.is_enum();
67 let is_arrow_transparent = obj.datatype.is_none();
68 let is_tuple_struct = is_tuple_struct_from_obj(obj);
69
70 if is_enum {
71 // An enum is very similar to a transparent type.
72
73 // As a transparent type, it's not clear what this does or
74 // where it should come from. Also, it's not used in the internal
75 // implementation of `quote_arrow_field_deserializer` anyways.
76 // TODO(#6819): If we get rid of nullable components this will likely need to change.
77 let is_nullable = true; // Will be ignored
78
79 let obj_field_fqname = format!("{obj_fqname}#enum");
80
81 let quoted_deserializer = quote_arrow_field_deserializer(
82 objects,
83 datatype.to_logical_type(),
84 "ed_self_datatype, // we are transparent, so the datatype of `Self` is the datatype of our contents
85 is_nullable,
86 &obj_field_fqname,
87 &data_src,
88 InnerRepr::NativeIterable,
89 );
90
91 let quoted_branches = obj.fields.iter().map(|obj_field| {
92 let quoted_obj_field_type = format_ident!("{}", obj_field.name);
93
94 // We should never hit this unwrap or it means the enum-processing at
95 // the fbs layer is totally broken.
96 let enum_value = obj_field.enum_value.unwrap();
97 let quoted_enum_value = proc_macro2::Literal::u8_unsuffixed(enum_value);
98
99 quote! {
100 Some(#quoted_enum_value) => Ok(Some(Self::#quoted_obj_field_type))
101 }
102 });
103
104 // TODO(jleibs): We should be able to do this with try_from instead.
105 let quoted_remapping = quote! {
106 .map(|typ| {
107 match typ {
108 // The actual enum variants
109 #(#quoted_branches,)*
110 None => Ok(None),
111 Some(invalid) => Err(DeserializationError::missing_union_arm(
112 #quoted_self_datatype, "<invalid>", invalid as _,
113 )),
114 }
115 })
116 };
117
118 quote! {
119 #quoted_deserializer
120 #quoted_remapping
121 // NOTE: implicit Vec<Result> to Result<Vec>
122 .collect::<DeserializationResult<Vec<Option<_>>>>()
123 .with_context(#obj_fqname)?
124 }
125 } else if is_arrow_transparent {
126 // NOTE: Arrow transparent objects must have a single field, no more no less.
127 // The semantic pass would have failed already if this wasn't the case.
128 let obj_field = &obj.fields[0];
129 let obj_field_fqname = obj_field.fqname.as_str();
130
131 let data_dst = format_ident!(
132 "{}",
133 if is_tuple_struct {
134 "data0"
135 } else {
136 obj_field.name.as_str()
137 }
138 );
139
140 let field_datatype = arrow_registry.get(&obj_field.fqname);
141
142 let quoted_deserializer = quote_arrow_field_deserializer(
143 objects,
144 &field_datatype,
145 "ed_self_datatype, // we are transparent, so the datatype of `Self` is the datatype of our contents
146 obj_field.is_nullable,
147 obj_field_fqname,
148 &data_src,
149 InnerRepr::NativeIterable,
150 );
151
152 let quoted_unwrapping = if obj_field.is_nullable {
153 quote!(.map(Ok))
154 } else {
155 // error context is appended below during final collection
156 quote!(.map(|v| v.ok_or_else(DeserializationError::missing_data)))
157 };
158
159 let quoted_remapping = if is_tuple_struct {
160 quote!(.map(|res| res.map(|v| Some(Self(v)))))
161 } else {
162 quote!(.map(|res| res.map(|#data_dst| Some(Self { #data_dst }))))
163 };
164
165 quote! {
166 #quoted_deserializer
167 #quoted_unwrapping
168 #quoted_remapping
169 // NOTE: implicit Vec<Result> to Result<Vec>
170 .collect::<DeserializationResult<Vec<Option<_>>>>()
171 // NOTE: double context so the user can see the transparent shenanigans going on in the
172 // error.
173 .with_context(#obj_field_fqname)
174 .with_context(#obj_fqname)?
175 }
176 } else {
177 // NOTE: This can only be struct or union/enum at this point.
178 match datatype.to_logical_type() {
179 DataType::Struct(_) => {
180 let data_src_fields = format_ident!("{data_src}_fields");
181 let data_src_arrays = format_ident!("{data_src}_arrays");
182
183 let quoted_field_deserializers = obj.fields.iter().map(|obj_field| {
184 let field_name = &obj_field.name;
185 let data_dst = format_ident!("{}", obj_field.name);
186 let field_datatype = &arrow_registry.get(&obj_field.fqname);
187
188 let quoted_deserializer = quote_arrow_field_deserializer(
189 objects,
190 field_datatype,
191 "e_datatype(field_datatype),
192 obj_field.is_nullable,
193 obj_field.fqname.as_str(),
194 &data_src,
195 InnerRepr::NativeIterable,
196 );
197
198 quote! {
199 let #data_dst = {
200 // NOTE: `arrays_by_name` is a runtime collection of all of the input's
201 // payload's struct fields, while `#field_name` is the field we're
202 // looking for at comptime… there's no guarantee it's actually there at
203 // runtime!
204 if !arrays_by_name.contains_key(#field_name) {
205 return Err(DeserializationError::missing_struct_field(
206 #quoted_self_datatype, #field_name,
207 )).with_context(#obj_fqname);
208 }
209
210 // NOTE: The indexing by name is safe: checked above.
211 let #data_src = &**arrays_by_name[#field_name];
212 #quoted_deserializer
213 }
214 }
215 });
216
217 // NOTE: Collecting because we need it more than once.
218 let quoted_field_names = obj
219 .fields
220 .iter()
221 .map(|field| format_ident!("{}", field.name))
222 .collect::<Vec<_>>();
223
224 let quoted_unwrappings = obj.fields.iter().map(|obj_field| {
225 let obj_field_fqname = obj_field.fqname.as_str();
226 let quoted_obj_field_name = format_ident!("{}", obj_field.name);
227 if obj_field.is_nullable {
228 quote!(#quoted_obj_field_name)
229 } else {
230 quote! {
231 #quoted_obj_field_name: #quoted_obj_field_name
232 .ok_or_else(DeserializationError::missing_data)
233 .with_context(#obj_field_fqname)?
234 }
235 }
236 });
237
238 let quoted_downcast = {
239 let cast_as = quote!(arrow::array::StructArray);
240 quote_array_downcast(obj_fqname, &data_src, cast_as, "ed_self_datatype)
241 };
242 quote! {{
243 let #data_src = #quoted_downcast?;
244 if #data_src.is_empty() {
245 // NOTE: The outer container is empty and so we already know that the end result
246 // is also going to be an empty vec.
247 // Early out right now rather than waste time computing possibly many empty
248 // datastructures for all of our children.
249 Vec::new()
250 } else {
251 let (#data_src_fields, #data_src_arrays) = (#data_src.fields(), #data_src.columns());
252
253 let arrays_by_name: ::std::collections::HashMap<_, _> = #data_src_fields
254 .iter()
255 .map(|field| field.name().as_str())
256 .zip(#data_src_arrays)
257 .collect();
258
259 #(#quoted_field_deserializers;)*
260
261 ZipValidity::new_with_validity(
262 ::itertools::izip!(#(#quoted_field_names),*),
263 #data_src.nulls(),
264 )
265 .map(|opt| opt.map(|(#(#quoted_field_names),*)| Ok(Self { #(#quoted_unwrappings,)* })).transpose())
266 // NOTE: implicit Vec<Result> to Result<Vec>
267 .collect::<DeserializationResult<Vec<_>>>()
268 .with_context(#obj_fqname)?
269 }
270 }}
271 }
272
273 DataType::Union(_, _, arrow2::datatypes::UnionMode::Sparse) => {
274 // We use sparse arrow unions for c-style enums, which means only 8 bits is required for each field,
275 // and nulls are encoded with a special 0-index `_null_markers` variant.
276
277 let data_src_types = format_ident!("{data_src}_type_ids");
278
279 let obj_fqname = obj.fqname.as_str();
280 let quoted_branches = obj.fields.iter().enumerate().map(|(typ, obj_field)| {
281 let arrow_type_index = Literal::i8_unsuffixed(typ as i8 + 1); // 0 is reserved for `_null_markers`
282
283 let quoted_obj_field_type = format_ident!("{}", obj_field.name);
284 quote! {
285 #arrow_type_index => Ok(Some(Self::#quoted_obj_field_type))
286 }
287 });
288
289 let quoted_downcast = {
290 let cast_as = quote!(arrow::array::UnionArray);
291 quote_array_downcast(obj_fqname, &data_src, &cast_as, "ed_self_datatype)
292 };
293
294 quote! {{
295 let #data_src = #quoted_downcast?;
296 let #data_src_types = #data_src.type_ids();
297
298 #data_src_types
299 .iter()
300 .map(|typ| {
301 match typ {
302 0 => Ok(None),
303
304 // The actual enum variants
305 #(#quoted_branches,)*
306
307 _ => Err(DeserializationError::missing_union_arm(
308 #quoted_self_datatype, "<invalid>", *typ as _,
309 )),
310 }
311 })
312 // NOTE: implicit Vec<Result> to Result<Vec>
313 .collect::<DeserializationResult<Vec<_>>>()
314 .with_context(#obj_fqname)?
315 }}
316 }
317
318 DataType::Union(_, _, arrow2::datatypes::UnionMode::Dense) => {
319 // We use dense arrow unions for proper sum-type unions.
320 // Nulls are encoded with a special 0-index `_null_markers` variant.
321
322 let data_src_type_ids = format_ident!("{data_src}_type_ids");
323 let data_src_offsets = format_ident!("{data_src}_offsets");
324
325 let quoted_field_deserializers = obj
326 .fields
327 .iter()
328 .enumerate()
329 .filter(|(_, obj_field)| {
330 // For unit fields we don't have to collect any data.
331 obj_field.typ != crate::Type::Unit
332 })
333 .map(|(type_id, obj_field)| {
334 let data_dst = format_ident!("{}", obj_field.snake_case_name());
335
336 let field_datatype = &arrow_registry.get(&obj_field.fqname);
337 let quoted_deserializer = quote_arrow_field_deserializer(
338 objects,
339 field_datatype,
340 "e_datatype(field_datatype),
341 obj_field.is_nullable,
342 obj_field.fqname.as_str(),
343 &data_src,
344 InnerRepr::NativeIterable,
345 );
346
347 let type_id = Literal::usize_unsuffixed(type_id + 1); // NOTE: +1 to account for `_null_markers` virtual arm
348
349 quote! {
350 let #data_dst = {
351 // `.child()` will panic if the given `type_id` doesn't exist,
352 // which could happen if the number of union arms has changed
353 // between serialization and deserialization.
354 // There is no simple way to check for this using `arrow-rs`
355 // (no access to `UnionArray::fields` as of arrow 54:
356 // https://docs.rs/arrow/latest/arrow/array/struct.UnionArray.html)
357 //
358 // Still, we're planning on removing arrow unions entirely, so this is… fine.
359 // TODO(#6388): stop using arrow unions, and remove this peril
360 let #data_src = #data_src.child(#type_id).as_ref();
361 #quoted_deserializer.collect::<Vec<_>>()
362 }
363 }
364 });
365
366 let obj_fqname = obj.fqname.as_str();
367 let quoted_branches = obj.fields.iter().enumerate().map(|(typ, obj_field)| {
368 let typ = typ as i8 + 1; // NOTE: +1 to account for `_null_markers` virtual arm
369
370 let obj_field_fqname = obj_field.fqname.as_str();
371 let quoted_obj_field_name = format_ident!("{}", obj_field.snake_case_name());
372 let quoted_obj_field_type = format_ident!("{}", obj_field.pascal_case_name());
373
374 if obj_field.typ == crate::Type::Unit {
375 // TODO(andreas): Should we check there's enough nulls on the null array?
376 return quote! {
377 #typ => Self::#quoted_obj_field_type
378 };
379 }
380
381 let quoted_unwrap = if obj_field.is_nullable {
382 quote!()
383 } else {
384 quote! {
385 .ok_or_else(DeserializationError::missing_data)
386 .with_context(#obj_field_fqname)?
387 }
388 };
389
390 quote! {
391 #typ => Self::#quoted_obj_field_type({
392 // NOTE: It is absolutely crucial we explicitly handle the
393 // boundchecks manually first, otherwise rustc completely chokes
394 // when indexing the data (as in: a 100x perf drop)!
395 if offset as usize >= #quoted_obj_field_name.len() {
396 return Err(DeserializationError::offset_oob(
397 offset as _, #quoted_obj_field_name.len()
398 )).with_context(#obj_field_fqname);
399 }
400
401 // Safety: all checked above.
402 #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
403 unsafe { #quoted_obj_field_name.get_unchecked(offset as usize) }
404 .clone()
405 #quoted_unwrap
406 })
407 }
408 });
409
410 let quoted_downcast = {
411 let cast_as = quote!(arrow::array::UnionArray);
412 quote_array_downcast(obj_fqname, &data_src, &cast_as, "ed_self_datatype)
413 };
414
415 quote! {{
416 let #data_src = #quoted_downcast?;
417 if #data_src.is_empty() {
418 // NOTE: The outer container is empty and so we already know that the end result
419 // is also going to be an empty vec.
420 // Early out right now rather than waste time computing possibly many empty
421 // datastructures for all of our children.
422 Vec::new()
423 } else {
424 let #data_src_type_ids = #data_src.type_ids();
425
426 let #data_src_offsets = #data_src.offsets()
427 // NOTE: expected dense union, got a sparse one instead
428 .ok_or_else(|| {
429 let expected = #quoted_self_datatype;
430 let actual = #data_src.data_type().clone();
431 DeserializationError::datatype_mismatch(expected, actual)
432 }).with_context(#obj_fqname)?;
433
434 if #data_src_type_ids.len() != #data_src_offsets.len() {
435 // NOTE: need one offset array per union arm!
436 return Err(DeserializationError::offset_slice_oob(
437 (0, #data_src_type_ids.len()), #data_src_offsets.len(),
438 )).with_context(#obj_fqname);
439 }
440
441 #(#quoted_field_deserializers;)*
442
443 #data_src_type_ids
444 .iter()
445 .enumerate()
446 .map(|(i, typ)| {
447 // NOTE: Array indexing is safe, checked above.
448 let offset = #data_src_offsets[i];
449
450 if *typ == 0 {
451 Ok(None)
452 } else {
453 Ok(Some(match typ {
454 #(#quoted_branches,)*
455 _ => {
456 return Err(DeserializationError::missing_union_arm(
457 #quoted_self_datatype, "<invalid>", *typ as _,
458 ));
459 }
460 }))
461 }
462 })
463 // NOTE: implicit Vec<Result> to Result<Vec>
464 .collect::<DeserializationResult<Vec<_>>>()
465 .with_context(#obj_fqname)?
466 }
467 }}
468 }
469
470 _ => unimplemented!("{datatype:#?}"),
471 }
472 }
473}
474
475#[derive(Copy, Clone, PartialEq)]
476enum InnerRepr {
477 /// The inner elements of the field should be exposed as `Buffer<T>`
478 /// This is only applicable when T is an arrow primitive
479 BufferT,
480
481 /// The inner elements of the field should be exposed as an iterable of T
482 NativeIterable,
483}
484
485/// This generates code that deserializes a runtime Arrow payload according to the specified `datatype`.
486///
487/// The `datatype` comes from our compile-time Arrow registry, not from the runtime payload!
488/// If the datatype happens to be a struct or union, this will merely inject a runtime call to
489/// `Loggable::from_arrow_opt` and call it a day, preventing code bloat.
490///
491/// `data_src` is the runtime identifier of the variable holding the Arrow payload (`&dyn ::arrow::array::Array`).
492/// The returned `TokenStream` always instantiates a `Vec<Option<T>>`.
493///
494/// This short-circuits on error using the `try` (`?`) operator: the outer scope must be one that
495/// returns a `Result<_, DeserializationError>`!
496#[allow(clippy::too_many_arguments)]
497fn quote_arrow_field_deserializer(
498 objects: &Objects,
499 datatype: &DataType,
500 quoted_datatype: &TokenStream,
501 is_nullable: bool,
502 obj_field_fqname: &str,
503 data_src: &proc_macro2::Ident, // &dyn ::arrow::array::Array
504 inner_repr: InnerRepr,
505) -> TokenStream {
506 _ = is_nullable; // not yet used, will be needed very soon
507
508 // If the inner object is an enum, then dispatch to its deserializer.
509 if let DataType::Extension(fqname, _, _) = datatype {
510 if objects.get(fqname).is_some_and(|obj| obj.is_enum()) {
511 let fqname_use = quote_fqname_as_type_path(fqname);
512 return quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter());
513 }
514 }
515
516 match datatype.to_logical_type() {
517 DataType::Int8
518 | DataType::Int16
519 | DataType::Int32
520 | DataType::Int64
521 | DataType::UInt8
522 | DataType::UInt16
523 | DataType::UInt32
524 | DataType::UInt64
525 | DataType::Float16
526 | DataType::Float32
527 | DataType::Float64
528 | DataType::Boolean
529 | DataType::Null => {
530 let quoted_iter_transparency =
531 quote_iterator_transparency(objects, datatype, IteratorKind::OptionValue, None);
532
533 let quoted_downcast = {
534 let cast_as = format!("{:?}", datatype.to_logical_type()).replace("DataType::", "");
535 let cast_as = format_ident!("{cast_as}Array");
536 quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
537 };
538
539 match inner_repr {
540 InnerRepr::BufferT => quote! {
541 #quoted_downcast?
542 .values()
543 },
544 InnerRepr::NativeIterable => quote! {
545 #quoted_downcast?
546 .into_iter() // NOTE: automatically checks the bitmap on our behalf
547 #quoted_iter_transparency
548 },
549 }
550 }
551
552 DataType::Utf8 => {
553 let quoted_downcast = {
554 let cast_as = quote!(StringArray);
555 quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
556 };
557
558 let quoted_iter_transparency = quote_iterator_transparency(
559 objects,
560 datatype,
561 IteratorKind::ResultOptionValue,
562 quote!(::re_types_core::ArrowString::from).into(),
563 );
564
565 let data_src_buf = format_ident!("{data_src}_buf");
566
567 quote! {{
568 let #data_src = #quoted_downcast?;
569 let #data_src_buf = #data_src.values();
570
571 let offsets = #data_src.offsets();
572 ZipValidity::new_with_validity(
573 offsets.windows(2),
574 #data_src.nulls(),
575 )
576 .map(|elem| elem.map(|window| {
577 // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
578
579 let start = window[0] as usize;
580 let end = window[1] as usize;
581 let len = end - start;
582
583 // NOTE: It is absolutely crucial we explicitly handle the
584 // boundchecks manually first, otherwise rustc completely chokes
585 // when slicing the data (as in: a 100x perf drop)!
586 if #data_src_buf.len() < end {
587 // error context is appended below during final collection
588 return Err(DeserializationError::offset_slice_oob(
589 (start, end), #data_src_buf.len(),
590 ));
591 }
592 #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] // TODO(apache/arrow-rs#6900): slice_with_length_unchecked unsafe when https://github.com/apache/arrow-rs/pull/6901 is merged and released
593 let data = #data_src_buf.slice_with_length(start, len);
594
595 Ok(data)
596 }).transpose()
597 )
598 #quoted_iter_transparency
599 // NOTE: implicit Vec<Result> to Result<Vec>
600 .collect::<DeserializationResult<Vec<Option<_>>>>()
601 .with_context(#obj_field_fqname)?
602 .into_iter()
603 }}
604 }
605
606 DataType::FixedSizeList(inner, length) => {
607 let data_src_inner = format_ident!("{data_src}_inner");
608 let quoted_inner = quote_arrow_field_deserializer(
609 objects,
610 inner.data_type(),
611 "e_datatype(inner.data_type()),
612 inner.is_nullable,
613 obj_field_fqname,
614 &data_src_inner,
615 InnerRepr::NativeIterable,
616 );
617
618 let quoted_downcast = {
619 let cast_as = quote!(arrow::array::FixedSizeListArray);
620 quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
621 };
622
623 let quoted_iter_transparency = quote_iterator_transparency(
624 objects,
625 datatype,
626 IteratorKind::ResultOptionValue,
627 None,
628 );
629
630 let comment_note_unwrap =
631 quote_comment("NOTE: Unwrapping cannot fail: the length must be correct.");
632
633 quote! {{
634 let #data_src = #quoted_downcast?;
635 if #data_src.is_empty() {
636 // NOTE: The outer container is empty and so we already know that the end result
637 // is also going to be an empty vec.
638 // Early out right now rather than waste time computing possibly many empty
639 // datastructures for all of our children.
640 Vec::new()
641 } else {
642 let offsets = (0..).step_by(#length).zip((#length..).step_by(#length).take(#data_src.len()));
643
644 let #data_src_inner = {
645 let #data_src_inner = &**#data_src.values();
646 #quoted_inner.collect::<Vec<_>>()
647 };
648
649 ZipValidity::new_with_validity(offsets, #data_src.nulls())
650 .map(|elem| elem.map(|(start, end): (usize, usize)| {
651 // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
652
653 // We're manually generating our own offsets in this case, thus length
654 // must be correct.
655 debug_assert!(end - start == #length);
656
657 // NOTE: It is absolutely crucial we explicitly handle the
658 // boundchecks manually first, otherwise rustc completely chokes
659 // when slicing the data (as in: a 100x perf drop)!
660 if #data_src_inner.len() < end {
661 // error context is appended below during final collection
662 return Err(DeserializationError::offset_slice_oob(
663 (start, end), #data_src_inner.len(),
664 ));
665 }
666 // Safety: all checked above.
667 #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
668 let data = unsafe { #data_src_inner.get_unchecked(start..end) };
669
670 // NOTE: The call to `Option::unwrap_or_default` is very important here.
671 //
672 // Since we can only get here if the outer entry is marked as
673 // non-null, the only possible reason for the default() path
674 // to be taken is because the inner field itself is nullable and
675 // happens to have one or more nullable values in the referenced
676 // slice.
677 // This is perfectly fine, and when it happens, we need to fill the
678 // resulting vec with some data, hence default().
679 //
680 // This does have a subtle implication though!
681 // Since we never even look at the inner field's data when the outer
682 // entry is null, it means we won't notice it if illegal/malformed/corrupt
683 // in any way.
684 // It is important that we turn a blind eye here, because most SDKs in
685 // the ecosystem will put illegal data (e.g. null entries in an array of
686 // non-null floats) in the inner buffer if the outer entry itself
687 // is null.
688 //
689 // TODO(#2875): use MaybeUninit rather than requiring a default impl
690 let data = data.iter().cloned().map(Option::unwrap_or_default);
691 // The following would be the correct thing to do, but costs us way
692 // too much performance-wise for something that only applies to
693 // malformed inputs.
694 //
695 // // NOTE: We don't support nullable inner elements in our IDL, so
696 // // this can only be a case of malformed data.
697 // .map(|opt| opt.ok_or_else(DeserializationError::missing_data))
698 // .collect::<DeserializationResult<Vec<_>>>()?;
699
700 #comment_note_unwrap
701 #[allow(clippy::unwrap_used)]
702 Ok(array_init::from_iter(data).unwrap())
703 }).transpose()
704 )
705 #quoted_iter_transparency
706 // NOTE: implicit Vec<Result> to Result<Vec>
707 .collect::<DeserializationResult<Vec<Option<_>>>>()?
708 }
709 .into_iter()
710 }}
711 }
712
713 DataType::List(inner) => {
714 let data_src_inner = format_ident!("{data_src}_inner");
715
716 let inner_repr = if is_backed_by_arrow_buffer(inner.data_type()) {
717 InnerRepr::BufferT
718 } else {
719 InnerRepr::NativeIterable
720 };
721
722 let quoted_inner = quote_arrow_field_deserializer(
723 objects,
724 inner.data_type(),
725 "e_datatype(inner.data_type()),
726 inner.is_nullable,
727 obj_field_fqname,
728 &data_src_inner,
729 inner_repr,
730 );
731
732 let quoted_downcast = {
733 let cast_as = quote!(arrow::array::ListArray);
734 quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
735 };
736 let quoted_collect_inner = match inner_repr {
737 InnerRepr::BufferT => quote!(),
738 InnerRepr::NativeIterable => quote!(.collect::<Vec<_>>()),
739 };
740
741 let quoted_inner_data_range = match inner_repr {
742 InnerRepr::BufferT => {
743 quote! {
744 #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] // TODO(apache/arrow-rs#6900): unsafe slice_unchecked when https://github.com/apache/arrow-rs/pull/6901 is merged and released
745 let data = #data_src_inner.clone().slice(start, end - start);
746 let data = ::re_types_core::ArrowBuffer::from(data);
747 }
748 }
749 InnerRepr::NativeIterable => quote! {
750 #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
751 let data = unsafe { #data_src_inner.get_unchecked(start..end) };
752
753 // NOTE: The call to `Option::unwrap_or_default` is very important here.
754 //
755 // Since we can only get here if the outer oob is marked as
756 // non-null, the only possible reason for the default() path
757 // to be taken is because the inner field itself is nullable and
758 // happens to have one or more nullable values in the referenced
759 // slice.
760 // This is perfectly fine, and when it happens, we need to fill the
761 // resulting vec with some data, hence default().
762 //
763 // This does have a subtle implication though!
764 // Since we never even look at the inner field's data when the outer
765 // entry is null, it means we won't notice it if illegal/malformed/corrupt
766 // in any way.
767 // It is important that we turn a blind eye here, because most SDKs in
768 // the ecosystem will put illegal data (e.g. null entries in an array of
769 // non-null floats) in the inner buffer if the outer entry itself
770 // is null.
771 //
772 // TODO(#2875): use MaybeUninit rather than requiring a default impl
773 let data = data.iter().cloned().map(Option::unwrap_or_default).collect();
774 // The following would be the correct thing to do, but costs us way
775 // too much performance-wise for something that only applies to
776 // malformed inputs.
777 //
778 // // NOTE: We don't support nullable inner elements in our IDL, so
779 // // this can only be a case of malformed data.
780 // .map(|opt| opt.ok_or_else(DeserializationError::missing_data))
781 // .collect::<DeserializationResult<Vec<_>>>()?;
782 },
783 };
784
785 quote! {{
786 let #data_src = #quoted_downcast?;
787 if #data_src.is_empty() {
788 // NOTE: The outer container is empty and so we already know that the end result
789 // is also going to be an empty vec.
790 // Early out right now rather than waste time computing possibly many empty
791 // datastructures for all of our children.
792 Vec::new()
793 } else {
794 let #data_src_inner = {
795 let #data_src_inner = &**#data_src.values();
796 #quoted_inner #quoted_collect_inner
797 };
798
799 let offsets = #data_src.offsets();
800 ZipValidity::new_with_validity(
801 offsets.windows(2),
802 #data_src.nulls(),
803 )
804 .map(|elem| elem.map(|window| {
805 // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
806
807 let start = window[0] as usize;
808 let end = window[1] as usize;
809
810 // NOTE: It is absolutely crucial we explicitly handle the
811 // boundchecks manually first, otherwise rustc completely chokes
812 // when slicing the data (as in: a 100x perf drop)!
813 if #data_src_inner.len() < end {
814 // error context is appended below during final collection
815 return Err(DeserializationError::offset_slice_oob(
816 (start, end), #data_src_inner.len(),
817 ));
818 }
819
820 #quoted_inner_data_range
821
822 Ok(data)
823 }).transpose()
824 )
825 // NOTE: implicit Vec<Result> to Result<Vec>
826 .collect::<DeserializationResult<Vec<Option<_>>>>()?
827 }
828 .into_iter()
829 }}
830 }
831
832 DataType::Struct(_) | DataType::Union(_, _, _) => {
833 let DataType::Extension(fqname, _, _) = datatype else {
834 unreachable!()
835 };
836 let fqname_use = quote_fqname_as_type_path(fqname);
837 quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter())
838 }
839
840 _ => unimplemented!("{datatype:#?}"),
841 }
842}
843
844fn quote_datatype(datatype: &DataType) -> TokenStream {
845 let is_recursive = false;
846 let expected = ArrowDataTypeTokenizer(datatype, is_recursive);
847 quote! { #expected }
848}
849
850/// Generates tokens that downcast the runtime Arrow array identifier by `arr` as `cast_as`, making sure
851/// to inject proper error handling.
852fn quote_array_downcast(
853 location: impl AsRef<str>,
854 arr: &syn::Ident,
855 cast_as: impl quote::ToTokens,
856 quoted_expected_datatype: &TokenStream,
857) -> TokenStream {
858 let location = location.as_ref();
859 let cast_as = cast_as.to_token_stream();
860 quote! {
861 #arr
862 .as_any()
863 .downcast_ref::<#cast_as>()
864 .ok_or_else(|| {
865 let expected = #quoted_expected_datatype;
866 let actual = #arr.data_type().clone();
867 DeserializationError::datatype_mismatch(expected, actual)
868 })
869 .with_context(#location)
870 }
871}
872
873#[derive(Debug, Clone, Copy)]
874#[allow(dead_code)]
875enum IteratorKind {
876 /// `Iterator<Item = DeserializationResult<Option<T>>>`.
877 ResultOptionValue,
878
879 /// `Iterator<Item = Option<DeserializationResult<T>>>`.
880 OptionResultValue,
881
882 /// `Iterator<Item = Option<T>>`.
883 OptionValue,
884
885 /// `Iterator<Item = DeserializationResult<T>>`.
886 ResultValue,
887
888 /// `Iterator<Item = T>`.
889 Value,
890}
891
892/// This generates code that maps the data in an iterator in order to apply the Arrow transparency
893/// rules to it, if necessary.
894///
895/// This can often become a very difficult job due to all the affixes that might be involved:
896/// fallibility, nullability, transparency, tuple structs…
897/// This function will just do the right thing.
898///
899/// If `extra_wrapper` is specified, this will also wrap the resulting data in `$extra_wrapper(data)`.
900///
901/// Have a look around in this file for examples of use.
902fn quote_iterator_transparency(
903 objects: &Objects,
904 datatype: &DataType,
905 iter_kind: IteratorKind,
906 extra_wrapper: Option<TokenStream>,
907) -> TokenStream {
908 #![allow(clippy::collapsible_else_if)]
909
910 let inner_obj = if let DataType::Extension(fqname, _, _) = datatype {
911 Some(&objects[fqname])
912 } else {
913 None
914 };
915 let inner_is_arrow_transparent = inner_obj.is_some_and(|obj| obj.datatype.is_none());
916
917 if inner_is_arrow_transparent {
918 let inner_obj = inner_obj.as_ref().unwrap();
919 let quoted_inner_obj_type = quote_fqname_as_type_path(&inner_obj.fqname);
920
921 let is_tuple_struct = is_tuple_struct_from_obj(inner_obj);
922 let quoted_data_dst = format_ident!(
923 "{}",
924 if is_tuple_struct {
925 "data0"
926 } else {
927 inner_obj.fields[0].name.as_str()
928 }
929 );
930
931 let quoted_binding = if is_tuple_struct {
932 if let Some(extra_wrapper) = extra_wrapper {
933 quote!(|v| #quoted_inner_obj_type(#extra_wrapper(v)))
934 } else {
935 quote!(#quoted_inner_obj_type)
936 }
937 } else {
938 if let Some(extra_wrapper) = extra_wrapper {
939 quote!(|#quoted_data_dst| #quoted_inner_obj_type { #quoted_data_dst: #extra_wrapper(v) })
940 } else {
941 quote!(|#quoted_data_dst| #quoted_inner_obj_type { #quoted_data_dst })
942 }
943 };
944
945 match iter_kind {
946 IteratorKind::ResultOptionValue | IteratorKind::OptionResultValue => {
947 quote!(.map(|res_or_opt| res_or_opt.map(|res_or_opt| res_or_opt.map(#quoted_binding))))
948 }
949 IteratorKind::OptionValue | IteratorKind::ResultValue => {
950 quote!(.map(|res_or_opt| res_or_opt.map(#quoted_binding)))
951 }
952 IteratorKind::Value => quote!(.map(#quoted_binding)),
953 }
954 } else {
955 if let Some(extra_wrapper) = extra_wrapper {
956 let quoted_binding = quote!(|v| #extra_wrapper(v));
957 match iter_kind {
958 IteratorKind::ResultOptionValue | IteratorKind::OptionResultValue => {
959 quote!(.map(|res_or_opt| res_or_opt.map(|res_or_opt| res_or_opt.map(#quoted_binding))))
960 }
961 IteratorKind::OptionValue | IteratorKind::ResultValue => {
962 quote!(.map(|res_or_opt| res_or_opt.map(#quoted_binding)))
963 }
964 IteratorKind::Value => quote!(.map(#quoted_binding)),
965 }
966 } else {
967 quote!()
968 }
969 }
970}
971
972/// This generates code that deserializes a runtime Arrow payload into the specified `obj`, taking
973/// Arrow-transparency into account.
974///
975/// It contains additional performance optimizations based on the inner-type being a non-nullable primitive
976/// allowing us to map directly to slices rather than iterating. The ability to use this optimization is
977/// determined by [`should_optimize_buffer_slice_deserialize`].
978///
979/// There is a 1:1 relationship between `quote_arrow_deserializer_buffer_slice` and `Loggable::from_arrow`:
980/// ```ignore
981/// fn from_arrow(data: &dyn ::arrow::array::Array) -> DeserializationResult<Vec<Self>> {
982/// Ok(#quoted_deserializer_)
983/// }
984/// ```
985///
986/// See [`quote_arrow_deserializer_buffer_slice`] for additional information.
987pub fn quote_arrow_deserializer_buffer_slice(
988 arrow_registry: &ArrowRegistry,
989 objects: &Objects,
990 obj: &Object,
991) -> TokenStream {
992 // Runtime identifier of the variable holding the Arrow payload (`&dyn ::arrow::array::Array`).
993 let data_src = format_ident!("arrow_data");
994
995 let datatype = &arrow_registry.get(&obj.fqname);
996
997 let is_arrow_transparent = obj.datatype.is_none();
998 let is_tuple_struct = is_tuple_struct_from_obj(obj);
999
1000 if is_arrow_transparent {
1001 // NOTE: Arrow transparent objects must have a single field, no more no less.
1002 // The semantic pass would have failed already if this wasn't the case.
1003 debug_assert!(obj.fields.len() == 1);
1004 let obj_field = &obj.fields[0];
1005 let obj_field_fqname = obj_field.fqname.as_str();
1006
1007 let data_dst = format_ident!(
1008 "{}",
1009 if is_tuple_struct {
1010 "data0"
1011 } else {
1012 obj_field.name.as_str()
1013 }
1014 );
1015
1016 let datatype = arrow_registry.get(&obj_field.fqname);
1017 let deserizlized_as_slice = quote_arrow_field_deserializer_buffer_slice(
1018 &datatype,
1019 obj_field.is_nullable,
1020 obj_field_fqname,
1021 &data_src,
1022 );
1023
1024 let quoted_iter_transparency =
1025 quote_iterator_transparency(objects, &datatype, IteratorKind::Value, None);
1026 let quoted_iter_transparency = quote!(.copied() #quoted_iter_transparency);
1027
1028 let quoted_remapping = if is_tuple_struct {
1029 quote!(.map(Self))
1030 } else {
1031 quote!(.map(|#data_dst| Self { #data_dst }))
1032 };
1033
1034 quote! {{
1035 let slice = #deserizlized_as_slice;
1036
1037 {
1038 // NOTE(#3850): Don't add a profile scope here: the profiler overhead is too big for this fast function.
1039 // re_tracing::profile_scope!("collect");
1040
1041 slice
1042 .iter()
1043 #quoted_iter_transparency
1044 #quoted_remapping
1045 .collect::<Vec<_>>()
1046 }
1047 }}
1048 } else {
1049 unimplemented!("{datatype:#?}")
1050 }
1051}
1052
1053/// This generates code that deserializes a runtime Arrow payload according to the specified `datatype`.
1054///
1055/// It contains additional performance optimizations based on the inner-type being a non-nullable primitive
1056/// allowing us to map directly to slices rather than iterating. The ability to use this optimization is
1057/// determined by [`should_optimize_buffer_slice_deserialize`].
1058///
1059/// See [`quote_arrow_field_deserializer`] for additional information.
1060fn quote_arrow_field_deserializer_buffer_slice(
1061 datatype: &DataType,
1062 is_nullable: bool,
1063 obj_field_fqname: &str,
1064 data_src: &proc_macro2::Ident, // &dyn ::arrow::array::Array
1065) -> TokenStream {
1066 _ = is_nullable; // not yet used, will be needed very soon
1067
1068 match datatype.to_logical_type() {
1069 DataType::Int8
1070 | DataType::Int16
1071 | DataType::Int32
1072 | DataType::Int64
1073 | DataType::UInt8
1074 | DataType::UInt16
1075 | DataType::UInt32
1076 | DataType::UInt64
1077 | DataType::Float16
1078 | DataType::Float32
1079 | DataType::Float64 => {
1080 let quoted_downcast = {
1081 let cast_as = format!("{:?}", datatype.to_logical_type()).replace("DataType::", "");
1082 let cast_as = format_ident!("{cast_as}Array"); // e.g. `Uint32Array`
1083 quote_array_downcast(
1084 obj_field_fqname,
1085 data_src,
1086 cast_as,
1087 "e_datatype(datatype),
1088 )
1089 };
1090
1091 quote! {
1092 #quoted_downcast?
1093 .values()
1094 .as_ref()
1095 }
1096 }
1097
1098 DataType::FixedSizeList(inner, length) => {
1099 let data_src_inner = format_ident!("{data_src}_inner");
1100 let quoted_inner = quote_arrow_field_deserializer_buffer_slice(
1101 inner.data_type(),
1102 inner.is_nullable,
1103 obj_field_fqname,
1104 &data_src_inner,
1105 );
1106
1107 let quoted_downcast = {
1108 let cast_as = quote!(arrow::array::FixedSizeListArray);
1109 quote_array_downcast(
1110 obj_field_fqname,
1111 data_src,
1112 cast_as,
1113 "e_datatype(datatype),
1114 )
1115 };
1116
1117 quote! {{
1118 let #data_src = #quoted_downcast?;
1119
1120 let #data_src_inner = &**#data_src.values();
1121 bytemuck::cast_slice::<_, [_; #length]>(#quoted_inner)
1122 }}
1123 }
1124
1125 _ => unimplemented!("{datatype:#?}"),
1126 }
1127}
1128
1129/// Whether or not this object allows for the buffer-slice optimizations.
1130///
1131/// These optimizations require the outer type to be non-nullable and made up exclusively
1132/// of primitive types.
1133///
1134/// Note that nullabillity is kind of weird since it's technically a property of the field
1135/// rather than the datatype.
1136/// Components can only be used by archetypes so they should never be nullable, but for datatypes
1137/// we might need both.
1138///
1139/// This should always be checked before using [`quote_arrow_deserializer_buffer_slice`].
1140pub fn should_optimize_buffer_slice_deserialize(
1141 obj: &Object,
1142 arrow_registry: &ArrowRegistry,
1143) -> bool {
1144 let is_arrow_transparent = obj.datatype.is_none();
1145 if is_arrow_transparent {
1146 let typ = arrow_registry.get(&obj.fqname);
1147 let obj_field = &obj.fields[0];
1148 !obj_field.is_nullable && should_optimize_buffer_slice_deserialize_datatype(&typ)
1149 } else {
1150 false
1151 }
1152}
1153
1154/// Whether or not this datatype allows for the buffer slice optimizations.
1155fn should_optimize_buffer_slice_deserialize_datatype(typ: &DataType) -> bool {
1156 match typ {
1157 DataType::Int8
1158 | DataType::Int16
1159 | DataType::Int32
1160 | DataType::Int64
1161 | DataType::UInt8
1162 | DataType::UInt16
1163 | DataType::UInt32
1164 | DataType::UInt64
1165 | DataType::Float16
1166 | DataType::Float32
1167 | DataType::Float64 => true,
1168 DataType::Extension(_, typ, _) => should_optimize_buffer_slice_deserialize_datatype(typ),
1169 DataType::FixedSizeList(field, _) => {
1170 should_optimize_buffer_slice_deserialize_datatype(field.data_type())
1171 }
1172 _ => false,
1173 }
1174}