re_types_builder/codegen/rust/
deserializer.rs

1use arrow2::datatypes::DataType;
2use proc_macro2::{Literal, TokenStream};
3use quote::{format_ident, quote};
4
5use crate::{
6    codegen::rust::{
7        arrow::{is_backed_by_arrow_buffer, quote_fqname_as_type_path, ArrowDataTypeTokenizer},
8        util::{is_tuple_struct_from_obj, quote_comment},
9    },
10    ArrowRegistry, Object, Objects,
11};
12
13// ---
14
15/// This generates code that deserializes a runtime Arrow payload into the specified `obj`, taking
16/// Arrow-transparency into account.
17///
18/// This short-circuits on error using the `try` (`?`) operator: the outer scope must be one that
19/// returns a `Result<_, DeserializationError>`!
20///
21/// There is a 1:1 relationship between `quote_arrow_deserializer` and `Loggable::from_arrow_opt`:
22/// ```ignore
23/// fn from_arrow_opt(data: &dyn ::arrow::array::Array) -> DeserializationResult<Vec<Option<Self>>> {
24///     Ok(#quoted_deserializer)
25/// }
26/// ```
27///
28/// This tells you two things:
29/// - The runtime Arrow payload is always held in a variable `data`, identified as `data_src` below.
30/// - The returned `TokenStream` must always instantiates a `Vec<Option<Self>>`.
31///
32/// ## Performance vs validation
33/// The deserializers are designed for maximum performance, assuming the incoming data is correct.
34/// If the data is not correct, the deserializers will return an error, but never panic or crash.
35///
36/// TODO(#5305): Currently we're doing a lot of checking for exact matches.
37/// We should instead assume data is correct and handle errors gracefully.
38///
39/// ## Understanding datatypes
40///
41/// There are three (!) datatypes involved in the deserialization process:
42/// - The object's native Rust type, which was derived from its IDL definition by the codegen
43///   framework.
44/// - The object's Arrow datatype, which was also derived from its IDL definition.
45/// - The runtime payload's advertised Arrow datatype.
46///
47/// The deserialization process is _entirely_ driven by our own compile-time IDL-derived definitions:
48/// the runtime payload's advertised Arrow datatype is only ever used as a mean of checking whether
49/// the data we receive can be coerced one way or another into something that fit our schema.
50///
51/// In some places that coercion can be very strict (if the data doesn't match exactly, we abort
52/// with a runtime error) while in other it might be more relaxed for performance reasons
53/// (e.g. ignore the fact that the data has a bitmap altogether).
54pub fn quote_arrow_deserializer(
55    arrow_registry: &ArrowRegistry,
56    objects: &Objects,
57    obj: &Object,
58) -> TokenStream {
59    // Runtime identifier of the variable holding the Arrow payload (`&dyn ::arrow::array::Array`).
60    let data_src = format_ident!("arrow_data");
61
62    let datatype = &arrow_registry.get(&obj.fqname);
63    let quoted_self_datatype = quote! { Self::arrow_datatype() };
64
65    let obj_fqname = obj.fqname.as_str();
66    let is_enum = obj.is_enum();
67    let is_arrow_transparent = obj.datatype.is_none();
68    let is_tuple_struct = is_tuple_struct_from_obj(obj);
69
70    if is_enum {
71        // An enum is very similar to a transparent type.
72
73        // As a transparent type, it's not clear what this does or
74        // where it should come from. Also, it's not used in the internal
75        // implementation of `quote_arrow_field_deserializer` anyways.
76        // TODO(#6819): If we get rid of nullable components this will likely need to change.
77        let is_nullable = true; // Will be ignored
78
79        let obj_field_fqname = format!("{obj_fqname}#enum");
80
81        let quoted_deserializer = quote_arrow_field_deserializer(
82            objects,
83            datatype.to_logical_type(),
84            &quoted_self_datatype, // we are transparent, so the datatype of `Self` is the datatype of our contents
85            is_nullable,
86            &obj_field_fqname,
87            &data_src,
88            InnerRepr::NativeIterable,
89        );
90
91        let quoted_branches = obj.fields.iter().map(|obj_field| {
92            let quoted_obj_field_type = format_ident!("{}", obj_field.name);
93
94            // We should never hit this unwrap or it means the enum-processing at
95            // the fbs layer is totally broken.
96            let enum_value = obj_field.enum_value.unwrap();
97            let quoted_enum_value = proc_macro2::Literal::u8_unsuffixed(enum_value);
98
99            quote! {
100                Some(#quoted_enum_value) => Ok(Some(Self::#quoted_obj_field_type))
101            }
102        });
103
104        // TODO(jleibs): We should be able to do this with try_from instead.
105        let quoted_remapping = quote! {
106            .map(|typ| {
107                match typ {
108                    // The actual enum variants
109                    #(#quoted_branches,)*
110                    None => Ok(None),
111                    Some(invalid) => Err(DeserializationError::missing_union_arm(
112                        #quoted_self_datatype, "<invalid>", invalid as _,
113                    )),
114                }
115            })
116        };
117
118        quote! {
119            #quoted_deserializer
120            #quoted_remapping
121            // NOTE: implicit Vec<Result> to Result<Vec>
122            .collect::<DeserializationResult<Vec<Option<_>>>>()
123            .with_context(#obj_fqname)?
124        }
125    } else if is_arrow_transparent {
126        // NOTE: Arrow transparent objects must have a single field, no more no less.
127        // The semantic pass would have failed already if this wasn't the case.
128        let obj_field = &obj.fields[0];
129        let obj_field_fqname = obj_field.fqname.as_str();
130
131        let data_dst = format_ident!(
132            "{}",
133            if is_tuple_struct {
134                "data0"
135            } else {
136                obj_field.name.as_str()
137            }
138        );
139
140        let field_datatype = arrow_registry.get(&obj_field.fqname);
141
142        let quoted_deserializer = quote_arrow_field_deserializer(
143            objects,
144            &field_datatype,
145            &quoted_self_datatype, // we are transparent, so the datatype of `Self` is the datatype of our contents
146            obj_field.is_nullable,
147            obj_field_fqname,
148            &data_src,
149            InnerRepr::NativeIterable,
150        );
151
152        let quoted_unwrapping = if obj_field.is_nullable {
153            quote!(.map(Ok))
154        } else {
155            // error context is appended below during final collection
156            quote!(.map(|v| v.ok_or_else(DeserializationError::missing_data)))
157        };
158
159        let quoted_remapping = if is_tuple_struct {
160            quote!(.map(|res| res.map(|v| Some(Self(v)))))
161        } else {
162            quote!(.map(|res| res.map(|#data_dst| Some(Self { #data_dst }))))
163        };
164
165        quote! {
166            #quoted_deserializer
167            #quoted_unwrapping
168            #quoted_remapping
169            // NOTE: implicit Vec<Result> to Result<Vec>
170            .collect::<DeserializationResult<Vec<Option<_>>>>()
171            // NOTE: double context so the user can see the transparent shenanigans going on in the
172            // error.
173            .with_context(#obj_field_fqname)
174            .with_context(#obj_fqname)?
175        }
176    } else {
177        // NOTE: This can only be struct or union/enum at this point.
178        match datatype.to_logical_type() {
179            DataType::Struct(_) => {
180                let data_src_fields = format_ident!("{data_src}_fields");
181                let data_src_arrays = format_ident!("{data_src}_arrays");
182
183                let quoted_field_deserializers = obj.fields.iter().map(|obj_field| {
184                    let field_name = &obj_field.name;
185                    let data_dst = format_ident!("{}", obj_field.name);
186                    let field_datatype = &arrow_registry.get(&obj_field.fqname);
187
188                    let quoted_deserializer = quote_arrow_field_deserializer(
189                        objects,
190                        field_datatype,
191                        &quote_datatype(field_datatype),
192                        obj_field.is_nullable,
193                        obj_field.fqname.as_str(),
194                        &data_src,
195                        InnerRepr::NativeIterable,
196                    );
197
198                    quote! {
199                        let #data_dst = {
200                            // NOTE: `arrays_by_name` is a runtime collection of all of the input's
201                            // payload's struct fields, while `#field_name` is the field we're
202                            // looking for at comptime… there's no guarantee it's actually there at
203                            // runtime!
204                            if !arrays_by_name.contains_key(#field_name) {
205                                return Err(DeserializationError::missing_struct_field(
206                                    #quoted_self_datatype, #field_name,
207                                )).with_context(#obj_fqname);
208                            }
209
210                            // NOTE: The indexing by name is safe: checked above.
211                            let #data_src = &**arrays_by_name[#field_name];
212                             #quoted_deserializer
213                        }
214                    }
215                });
216
217                // NOTE: Collecting because we need it more than once.
218                let quoted_field_names = obj
219                    .fields
220                    .iter()
221                    .map(|field| format_ident!("{}", field.name))
222                    .collect::<Vec<_>>();
223
224                let quoted_unwrappings = obj.fields.iter().map(|obj_field| {
225                    let obj_field_fqname = obj_field.fqname.as_str();
226                    let quoted_obj_field_name = format_ident!("{}", obj_field.name);
227                    if obj_field.is_nullable {
228                        quote!(#quoted_obj_field_name)
229                    } else {
230                        quote! {
231                            #quoted_obj_field_name: #quoted_obj_field_name
232                                .ok_or_else(DeserializationError::missing_data)
233                                .with_context(#obj_field_fqname)?
234                        }
235                    }
236                });
237
238                let quoted_downcast = {
239                    let cast_as = quote!(arrow::array::StructArray);
240                    quote_array_downcast(obj_fqname, &data_src, cast_as, &quoted_self_datatype)
241                };
242                quote! {{
243                    let #data_src = #quoted_downcast?;
244                    if #data_src.is_empty() {
245                        // NOTE: The outer container is empty and so we already know that the end result
246                        // is also going to be an empty vec.
247                        // Early out right now rather than waste time computing possibly many empty
248                        // datastructures for all of our children.
249                        Vec::new()
250                    } else {
251                        let (#data_src_fields, #data_src_arrays) = (#data_src.fields(), #data_src.columns());
252
253                        let arrays_by_name: ::std::collections::HashMap<_, _> = #data_src_fields
254                            .iter()
255                            .map(|field| field.name().as_str())
256                            .zip(#data_src_arrays)
257                            .collect();
258
259                        #(#quoted_field_deserializers;)*
260
261                        ZipValidity::new_with_validity(
262                            ::itertools::izip!(#(#quoted_field_names),*),
263                            #data_src.nulls(),
264                        )
265                        .map(|opt| opt.map(|(#(#quoted_field_names),*)| Ok(Self { #(#quoted_unwrappings,)* })).transpose())
266                        // NOTE: implicit Vec<Result> to Result<Vec>
267                        .collect::<DeserializationResult<Vec<_>>>()
268                        .with_context(#obj_fqname)?
269                    }
270                }}
271            }
272
273            DataType::Union(_, _, arrow2::datatypes::UnionMode::Sparse) => {
274                // We use sparse arrow unions for c-style enums, which means only 8 bits is required for each field,
275                // and nulls are encoded with a special 0-index `_null_markers` variant.
276
277                let data_src_types = format_ident!("{data_src}_type_ids");
278
279                let obj_fqname = obj.fqname.as_str();
280                let quoted_branches = obj.fields.iter().enumerate().map(|(typ, obj_field)| {
281                    let arrow_type_index = Literal::i8_unsuffixed(typ as i8 + 1); // 0 is reserved for `_null_markers`
282
283                    let quoted_obj_field_type = format_ident!("{}", obj_field.name);
284                    quote! {
285                        #arrow_type_index => Ok(Some(Self::#quoted_obj_field_type))
286                    }
287                });
288
289                let quoted_downcast = {
290                    let cast_as = quote!(arrow::array::UnionArray);
291                    quote_array_downcast(obj_fqname, &data_src, &cast_as, &quoted_self_datatype)
292                };
293
294                quote! {{
295                    let #data_src = #quoted_downcast?;
296                    let #data_src_types = #data_src.type_ids();
297
298                    #data_src_types
299                        .iter()
300                        .map(|typ| {
301                            match typ {
302                                0 => Ok(None),
303
304                                // The actual enum variants
305                                #(#quoted_branches,)*
306
307                                _ => Err(DeserializationError::missing_union_arm(
308                                    #quoted_self_datatype, "<invalid>", *typ as _,
309                                )),
310                            }
311                        })
312                        // NOTE: implicit Vec<Result> to Result<Vec>
313                        .collect::<DeserializationResult<Vec<_>>>()
314                        .with_context(#obj_fqname)?
315                }}
316            }
317
318            DataType::Union(_, _, arrow2::datatypes::UnionMode::Dense) => {
319                // We use dense arrow unions for proper sum-type unions.
320                // Nulls are encoded with a special 0-index `_null_markers` variant.
321
322                let data_src_type_ids = format_ident!("{data_src}_type_ids");
323                let data_src_offsets = format_ident!("{data_src}_offsets");
324
325                let quoted_field_deserializers = obj
326                    .fields
327                    .iter()
328                    .enumerate()
329                    .filter(|(_, obj_field)| {
330                        // For unit fields we don't have to collect any data.
331                        obj_field.typ != crate::Type::Unit
332                    })
333                    .map(|(type_id, obj_field)| {
334                        let data_dst = format_ident!("{}", obj_field.snake_case_name());
335
336                        let field_datatype = &arrow_registry.get(&obj_field.fqname);
337                        let quoted_deserializer = quote_arrow_field_deserializer(
338                            objects,
339                            field_datatype,
340                            &quote_datatype(field_datatype),
341                            obj_field.is_nullable,
342                            obj_field.fqname.as_str(),
343                            &data_src,
344                            InnerRepr::NativeIterable,
345                        );
346
347                        let type_id = Literal::usize_unsuffixed(type_id + 1); // NOTE: +1 to account for `_null_markers` virtual arm
348
349                        quote! {
350                            let #data_dst = {
351                                // `.child()` will panic if the given `type_id` doesn't exist,
352                                // which could happen if the number of union arms has changed
353                                // between serialization and deserialization.
354                                // There is no simple way to check for this using `arrow-rs`
355                                // (no access to `UnionArray::fields` as of arrow 54:
356                                // https://docs.rs/arrow/latest/arrow/array/struct.UnionArray.html)
357                                //
358                                // Still, we're planning on removing arrow unions entirely, so this is… fine.
359                                // TODO(#6388): stop using arrow unions, and remove this peril
360                                let #data_src = #data_src.child(#type_id).as_ref();
361                                #quoted_deserializer.collect::<Vec<_>>()
362                            }
363                        }
364                    });
365
366                let obj_fqname = obj.fqname.as_str();
367                let quoted_branches = obj.fields.iter().enumerate().map(|(typ, obj_field)| {
368                    let typ = typ as i8 + 1; // NOTE: +1 to account for `_null_markers` virtual arm
369
370                    let obj_field_fqname = obj_field.fqname.as_str();
371                    let quoted_obj_field_name = format_ident!("{}", obj_field.snake_case_name());
372                    let quoted_obj_field_type = format_ident!("{}", obj_field.pascal_case_name());
373
374                    if obj_field.typ == crate::Type::Unit {
375                        // TODO(andreas): Should we check there's enough nulls on the null array?
376                        return quote! {
377                            #typ => Self::#quoted_obj_field_type
378                        };
379                    }
380
381                    let quoted_unwrap = if obj_field.is_nullable {
382                        quote!()
383                    } else {
384                        quote! {
385                            .ok_or_else(DeserializationError::missing_data)
386                            .with_context(#obj_field_fqname)?
387                        }
388                    };
389
390                    quote! {
391                        #typ => Self::#quoted_obj_field_type({
392                            // NOTE: It is absolutely crucial we explicitly handle the
393                            // boundchecks manually first, otherwise rustc completely chokes
394                            // when indexing the data (as in: a 100x perf drop)!
395                            if offset as usize >= #quoted_obj_field_name.len() {
396                                return Err(DeserializationError::offset_oob(
397                                    offset as _, #quoted_obj_field_name.len()
398                                )).with_context(#obj_field_fqname);
399                            }
400
401                            // Safety: all checked above.
402                            #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
403                            unsafe { #quoted_obj_field_name.get_unchecked(offset as usize) }
404                                .clone()
405                                #quoted_unwrap
406                        })
407                    }
408                });
409
410                let quoted_downcast = {
411                    let cast_as = quote!(arrow::array::UnionArray);
412                    quote_array_downcast(obj_fqname, &data_src, &cast_as, &quoted_self_datatype)
413                };
414
415                quote! {{
416                    let #data_src = #quoted_downcast?;
417                    if #data_src.is_empty() {
418                        // NOTE: The outer container is empty and so we already know that the end result
419                        // is also going to be an empty vec.
420                        // Early out right now rather than waste time computing possibly many empty
421                        // datastructures for all of our children.
422                        Vec::new()
423                    } else {
424                        let #data_src_type_ids = #data_src.type_ids();
425
426                        let #data_src_offsets = #data_src.offsets()
427                            // NOTE: expected dense union, got a sparse one instead
428                            .ok_or_else(|| {
429                                let expected = #quoted_self_datatype;
430                                let actual = #data_src.data_type().clone();
431                                DeserializationError::datatype_mismatch(expected, actual)
432                            }).with_context(#obj_fqname)?;
433
434                        if #data_src_type_ids.len() != #data_src_offsets.len() {
435                            // NOTE: need one offset array per union arm!
436                            return Err(DeserializationError::offset_slice_oob(
437                                (0, #data_src_type_ids.len()), #data_src_offsets.len(),
438                            )).with_context(#obj_fqname);
439                        }
440
441                        #(#quoted_field_deserializers;)*
442
443                        #data_src_type_ids
444                            .iter()
445                            .enumerate()
446                            .map(|(i, typ)| {
447                                // NOTE: Array indexing is safe, checked above.
448                                let offset = #data_src_offsets[i];
449
450                                if *typ == 0 {
451                                    Ok(None)
452                                } else {
453                                    Ok(Some(match typ {
454                                        #(#quoted_branches,)*
455                                        _ => {
456                                            return Err(DeserializationError::missing_union_arm(
457                                                #quoted_self_datatype, "<invalid>", *typ as _,
458                                            ));
459                                        }
460                                    }))
461                                }
462                            })
463                            // NOTE: implicit Vec<Result> to Result<Vec>
464                            .collect::<DeserializationResult<Vec<_>>>()
465                            .with_context(#obj_fqname)?
466                    }
467                }}
468            }
469
470            _ => unimplemented!("{datatype:#?}"),
471        }
472    }
473}
474
475#[derive(Copy, Clone, PartialEq)]
476enum InnerRepr {
477    /// The inner elements of the field should be exposed as `Buffer<T>`
478    /// This is only applicable when T is an arrow primitive
479    BufferT,
480
481    /// The inner elements of the field should be exposed as an iterable of T
482    NativeIterable,
483}
484
485/// This generates code that deserializes a runtime Arrow payload according to the specified `datatype`.
486///
487/// The `datatype` comes from our compile-time Arrow registry, not from the runtime payload!
488/// If the datatype happens to be a struct or union, this will merely inject a runtime call to
489/// `Loggable::from_arrow_opt` and call it a day, preventing code bloat.
490///
491/// `data_src` is the runtime identifier of the variable holding the Arrow payload (`&dyn ::arrow::array::Array`).
492/// The returned `TokenStream` always instantiates a `Vec<Option<T>>`.
493///
494/// This short-circuits on error using the `try` (`?`) operator: the outer scope must be one that
495/// returns a `Result<_, DeserializationError>`!
496#[allow(clippy::too_many_arguments)]
497fn quote_arrow_field_deserializer(
498    objects: &Objects,
499    datatype: &DataType,
500    quoted_datatype: &TokenStream,
501    is_nullable: bool,
502    obj_field_fqname: &str,
503    data_src: &proc_macro2::Ident, // &dyn ::arrow::array::Array
504    inner_repr: InnerRepr,
505) -> TokenStream {
506    _ = is_nullable; // not yet used, will be needed very soon
507
508    // If the inner object is an enum, then dispatch to its deserializer.
509    if let DataType::Extension(fqname, _, _) = datatype {
510        if objects.get(fqname).is_some_and(|obj| obj.is_enum()) {
511            let fqname_use = quote_fqname_as_type_path(fqname);
512            return quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter());
513        }
514    }
515
516    match datatype.to_logical_type() {
517        DataType::Int8
518        | DataType::Int16
519        | DataType::Int32
520        | DataType::Int64
521        | DataType::UInt8
522        | DataType::UInt16
523        | DataType::UInt32
524        | DataType::UInt64
525        | DataType::Float16
526        | DataType::Float32
527        | DataType::Float64
528        | DataType::Boolean
529        | DataType::Null => {
530            let quoted_iter_transparency =
531                quote_iterator_transparency(objects, datatype, IteratorKind::OptionValue, None);
532
533            let quoted_downcast = {
534                let cast_as = format!("{:?}", datatype.to_logical_type()).replace("DataType::", "");
535                let cast_as = format_ident!("{cast_as}Array");
536                quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
537            };
538
539            match inner_repr {
540                InnerRepr::BufferT => quote! {
541                    #quoted_downcast?
542                    .values()
543                },
544                InnerRepr::NativeIterable => quote! {
545                    #quoted_downcast?
546                        .into_iter() // NOTE: automatically checks the bitmap on our behalf
547                        #quoted_iter_transparency
548                },
549            }
550        }
551
552        DataType::Utf8 => {
553            let quoted_downcast = {
554                let cast_as = quote!(StringArray);
555                quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
556            };
557
558            let quoted_iter_transparency = quote_iterator_transparency(
559                objects,
560                datatype,
561                IteratorKind::ResultOptionValue,
562                quote!(::re_types_core::ArrowString::from).into(),
563            );
564
565            let data_src_buf = format_ident!("{data_src}_buf");
566
567            quote! {{
568                let #data_src = #quoted_downcast?;
569                let #data_src_buf = #data_src.values();
570
571                let offsets = #data_src.offsets();
572                ZipValidity::new_with_validity(
573                    offsets.windows(2),
574                    #data_src.nulls(),
575                )
576                .map(|elem| elem.map(|window| {
577                        // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
578
579                        let start = window[0] as usize;
580                        let end = window[1] as usize;
581                        let len = end - start;
582
583                        // NOTE: It is absolutely crucial we explicitly handle the
584                        // boundchecks manually first, otherwise rustc completely chokes
585                        // when slicing the data (as in: a 100x perf drop)!
586                        if #data_src_buf.len() < end {
587                            // error context is appended below during final collection
588                            return Err(DeserializationError::offset_slice_oob(
589                                (start, end), #data_src_buf.len(),
590                            ));
591                        }
592                        #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] // TODO(apache/arrow-rs#6900): slice_with_length_unchecked unsafe when https://github.com/apache/arrow-rs/pull/6901 is merged and released
593                        let data = #data_src_buf.slice_with_length(start, len);
594
595                        Ok(data)
596                    }).transpose()
597                )
598                #quoted_iter_transparency
599                // NOTE: implicit Vec<Result> to Result<Vec>
600                .collect::<DeserializationResult<Vec<Option<_>>>>()
601                .with_context(#obj_field_fqname)?
602                .into_iter()
603            }}
604        }
605
606        DataType::FixedSizeList(inner, length) => {
607            let data_src_inner = format_ident!("{data_src}_inner");
608            let quoted_inner = quote_arrow_field_deserializer(
609                objects,
610                inner.data_type(),
611                &quote_datatype(inner.data_type()),
612                inner.is_nullable,
613                obj_field_fqname,
614                &data_src_inner,
615                InnerRepr::NativeIterable,
616            );
617
618            let quoted_downcast = {
619                let cast_as = quote!(arrow::array::FixedSizeListArray);
620                quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
621            };
622
623            let quoted_iter_transparency = quote_iterator_transparency(
624                objects,
625                datatype,
626                IteratorKind::ResultOptionValue,
627                None,
628            );
629
630            let comment_note_unwrap =
631                quote_comment("NOTE: Unwrapping cannot fail: the length must be correct.");
632
633            quote! {{
634                let #data_src = #quoted_downcast?;
635                if #data_src.is_empty() {
636                    // NOTE: The outer container is empty and so we already know that the end result
637                    // is also going to be an empty vec.
638                    // Early out right now rather than waste time computing possibly many empty
639                    // datastructures for all of our children.
640                    Vec::new()
641                } else {
642                    let offsets = (0..).step_by(#length).zip((#length..).step_by(#length).take(#data_src.len()));
643
644                    let #data_src_inner = {
645                        let #data_src_inner = &**#data_src.values();
646                        #quoted_inner.collect::<Vec<_>>()
647                    };
648
649                    ZipValidity::new_with_validity(offsets, #data_src.nulls())
650                        .map(|elem| elem.map(|(start, end): (usize, usize)| {
651                                // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
652
653                                // We're manually generating our own offsets in this case, thus length
654                                // must be correct.
655                                debug_assert!(end - start == #length);
656
657                                // NOTE: It is absolutely crucial we explicitly handle the
658                                // boundchecks manually first, otherwise rustc completely chokes
659                                // when slicing the data (as in: a 100x perf drop)!
660                                if #data_src_inner.len() < end {
661                                    // error context is appended below during final collection
662                                    return Err(DeserializationError::offset_slice_oob(
663                                        (start, end), #data_src_inner.len(),
664                                    ));
665                                }
666                                // Safety: all checked above.
667                                #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
668                                let data = unsafe { #data_src_inner.get_unchecked(start..end) };
669
670                                // NOTE: The call to `Option::unwrap_or_default` is very important here.
671                                //
672                                // Since we can only get here if the outer entry is marked as
673                                // non-null, the only possible reason for the default() path
674                                // to be taken is because the inner field itself is nullable and
675                                // happens to have one or more nullable values in the referenced
676                                // slice.
677                                // This is perfectly fine, and when it happens, we need to fill the
678                                // resulting vec with some data, hence default().
679                                //
680                                // This does have a subtle implication though!
681                                // Since we never even look at the inner field's data when the outer
682                                // entry is null, it means we won't notice it if illegal/malformed/corrupt
683                                // in any way.
684                                // It is important that we turn a blind eye here, because most SDKs in
685                                // the ecosystem will put illegal data (e.g. null entries in an array of
686                                // non-null floats) in the inner buffer if the outer entry itself
687                                // is null.
688                                //
689                                // TODO(#2875): use MaybeUninit rather than requiring a default impl
690                                let data = data.iter().cloned().map(Option::unwrap_or_default);
691                                // The following would be the correct thing to do, but costs us way
692                                // too much performance-wise for something that only applies to
693                                // malformed inputs.
694                                //
695                                // // NOTE: We don't support nullable inner elements in our IDL, so
696                                // // this can only be a case of malformed data.
697                                // .map(|opt| opt.ok_or_else(DeserializationError::missing_data))
698                                // .collect::<DeserializationResult<Vec<_>>>()?;
699
700                                #comment_note_unwrap
701                                #[allow(clippy::unwrap_used)]
702                                Ok(array_init::from_iter(data).unwrap())
703                            }).transpose()
704                        )
705                        #quoted_iter_transparency
706                        // NOTE: implicit Vec<Result> to Result<Vec>
707                        .collect::<DeserializationResult<Vec<Option<_>>>>()?
708                }
709                .into_iter()
710            }}
711        }
712
713        DataType::List(inner) => {
714            let data_src_inner = format_ident!("{data_src}_inner");
715
716            let inner_repr = if is_backed_by_arrow_buffer(inner.data_type()) {
717                InnerRepr::BufferT
718            } else {
719                InnerRepr::NativeIterable
720            };
721
722            let quoted_inner = quote_arrow_field_deserializer(
723                objects,
724                inner.data_type(),
725                &quote_datatype(inner.data_type()),
726                inner.is_nullable,
727                obj_field_fqname,
728                &data_src_inner,
729                inner_repr,
730            );
731
732            let quoted_downcast = {
733                let cast_as = quote!(arrow::array::ListArray);
734                quote_array_downcast(obj_field_fqname, data_src, cast_as, quoted_datatype)
735            };
736            let quoted_collect_inner = match inner_repr {
737                InnerRepr::BufferT => quote!(),
738                InnerRepr::NativeIterable => quote!(.collect::<Vec<_>>()),
739            };
740
741            let quoted_inner_data_range = match inner_repr {
742                InnerRepr::BufferT => {
743                    quote! {
744                        #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)] // TODO(apache/arrow-rs#6900): unsafe slice_unchecked when https://github.com/apache/arrow-rs/pull/6901 is merged and released
745                        let data = #data_src_inner.clone().slice(start,  end - start);
746                        let data = ::re_types_core::ArrowBuffer::from(data);
747                    }
748                }
749                InnerRepr::NativeIterable => quote! {
750                    #[allow(unsafe_code, clippy::undocumented_unsafe_blocks)]
751                    let data = unsafe { #data_src_inner.get_unchecked(start..end) };
752
753                    // NOTE: The call to `Option::unwrap_or_default` is very important here.
754                    //
755                    // Since we can only get here if the outer oob is marked as
756                    // non-null, the only possible reason for the default() path
757                    // to be taken is because the inner field itself is nullable and
758                    // happens to have one or more nullable values in the referenced
759                    // slice.
760                    // This is perfectly fine, and when it happens, we need to fill the
761                    // resulting vec with some data, hence default().
762                    //
763                    // This does have a subtle implication though!
764                    // Since we never even look at the inner field's data when the outer
765                    // entry is null, it means we won't notice it if illegal/malformed/corrupt
766                    // in any way.
767                    // It is important that we turn a blind eye here, because most SDKs in
768                    // the ecosystem will put illegal data (e.g. null entries in an array of
769                    // non-null floats) in the inner buffer if the outer entry itself
770                    // is null.
771                    //
772                    // TODO(#2875): use MaybeUninit rather than requiring a default impl
773                    let data = data.iter().cloned().map(Option::unwrap_or_default).collect();
774                        // The following would be the correct thing to do, but costs us way
775                        // too much performance-wise for something that only applies to
776                        // malformed inputs.
777                        //
778                        // // NOTE: We don't support nullable inner elements in our IDL, so
779                        // // this can only be a case of malformed data.
780                        // .map(|opt| opt.ok_or_else(DeserializationError::missing_data))
781                        // .collect::<DeserializationResult<Vec<_>>>()?;
782                },
783            };
784
785            quote! {{
786                let #data_src = #quoted_downcast?;
787                if #data_src.is_empty() {
788                    // NOTE: The outer container is empty and so we already know that the end result
789                    // is also going to be an empty vec.
790                    // Early out right now rather than waste time computing possibly many empty
791                    // datastructures for all of our children.
792                    Vec::new()
793                } else {
794                    let #data_src_inner = {
795                        let #data_src_inner = &**#data_src.values();
796                        #quoted_inner #quoted_collect_inner
797                    };
798
799                    let offsets = #data_src.offsets();
800                    ZipValidity::new_with_validity(
801                        offsets.windows(2),
802                        #data_src.nulls(),
803                    )
804                    .map(|elem| elem.map(|window| {
805                            // NOTE: Do _not_ use `Buffer::sliced`, it panics on malformed inputs.
806
807                            let start = window[0] as usize;
808                            let end = window[1] as usize;
809
810                            // NOTE: It is absolutely crucial we explicitly handle the
811                            // boundchecks manually first, otherwise rustc completely chokes
812                            // when slicing the data (as in: a 100x perf drop)!
813                            if #data_src_inner.len() < end {
814                                // error context is appended below during final collection
815                                return Err(DeserializationError::offset_slice_oob(
816                                    (start, end), #data_src_inner.len(),
817                                ));
818                            }
819
820                            #quoted_inner_data_range
821
822                            Ok(data)
823                        }).transpose()
824                    )
825                    // NOTE: implicit Vec<Result> to Result<Vec>
826                    .collect::<DeserializationResult<Vec<Option<_>>>>()?
827                }
828                .into_iter()
829            }}
830        }
831
832        DataType::Struct(_) | DataType::Union(_, _, _) => {
833            let DataType::Extension(fqname, _, _) = datatype else {
834                unreachable!()
835            };
836            let fqname_use = quote_fqname_as_type_path(fqname);
837            quote!(#fqname_use::from_arrow_opt(#data_src).with_context(#obj_field_fqname)?.into_iter())
838        }
839
840        _ => unimplemented!("{datatype:#?}"),
841    }
842}
843
844fn quote_datatype(datatype: &DataType) -> TokenStream {
845    let is_recursive = false;
846    let expected = ArrowDataTypeTokenizer(datatype, is_recursive);
847    quote! { #expected }
848}
849
850/// Generates tokens that downcast the runtime Arrow array identifier by `arr` as `cast_as`, making sure
851/// to inject proper error handling.
852fn quote_array_downcast(
853    location: impl AsRef<str>,
854    arr: &syn::Ident,
855    cast_as: impl quote::ToTokens,
856    quoted_expected_datatype: &TokenStream,
857) -> TokenStream {
858    let location = location.as_ref();
859    let cast_as = cast_as.to_token_stream();
860    quote! {
861        #arr
862            .as_any()
863            .downcast_ref::<#cast_as>()
864            .ok_or_else(|| {
865                let expected = #quoted_expected_datatype;
866                let actual = #arr.data_type().clone();
867                DeserializationError::datatype_mismatch(expected, actual)
868            })
869            .with_context(#location)
870    }
871}
872
873#[derive(Debug, Clone, Copy)]
874#[allow(dead_code)]
875enum IteratorKind {
876    /// `Iterator<Item = DeserializationResult<Option<T>>>`.
877    ResultOptionValue,
878
879    /// `Iterator<Item = Option<DeserializationResult<T>>>`.
880    OptionResultValue,
881
882    /// `Iterator<Item = Option<T>>`.
883    OptionValue,
884
885    /// `Iterator<Item = DeserializationResult<T>>`.
886    ResultValue,
887
888    /// `Iterator<Item = T>`.
889    Value,
890}
891
892/// This generates code that maps the data in an iterator in order to apply the Arrow transparency
893/// rules to it, if necessary.
894///
895/// This can often become a very difficult job due to all the affixes that might be involved:
896/// fallibility, nullability, transparency, tuple structs…
897/// This function will just do the right thing.
898///
899/// If `extra_wrapper` is specified, this will also wrap the resulting data in `$extra_wrapper(data)`.
900///
901/// Have a look around in this file for examples of use.
902fn quote_iterator_transparency(
903    objects: &Objects,
904    datatype: &DataType,
905    iter_kind: IteratorKind,
906    extra_wrapper: Option<TokenStream>,
907) -> TokenStream {
908    #![allow(clippy::collapsible_else_if)]
909
910    let inner_obj = if let DataType::Extension(fqname, _, _) = datatype {
911        Some(&objects[fqname])
912    } else {
913        None
914    };
915    let inner_is_arrow_transparent = inner_obj.is_some_and(|obj| obj.datatype.is_none());
916
917    if inner_is_arrow_transparent {
918        let inner_obj = inner_obj.as_ref().unwrap();
919        let quoted_inner_obj_type = quote_fqname_as_type_path(&inner_obj.fqname);
920
921        let is_tuple_struct = is_tuple_struct_from_obj(inner_obj);
922        let quoted_data_dst = format_ident!(
923            "{}",
924            if is_tuple_struct {
925                "data0"
926            } else {
927                inner_obj.fields[0].name.as_str()
928            }
929        );
930
931        let quoted_binding = if is_tuple_struct {
932            if let Some(extra_wrapper) = extra_wrapper {
933                quote!(|v| #quoted_inner_obj_type(#extra_wrapper(v)))
934            } else {
935                quote!(#quoted_inner_obj_type)
936            }
937        } else {
938            if let Some(extra_wrapper) = extra_wrapper {
939                quote!(|#quoted_data_dst| #quoted_inner_obj_type { #quoted_data_dst: #extra_wrapper(v) })
940            } else {
941                quote!(|#quoted_data_dst| #quoted_inner_obj_type { #quoted_data_dst })
942            }
943        };
944
945        match iter_kind {
946            IteratorKind::ResultOptionValue | IteratorKind::OptionResultValue => {
947                quote!(.map(|res_or_opt| res_or_opt.map(|res_or_opt| res_or_opt.map(#quoted_binding))))
948            }
949            IteratorKind::OptionValue | IteratorKind::ResultValue => {
950                quote!(.map(|res_or_opt| res_or_opt.map(#quoted_binding)))
951            }
952            IteratorKind::Value => quote!(.map(#quoted_binding)),
953        }
954    } else {
955        if let Some(extra_wrapper) = extra_wrapper {
956            let quoted_binding = quote!(|v| #extra_wrapper(v));
957            match iter_kind {
958                IteratorKind::ResultOptionValue | IteratorKind::OptionResultValue => {
959                    quote!(.map(|res_or_opt| res_or_opt.map(|res_or_opt| res_or_opt.map(#quoted_binding))))
960                }
961                IteratorKind::OptionValue | IteratorKind::ResultValue => {
962                    quote!(.map(|res_or_opt| res_or_opt.map(#quoted_binding)))
963                }
964                IteratorKind::Value => quote!(.map(#quoted_binding)),
965            }
966        } else {
967            quote!()
968        }
969    }
970}
971
972/// This generates code that deserializes a runtime Arrow payload into the specified `obj`, taking
973/// Arrow-transparency into account.
974///
975/// It contains additional performance optimizations based on the inner-type being a non-nullable primitive
976/// allowing us to map directly to slices rather than iterating. The ability to use this optimization is
977/// determined by [`should_optimize_buffer_slice_deserialize`].
978///
979/// There is a 1:1 relationship between `quote_arrow_deserializer_buffer_slice` and `Loggable::from_arrow`:
980/// ```ignore
981/// fn from_arrow(data: &dyn ::arrow::array::Array) -> DeserializationResult<Vec<Self>> {
982///     Ok(#quoted_deserializer_)
983/// }
984/// ```
985///
986/// See [`quote_arrow_deserializer_buffer_slice`] for additional information.
987pub fn quote_arrow_deserializer_buffer_slice(
988    arrow_registry: &ArrowRegistry,
989    objects: &Objects,
990    obj: &Object,
991) -> TokenStream {
992    // Runtime identifier of the variable holding the Arrow payload (`&dyn ::arrow::array::Array`).
993    let data_src = format_ident!("arrow_data");
994
995    let datatype = &arrow_registry.get(&obj.fqname);
996
997    let is_arrow_transparent = obj.datatype.is_none();
998    let is_tuple_struct = is_tuple_struct_from_obj(obj);
999
1000    if is_arrow_transparent {
1001        // NOTE: Arrow transparent objects must have a single field, no more no less.
1002        // The semantic pass would have failed already if this wasn't the case.
1003        debug_assert!(obj.fields.len() == 1);
1004        let obj_field = &obj.fields[0];
1005        let obj_field_fqname = obj_field.fqname.as_str();
1006
1007        let data_dst = format_ident!(
1008            "{}",
1009            if is_tuple_struct {
1010                "data0"
1011            } else {
1012                obj_field.name.as_str()
1013            }
1014        );
1015
1016        let datatype = arrow_registry.get(&obj_field.fqname);
1017        let deserizlized_as_slice = quote_arrow_field_deserializer_buffer_slice(
1018            &datatype,
1019            obj_field.is_nullable,
1020            obj_field_fqname,
1021            &data_src,
1022        );
1023
1024        let quoted_iter_transparency =
1025            quote_iterator_transparency(objects, &datatype, IteratorKind::Value, None);
1026        let quoted_iter_transparency = quote!(.copied() #quoted_iter_transparency);
1027
1028        let quoted_remapping = if is_tuple_struct {
1029            quote!(.map(Self))
1030        } else {
1031            quote!(.map(|#data_dst| Self { #data_dst }))
1032        };
1033
1034        quote! {{
1035            let slice = #deserizlized_as_slice;
1036
1037            {
1038                // NOTE(#3850): Don't add a profile scope here: the profiler overhead is too big for this fast function.
1039                // re_tracing::profile_scope!("collect");
1040
1041                slice
1042                    .iter()
1043                    #quoted_iter_transparency
1044                    #quoted_remapping
1045                    .collect::<Vec<_>>()
1046            }
1047        }}
1048    } else {
1049        unimplemented!("{datatype:#?}")
1050    }
1051}
1052
1053/// This generates code that deserializes a runtime Arrow payload according to the specified `datatype`.
1054///
1055/// It contains additional performance optimizations based on the inner-type being a non-nullable primitive
1056/// allowing us to map directly to slices rather than iterating. The ability to use this optimization is
1057/// determined by [`should_optimize_buffer_slice_deserialize`].
1058///
1059/// See [`quote_arrow_field_deserializer`] for additional information.
1060fn quote_arrow_field_deserializer_buffer_slice(
1061    datatype: &DataType,
1062    is_nullable: bool,
1063    obj_field_fqname: &str,
1064    data_src: &proc_macro2::Ident, // &dyn ::arrow::array::Array
1065) -> TokenStream {
1066    _ = is_nullable; // not yet used, will be needed very soon
1067
1068    match datatype.to_logical_type() {
1069        DataType::Int8
1070        | DataType::Int16
1071        | DataType::Int32
1072        | DataType::Int64
1073        | DataType::UInt8
1074        | DataType::UInt16
1075        | DataType::UInt32
1076        | DataType::UInt64
1077        | DataType::Float16
1078        | DataType::Float32
1079        | DataType::Float64 => {
1080            let quoted_downcast = {
1081                let cast_as = format!("{:?}", datatype.to_logical_type()).replace("DataType::", "");
1082                let cast_as = format_ident!("{cast_as}Array"); // e.g. `Uint32Array`
1083                quote_array_downcast(
1084                    obj_field_fqname,
1085                    data_src,
1086                    cast_as,
1087                    &quote_datatype(datatype),
1088                )
1089            };
1090
1091            quote! {
1092                #quoted_downcast?
1093                .values()
1094                .as_ref()
1095            }
1096        }
1097
1098        DataType::FixedSizeList(inner, length) => {
1099            let data_src_inner = format_ident!("{data_src}_inner");
1100            let quoted_inner = quote_arrow_field_deserializer_buffer_slice(
1101                inner.data_type(),
1102                inner.is_nullable,
1103                obj_field_fqname,
1104                &data_src_inner,
1105            );
1106
1107            let quoted_downcast = {
1108                let cast_as = quote!(arrow::array::FixedSizeListArray);
1109                quote_array_downcast(
1110                    obj_field_fqname,
1111                    data_src,
1112                    cast_as,
1113                    &quote_datatype(datatype),
1114                )
1115            };
1116
1117            quote! {{
1118                let #data_src = #quoted_downcast?;
1119
1120                let #data_src_inner = &**#data_src.values();
1121                bytemuck::cast_slice::<_, [_; #length]>(#quoted_inner)
1122            }}
1123        }
1124
1125        _ => unimplemented!("{datatype:#?}"),
1126    }
1127}
1128
1129/// Whether or not this object allows for the buffer-slice optimizations.
1130///
1131/// These optimizations require the outer type to be non-nullable and made up exclusively
1132/// of primitive types.
1133///
1134/// Note that nullabillity is kind of weird since it's technically a property of the field
1135/// rather than the datatype.
1136/// Components can only be used by archetypes so they should never be nullable, but for datatypes
1137/// we might need both.
1138///
1139/// This should always be checked before using [`quote_arrow_deserializer_buffer_slice`].
1140pub fn should_optimize_buffer_slice_deserialize(
1141    obj: &Object,
1142    arrow_registry: &ArrowRegistry,
1143) -> bool {
1144    let is_arrow_transparent = obj.datatype.is_none();
1145    if is_arrow_transparent {
1146        let typ = arrow_registry.get(&obj.fqname);
1147        let obj_field = &obj.fields[0];
1148        !obj_field.is_nullable && should_optimize_buffer_slice_deserialize_datatype(&typ)
1149    } else {
1150        false
1151    }
1152}
1153
1154/// Whether or not this datatype allows for the buffer slice optimizations.
1155fn should_optimize_buffer_slice_deserialize_datatype(typ: &DataType) -> bool {
1156    match typ {
1157        DataType::Int8
1158        | DataType::Int16
1159        | DataType::Int32
1160        | DataType::Int64
1161        | DataType::UInt8
1162        | DataType::UInt16
1163        | DataType::UInt32
1164        | DataType::UInt64
1165        | DataType::Float16
1166        | DataType::Float32
1167        | DataType::Float64 => true,
1168        DataType::Extension(_, typ, _) => should_optimize_buffer_slice_deserialize_datatype(typ),
1169        DataType::FixedSizeList(field, _) => {
1170            should_optimize_buffer_slice_deserialize_datatype(field.data_type())
1171        }
1172        _ => false,
1173    }
1174}