df-derive-macros 0.3.0

Procedural derive macro implementation for df-derive.
Documentation
use proc_macro2::TokenStream;
use quote::quote;

use crate::codegen::external_paths::ExternalPaths;
use crate::ir::VecLayers;

use super::idents::{self, LayerIdents};
use super::shape_walk::{
    shape_assemble_list_stack, shape_freeze_offsets_buffers, shape_freeze_validity_bitmaps,
    shape_layer_wraps_clone,
};

pub(super) struct NestedMaterializeCtx<'a> {
    pub field_idx: usize,
    pub ty: &'a TokenStream,
    pub column_prefix: &'a str,
    pub flat: &'a syn::Ident,
    pub positions: Option<&'a syn::Ident>,
    pub total_len: TokenStream,
    pub wrapper: NestedWrapper<'a>,
    pub columnar_trait: &'a syn::Path,
    pub to_df_trait: &'a syn::Path,
    pub paths: &'a ExternalPaths,
}

#[derive(Clone, Copy)]
pub(super) enum NestedWrapper<'a> {
    None,
    List {
        shape: &'a VecLayers,
        layers: &'a [LayerIdents],
        arr_id_for_layer: fn(usize) -> syn::Ident,
    },
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(super) enum NestedMaterializeKind {
    LeafBare,
    LeafOptional,
    Vec { has_inner_option: bool },
}

pub(super) struct NestedMaterializeBranches {
    pub validity_freeze: TokenStream,
    pub offsets_freeze: TokenStream,
    pub df_decl: TokenStream,
    pub take_decl: TokenStream,
    pub consume_direct: TokenStream,
    pub consume_take: TokenStream,
    pub consume_empty: TokenStream,
    pub consume_all_absent: TokenStream,
}

pub(super) fn nested_materialize_dispatch(
    kind: NestedMaterializeKind,
    flat: &syn::Ident,
    total_len: &TokenStream,
    branches: NestedMaterializeBranches,
) -> TokenStream {
    let NestedMaterializeBranches {
        validity_freeze,
        offsets_freeze,
        df_decl,
        take_decl,
        consume_direct,
        consume_take,
        consume_empty,
        consume_all_absent,
    } = branches;

    match kind {
        NestedMaterializeKind::LeafBare => {
            quote! {
                #df_decl
                #consume_direct
            }
        }
        NestedMaterializeKind::LeafOptional => {
            quote! {
                if #flat.is_empty() {
                    #consume_all_absent
                } else if #flat.len() == #total_len {
                    #df_decl
                    #consume_direct
                } else {
                    #df_decl
                    #take_decl
                    #consume_take
                }
            }
        }
        NestedMaterializeKind::Vec {
            has_inner_option: true,
        } => {
            quote! {
                #validity_freeze
                if #total_len == 0 {
                    #offsets_freeze
                    #consume_empty
                } else if #flat.is_empty() {
                    #offsets_freeze
                    #consume_all_absent
                } else if #flat.len() == #total_len {
                    #df_decl
                    #offsets_freeze
                    #consume_direct
                } else {
                    #df_decl
                    #take_decl
                    #offsets_freeze
                    #consume_take
                }
            }
        }
        NestedMaterializeKind::Vec {
            has_inner_option: false,
        } => {
            quote! {
                #validity_freeze
                if #flat.is_empty() {
                    #offsets_freeze
                    #consume_empty
                } else {
                    #df_decl
                    #offsets_freeze
                    #consume_direct
                }
            }
        }
    }
}

pub(super) fn nested_df_decl(
    df: &syn::Ident,
    ty: &TokenStream,
    columnar_trait: &syn::Path,
    flat: &syn::Ident,
) -> TokenStream {
    let validate_nested_frame = idents::validate_nested_frame();
    quote! {
        let #df = <#ty as #columnar_trait>::columnar_from_refs(&#flat)?;
        #validate_nested_frame(&#df, #flat.len(), ::core::any::type_name::<#ty>())?;
    }
}

pub(super) fn nested_take_decl(
    take: &syn::Ident,
    positions: &syn::Ident,
    pp: &TokenStream,
) -> TokenStream {
    quote! {
        let #take: #pp::IdxCa =
            <#pp::IdxCa as #pp::NewChunkedArray<_, _>>::from_iter_options(
                "".into(),
                #positions.iter().copied(),
            );
    }
}

#[derive(Clone, Copy)]
pub(super) struct NestedColumnIdents<'a> {
    pub df: &'a syn::Ident,
    pub take: &'a syn::Ident,
    pub col_name: &'a syn::Ident,
    pub dtype: &'a syn::Ident,
    pub inner_full: &'a syn::Ident,
}

pub(super) fn build_inner_col_direct(ids: NestedColumnIdents<'_>) -> TokenStream {
    let validate_nested_column_dtype = idents::validate_nested_column_dtype();
    let NestedColumnIdents {
        df,
        col_name,
        dtype,
        inner_full,
        ..
    } = ids;
    quote! {{
        let #inner_full = #df.column(#col_name)?.as_materialized_series();
        #validate_nested_column_dtype(#inner_full, #col_name, #dtype)?;
        #inner_full.clone()
    }}
}

pub(super) fn build_inner_col_take(ids: NestedColumnIdents<'_>) -> TokenStream {
    let validate_nested_column_dtype = idents::validate_nested_column_dtype();
    let NestedColumnIdents {
        df,
        take,
        col_name,
        dtype,
        inner_full,
    } = ids;
    quote! {{
        let #inner_full = #df
            .column(#col_name)?
            .as_materialized_series();
        #validate_nested_column_dtype(#inner_full, #col_name, #dtype)?;
        #inner_full.take(&#take)?
    }}
}

pub(super) fn build_inner_col_empty(dtype: &syn::Ident, pp: &TokenStream) -> TokenStream {
    quote! {
        #pp::Series::new_empty("".into(), #dtype)
    }
}

pub(super) fn build_inner_col_all_absent(
    dtype: &syn::Ident,
    len: &TokenStream,
    pp: &TokenStream,
) -> TokenStream {
    quote! {
        #pp::Series::new_empty("".into(), #dtype)
            .extend_constant(#pp::AnyValue::Null, #len)?
    }
}

fn wrap_nested_column(
    wrapper: &NestedWrapper<'_>,
    inner_col_expr: &TokenStream,
    dtype: &syn::Ident,
    pp: &TokenStream,
    pa_root: &TokenStream,
) -> TokenStream {
    let NestedWrapper::List {
        shape,
        layers,
        arr_id_for_layer,
    } = wrapper
    else {
        return inner_col_expr.clone();
    };
    let inner_chunk = idents::nested_inner_chunk();
    let inner_col = idents::nested_inner_col();
    let inner_rech = idents::nested_inner_rech();
    let chunk_decl = quote! {
        let #inner_col: #pp::Series = #inner_col_expr;
        let #inner_rech = #inner_col.rechunk();
        let #inner_chunk: #pp::ArrayRef = #inner_rech.chunks()[0].clone();
    };
    let wrap_layers = shape_layer_wraps_clone(shape, layers);
    let stack = shape_assemble_list_stack(
        quote! { #inner_chunk },
        quote! { #inner_chunk.dtype().clone() },
        &wrap_layers,
        quote! { (*#dtype).clone() },
        pp,
        pa_root,
        arr_id_for_layer,
    );
    quote! {{
        #chunk_decl
        #stack
    }}
}

pub(super) fn materialize_nested_columns(ctx: &NestedMaterializeCtx<'_>) -> TokenStream {
    let pp = ctx.paths.prelude();
    let pa_root = ctx.paths.polars_arrow_root();
    let df = idents::nested_df(ctx.field_idx);
    let take = idents::nested_take(ctx.field_idx);
    let columns = idents::columns();
    let col_name = idents::nested_col_name();
    let dtype = idents::nested_col_dtype();
    let inner_full = idents::nested_inner_full();

    let column_idents = NestedColumnIdents {
        df: &df,
        take: &take,
        col_name: &col_name,
        dtype: &dtype,
        inner_full: &inner_full,
    };
    let inner_col_direct = build_inner_col_direct(column_idents);
    let inner_col_take = build_inner_col_take(column_idents);
    let inner_col_empty = build_inner_col_empty(&dtype, pp);
    let inner_col_all_absent = build_inner_col_all_absent(&dtype, &ctx.total_len, pp);

    let series_direct = wrap_nested_column(&ctx.wrapper, &inner_col_direct, &dtype, pp, pa_root);
    let series_take = wrap_nested_column(&ctx.wrapper, &inner_col_take, &dtype, pp, pa_root);
    let series_empty = wrap_nested_column(&ctx.wrapper, &inner_col_empty, &dtype, pp, pa_root);
    let series_all_absent =
        wrap_nested_column(&ctx.wrapper, &inner_col_all_absent, &dtype, pp, pa_root);

    let consume_direct = consume_nested_columns(
        &columns,
        ctx.column_prefix,
        ctx.to_df_trait,
        ctx.ty,
        &series_direct,
        pp,
    );
    let consume_take = consume_nested_columns(
        &columns,
        ctx.column_prefix,
        ctx.to_df_trait,
        ctx.ty,
        &series_take,
        pp,
    );
    let consume_empty = consume_nested_columns(
        &columns,
        ctx.column_prefix,
        ctx.to_df_trait,
        ctx.ty,
        &series_empty,
        pp,
    );
    let consume_all_absent = consume_nested_columns(
        &columns,
        ctx.column_prefix,
        ctx.to_df_trait,
        ctx.ty,
        &series_all_absent,
        pp,
    );

    let df_decl = nested_df_decl(&df, ctx.ty, ctx.columnar_trait, ctx.flat);
    let take_decl = ctx.positions.map_or_else(TokenStream::new, |positions| {
        nested_take_decl(&take, positions, pp)
    });

    let (kind, validity_freeze, offsets_freeze) = match ctx.wrapper {
        NestedWrapper::None => {
            let kind = if ctx.positions.is_some() {
                NestedMaterializeKind::LeafOptional
            } else {
                NestedMaterializeKind::LeafBare
            };
            (kind, TokenStream::new(), TokenStream::new())
        }
        NestedWrapper::List { shape, layers, .. } => (
            NestedMaterializeKind::Vec {
                has_inner_option: ctx.positions.is_some(),
            },
            shape_freeze_validity_bitmaps(shape, layers, pa_root),
            shape_freeze_offsets_buffers(layers, pa_root),
        ),
    };

    nested_materialize_dispatch(
        kind,
        ctx.flat,
        &ctx.total_len,
        NestedMaterializeBranches {
            validity_freeze,
            offsets_freeze,
            df_decl,
            take_decl,
            consume_direct,
            consume_take,
            consume_empty,
            consume_all_absent,
        },
    )
}

pub(super) fn consume_nested_columns(
    columns: &syn::Ident,
    parent_name: &str,
    to_df_trait: &syn::Path,
    ty: &TokenStream,
    series_expr: &TokenStream,
    pp: &TokenStream,
) -> TokenStream {
    let col_name = idents::nested_col_name();
    let dtype = idents::nested_col_dtype();
    let prefixed = idents::nested_prefixed_name();
    let inner = idents::nested_inner_series();
    let named = idents::field_named_series();
    quote! {
        for (#col_name, #dtype) in
            <#ty as #to_df_trait>::schema()?
        {
            let #col_name: &str = #col_name.as_str();
            let #dtype: &#pp::DataType = &#dtype;
            {
                let #prefixed = ::std::format!(
                    "{}.{}", #parent_name, #col_name,
                );
                let #inner: #pp::Series = #series_expr;
                let #named = #inner
                    .with_name(#prefixed.as_str().into());
                #columns.push(#named.into());
            }
        }
    }
}