Skip to main content

df_derive_macros/
lib.rs

1//! Proc-macro implementation crate for `df-derive`.
2//!
3//! Most users should depend on the `df-derive` facade, which re-exports
4//! this macro and the default runtime traits from `df-derive-core`. Depend
5//! on `df-derive-macros` directly when you want to target `paft`,
6//! `df-derive-core`, or a custom runtime without the facade.
7//!
8//! Explicit `#[df_derive(trait = "...")]` selects a custom runtime path.
9//! Explicit paths to the built-in `df_derive::dataframe::ToDataFrame` or
10//! `df_derive_core::dataframe::ToDataFrame` runtimes are treated as the
11//! default runtime and still use the runtime's hidden dependency re-exports.
12//! `columnar = "..."` may be provided alongside `trait = "..."`, and
13//! `decimal128_encode = "..."` may override decimal dispatch. Built-in
14//! dataframe runtime paths cannot be mixed with custom `columnar` paths.
15//! Without runtime overrides, discovery tries `df-derive`, `df-derive-core`,
16//! `paft-utils`, `paft`, then the `crate::core::dataframe` fallback.
17#![warn(missing_docs)]
18extern crate proc_macro;
19
20mod attrs;
21mod codegen;
22mod ir;
23mod lower;
24mod parser;
25mod type_analysis;
26use proc_macro::TokenStream;
27use syn::{DeriveInput, parse_macro_input};
28
29/// Derive `ToDataFrame` for structs and tuple structs to generate fast conversions to Polars.
30///
31/// What this macro generates (paths configurable via `#[df_derive(...)]`):
32///
33/// - An implementation of `ToDataFrame` for the annotated type `T` providing:
34///   - `fn to_dataframe(&self) -> PolarsResult<DataFrame>`
35///   - `fn empty_dataframe() -> PolarsResult<DataFrame>`
36///   - `fn schema() -> PolarsResult<Vec<(String, DataType)>>`
37/// - An implementation of `Columnar` for `T` providing
38///   `fn columnar_to_dataframe(items: &[Self]) -> PolarsResult<DataFrame>` and
39///   `fn columnar_from_refs(items: &[&Self]) -> PolarsResult<DataFrame>`.
40///   The direct slice method avoids the trait default's temporary ref-vector
41///   allocation on top-level batch conversion; the borrowed method remains
42///   available for nested and generic composition.
43///
44/// Supported shapes and types:
45///
46/// - Named and tuple structs (tuple fields are named `field_{index}`)
47/// - Nested structs are flattened using dot notation (e.g., `outer.inner`)
48/// - Wrappers `Option<T>` and `Vec<T>` in any nesting order, with `Vec<Struct>` producing multiple
49///   list columns with a `vec_field.subfield` prefix
50/// - Primitive types: `String`, `bool`, integer types including `i128`/`u128`,
51///   `std::num::NonZero*` integer types, `f32`, `f64`
52/// - `chrono::DateTime<Tz>` and `chrono::NaiveDateTime` (default:
53///   `Datetime(Milliseconds, None)`; override with `#[df_derive(time_unit = "ms"|"us"|"ns")]`).
54///   `DateTime<Tz>` stores the UTC instant; use `as_string` when the textual timezone/offset
55///   matters.
56/// - `chrono::NaiveDate` (`Date`, i32 days since 1970-01-01) and `chrono::NaiveTime`
57///   (`Time`, i64 ns since midnight); both have fixed encodings, no unit override.
58/// - `std::time::Duration`, `core::time::Duration`, and `chrono::Duration` (alias for
59///   `chrono::TimeDelta`) → `Duration(Nanoseconds)` by default; override with
60///   `#[df_derive(time_unit = "ms"|"us"|"ns")]`. Bare `Duration` is ambiguous and rejected.
61/// - Decimal backends written as bare `Decimal` or `rust_decimal::Decimal`
62///   (default: `Decimal(38, 10)`; override with
63///   `#[df_derive(decimal(precision = N, scale = N))]`). Custom backends opt in
64///   with explicit `decimal(...)` and a `Decimal128Encode` impl.
65///
66/// Attributes:
67///
68/// - Container-level: `#[df_derive(trait = "path::ToDataFrame")]` to set the `ToDataFrame` trait
69///   path; the `Columnar` and `Decimal128Encode` paths are inferred by replacing the last
70///   path segment with `Columnar` / `Decimal128Encode`. Optionally, set them explicitly with
71///   `#[df_derive(columnar = "path::Columnar")]` and
72///   `#[df_derive(decimal128_encode = "path::Decimal128Encode")]`. A `columnar` override
73///   must be paired with `trait` to avoid mixed-runtime impls. `decimal128_encode` is the
74///   dispatch point for `rust_decimal::Decimal` / `bigdecimal::BigDecimal` / other decimal
75///   backends — see "Custom decimal backends" in the README for the trait contract. Explicit
76///   paths to `df_derive::dataframe::ToDataFrame` or
77///   `df_derive_core::dataframe::ToDataFrame` keep using the default runtime's hidden
78///   dependency re-exports and cannot be paired with a custom `columnar` path;
79///   other explicit trait paths are treated as custom runtimes.
80/// - Field-level: `#[df_derive(skip)]` to omit a field from generated schema
81///   and `DataFrame` output. Skipped fields are not type-analyzed, so this can
82///   be used for caches, handles, source metadata, or other helper values that
83///   should remain on the Rust struct but not become columns. Mutually
84///   exclusive with conversion attributes.
85/// - Field-level: `#[df_derive(as_string)]` to stringify values via `Display` (e.g., enums) during
86///   conversion, resulting in `DataType::String` or `List<String>`. Generated encoders reuse a
87///   `String` scratch buffer per field; the column builder still copies the formatted bytes.
88/// - Field-level: `#[df_derive(as_str)]` to borrow `&str` via `AsRef<str>` for the duration of the
89///   conversion. Same column type as `as_string` but avoids `Display` formatting and the
90///   intermediate scratch buffer. The two attributes are mutually exclusive on a given field.
91/// - Field-level: `#[df_derive(as_binary)]` to route a `Vec<u8>`, `&[u8]`, or
92///   `Cow<'_, [u8]>` field through a Polars `Binary` column instead of the default
93///   `List(UInt8)` for `Vec<u8>`. Accepted shapes:
94///   `Vec<u8>`, `Option<Vec<u8>>`, `Vec<Vec<u8>>`, `Vec<Option<Vec<u8>>>`,
95///   `Option<Vec<Vec<u8>>>`, and the same scalar/list shapes over `&[u8]` and `Cow<'_, [u8]>` —
96///   bare `u8`, `Option<u8>`, `Vec<Option<u8>>` (`BinaryView` cannot carry per-byte nulls), and
97///   non-`u8` leaves are rejected at parse time. Mutually exclusive with `as_str`,
98///   `as_string`, `decimal(...)`, and `time_unit = "..."`.
99/// - Field-level: `#[df_derive(decimal(precision = N, scale = N))]` to choose the
100///   `Decimal(precision, scale)` dtype for a built-in decimal path or to explicitly opt a
101///   custom/generic decimal backend into `Decimal128Encode` dispatch. Polars requires
102///   `1 <= precision <= 38`; `scale` may not exceed `precision`.
103/// - Field-level: `#[df_derive(time_unit = "ms"|"us"|"ns")]` to choose the
104///   `Datetime(unit, None)` / `Duration(unit)` dtype for a temporal field. Accepted bases are
105///   `chrono::DateTime<Tz>`, `chrono::NaiveDateTime`, `std::time::Duration`,
106///   `core::time::Duration`, and `chrono::Duration`. The chrono / std call used to derive the
107///   i64 matches the chosen unit, so values are not silently truncated. `time_unit = "ns"` on
108///   `DateTime<Tz>` or `NaiveDateTime` is fallible on dates outside chrono's supported
109///   nanosecond range (~1677–2262); `time_unit = "ns"`/`"us"` on `chrono::Duration` is fallible
110///   when the duration overflows i64 in the chosen unit; on `std::time::Duration` every unit is
111///   fallible (the value type is `u128`). All failures surface as `PolarsError::ComputeError`
112///   rather than silently corrupting data. `time_unit` is rejected on `chrono::NaiveDate` and
113///   `chrono::NaiveTime` (both have fixed encodings).
114/// - The `decimal(...)` attribute can only be applied to decimal backend candidates: type paths
115///   named `Decimal`, custom struct types, or generic type parameters that implement
116///   `Decimal128Encode`. It cannot be combined with `as_str`/`as_string`/`time_unit` on the same
117///   field. The `time_unit = "..."` attribute is also mutually exclusive with
118///   `as_str`/`as_string`.
119///
120/// Notes:
121///
122/// - Enums are not supported for derive.
123/// - Generic structs are supported; the macro adds bounds only for the roles a
124///   generic parameter actually plays (`ToDataFrame + Columnar` for nested
125///   dataframe payloads, `AsRef<str>` for generic `as_str`, and
126///   `Decimal128Encode` for generic decimal backends). The unit type `()` is a
127///   valid generic payload (zero columns); direct `field: ()` fields are
128///   rejected.
129/// - All nested custom structs must also derive `ToDataFrame`.
130/// - Obvious direct self-recursive nested fields using `Self`, the bare
131///   deriving struct name, `self::Type`, or `crate::Type` are rejected after
132///   transparent wrapper peeling, including `Box<T>`/`Option<Box<T>>` forms
133///   and tuple fields containing the same.
134/// - Empty structs: `to_dataframe` yields a single-row, zero-column `DataFrame`; the columnar path
135///   yields a zero-column `DataFrame` with `items.len()` rows.
136#[proc_macro_derive(ToDataFrame, attributes(df_derive))]
137pub fn to_dataframe_derive(input: TokenStream) -> TokenStream {
138    // Parse the input tokens into a syntax tree
139    let ast = parse_macro_input!(input as DeriveInput);
140    let config = match codegen::build_macro_config(&ast) {
141        Ok(config) => config,
142        Err(e) => return e.to_compile_error().into(),
143    };
144
145    // Build the intermediate representation
146    let ir = match parser::parse_to_ir(&ast) {
147        Ok(ir) => ir,
148        Err(e) => return e.to_compile_error().into(),
149    };
150
151    // Delegate to the codegen orchestrator
152    let generated = codegen::generate_code(&ir, &config);
153    TokenStream::from(generated)
154}