df_derive_macros/lib.rs
1//! Proc-macro implementation crate for `df-derive`.
2//!
3//! Most users should depend on the `df-derive` facade, which re-exports
4//! this macro and the default runtime traits from `df-derive-core`. Depend
5//! on `df-derive-macros` directly when you want to target `paft`,
6//! `df-derive-core`, or a custom runtime without the facade.
7//!
8//! Explicit `#[df_derive(trait = "...")]` selects a custom runtime path.
9//! Explicit paths to the built-in `df_derive::dataframe::ToDataFrame` or
10//! `df_derive_core::dataframe::ToDataFrame` runtimes are treated as the
11//! default runtime and still use the runtime's hidden dependency re-exports.
12//! `columnar = "..."` may be provided alongside `trait = "..."`, and
13//! `decimal128_encode = "..."` may override decimal dispatch. Built-in
14//! dataframe runtime paths cannot be mixed with custom `columnar` paths.
15//! Without runtime overrides, discovery tries `df-derive`, `df-derive-core`,
16//! `paft-utils`, `paft`, then the `crate::core::dataframe` fallback.
17#![warn(missing_docs)]
18extern crate proc_macro;
19
20mod attrs;
21mod codegen;
22mod ir;
23mod lower;
24mod parser;
25mod type_analysis;
26use proc_macro::TokenStream;
27use syn::{DeriveInput, parse_macro_input};
28
29/// Derive `ToDataFrame` for structs and tuple structs to generate fast conversions to Polars.
30///
31/// What this macro generates (paths configurable via `#[df_derive(...)]`):
32///
33/// - An implementation of `ToDataFrame` for the annotated type `T` providing:
34/// - `fn to_dataframe(&self) -> PolarsResult<DataFrame>`
35/// - `fn empty_dataframe() -> PolarsResult<DataFrame>`
36/// - `fn schema() -> PolarsResult<Vec<(String, DataType)>>`
37/// - An implementation of `Columnar` for `T` providing
38/// `fn columnar_to_dataframe(items: &[Self]) -> PolarsResult<DataFrame>` and
39/// `fn columnar_from_refs(items: &[&Self]) -> PolarsResult<DataFrame>`.
40/// The direct slice method avoids the trait default's temporary ref-vector
41/// allocation on top-level batch conversion; the borrowed method remains
42/// available for nested and generic composition.
43///
44/// Supported shapes and types:
45///
46/// - Named and tuple structs (tuple fields are named `field_{index}`)
47/// - Nested structs are flattened using dot notation (e.g., `outer.inner`)
48/// - Wrappers `Option<T>` and `Vec<T>` in any nesting order, with `Vec<Struct>` producing multiple
49/// list columns with a `vec_field.subfield` prefix
50/// - Primitive types: `String`, `bool`, integer types including `i128`/`u128`,
51/// `std::num::NonZero*` integer types, `f32`, `f64`
52/// - `chrono::DateTime<Tz>` and `chrono::NaiveDateTime` (default:
53/// `Datetime(Milliseconds, None)`; override with `#[df_derive(time_unit = "ms"|"us"|"ns")]`).
54/// `DateTime<Tz>` stores the UTC instant; use `as_string` when the textual timezone/offset
55/// matters.
56/// - `chrono::NaiveDate` (`Date`, i32 days since 1970-01-01) and `chrono::NaiveTime`
57/// (`Time`, i64 ns since midnight); both have fixed encodings, no unit override.
58/// - `std::time::Duration`, `core::time::Duration`, and `chrono::Duration` (alias for
59/// `chrono::TimeDelta`) → `Duration(Nanoseconds)` by default; override with
60/// `#[df_derive(time_unit = "ms"|"us"|"ns")]`. Bare `Duration` is ambiguous and rejected.
61/// - Decimal backends written as bare `Decimal` or `rust_decimal::Decimal`
62/// (default: `Decimal(38, 10)`; override with
63/// `#[df_derive(decimal(precision = N, scale = N))]`). Custom backends opt in
64/// with explicit `decimal(...)` and a `Decimal128Encode` impl.
65///
66/// Attributes:
67///
68/// - Container-level: `#[df_derive(trait = "path::ToDataFrame")]` to set the `ToDataFrame` trait
69/// path; the `Columnar` and `Decimal128Encode` paths are inferred by replacing the last
70/// path segment with `Columnar` / `Decimal128Encode`. Optionally, set them explicitly with
71/// `#[df_derive(columnar = "path::Columnar")]` and
72/// `#[df_derive(decimal128_encode = "path::Decimal128Encode")]`. A `columnar` override
73/// must be paired with `trait` to avoid mixed-runtime impls. `decimal128_encode` is the
74/// dispatch point for `rust_decimal::Decimal` / `bigdecimal::BigDecimal` / other decimal
75/// backends — see "Custom decimal backends" in the README for the trait contract. Explicit
76/// paths to `df_derive::dataframe::ToDataFrame` or
77/// `df_derive_core::dataframe::ToDataFrame` keep using the default runtime's hidden
78/// dependency re-exports and cannot be paired with a custom `columnar` path;
79/// other explicit trait paths are treated as custom runtimes.
80/// - Field-level: `#[df_derive(skip)]` to omit a field from generated schema
81/// and `DataFrame` output. Skipped fields are not type-analyzed, so this can
82/// be used for caches, handles, source metadata, or other helper values that
83/// should remain on the Rust struct but not become columns. Mutually
84/// exclusive with conversion attributes.
85/// - Field-level: `#[df_derive(as_string)]` to stringify values via `Display` (e.g., enums) during
86/// conversion, resulting in `DataType::String` or `List<String>`. Generated encoders reuse a
87/// `String` scratch buffer per field; the column builder still copies the formatted bytes.
88/// - Field-level: `#[df_derive(as_str)]` to borrow `&str` via `AsRef<str>` for the duration of the
89/// conversion. Same column type as `as_string` but avoids `Display` formatting and the
90/// intermediate scratch buffer. The two attributes are mutually exclusive on a given field.
91/// - Field-level: `#[df_derive(as_binary)]` to route a `Vec<u8>`, `&[u8]`, or
92/// `Cow<'_, [u8]>` field through a Polars `Binary` column instead of the default
93/// `List(UInt8)` for `Vec<u8>`. Accepted shapes:
94/// `Vec<u8>`, `Option<Vec<u8>>`, `Vec<Vec<u8>>`, `Vec<Option<Vec<u8>>>`,
95/// `Option<Vec<Vec<u8>>>`, and the same scalar/list shapes over `&[u8]` and `Cow<'_, [u8]>` —
96/// bare `u8`, `Option<u8>`, `Vec<Option<u8>>` (`BinaryView` cannot carry per-byte nulls), and
97/// non-`u8` leaves are rejected at parse time. Mutually exclusive with `as_str`,
98/// `as_string`, `decimal(...)`, and `time_unit = "..."`.
99/// - Field-level: `#[df_derive(decimal(precision = N, scale = N))]` to choose the
100/// `Decimal(precision, scale)` dtype for a built-in decimal path or to explicitly opt a
101/// custom/generic decimal backend into `Decimal128Encode` dispatch. Polars requires
102/// `1 <= precision <= 38`; `scale` may not exceed `precision`.
103/// - Field-level: `#[df_derive(time_unit = "ms"|"us"|"ns")]` to choose the
104/// `Datetime(unit, None)` / `Duration(unit)` dtype for a temporal field. Accepted bases are
105/// `chrono::DateTime<Tz>`, `chrono::NaiveDateTime`, `std::time::Duration`,
106/// `core::time::Duration`, and `chrono::Duration`. The chrono / std call used to derive the
107/// i64 matches the chosen unit, so values are not silently truncated. `time_unit = "ns"` on
108/// `DateTime<Tz>` or `NaiveDateTime` is fallible on dates outside chrono's supported
109/// nanosecond range (~1677–2262); `time_unit = "ns"`/`"us"` on `chrono::Duration` is fallible
110/// when the duration overflows i64 in the chosen unit; on `std::time::Duration` every unit is
111/// fallible (the value type is `u128`). All failures surface as `PolarsError::ComputeError`
112/// rather than silently corrupting data. `time_unit` is rejected on `chrono::NaiveDate` and
113/// `chrono::NaiveTime` (both have fixed encodings).
114/// - The `decimal(...)` attribute can only be applied to decimal backend candidates: type paths
115/// named `Decimal`, custom struct types, or generic type parameters that implement
116/// `Decimal128Encode`. It cannot be combined with `as_str`/`as_string`/`time_unit` on the same
117/// field. The `time_unit = "..."` attribute is also mutually exclusive with
118/// `as_str`/`as_string`.
119///
120/// Notes:
121///
122/// - Enums are not supported for derive.
123/// - Generic structs are supported; the macro adds bounds only for the roles a
124/// generic parameter actually plays (`ToDataFrame + Columnar` for nested
125/// dataframe payloads, `AsRef<str>` for generic `as_str`, and
126/// `Decimal128Encode` for generic decimal backends). The unit type `()` is a
127/// valid generic payload (zero columns); direct `field: ()` fields are
128/// rejected.
129/// - All nested custom structs must also derive `ToDataFrame`.
130/// - Obvious direct self-recursive nested fields using `Self`, the bare
131/// deriving struct name, `self::Type`, or `crate::Type` are rejected after
132/// transparent wrapper peeling, including `Box<T>`/`Option<Box<T>>` forms
133/// and tuple fields containing the same.
134/// - Empty structs: `to_dataframe` yields a single-row, zero-column `DataFrame`; the columnar path
135/// yields a zero-column `DataFrame` with `items.len()` rows.
136#[proc_macro_derive(ToDataFrame, attributes(df_derive))]
137pub fn to_dataframe_derive(input: TokenStream) -> TokenStream {
138 // Parse the input tokens into a syntax tree
139 let ast = parse_macro_input!(input as DeriveInput);
140 let config = match codegen::build_macro_config(&ast) {
141 Ok(config) => config,
142 Err(e) => return e.to_compile_error().into(),
143 };
144
145 // Build the intermediate representation
146 let ir = match parser::parse_to_ir(&ast) {
147 Ok(ir) => ir,
148 Err(e) => return e.to_compile_error().into(),
149 };
150
151 // Delegate to the codegen orchestrator
152 let generated = codegen::generate_code(&ir, &config);
153 TokenStream::from(generated)
154}