Skip to main content

daft_ext/abi/
arrow.rs

1//! Arrow C Data Interface and C Stream Interface types.
2//!
3//! These are `#[repr(C)]` definitions matching the official Arrow specs:
4//! - <https://arrow.apache.org/docs/format/CDataInterface.html>
5//! - <https://arrow.apache.org/docs/format/CStreamInterface.html>
6//!
7//! By owning these types, `daft-ext` has **zero** dependency on any
8//! Arrow Rust implementation (arrow-rs, arrow2, etc.).
9
10use std::ffi::{c_char, c_int, c_void};
11
12/// An Arrow array paired with its schema (C Data Interface).
13///
14/// Note: This is a convenience type rather than passing (ArrowArray, ArrowSchema).
15#[repr(C)]
16pub struct ArrowData {
17    pub schema: ArrowSchema,
18    pub array: ArrowArray,
19}
20
21/// ArrowSchema C Data Interface.
22///
23/// See: <https://arrow.apache.org/docs/format/CDataInterface.html#the-arrowschema-structure>
24///
25/// **Ownership:** This type has no `Drop` impl. Callers must invoke the
26/// `release` callback (if `Some`) before dropping to free resources.
27#[repr(C)]
28#[derive(Debug)]
29pub struct ArrowSchema {
30    pub format: *const c_char,
31    pub name: *const c_char,
32    pub metadata: *const c_char,
33    pub flags: i64,
34    pub n_children: i64,
35    pub children: *mut *mut ArrowSchema,
36    pub dictionary: *mut ArrowSchema,
37    pub release: Option<unsafe extern "C" fn(schema: *mut ArrowSchema)>,
38    pub private_data: *mut c_void,
39}
40
41impl ArrowSchema {
42    /// Create an empty (released) schema.
43    pub fn empty() -> Self {
44        Self {
45            format: std::ptr::null(),
46            name: std::ptr::null(),
47            metadata: std::ptr::null(),
48            flags: 0,
49            n_children: 0,
50            children: std::ptr::null_mut(),
51            dictionary: std::ptr::null_mut(),
52            release: None,
53            private_data: std::ptr::null_mut(),
54        }
55    }
56
57    /// Whether this schema has been released (release callback is None).
58    pub fn is_released(&self) -> bool {
59        self.release.is_none()
60    }
61
62    /// Borrow a foreign C Data Interface schema as ours (zero-copy).
63    ///
64    /// # Safety
65    ///
66    /// `T` must have the Arrow C Data Interface `ArrowSchema` memory layout.
67    pub unsafe fn from_raw<T>(ptr: &T) -> &Self {
68        assert_eq!(
69            std::mem::size_of::<T>(),
70            std::mem::size_of::<Self>(),
71            "ArrowSchema size mismatch"
72        );
73        assert_eq!(
74            std::mem::align_of::<T>(),
75            std::mem::align_of::<Self>(),
76            "ArrowSchema align mismatch"
77        );
78        unsafe { &*std::ptr::from_ref(ptr).cast::<Self>() }
79    }
80
81    /// Mutably borrow a foreign C Data Interface schema as ours (zero-copy).
82    ///
83    /// # Safety
84    ///
85    /// `T` must have the Arrow C Data Interface `ArrowSchema` memory layout.
86    pub unsafe fn from_raw_mut<T>(ptr: &mut T) -> &mut Self {
87        assert_eq!(
88            std::mem::size_of::<T>(),
89            std::mem::size_of::<Self>(),
90            "ArrowSchema size mismatch"
91        );
92        assert_eq!(
93            std::mem::align_of::<T>(),
94            std::mem::align_of::<Self>(),
95            "ArrowSchema align mismatch"
96        );
97        unsafe { &mut *std::ptr::from_mut(ptr).cast::<Self>() }
98    }
99
100    /// Borrow ours as a foreign C Data Interface schema type (zero-copy).
101    ///
102    /// # Safety
103    ///
104    /// `T` must have the Arrow C Data Interface `ArrowSchema` memory layout.
105    pub unsafe fn as_raw<T>(&self) -> &T {
106        assert_eq!(
107            std::mem::size_of::<T>(),
108            std::mem::size_of::<Self>(),
109            "ArrowSchema size mismatch"
110        );
111        assert_eq!(
112            std::mem::align_of::<T>(),
113            std::mem::align_of::<Self>(),
114            "ArrowSchema align mismatch"
115        );
116        unsafe { &*std::ptr::from_ref(self).cast::<T>() }
117    }
118
119    /// Take ownership of a foreign C Data Interface schema.
120    ///
121    /// # Safety
122    ///
123    /// - `T` must have the Arrow C Data Interface `ArrowSchema` memory layout.
124    /// - Ownership transfers — the caller must not use or drop the original.
125    pub unsafe fn from_owned<T>(val: T) -> Self {
126        assert_eq!(
127            std::mem::size_of::<T>(),
128            std::mem::size_of::<Self>(),
129            "ArrowSchema size mismatch"
130        );
131        assert_eq!(
132            std::mem::align_of::<T>(),
133            std::mem::align_of::<Self>(),
134            "ArrowSchema align mismatch"
135        );
136        let val = std::mem::ManuallyDrop::new(val);
137        unsafe { std::ptr::read((&raw const *val).cast::<Self>()) }
138    }
139
140    /// Convert into a foreign C Data Interface schema type.
141    ///
142    /// # Safety
143    ///
144    /// - `T` must have the Arrow C Data Interface `ArrowSchema` memory layout.
145    /// - Ownership transfers — the caller must not use or drop the original.
146    pub unsafe fn into_owned<T>(self) -> T {
147        assert_eq!(
148            std::mem::size_of::<T>(),
149            std::mem::size_of::<Self>(),
150            "ArrowSchema size mismatch"
151        );
152        assert_eq!(
153            std::mem::align_of::<T>(),
154            std::mem::align_of::<Self>(),
155            "ArrowSchema align mismatch"
156        );
157        let val = std::mem::ManuallyDrop::new(self);
158        unsafe { std::ptr::read((&raw const *val).cast::<T>()) }
159    }
160}
161
162// SAFETY: ArrowSchema is a plain C struct with raw pointers.
163// Only Send — concurrent &-access isn't safe per the Arrow C Data Interface spec.
164unsafe impl Send for ArrowSchema {}
165
166/// ArrowArray C Data Interface: array (columnar data).
167///
168/// See: <https://arrow.apache.org/docs/format/CDataInterface.html#the-arrowarray-structure>
169///
170/// **Ownership:** This type has no `Drop` impl. Callers must invoke the
171/// `release` callback (if `Some`) before dropping to free resources.
172#[repr(C)]
173#[derive(Debug)]
174pub struct ArrowArray {
175    pub length: i64,
176    pub null_count: i64,
177    pub offset: i64,
178    pub n_buffers: i64,
179    pub n_children: i64,
180    pub buffers: *mut *const c_void,
181    pub children: *mut *mut ArrowArray,
182    pub dictionary: *mut ArrowArray,
183    pub release: Option<unsafe extern "C" fn(array: *mut ArrowArray)>,
184    pub private_data: *mut c_void,
185}
186
187impl ArrowArray {
188    /// Create an empty (released) array.
189    pub fn empty() -> Self {
190        Self {
191            length: 0,
192            null_count: 0,
193            offset: 0,
194            n_buffers: 0,
195            n_children: 0,
196            buffers: std::ptr::null_mut(),
197            children: std::ptr::null_mut(),
198            dictionary: std::ptr::null_mut(),
199            release: None,
200            private_data: std::ptr::null_mut(),
201        }
202    }
203
204    /// Whether this array has been released (release callback is None).
205    pub fn is_released(&self) -> bool {
206        self.release.is_none()
207    }
208
209    /// Borrow a foreign C Data Interface array as ours (zero-copy).
210    ///
211    /// # Safety
212    ///
213    /// `T` must have the Arrow C Data Interface `ArrowArray` memory layout.
214    pub unsafe fn from_raw<T>(ptr: &T) -> &Self {
215        assert_eq!(
216            std::mem::size_of::<T>(),
217            std::mem::size_of::<Self>(),
218            "ArrowArray size mismatch"
219        );
220        assert_eq!(
221            std::mem::align_of::<T>(),
222            std::mem::align_of::<Self>(),
223            "ArrowArray align mismatch"
224        );
225        unsafe { &*std::ptr::from_ref(ptr).cast::<Self>() }
226    }
227
228    /// Mutably borrow a foreign C Data Interface array as ours (zero-copy).
229    ///
230    /// # Safety
231    ///
232    /// `T` must have the Arrow C Data Interface `ArrowArray` memory layout.
233    pub unsafe fn from_raw_mut<T>(ptr: &mut T) -> &mut Self {
234        assert_eq!(
235            std::mem::size_of::<T>(),
236            std::mem::size_of::<Self>(),
237            "ArrowArray size mismatch"
238        );
239        assert_eq!(
240            std::mem::align_of::<T>(),
241            std::mem::align_of::<Self>(),
242            "ArrowArray align mismatch"
243        );
244        unsafe { &mut *std::ptr::from_mut(ptr).cast::<Self>() }
245    }
246
247    /// Borrow ours as a foreign C Data Interface array type (zero-copy).
248    ///
249    /// # Safety
250    ///
251    /// `T` must have the Arrow C Data Interface `ArrowArray` memory layout.
252    pub unsafe fn as_raw<T>(&self) -> &T {
253        assert_eq!(
254            std::mem::size_of::<T>(),
255            std::mem::size_of::<Self>(),
256            "ArrowArray size mismatch"
257        );
258        assert_eq!(
259            std::mem::align_of::<T>(),
260            std::mem::align_of::<Self>(),
261            "ArrowArray align mismatch"
262        );
263        unsafe { &*std::ptr::from_ref(self).cast::<T>() }
264    }
265
266    /// Take ownership of a foreign C Data Interface array.
267    ///
268    /// # Safety
269    ///
270    /// - `T` must have the Arrow C Data Interface `ArrowArray` memory layout.
271    /// - Ownership transfers — the caller must not use or drop the original.
272    pub unsafe fn from_owned<T>(val: T) -> Self {
273        assert_eq!(
274            std::mem::size_of::<T>(),
275            std::mem::size_of::<Self>(),
276            "ArrowArray size mismatch"
277        );
278        assert_eq!(
279            std::mem::align_of::<T>(),
280            std::mem::align_of::<Self>(),
281            "ArrowArray align mismatch"
282        );
283        let val = std::mem::ManuallyDrop::new(val);
284        unsafe { std::ptr::read((&raw const *val).cast::<Self>()) }
285    }
286
287    /// Convert into a foreign C Data Interface array type.
288    ///
289    /// # Safety
290    ///
291    /// - `T` must have the Arrow C Data Interface `ArrowArray` memory layout.
292    /// - Ownership transfers — the caller must not use or drop the original.
293    pub unsafe fn into_owned<T>(self) -> T {
294        assert_eq!(
295            std::mem::size_of::<T>(),
296            std::mem::size_of::<Self>(),
297            "ArrowArray size mismatch"
298        );
299        assert_eq!(
300            std::mem::align_of::<T>(),
301            std::mem::align_of::<Self>(),
302            "ArrowArray align mismatch"
303        );
304        let val = std::mem::ManuallyDrop::new(self);
305        unsafe { std::ptr::read((&raw const *val).cast::<T>()) }
306    }
307}
308
309// SAFETY: ArrowArray is a plain C struct with raw pointers.
310// Only Send — concurrent &-access isn't safe per the Arrow C Data Interface spec.
311unsafe impl Send for ArrowArray {}
312
313/// ArrowArray C Stream Interface is a streaming producer of Arrow record batches.
314///
315/// <https://arrow.apache.org/docs/format/CStreamInterface.html#the-arrowarraystream-structure>
316///
317/// **Ownership:** This type has no `Drop` impl. Callers must invoke the
318/// `release` callback (if `Some`) before dropping to free resources.
319#[repr(C)]
320pub struct ArrowArrayStream {
321    /// Get the schema of the stream.
322    ///
323    /// On success, writes to `*out` and returns 0.
324    /// On error, returns non-zero; caller may call `get_last_error`.
325    pub get_schema:
326        Option<unsafe extern "C" fn(stream: *mut ArrowArrayStream, out: *mut ArrowSchema) -> c_int>,
327
328    /// Get the next record batch.
329    ///
330    /// On success, writes to `*out` and returns 0.
331    /// End-of-stream is signaled by writing a released array (release == None).
332    /// On error, returns non-zero; caller may call `get_last_error`.
333    pub get_next:
334        Option<unsafe extern "C" fn(stream: *mut ArrowArrayStream, out: *mut ArrowArray) -> c_int>,
335
336    /// Get a human-readable error message for the last error.
337    ///
338    /// Returns a pointer to a null-terminated string, or null if no error.
339    /// The pointer is valid until the next call on this stream or until release.
340    pub get_last_error:
341        Option<unsafe extern "C" fn(stream: *mut ArrowArrayStream) -> *const c_char>,
342
343    /// Release the stream and all associated resources.
344    ///
345    /// After calling, the stream is in a released state (all pointers None/null).
346    pub release: Option<unsafe extern "C" fn(stream: *mut ArrowArrayStream)>,
347
348    /// Opaque producer-specific data.
349    pub private_data: *mut c_void,
350}
351
352impl ArrowArrayStream {
353    /// Create an empty (released) stream.
354    pub fn empty() -> Self {
355        Self {
356            get_schema: None,
357            get_next: None,
358            get_last_error: None,
359            release: None,
360            private_data: std::ptr::null_mut(),
361        }
362    }
363
364    /// Whether this stream has been released (release callback is None).
365    pub fn is_released(&self) -> bool {
366        self.release.is_none()
367    }
368}
369
370// SAFETY: ArrowArrayStream is a plain C struct with function pointers + opaque data.
371// Only Send — concurrent &-access isn't safe per the Arrow C Data Interface spec.
372unsafe impl Send for ArrowArrayStream {}
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    /// A mock "foreign" ArrowSchema with identical C layout.
379    #[repr(C)]
380    struct FakeSchema {
381        format: *const c_char,
382        name: *const c_char,
383        metadata: *const c_char,
384        flags: i64,
385        n_children: i64,
386        children: *mut *mut FakeSchema,
387        dictionary: *mut FakeSchema,
388        release: Option<unsafe extern "C" fn(schema: *mut FakeSchema)>,
389        private_data: *mut c_void,
390    }
391
392    /// A mock "foreign" ArrowArray with identical C layout.
393    #[repr(C)]
394    struct FakeArray {
395        length: i64,
396        null_count: i64,
397        offset: i64,
398        n_buffers: i64,
399        n_children: i64,
400        buffers: *mut *const c_void,
401        children: *mut *mut FakeArray,
402        dictionary: *mut FakeArray,
403        release: Option<unsafe extern "C" fn(array: *mut FakeArray)>,
404        private_data: *mut c_void,
405    }
406
407    #[test]
408    fn arrow_schema_empty() {
409        let s = ArrowSchema::empty();
410        assert!(s.is_released());
411        assert!(s.format.is_null());
412        assert!(s.name.is_null());
413    }
414
415    #[test]
416    fn arrow_array_empty() {
417        let a = ArrowArray::empty();
418        assert!(a.is_released());
419        assert_eq!(a.length, 0);
420    }
421
422    #[test]
423    fn arrow_array_stream_empty() {
424        let s = ArrowArrayStream::empty();
425        assert!(s.is_released());
426        assert!(s.private_data.is_null());
427    }
428
429    #[test]
430    fn send_only() {
431        fn assert_send<T: Send>() {}
432        assert_send::<ArrowSchema>();
433        assert_send::<ArrowArray>();
434        assert_send::<ArrowArrayStream>();
435    }
436
437    #[cfg(target_pointer_width = "64")]
438    #[test]
439    fn layout_sizes() {
440        let ptr = std::mem::size_of::<usize>();
441        assert_eq!(std::mem::size_of::<ArrowSchema>(), 9 * ptr);
442        assert_eq!(std::mem::size_of::<ArrowArray>(), 10 * ptr);
443        assert_eq!(std::mem::size_of::<ArrowArrayStream>(), 5 * ptr);
444    }
445
446    #[test]
447    fn from_raw_schema() {
448        let schema = ArrowSchema::empty();
449        let borrowed: &FakeSchema = unsafe { schema.as_raw() };
450        assert!(borrowed.release.is_none());
451
452        let fake = FakeSchema {
453            format: std::ptr::null(),
454            name: std::ptr::null(),
455            metadata: std::ptr::null(),
456            flags: 42,
457            n_children: 0,
458            children: std::ptr::null_mut(),
459            dictionary: std::ptr::null_mut(),
460            release: None,
461            private_data: std::ptr::null_mut(),
462        };
463        let borrowed: &ArrowSchema = unsafe { ArrowSchema::from_raw(&fake) };
464        assert_eq!(borrowed.flags, 42);
465    }
466
467    #[test]
468    fn from_raw_array() {
469        let array = ArrowArray::empty();
470        let borrowed: &FakeArray = unsafe { array.as_raw() };
471        assert!(borrowed.release.is_none());
472
473        let fake = FakeArray {
474            length: 50,
475            null_count: 3,
476            offset: 0,
477            n_buffers: 1,
478            n_children: 0,
479            buffers: std::ptr::null_mut(),
480            children: std::ptr::null_mut(),
481            dictionary: std::ptr::null_mut(),
482            release: None,
483            private_data: std::ptr::null_mut(),
484        };
485        let borrowed: &ArrowArray = unsafe { ArrowArray::from_raw(&fake) };
486        assert_eq!(borrowed.length, 50);
487        assert_eq!(borrowed.null_count, 3);
488    }
489
490    #[test]
491    fn owned_roundtrip_schema() {
492        let original = ArrowSchema::empty();
493        let foreign: FakeSchema = unsafe { original.into_owned() };
494        let back: ArrowSchema = unsafe { ArrowSchema::from_owned(foreign) };
495        assert!(back.is_released());
496    }
497
498    #[test]
499    fn owned_roundtrip_array() {
500        let original = ArrowArray {
501            length: 100,
502            null_count: 5,
503            offset: 10,
504            n_buffers: 2,
505            n_children: 0,
506            buffers: std::ptr::null_mut(),
507            children: std::ptr::null_mut(),
508            dictionary: std::ptr::null_mut(),
509            release: None,
510            private_data: std::ptr::null_mut(),
511        };
512        let foreign: FakeArray = unsafe { original.into_owned() };
513        assert_eq!(foreign.length, 100);
514        assert_eq!(foreign.null_count, 5);
515        let back: ArrowArray = unsafe { ArrowArray::from_owned(foreign) };
516        assert_eq!(back.length, 100);
517        assert_eq!(back.null_count, 5);
518    }
519}