Skip to main content

quack_rs/vector/
reader.rs

1// SPDX-License-Identifier: MIT
2// Copyright 2026 Tom F. <https://github.com/tomtom215/>
3// My way of giving something small back to the open source community
4// and encouraging more Rust development!
5
6//! Safe typed reading from `DuckDB` data vectors.
7//!
8//! [`VectorReader`] provides safe access to the typed data in a `DuckDB` vector
9//! without requiring direct raw pointer manipulation.
10//!
11//! # Pitfalls solved
12//!
13//! - **L5**: Booleans are read as `u8 != 0`, never as `bool`, because `DuckDB`'s
14//!   C API does not guarantee the Rust `bool` invariant (must be 0 or 1).
15//!
16//! # Example
17//!
18//! ```rust,no_run
19//! use quack_rs::vector::VectorReader;
20//! use libduckdb_sys::{duckdb_data_chunk, duckdb_data_chunk_get_vector,
21//!                     duckdb_data_chunk_get_size};
22//!
23//! // Inside a DuckDB aggregate `update` callback:
24//! // let reader = unsafe { VectorReader::new(chunk, 0) };
25//! // for row in 0..reader.row_count() {
26//! //     if reader.is_valid(row) {
27//! //         let val = unsafe { reader.read_i64(row) };
28//! //     }
29//! // }
30//! ```
31
32use libduckdb_sys::{
33    duckdb_data_chunk, duckdb_data_chunk_get_size, duckdb_data_chunk_get_vector,
34    duckdb_validity_row_is_valid, duckdb_vector, duckdb_vector_get_data,
35    duckdb_vector_get_validity, idx_t,
36};
37
38/// A typed reader for a single column in a `DuckDB` data chunk.
39///
40/// `VectorReader` wraps a pointer to a `DuckDB` vector's data buffer and
41/// provides ergonomic, type-checked access methods for common `DuckDB` types.
42///
43/// # Lifetimes
44///
45/// The reader borrows from the data chunk. Do not call `duckdb_destroy_data_chunk`
46/// while a `VectorReader` that references it is live.
47pub struct VectorReader {
48    data: *const u8,
49    validity: *mut u64,
50    row_count: usize,
51}
52
53impl VectorReader {
54    /// Creates a new `VectorReader` for the given column in a data chunk.
55    ///
56    /// # Safety
57    ///
58    /// - `chunk` must be a valid `duckdb_data_chunk` for the duration of this reader's lifetime.
59    /// - `col_idx` must be a valid column index in the chunk.
60    pub unsafe fn new(chunk: duckdb_data_chunk, col_idx: usize) -> Self {
61        // SAFETY: Caller guarantees chunk is valid.
62        let row_count = usize::try_from(unsafe { duckdb_data_chunk_get_size(chunk) }).unwrap_or(0);
63        // SAFETY: col_idx is valid per caller's contract.
64        let vector = unsafe { duckdb_data_chunk_get_vector(chunk, col_idx as idx_t) };
65        // SAFETY: vector is non-null for valid column indices.
66        let data = unsafe { duckdb_vector_get_data(vector) }.cast::<u8>();
67        // SAFETY: may be null if all values are valid (no NULLs); checked in is_valid.
68        let validity = unsafe { duckdb_vector_get_validity(vector) };
69        Self {
70            data,
71            validity,
72            row_count,
73        }
74    }
75
76    /// Creates a `VectorReader` directly from a raw `duckdb_vector` handle.
77    ///
78    /// Use this when you already have a child vector (e.g., from
79    /// [`StructVector::get_child`][crate::vector::complex::StructVector::get_child] or
80    /// [`ListVector::get_child`][crate::vector::complex::ListVector::get_child]).
81    ///
82    /// # Safety
83    ///
84    /// - `vector` must be a valid `duckdb_vector` for the duration of this reader's lifetime.
85    /// - `row_count` must equal the number of valid rows in the vector.
86    pub unsafe fn from_vector(vector: duckdb_vector, row_count: usize) -> Self {
87        // SAFETY: vector is valid per caller's contract.
88        let data = unsafe { duckdb_vector_get_data(vector) }.cast::<u8>();
89        let validity = unsafe { duckdb_vector_get_validity(vector) };
90        Self {
91            data,
92            validity,
93            row_count,
94        }
95    }
96
97    /// Returns the number of rows in this vector.
98    #[mutants::skip]
99    #[must_use]
100    #[inline]
101    pub const fn row_count(&self) -> usize {
102        self.row_count
103    }
104
105    /// Returns `true` if the value at row `idx` is not NULL.
106    ///
107    /// # Safety
108    ///
109    /// `idx` must be less than `self.row_count()`.
110    #[inline]
111    pub unsafe fn is_valid(&self, idx: usize) -> bool {
112        if self.validity.is_null() {
113            return true;
114        }
115        // SAFETY: validity is non-null and idx is in bounds per caller's contract.
116        unsafe { duckdb_validity_row_is_valid(self.validity, idx as idx_t) }
117    }
118
119    /// Reads an `i8` (TINYINT) value at row `idx`.
120    ///
121    /// # Safety
122    ///
123    /// - `idx` must be less than `self.row_count()`.
124    /// - The column must contain `TINYINT` data.
125    /// - The value at `idx` must not be NULL (check with [`is_valid`][Self::is_valid]).
126    #[inline]
127    pub const unsafe fn read_i8(&self, idx: usize) -> i8 {
128        // SAFETY: data points to valid TINYINT array, idx is in bounds.
129        unsafe { core::ptr::read_unaligned(self.data.add(idx).cast::<i8>()) }
130    }
131
132    /// Reads an `i16` (SMALLINT) value at row `idx`.
133    ///
134    /// # Safety
135    ///
136    /// - `idx` must be less than `self.row_count()`.
137    /// - The column must contain `SMALLINT` data.
138    #[inline]
139    pub const unsafe fn read_i16(&self, idx: usize) -> i16 {
140        // SAFETY: 2-byte read from valid SMALLINT vector.
141        unsafe { core::ptr::read_unaligned(self.data.add(idx * 2).cast::<i16>()) }
142    }
143
144    /// Reads an `i32` (INTEGER) value at row `idx`.
145    ///
146    /// # Safety
147    ///
148    /// See [`read_i8`][Self::read_i8].
149    #[inline]
150    pub const unsafe fn read_i32(&self, idx: usize) -> i32 {
151        // SAFETY: 4-byte read from valid INTEGER vector.
152        unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<i32>()) }
153    }
154
155    /// Reads an `i64` (BIGINT / TIMESTAMP) value at row `idx`.
156    ///
157    /// # Safety
158    ///
159    /// See [`read_i8`][Self::read_i8].
160    #[inline]
161    pub const unsafe fn read_i64(&self, idx: usize) -> i64 {
162        // SAFETY: 8-byte read from valid BIGINT/TIMESTAMP vector.
163        unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<i64>()) }
164    }
165
166    /// Reads a `u8` (UTINYINT) value at row `idx`.
167    ///
168    /// # Safety
169    ///
170    /// See [`read_i8`][Self::read_i8].
171    #[inline]
172    pub const unsafe fn read_u8(&self, idx: usize) -> u8 {
173        // SAFETY: 1-byte read from valid UTINYINT vector.
174        unsafe { *self.data.add(idx) }
175    }
176
177    /// Reads a `u16` (USMALLINT) value at row `idx`.
178    ///
179    /// # Safety
180    ///
181    /// See [`read_i8`][Self::read_i8].
182    #[inline]
183    pub const unsafe fn read_u16(&self, idx: usize) -> u16 {
184        // SAFETY: 2-byte read from valid USMALLINT vector.
185        unsafe { core::ptr::read_unaligned(self.data.add(idx * 2).cast::<u16>()) }
186    }
187
188    /// Reads a `u32` (UINTEGER) value at row `idx`.
189    ///
190    /// # Safety
191    ///
192    /// See [`read_i8`][Self::read_i8].
193    #[inline]
194    pub const unsafe fn read_u32(&self, idx: usize) -> u32 {
195        // SAFETY: 4-byte read from valid UINTEGER vector.
196        unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<u32>()) }
197    }
198
199    /// Reads a `u64` (UBIGINT) value at row `idx`.
200    ///
201    /// # Safety
202    ///
203    /// See [`read_i8`][Self::read_i8].
204    #[inline]
205    pub const unsafe fn read_u64(&self, idx: usize) -> u64 {
206        // SAFETY: 8-byte read from valid UBIGINT vector.
207        unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<u64>()) }
208    }
209
210    /// Reads an `f32` (FLOAT) value at row `idx`.
211    ///
212    /// # Safety
213    ///
214    /// See [`read_i8`][Self::read_i8].
215    #[inline]
216    pub const unsafe fn read_f32(&self, idx: usize) -> f32 {
217        // SAFETY: 4-byte read from valid FLOAT vector.
218        unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<f32>()) }
219    }
220
221    /// Reads an `f64` (DOUBLE) value at row `idx`.
222    ///
223    /// # Safety
224    ///
225    /// See [`read_i8`][Self::read_i8].
226    #[inline]
227    pub const unsafe fn read_f64(&self, idx: usize) -> f64 {
228        // SAFETY: 8-byte read from valid DOUBLE vector.
229        unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<f64>()) }
230    }
231
232    /// Reads a `bool` (BOOLEAN) value at row `idx`.
233    ///
234    /// # Pitfall L5: Defensive boolean reading
235    ///
236    /// This method reads the underlying byte as `u8` and compares with `!= 0`,
237    /// rather than casting directly to `bool`. `DuckDB`'s C API does not guarantee
238    /// the Rust `bool` invariant (must be exactly 0 or 1), so a direct cast could
239    /// cause undefined behaviour.
240    ///
241    /// # Safety
242    ///
243    /// - `idx` must be less than `self.row_count()`.
244    /// - The column must contain `BOOLEAN` data.
245    #[inline]
246    pub const unsafe fn read_bool(&self, idx: usize) -> bool {
247        // SAFETY: BOOLEAN data is stored as 1 byte per value.
248        // We read as u8 (not bool) to avoid UB if DuckDB sets non-0/1 values.
249        // This is Pitfall L5: always read boolean as u8 then compare != 0.
250        unsafe { *self.data.add(idx) != 0 }
251    }
252
253    /// Reads an `i128` (HUGEINT) value at row `idx`.
254    ///
255    /// `DuckDB` stores HUGEINT as `{ lower: u64, upper: i64 }` in little-endian
256    /// layout, totaling 16 bytes per value.
257    ///
258    /// # Safety
259    ///
260    /// - `idx` must be less than `self.row_count()`.
261    /// - The column must contain `HUGEINT` data.
262    /// - The value at `idx` must not be NULL (check with [`is_valid`][Self::is_valid]).
263    #[inline]
264    pub const unsafe fn read_i128(&self, idx: usize) -> i128 {
265        // SAFETY: HUGEINT is stored as { lower: u64, upper: i64 } = 16 bytes.
266        // DuckDB lays this out in little-endian order: lower at offset 0, upper at offset 8.
267        let base = unsafe { self.data.add(idx * 16) };
268        let lower = unsafe { core::ptr::read_unaligned(base.cast::<u64>()) };
269        let upper = unsafe { core::ptr::read_unaligned(base.add(8).cast::<i64>()) };
270        // Widening casts: u64→i128 and i64→i128 are always lossless.
271        #[allow(clippy::cast_lossless)]
272        let result = (upper as i128) << 64 | (lower as i128);
273        result
274    }
275
276    /// Reads a VARCHAR value at row `idx`.
277    ///
278    /// Returns an empty string if the data is not valid UTF-8 or if the internal
279    /// string pointer is null.
280    ///
281    /// # Pitfall P7
282    ///
283    /// `DuckDB` stores strings in a 16-byte `duckdb_string_t` with two formats
284    /// (inline for ≤ 12 bytes, pointer otherwise). This method handles both.
285    ///
286    /// # Safety
287    ///
288    /// - `idx` must be less than `self.row_count()`.
289    /// - The column must contain `VARCHAR` data.
290    /// - For pointer-format strings, the pointed-to heap memory must be valid
291    ///   for the lifetime of the returned `&str`.
292    pub unsafe fn read_str(&self, idx: usize) -> &str {
293        // SAFETY: Caller guarantees data is a VARCHAR vector and idx is in bounds.
294        unsafe { crate::vector::string::read_duck_string(self.data, idx) }
295    }
296
297    /// Reads a `BLOB` (binary) value at row `idx`.
298    ///
299    /// `DuckDB` stores BLOBs using the same 16-byte `duckdb_string_t` layout as
300    /// VARCHAR (inline for ≤12 bytes, pointer for larger values). The returned
301    /// slice borrows from the vector's data buffer.
302    ///
303    /// # Safety
304    ///
305    /// - `idx` must be less than `self.row_count()`.
306    /// - The column must contain `BLOB` data.
307    /// - The pointed-to memory must be valid for the lifetime of the returned slice.
308    pub unsafe fn read_blob(&self, idx: usize) -> &[u8] {
309        // SAFETY: BLOB uses the same duckdb_string_t layout as VARCHAR.
310        unsafe { crate::vector::string::read_duck_string(self.data, idx).as_bytes() }
311    }
312
313    /// Reads a `UUID` value at row `idx` as an `i128`.
314    ///
315    /// `DuckDB` stores UUID as a HUGEINT (128-bit integer). This is a semantic
316    /// alias for [`read_i128`][Self::read_i128].
317    ///
318    /// # Safety
319    ///
320    /// - `idx` must be less than `self.row_count()`.
321    /// - The column must contain `UUID` data.
322    #[inline]
323    pub const unsafe fn read_uuid(&self, idx: usize) -> i128 {
324        // SAFETY: UUID is stored as HUGEINT (i128).
325        unsafe { self.read_i128(idx) }
326    }
327
328    /// Reads a `DATE` value at row `idx` as days since the Unix epoch.
329    ///
330    /// `DuckDB` stores DATE as a 4-byte `i32` representing the number of days
331    /// since 1970-01-01. This is a semantic alias for [`read_i32`][Self::read_i32].
332    ///
333    /// # Safety
334    ///
335    /// - `idx` must be less than `self.row_count()`.
336    /// - The column must contain `DATE` data.
337    #[inline]
338    pub const unsafe fn read_date(&self, idx: usize) -> i32 {
339        // SAFETY: DATE is stored as i32 (days since epoch).
340        unsafe { self.read_i32(idx) }
341    }
342
343    /// Reads a `TIMESTAMP` value at row `idx` as microseconds since the Unix epoch.
344    ///
345    /// `DuckDB` stores TIMESTAMP as an 8-byte `i64` representing microseconds
346    /// since 1970-01-01 00:00:00 UTC. This is a semantic alias for
347    /// [`read_i64`][Self::read_i64].
348    ///
349    /// # Safety
350    ///
351    /// - `idx` must be less than `self.row_count()`.
352    /// - The column must contain `TIMESTAMP` data.
353    #[inline]
354    pub const unsafe fn read_timestamp(&self, idx: usize) -> i64 {
355        // SAFETY: TIMESTAMP is stored as i64 (microseconds since epoch).
356        unsafe { self.read_i64(idx) }
357    }
358
359    /// Reads a `TIME` value at row `idx` as microseconds since midnight.
360    ///
361    /// `DuckDB` stores TIME as an 8-byte `i64` representing microseconds since
362    /// midnight. This is a semantic alias for [`read_i64`][Self::read_i64].
363    ///
364    /// # Safety
365    ///
366    /// - `idx` must be less than `self.row_count()`.
367    /// - The column must contain `TIME` data.
368    #[inline]
369    pub const unsafe fn read_time(&self, idx: usize) -> i64 {
370        // SAFETY: TIME is stored as i64 (microseconds since midnight).
371        unsafe { self.read_i64(idx) }
372    }
373
374    /// Reads an `INTERVAL` value at row `idx`.
375    ///
376    /// Returns a [`DuckInterval`][crate::interval::DuckInterval] struct.
377    ///
378    /// # Pitfall P8
379    ///
380    /// The `INTERVAL` struct is 16 bytes: `{ months: i32, days: i32, micros: i64 }`.
381    /// This method handles the layout correctly using [`read_interval_at`][crate::interval::read_interval_at].
382    ///
383    /// # Safety
384    ///
385    /// - `idx` must be less than `self.row_count()`.
386    /// - The column must contain `INTERVAL` data.
387    #[inline]
388    pub const unsafe fn read_interval(&self, idx: usize) -> crate::interval::DuckInterval {
389        // SAFETY: data is a valid INTERVAL vector and idx is in bounds.
390        unsafe { crate::interval::read_interval_at(self.data, idx) }
391    }
392}
393
394#[cfg(test)]
395mod tests {
396    use super::*;
397
398    /// Verify that `VectorReader` handles the boolean-as-u8 pattern correctly.
399    #[test]
400    fn bool_read_u8_pattern() {
401        // Simulate a DuckDB BOOLEAN vector with a non-standard value (e.g., 2)
402        // to verify we use != 0 comparison rather than transmuting to bool.
403        let data: [u8; 4] = [0, 1, 2, 255];
404
405        // Directly test the read_bool logic by checking values
406        // (We can't easily create a real VectorReader without DuckDB, so we test
407        // the underlying invariant: any non-zero byte is `true`.)
408        let as_bools: Vec<bool> = data.iter().map(|&b| b != 0).collect();
409        assert_eq!(as_bools, [false, true, true, true]);
410    }
411
412    #[test]
413    fn row_count_is_zero_for_empty_state() {
414        // This exercises the struct layout; actual DuckDB integration is in tests/
415        let reader = VectorReader {
416            data: std::ptr::null(),
417            validity: std::ptr::null_mut(),
418            row_count: 0,
419        };
420        assert_eq!(reader.row_count(), 0);
421    }
422
423    #[test]
424    fn is_valid_when_validity_null() {
425        // When validity is null, all rows are considered valid
426        let reader = VectorReader {
427            data: std::ptr::null(),
428            validity: std::ptr::null_mut(),
429            row_count: 5,
430        };
431        // SAFETY: row 0 is in bounds (row_count = 5), validity is null (all valid)
432        assert!(unsafe { reader.is_valid(0) });
433        assert!(unsafe { reader.is_valid(4) });
434    }
435}