quack_rs/vector/reader.rs
1// SPDX-License-Identifier: MIT
2// Copyright 2026 Tom F. <https://github.com/tomtom215/>
3// My way of giving something small back to the open source community
4// and encouraging more Rust development!
5
6//! Safe typed reading from `DuckDB` data vectors.
7//!
8//! [`VectorReader`] provides safe access to the typed data in a `DuckDB` vector
9//! without requiring direct raw pointer manipulation.
10//!
11//! # Pitfalls solved
12//!
13//! - **L5**: Booleans are read as `u8 != 0`, never as `bool`, because `DuckDB`'s
14//! C API does not guarantee the Rust `bool` invariant (must be 0 or 1).
15//!
16//! # Example
17//!
18//! ```rust,no_run
19//! use quack_rs::vector::VectorReader;
20//! use libduckdb_sys::{duckdb_data_chunk, duckdb_data_chunk_get_vector,
21//! duckdb_data_chunk_get_size};
22//!
23//! // Inside a DuckDB aggregate `update` callback:
24//! // let reader = unsafe { VectorReader::new(chunk, 0) };
25//! // for row in 0..reader.row_count() {
26//! // if reader.is_valid(row) {
27//! // let val = unsafe { reader.read_i64(row) };
28//! // }
29//! // }
30//! ```
31
32use libduckdb_sys::{
33 duckdb_data_chunk, duckdb_data_chunk_get_size, duckdb_data_chunk_get_vector,
34 duckdb_validity_row_is_valid, duckdb_vector, duckdb_vector_get_data,
35 duckdb_vector_get_validity, idx_t,
36};
37
38/// A typed reader for a single column in a `DuckDB` data chunk.
39///
40/// `VectorReader` wraps a pointer to a `DuckDB` vector's data buffer and
41/// provides ergonomic, type-checked access methods for common `DuckDB` types.
42///
43/// # Lifetimes
44///
45/// The reader borrows from the data chunk. Do not call `duckdb_destroy_data_chunk`
46/// while a `VectorReader` that references it is live.
47pub struct VectorReader {
48 data: *const u8,
49 validity: *mut u64,
50 row_count: usize,
51}
52
53impl VectorReader {
54 /// Creates a new `VectorReader` for the given column in a data chunk.
55 ///
56 /// # Safety
57 ///
58 /// - `chunk` must be a valid `duckdb_data_chunk` for the duration of this reader's lifetime.
59 /// - `col_idx` must be a valid column index in the chunk.
60 pub unsafe fn new(chunk: duckdb_data_chunk, col_idx: usize) -> Self {
61 // SAFETY: Caller guarantees chunk is valid.
62 let row_count = usize::try_from(unsafe { duckdb_data_chunk_get_size(chunk) }).unwrap_or(0);
63 // SAFETY: col_idx is valid per caller's contract.
64 let vector = unsafe { duckdb_data_chunk_get_vector(chunk, col_idx as idx_t) };
65 // SAFETY: vector is non-null for valid column indices.
66 let data = unsafe { duckdb_vector_get_data(vector) }.cast::<u8>();
67 // SAFETY: may be null if all values are valid (no NULLs); checked in is_valid.
68 let validity = unsafe { duckdb_vector_get_validity(vector) };
69 Self {
70 data,
71 validity,
72 row_count,
73 }
74 }
75
76 /// Creates a `VectorReader` directly from a raw `duckdb_vector` handle.
77 ///
78 /// Use this when you already have a child vector (e.g., from
79 /// [`StructVector::get_child`][crate::vector::complex::StructVector::get_child] or
80 /// [`ListVector::get_child`][crate::vector::complex::ListVector::get_child]).
81 ///
82 /// # Safety
83 ///
84 /// - `vector` must be a valid `duckdb_vector` for the duration of this reader's lifetime.
85 /// - `row_count` must equal the number of valid rows in the vector.
86 pub unsafe fn from_vector(vector: duckdb_vector, row_count: usize) -> Self {
87 // SAFETY: vector is valid per caller's contract.
88 let data = unsafe { duckdb_vector_get_data(vector) }.cast::<u8>();
89 let validity = unsafe { duckdb_vector_get_validity(vector) };
90 Self {
91 data,
92 validity,
93 row_count,
94 }
95 }
96
97 /// Returns the number of rows in this vector.
98 #[mutants::skip]
99 #[must_use]
100 #[inline]
101 pub const fn row_count(&self) -> usize {
102 self.row_count
103 }
104
105 /// Returns `true` if the value at row `idx` is not NULL.
106 ///
107 /// # Safety
108 ///
109 /// `idx` must be less than `self.row_count()`.
110 #[inline]
111 pub unsafe fn is_valid(&self, idx: usize) -> bool {
112 if self.validity.is_null() {
113 return true;
114 }
115 // SAFETY: validity is non-null and idx is in bounds per caller's contract.
116 unsafe { duckdb_validity_row_is_valid(self.validity, idx as idx_t) }
117 }
118
119 /// Reads an `i8` (TINYINT) value at row `idx`.
120 ///
121 /// # Safety
122 ///
123 /// - `idx` must be less than `self.row_count()`.
124 /// - The column must contain `TINYINT` data.
125 /// - The value at `idx` must not be NULL (check with [`is_valid`][Self::is_valid]).
126 #[inline]
127 pub const unsafe fn read_i8(&self, idx: usize) -> i8 {
128 // SAFETY: data points to valid TINYINT array, idx is in bounds.
129 unsafe { core::ptr::read_unaligned(self.data.add(idx).cast::<i8>()) }
130 }
131
132 /// Reads an `i16` (SMALLINT) value at row `idx`.
133 ///
134 /// # Safety
135 ///
136 /// - `idx` must be less than `self.row_count()`.
137 /// - The column must contain `SMALLINT` data.
138 #[inline]
139 pub const unsafe fn read_i16(&self, idx: usize) -> i16 {
140 // SAFETY: 2-byte read from valid SMALLINT vector.
141 unsafe { core::ptr::read_unaligned(self.data.add(idx * 2).cast::<i16>()) }
142 }
143
144 /// Reads an `i32` (INTEGER) value at row `idx`.
145 ///
146 /// # Safety
147 ///
148 /// See [`read_i8`][Self::read_i8].
149 #[inline]
150 pub const unsafe fn read_i32(&self, idx: usize) -> i32 {
151 // SAFETY: 4-byte read from valid INTEGER vector.
152 unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<i32>()) }
153 }
154
155 /// Reads an `i64` (BIGINT / TIMESTAMP) value at row `idx`.
156 ///
157 /// # Safety
158 ///
159 /// See [`read_i8`][Self::read_i8].
160 #[inline]
161 pub const unsafe fn read_i64(&self, idx: usize) -> i64 {
162 // SAFETY: 8-byte read from valid BIGINT/TIMESTAMP vector.
163 unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<i64>()) }
164 }
165
166 /// Reads a `u8` (UTINYINT) value at row `idx`.
167 ///
168 /// # Safety
169 ///
170 /// See [`read_i8`][Self::read_i8].
171 #[inline]
172 pub const unsafe fn read_u8(&self, idx: usize) -> u8 {
173 // SAFETY: 1-byte read from valid UTINYINT vector.
174 unsafe { *self.data.add(idx) }
175 }
176
177 /// Reads a `u16` (USMALLINT) value at row `idx`.
178 ///
179 /// # Safety
180 ///
181 /// See [`read_i8`][Self::read_i8].
182 #[inline]
183 pub const unsafe fn read_u16(&self, idx: usize) -> u16 {
184 // SAFETY: 2-byte read from valid USMALLINT vector.
185 unsafe { core::ptr::read_unaligned(self.data.add(idx * 2).cast::<u16>()) }
186 }
187
188 /// Reads a `u32` (UINTEGER) value at row `idx`.
189 ///
190 /// # Safety
191 ///
192 /// See [`read_i8`][Self::read_i8].
193 #[inline]
194 pub const unsafe fn read_u32(&self, idx: usize) -> u32 {
195 // SAFETY: 4-byte read from valid UINTEGER vector.
196 unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<u32>()) }
197 }
198
199 /// Reads a `u64` (UBIGINT) value at row `idx`.
200 ///
201 /// # Safety
202 ///
203 /// See [`read_i8`][Self::read_i8].
204 #[inline]
205 pub const unsafe fn read_u64(&self, idx: usize) -> u64 {
206 // SAFETY: 8-byte read from valid UBIGINT vector.
207 unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<u64>()) }
208 }
209
210 /// Reads an `f32` (FLOAT) value at row `idx`.
211 ///
212 /// # Safety
213 ///
214 /// See [`read_i8`][Self::read_i8].
215 #[inline]
216 pub const unsafe fn read_f32(&self, idx: usize) -> f32 {
217 // SAFETY: 4-byte read from valid FLOAT vector.
218 unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<f32>()) }
219 }
220
221 /// Reads an `f64` (DOUBLE) value at row `idx`.
222 ///
223 /// # Safety
224 ///
225 /// See [`read_i8`][Self::read_i8].
226 #[inline]
227 pub const unsafe fn read_f64(&self, idx: usize) -> f64 {
228 // SAFETY: 8-byte read from valid DOUBLE vector.
229 unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<f64>()) }
230 }
231
232 /// Reads a `bool` (BOOLEAN) value at row `idx`.
233 ///
234 /// # Pitfall L5: Defensive boolean reading
235 ///
236 /// This method reads the underlying byte as `u8` and compares with `!= 0`,
237 /// rather than casting directly to `bool`. `DuckDB`'s C API does not guarantee
238 /// the Rust `bool` invariant (must be exactly 0 or 1), so a direct cast could
239 /// cause undefined behaviour.
240 ///
241 /// # Safety
242 ///
243 /// - `idx` must be less than `self.row_count()`.
244 /// - The column must contain `BOOLEAN` data.
245 #[inline]
246 pub const unsafe fn read_bool(&self, idx: usize) -> bool {
247 // SAFETY: BOOLEAN data is stored as 1 byte per value.
248 // We read as u8 (not bool) to avoid UB if DuckDB sets non-0/1 values.
249 // This is Pitfall L5: always read boolean as u8 then compare != 0.
250 unsafe { *self.data.add(idx) != 0 }
251 }
252
253 /// Reads an `i128` (HUGEINT) value at row `idx`.
254 ///
255 /// `DuckDB` stores HUGEINT as `{ lower: u64, upper: i64 }` in little-endian
256 /// layout, totaling 16 bytes per value.
257 ///
258 /// # Safety
259 ///
260 /// - `idx` must be less than `self.row_count()`.
261 /// - The column must contain `HUGEINT` data.
262 /// - The value at `idx` must not be NULL (check with [`is_valid`][Self::is_valid]).
263 #[inline]
264 pub const unsafe fn read_i128(&self, idx: usize) -> i128 {
265 // SAFETY: HUGEINT is stored as { lower: u64, upper: i64 } = 16 bytes.
266 // DuckDB lays this out in little-endian order: lower at offset 0, upper at offset 8.
267 let base = unsafe { self.data.add(idx * 16) };
268 let lower = unsafe { core::ptr::read_unaligned(base.cast::<u64>()) };
269 let upper = unsafe { core::ptr::read_unaligned(base.add(8).cast::<i64>()) };
270 // Widening casts: u64→i128 and i64→i128 are always lossless.
271 #[allow(clippy::cast_lossless)]
272 let result = (upper as i128) << 64 | (lower as i128);
273 result
274 }
275
276 /// Reads a VARCHAR value at row `idx`.
277 ///
278 /// Returns an empty string if the data is not valid UTF-8 or if the internal
279 /// string pointer is null.
280 ///
281 /// # Pitfall P7
282 ///
283 /// `DuckDB` stores strings in a 16-byte `duckdb_string_t` with two formats
284 /// (inline for ≤ 12 bytes, pointer otherwise). This method handles both.
285 ///
286 /// # Safety
287 ///
288 /// - `idx` must be less than `self.row_count()`.
289 /// - The column must contain `VARCHAR` data.
290 /// - For pointer-format strings, the pointed-to heap memory must be valid
291 /// for the lifetime of the returned `&str`.
292 pub unsafe fn read_str(&self, idx: usize) -> &str {
293 // SAFETY: Caller guarantees data is a VARCHAR vector and idx is in bounds.
294 unsafe { crate::vector::string::read_duck_string(self.data, idx) }
295 }
296
297 /// Reads a `BLOB` (binary) value at row `idx`.
298 ///
299 /// `DuckDB` stores BLOBs using the same 16-byte `duckdb_string_t` layout as
300 /// VARCHAR (inline for ≤12 bytes, pointer for larger values). The returned
301 /// slice borrows from the vector's data buffer.
302 ///
303 /// # Safety
304 ///
305 /// - `idx` must be less than `self.row_count()`.
306 /// - The column must contain `BLOB` data.
307 /// - The pointed-to memory must be valid for the lifetime of the returned slice.
308 pub unsafe fn read_blob(&self, idx: usize) -> &[u8] {
309 // SAFETY: BLOB uses the same duckdb_string_t layout as VARCHAR.
310 unsafe { crate::vector::string::read_duck_string(self.data, idx).as_bytes() }
311 }
312
313 /// Reads a `UUID` value at row `idx` as an `i128`.
314 ///
315 /// `DuckDB` stores UUID as a HUGEINT (128-bit integer). This is a semantic
316 /// alias for [`read_i128`][Self::read_i128].
317 ///
318 /// # Safety
319 ///
320 /// - `idx` must be less than `self.row_count()`.
321 /// - The column must contain `UUID` data.
322 #[inline]
323 pub const unsafe fn read_uuid(&self, idx: usize) -> i128 {
324 // SAFETY: UUID is stored as HUGEINT (i128).
325 unsafe { self.read_i128(idx) }
326 }
327
328 /// Reads a `DATE` value at row `idx` as days since the Unix epoch.
329 ///
330 /// `DuckDB` stores DATE as a 4-byte `i32` representing the number of days
331 /// since 1970-01-01. This is a semantic alias for [`read_i32`][Self::read_i32].
332 ///
333 /// # Safety
334 ///
335 /// - `idx` must be less than `self.row_count()`.
336 /// - The column must contain `DATE` data.
337 #[inline]
338 pub const unsafe fn read_date(&self, idx: usize) -> i32 {
339 // SAFETY: DATE is stored as i32 (days since epoch).
340 unsafe { self.read_i32(idx) }
341 }
342
343 /// Reads a `TIMESTAMP` value at row `idx` as microseconds since the Unix epoch.
344 ///
345 /// `DuckDB` stores TIMESTAMP as an 8-byte `i64` representing microseconds
346 /// since 1970-01-01 00:00:00 UTC. This is a semantic alias for
347 /// [`read_i64`][Self::read_i64].
348 ///
349 /// # Safety
350 ///
351 /// - `idx` must be less than `self.row_count()`.
352 /// - The column must contain `TIMESTAMP` data.
353 #[inline]
354 pub const unsafe fn read_timestamp(&self, idx: usize) -> i64 {
355 // SAFETY: TIMESTAMP is stored as i64 (microseconds since epoch).
356 unsafe { self.read_i64(idx) }
357 }
358
359 /// Reads a `TIME` value at row `idx` as microseconds since midnight.
360 ///
361 /// `DuckDB` stores TIME as an 8-byte `i64` representing microseconds since
362 /// midnight. This is a semantic alias for [`read_i64`][Self::read_i64].
363 ///
364 /// # Safety
365 ///
366 /// - `idx` must be less than `self.row_count()`.
367 /// - The column must contain `TIME` data.
368 #[inline]
369 pub const unsafe fn read_time(&self, idx: usize) -> i64 {
370 // SAFETY: TIME is stored as i64 (microseconds since midnight).
371 unsafe { self.read_i64(idx) }
372 }
373
374 /// Reads an `INTERVAL` value at row `idx`.
375 ///
376 /// Returns a [`DuckInterval`][crate::interval::DuckInterval] struct.
377 ///
378 /// # Pitfall P8
379 ///
380 /// The `INTERVAL` struct is 16 bytes: `{ months: i32, days: i32, micros: i64 }`.
381 /// This method handles the layout correctly using [`read_interval_at`][crate::interval::read_interval_at].
382 ///
383 /// # Safety
384 ///
385 /// - `idx` must be less than `self.row_count()`.
386 /// - The column must contain `INTERVAL` data.
387 #[inline]
388 pub const unsafe fn read_interval(&self, idx: usize) -> crate::interval::DuckInterval {
389 // SAFETY: data is a valid INTERVAL vector and idx is in bounds.
390 unsafe { crate::interval::read_interval_at(self.data, idx) }
391 }
392}
393
394#[cfg(test)]
395mod tests {
396 use super::*;
397
398 /// Verify that `VectorReader` handles the boolean-as-u8 pattern correctly.
399 #[test]
400 fn bool_read_u8_pattern() {
401 // Simulate a DuckDB BOOLEAN vector with a non-standard value (e.g., 2)
402 // to verify we use != 0 comparison rather than transmuting to bool.
403 let data: [u8; 4] = [0, 1, 2, 255];
404
405 // Directly test the read_bool logic by checking values
406 // (We can't easily create a real VectorReader without DuckDB, so we test
407 // the underlying invariant: any non-zero byte is `true`.)
408 let as_bools: Vec<bool> = data.iter().map(|&b| b != 0).collect();
409 assert_eq!(as_bools, [false, true, true, true]);
410 }
411
412 #[test]
413 fn row_count_is_zero_for_empty_state() {
414 // This exercises the struct layout; actual DuckDB integration is in tests/
415 let reader = VectorReader {
416 data: std::ptr::null(),
417 validity: std::ptr::null_mut(),
418 row_count: 0,
419 };
420 assert_eq!(reader.row_count(), 0);
421 }
422
423 #[test]
424 fn is_valid_when_validity_null() {
425 // When validity is null, all rows are considered valid
426 let reader = VectorReader {
427 data: std::ptr::null(),
428 validity: std::ptr::null_mut(),
429 row_count: 5,
430 };
431 // SAFETY: row 0 is in bounds (row_count = 5), validity is null (all valid)
432 assert!(unsafe { reader.is_valid(0) });
433 assert!(unsafe { reader.is_valid(4) });
434 }
435}