quack-rs 0.12.0

Production-grade Rust SDK for building DuckDB loadable extensions
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
// SPDX-License-Identifier: MIT
// Copyright 2026 Tom F. <https://github.com/tomtom215/>
// My way of giving something small back to the open source community
// and encouraging more Rust development!

//! Safe typed reading from `DuckDB` data vectors.
//!
//! [`VectorReader`] provides safe access to the typed data in a `DuckDB` vector
//! without requiring direct raw pointer manipulation.
//!
//! # Pitfalls solved
//!
//! - **L5**: Booleans are read as `u8 != 0`, never as `bool`, because `DuckDB`'s
//!   C API does not guarantee the Rust `bool` invariant (must be 0 or 1).
//!
//! # Example
//!
//! ```rust,no_run
//! use quack_rs::vector::VectorReader;
//! use libduckdb_sys::{duckdb_data_chunk, duckdb_data_chunk_get_vector,
//!                     duckdb_data_chunk_get_size};
//!
//! // Inside a DuckDB aggregate `update` callback:
//! // let reader = unsafe { VectorReader::new(chunk, 0) };
//! // for row in 0..reader.row_count() {
//! //     if reader.is_valid(row) {
//! //         let val = unsafe { reader.read_i64(row) };
//! //     }
//! // }
//! ```

use libduckdb_sys::{
    duckdb_data_chunk, duckdb_data_chunk_get_size, duckdb_data_chunk_get_vector,
    duckdb_validity_row_is_valid, duckdb_vector, duckdb_vector_get_data,
    duckdb_vector_get_validity, idx_t,
};

/// A typed reader for a single column in a `DuckDB` data chunk.
///
/// `VectorReader` wraps a pointer to a `DuckDB` vector's data buffer and
/// provides ergonomic, type-checked access methods for common `DuckDB` types.
///
/// # Lifetimes
///
/// The reader borrows from the data chunk. Do not call `duckdb_destroy_data_chunk`
/// while a `VectorReader` that references it is live.
pub struct VectorReader {
    data: *const u8,
    validity: *mut u64,
    row_count: usize,
}

impl VectorReader {
    /// Creates a new `VectorReader` for the given column in a data chunk.
    ///
    /// # Safety
    ///
    /// - `chunk` must be a valid `duckdb_data_chunk` for the duration of this reader's lifetime.
    /// - `col_idx` must be a valid column index in the chunk.
    pub unsafe fn new(chunk: duckdb_data_chunk, col_idx: usize) -> Self {
        // SAFETY: Caller guarantees chunk is valid.
        let row_count = usize::try_from(unsafe { duckdb_data_chunk_get_size(chunk) }).unwrap_or(0);
        // SAFETY: col_idx is valid per caller's contract.
        let vector = unsafe { duckdb_data_chunk_get_vector(chunk, col_idx as idx_t) };
        // SAFETY: vector is non-null for valid column indices.
        let data = unsafe { duckdb_vector_get_data(vector) }.cast::<u8>();
        // SAFETY: may be null if all values are valid (no NULLs); checked in is_valid.
        let validity = unsafe { duckdb_vector_get_validity(vector) };
        Self {
            data,
            validity,
            row_count,
        }
    }

    /// Creates a `VectorReader` directly from a raw `duckdb_vector` handle.
    ///
    /// Use this when you already have a child vector (e.g., from
    /// [`StructVector::get_child`][crate::vector::complex::StructVector::get_child] or
    /// [`ListVector::get_child`][crate::vector::complex::ListVector::get_child]).
    ///
    /// # Safety
    ///
    /// - `vector` must be a valid `duckdb_vector` for the duration of this reader's lifetime.
    /// - `row_count` must equal the number of valid rows in the vector.
    pub unsafe fn from_vector(vector: duckdb_vector, row_count: usize) -> Self {
        // SAFETY: vector is valid per caller's contract.
        let data = unsafe { duckdb_vector_get_data(vector) }.cast::<u8>();
        let validity = unsafe { duckdb_vector_get_validity(vector) };
        Self {
            data,
            validity,
            row_count,
        }
    }

    /// Returns the number of rows in this vector.
    #[mutants::skip]
    #[must_use]
    #[inline]
    pub const fn row_count(&self) -> usize {
        self.row_count
    }

    /// Returns `true` if the value at row `idx` is not NULL.
    ///
    /// # Safety
    ///
    /// `idx` must be less than `self.row_count()`.
    #[inline]
    pub unsafe fn is_valid(&self, idx: usize) -> bool {
        if self.validity.is_null() {
            return true;
        }
        // SAFETY: validity is non-null and idx is in bounds per caller's contract.
        unsafe { duckdb_validity_row_is_valid(self.validity, idx as idx_t) }
    }

    /// Reads an `i8` (TINYINT) value at row `idx`.
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `TINYINT` data.
    /// - The value at `idx` must not be NULL (check with [`is_valid`][Self::is_valid]).
    #[inline]
    pub const unsafe fn read_i8(&self, idx: usize) -> i8 {
        // SAFETY: data points to valid TINYINT array, idx is in bounds.
        unsafe { core::ptr::read_unaligned(self.data.add(idx).cast::<i8>()) }
    }

    /// Reads an `i16` (SMALLINT) value at row `idx`.
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `SMALLINT` data.
    #[inline]
    pub const unsafe fn read_i16(&self, idx: usize) -> i16 {
        // SAFETY: 2-byte read from valid SMALLINT vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 2).cast::<i16>()) }
    }

    /// Reads an `i32` (INTEGER) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_i32(&self, idx: usize) -> i32 {
        // SAFETY: 4-byte read from valid INTEGER vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<i32>()) }
    }

    /// Reads an `i64` (BIGINT / TIMESTAMP) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_i64(&self, idx: usize) -> i64 {
        // SAFETY: 8-byte read from valid BIGINT/TIMESTAMP vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<i64>()) }
    }

    /// Reads a `u8` (UTINYINT) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_u8(&self, idx: usize) -> u8 {
        // SAFETY: 1-byte read from valid UTINYINT vector.
        unsafe { *self.data.add(idx) }
    }

    /// Reads a `u16` (USMALLINT) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_u16(&self, idx: usize) -> u16 {
        // SAFETY: 2-byte read from valid USMALLINT vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 2).cast::<u16>()) }
    }

    /// Reads a `u32` (UINTEGER) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_u32(&self, idx: usize) -> u32 {
        // SAFETY: 4-byte read from valid UINTEGER vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<u32>()) }
    }

    /// Reads a `u64` (UBIGINT) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_u64(&self, idx: usize) -> u64 {
        // SAFETY: 8-byte read from valid UBIGINT vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<u64>()) }
    }

    /// Reads an `f32` (FLOAT) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_f32(&self, idx: usize) -> f32 {
        // SAFETY: 4-byte read from valid FLOAT vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 4).cast::<f32>()) }
    }

    /// Reads an `f64` (DOUBLE) value at row `idx`.
    ///
    /// # Safety
    ///
    /// See [`read_i8`][Self::read_i8].
    #[inline]
    pub const unsafe fn read_f64(&self, idx: usize) -> f64 {
        // SAFETY: 8-byte read from valid DOUBLE vector.
        unsafe { core::ptr::read_unaligned(self.data.add(idx * 8).cast::<f64>()) }
    }

    /// Reads a `bool` (BOOLEAN) value at row `idx`.
    ///
    /// # Pitfall L5: Defensive boolean reading
    ///
    /// This method reads the underlying byte as `u8` and compares with `!= 0`,
    /// rather than casting directly to `bool`. `DuckDB`'s C API does not guarantee
    /// the Rust `bool` invariant (must be exactly 0 or 1), so a direct cast could
    /// cause undefined behaviour.
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `BOOLEAN` data.
    #[inline]
    pub const unsafe fn read_bool(&self, idx: usize) -> bool {
        // SAFETY: BOOLEAN data is stored as 1 byte per value.
        // We read as u8 (not bool) to avoid UB if DuckDB sets non-0/1 values.
        // This is Pitfall L5: always read boolean as u8 then compare != 0.
        unsafe { *self.data.add(idx) != 0 }
    }

    /// Reads an `i128` (HUGEINT) value at row `idx`.
    ///
    /// `DuckDB` stores HUGEINT as `{ lower: u64, upper: i64 }` in little-endian
    /// layout, totaling 16 bytes per value.
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `HUGEINT` data.
    /// - The value at `idx` must not be NULL (check with [`is_valid`][Self::is_valid]).
    #[inline]
    pub const unsafe fn read_i128(&self, idx: usize) -> i128 {
        // SAFETY: HUGEINT is stored as { lower: u64, upper: i64 } = 16 bytes.
        // DuckDB lays this out in little-endian order: lower at offset 0, upper at offset 8.
        let base = unsafe { self.data.add(idx * 16) };
        let lower = unsafe { core::ptr::read_unaligned(base.cast::<u64>()) };
        let upper = unsafe { core::ptr::read_unaligned(base.add(8).cast::<i64>()) };
        // Widening casts: u64→i128 and i64→i128 are always lossless.
        #[allow(clippy::cast_lossless)]
        let result = (upper as i128) << 64 | (lower as i128);
        result
    }

    /// Reads a VARCHAR value at row `idx`.
    ///
    /// Returns an empty string if the data is not valid UTF-8 or if the internal
    /// string pointer is null.
    ///
    /// # Pitfall P7
    ///
    /// `DuckDB` stores strings in a 16-byte `duckdb_string_t` with two formats
    /// (inline for ≤ 12 bytes, pointer otherwise). This method handles both.
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `VARCHAR` data.
    /// - For pointer-format strings, the pointed-to heap memory must be valid
    ///   for the lifetime of the returned `&str`.
    pub unsafe fn read_str(&self, idx: usize) -> &str {
        // SAFETY: Caller guarantees data is a VARCHAR vector and idx is in bounds.
        unsafe { crate::vector::string::read_duck_string(self.data, idx) }
    }

    /// Reads a `BLOB` (binary) value at row `idx`.
    ///
    /// `DuckDB` stores BLOBs using the same 16-byte `duckdb_string_t` layout as
    /// VARCHAR (inline for ≤12 bytes, pointer for larger values). The returned
    /// slice borrows from the vector's data buffer.
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `BLOB` data.
    /// - The pointed-to memory must be valid for the lifetime of the returned slice.
    pub unsafe fn read_blob(&self, idx: usize) -> &[u8] {
        // SAFETY: BLOB uses the same duckdb_string_t layout as VARCHAR.
        unsafe { crate::vector::string::read_duck_string(self.data, idx).as_bytes() }
    }

    /// Reads a `UUID` value at row `idx` as an `i128`.
    ///
    /// `DuckDB` stores UUID as a HUGEINT (128-bit integer). This is a semantic
    /// alias for [`read_i128`][Self::read_i128].
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `UUID` data.
    #[inline]
    pub const unsafe fn read_uuid(&self, idx: usize) -> i128 {
        // SAFETY: UUID is stored as HUGEINT (i128).
        unsafe { self.read_i128(idx) }
    }

    /// Reads a `DATE` value at row `idx` as days since the Unix epoch.
    ///
    /// `DuckDB` stores DATE as a 4-byte `i32` representing the number of days
    /// since 1970-01-01. This is a semantic alias for [`read_i32`][Self::read_i32].
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `DATE` data.
    #[inline]
    pub const unsafe fn read_date(&self, idx: usize) -> i32 {
        // SAFETY: DATE is stored as i32 (days since epoch).
        unsafe { self.read_i32(idx) }
    }

    /// Reads a `TIMESTAMP` value at row `idx` as microseconds since the Unix epoch.
    ///
    /// `DuckDB` stores TIMESTAMP as an 8-byte `i64` representing microseconds
    /// since 1970-01-01 00:00:00 UTC. This is a semantic alias for
    /// [`read_i64`][Self::read_i64].
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `TIMESTAMP` data.
    #[inline]
    pub const unsafe fn read_timestamp(&self, idx: usize) -> i64 {
        // SAFETY: TIMESTAMP is stored as i64 (microseconds since epoch).
        unsafe { self.read_i64(idx) }
    }

    /// Reads a `TIME` value at row `idx` as microseconds since midnight.
    ///
    /// `DuckDB` stores TIME as an 8-byte `i64` representing microseconds since
    /// midnight. This is a semantic alias for [`read_i64`][Self::read_i64].
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `TIME` data.
    #[inline]
    pub const unsafe fn read_time(&self, idx: usize) -> i64 {
        // SAFETY: TIME is stored as i64 (microseconds since midnight).
        unsafe { self.read_i64(idx) }
    }

    /// Reads an `INTERVAL` value at row `idx`.
    ///
    /// Returns a [`DuckInterval`][crate::interval::DuckInterval] struct.
    ///
    /// # Pitfall P8
    ///
    /// The `INTERVAL` struct is 16 bytes: `{ months: i32, days: i32, micros: i64 }`.
    /// This method handles the layout correctly using [`read_interval_at`][crate::interval::read_interval_at].
    ///
    /// # Safety
    ///
    /// - `idx` must be less than `self.row_count()`.
    /// - The column must contain `INTERVAL` data.
    #[inline]
    pub const unsafe fn read_interval(&self, idx: usize) -> crate::interval::DuckInterval {
        // SAFETY: data is a valid INTERVAL vector and idx is in bounds.
        unsafe { crate::interval::read_interval_at(self.data, idx) }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Verify that `VectorReader` handles the boolean-as-u8 pattern correctly.
    #[test]
    fn bool_read_u8_pattern() {
        // Simulate a DuckDB BOOLEAN vector with a non-standard value (e.g., 2)
        // to verify we use != 0 comparison rather than transmuting to bool.
        let data: [u8; 4] = [0, 1, 2, 255];

        // Directly test the read_bool logic by checking values
        // (We can't easily create a real VectorReader without DuckDB, so we test
        // the underlying invariant: any non-zero byte is `true`.)
        let as_bools: Vec<bool> = data.iter().map(|&b| b != 0).collect();
        assert_eq!(as_bools, [false, true, true, true]);
    }

    #[test]
    fn row_count_is_zero_for_empty_state() {
        // This exercises the struct layout; actual DuckDB integration is in tests/
        let reader = VectorReader {
            data: std::ptr::null(),
            validity: std::ptr::null_mut(),
            row_count: 0,
        };
        assert_eq!(reader.row_count(), 0);
    }

    #[test]
    fn is_valid_when_validity_null() {
        // When validity is null, all rows are considered valid
        let reader = VectorReader {
            data: std::ptr::null(),
            validity: std::ptr::null_mut(),
            row_count: 5,
        };
        // SAFETY: row 0 is in bounds (row_count = 5), validity is null (all valid)
        assert!(unsafe { reader.is_valid(0) });
        assert!(unsafe { reader.is_valid(4) });
    }
}