Skip to main content

quack_rs/vector/
string.rs

1// SPDX-License-Identifier: MIT
2// Copyright 2026 Tom F. <https://github.com/tomtom215/>
3// My way of giving something small back to the open source community
4// and encouraging more Rust development!
5
6//! `DuckDB` `VARCHAR` (`duckdb_string_t`) reading utilities.
7//!
8//! # Pitfall P7: Undocumented `duckdb_string_t` format
9//!
10//! `DuckDB` stores VARCHAR values in a 16-byte `duckdb_string_t` struct with two
11//! representations:
12//!
13//! - **Inline** (length ≤ 12): `[ len: u32 | data: [u8; 12] ]`
14//! - **Pointer** (length > 12): `[ len: u32 | prefix: [u8; 4] | ptr: *const u8 | unused: u32 ]`
15//!
16//! This is not documented in the Rust bindings. The layout was determined by
17//! reading `DuckDB`'s C source and confirmed by the duckdb-behavioral implementation.
18//!
19//! # Example
20//!
21//! ```rust
22//! use quack_rs::vector::string::{DuckStringView, read_duck_string};
23//!
24//! // A short string (inline case)
25//! let bytes: [u8; 16] = {
26//!     let mut b = [0u8; 16];
27//!     b[0] = 5; // length = 5
28//!     b[4..9].copy_from_slice(b"hello");
29//!     b
30//! };
31//! let view = DuckStringView::from_bytes(&bytes);
32//! assert_eq!(view.as_str(), Some("hello"));
33//! assert_eq!(view.len(), 5);
34//! ```
35
36/// The size of a `duckdb_string_t` in bytes.
37pub const DUCK_STRING_SIZE: usize = 16;
38
39/// The maximum string length that fits inline in a `duckdb_string_t` (≤12 bytes).
40pub const DUCK_STRING_INLINE_MAX_LEN: usize = 12;
41
42/// A parsed view of a `duckdb_string_t` value.
43///
44/// This type borrows from the raw vector data — it does not allocate.
45///
46/// # Safety
47///
48/// The `data` slice from which this view is created must outlive the view.
49/// For pointer-format strings, the pointed-to heap data must also be valid.
50#[derive(Debug, Clone, Copy)]
51pub struct DuckStringView<'a> {
52    bytes: &'a [u8],
53    length: usize,
54}
55
56impl<'a> DuckStringView<'a> {
57    /// Creates a `DuckStringView` from the raw 16-byte representation.
58    ///
59    /// The input is a fixed-size `[u8; 16]` reference, so the size is
60    /// enforced at compile time.
61    #[must_use]
62    pub const fn from_bytes(raw: &'a [u8; DUCK_STRING_SIZE]) -> Self {
63        let length = u32::from_le_bytes([raw[0], raw[1], raw[2], raw[3]]) as usize;
64        Self { bytes: raw, length }
65    }
66
67    /// Returns the length of the string in bytes.
68    #[must_use]
69    #[inline]
70    pub const fn len(&self) -> usize {
71        self.length
72    }
73
74    /// Returns `true` if the string is empty.
75    #[must_use]
76    #[inline]
77    pub const fn is_empty(&self) -> bool {
78        self.length == 0
79    }
80
81    /// Returns the string as a UTF-8 `str` slice, or `None` if it is not valid UTF-8.
82    ///
83    /// The returned `&'a str` has the same lifetime as the underlying data slice —
84    /// not the lifetime of `self`. This allows the result to outlive the `DuckStringView`.
85    ///
86    /// # Safety
87    ///
88    /// For pointer-format strings (length > 12), the pointer stored at bytes 8–15
89    /// must be a valid pointer to at least `self.length` bytes of string data that
90    /// is live for lifetime `'a`.
91    #[must_use]
92    pub fn as_str(&self) -> Option<&'a str> {
93        let slice = self.as_bytes_unsafe()?;
94        std::str::from_utf8(slice).ok()
95    }
96
97    /// Returns the raw bytes of the string content.
98    ///
99    /// Returns `None` if the internal pointer (for long strings) is null.
100    ///
101    /// The returned bytes have lifetime `'a` (the lifetime of the underlying data).
102    ///
103    /// # Platform assumption
104    ///
105    /// The pointer-format branch reads bytes 8–15 as a `usize` pointer, which
106    /// assumes a 64-bit platform (8-byte pointers). `DuckDB` itself only supports
107    /// 64-bit platforms, so this is a safe assumption.
108    ///
109    /// # Safety (internal)
110    ///
111    /// This method dereferences the pointer stored in the `duckdb_string_t` struct
112    /// for strings longer than 12 bytes. The caller (i.e., the `DuckStringView`
113    /// constructor) must ensure the underlying vector data is still valid.
114    fn as_bytes_unsafe(&self) -> Option<&'a [u8]> {
115        if self.length <= DUCK_STRING_INLINE_MAX_LEN {
116            // Inline case: data starts at byte 4, length bytes follow
117            Some(&self.bytes[4..4 + self.length])
118        } else {
119            // Pointer case: bytes 8–15 contain the pointer (little-endian usize)
120            // SAFETY: For pointer-format strings, bytes 8..16 hold a valid pointer
121            // to heap memory allocated by DuckDB and valid for the vector's lifetime.
122            let ptr_bytes: [u8; 8] = self.bytes[8..16].try_into().ok()?;
123            let ptr_val = usize::from_le_bytes(ptr_bytes) as *const u8;
124            if ptr_val.is_null() {
125                return None;
126            }
127            // SAFETY: `ptr_val` is a DuckDB-managed pointer; the caller guarantees
128            // the underlying data is valid for the lifetime of the DuckStringView.
129            Some(unsafe { std::slice::from_raw_parts(ptr_val, self.length) })
130        }
131    }
132}
133
134/// Reads a `DuckDB` `VARCHAR` value from a raw vector data pointer at a given row index.
135///
136/// Returns the string as a `&str` slice, or an empty string if the data is not
137/// valid UTF-8 or if the pointer is null.
138///
139/// # Pitfall P7
140///
141/// `DuckDB` strings have two storage formats:
142/// - **Inline** (≤ 12 bytes): stored directly in the 16-byte struct
143/// - **Pointer** (> 12 bytes): struct contains a pointer to heap-allocated data
144///
145/// This function handles both transparently.
146///
147/// # Safety
148///
149/// - `data` must point to a `DuckDB` VARCHAR vector's data buffer.
150/// - `idx` must be within bounds of the vector.
151/// - For pointer-format strings, the heap data pointed to must be valid for the
152///   duration of this function call and the returned `&str` slice.
153/// - The returned `&str` borrows from the `DuckDB` vector — do not destroy the
154///   data chunk while the returned reference is live.
155///
156/// # Example
157///
158/// ```rust,no_run
159/// use quack_rs::vector::string::read_duck_string;
160///
161/// // Inside a DuckDB aggregate callback:
162/// // let data = libduckdb_sys::duckdb_vector_get_data(vec) as *const u8;
163/// // let s = unsafe { read_duck_string(data, row_idx) };
164/// # let data: *const u8 = std::ptr::null();
165/// # let _ = data;
166/// ```
167pub unsafe fn read_duck_string<'a>(data: *const u8, idx: usize) -> &'a str {
168    // SAFETY: Each duckdb_string_t is exactly 16 bytes. The caller guarantees
169    // `data` is valid and `idx` is in bounds.
170    let str_ptr = unsafe { data.add(idx * DUCK_STRING_SIZE) };
171    // SAFETY: `str_ptr` points to the idx-th duckdb_string_t in the vector.
172    // The reference has lifetime 'a because it borrows from the raw pointer
173    // whose backing data lives for the vector's lifetime ('a per caller's contract).
174    let raw_bytes: &'a [u8; DUCK_STRING_SIZE] =
175        unsafe { &*str_ptr.cast::<[u8; DUCK_STRING_SIZE]>() };
176    // DuckStringView<'a> stores &'a [u8], so as_str() returns Option<&'a str>.
177    DuckStringView::from_bytes(raw_bytes).as_str().unwrap_or("")
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183
184    fn make_inline_bytes(s: &str) -> [u8; 16] {
185        assert!(
186            s.len() <= DUCK_STRING_INLINE_MAX_LEN,
187            "use pointer format for long strings"
188        );
189        let mut bytes = [0u8; 16];
190        let len = u32::try_from(s.len()).unwrap_or(u32::MAX);
191        bytes[..4].copy_from_slice(&len.to_le_bytes());
192        bytes[4..4 + s.len()].copy_from_slice(s.as_bytes());
193        bytes
194    }
195
196    #[test]
197    fn empty_string_inline() {
198        let bytes = make_inline_bytes("");
199        let view = DuckStringView::from_bytes(&bytes);
200        assert_eq!(view.len(), 0);
201        assert!(view.is_empty());
202        assert_eq!(view.as_str(), Some(""));
203    }
204
205    #[test]
206    fn short_string_inline() {
207        let bytes = make_inline_bytes("hello");
208        let view = DuckStringView::from_bytes(&bytes);
209        assert_eq!(view.len(), 5);
210        assert!(!view.is_empty());
211        assert_eq!(view.as_str(), Some("hello"));
212    }
213
214    #[test]
215    fn max_inline_string() {
216        let s = "abcdefghijkl"; // exactly 12 bytes
217        assert_eq!(s.len(), DUCK_STRING_INLINE_MAX_LEN);
218        let bytes = make_inline_bytes(s);
219        let view = DuckStringView::from_bytes(&bytes);
220        assert_eq!(view.len(), 12);
221        assert_eq!(view.as_str(), Some(s));
222    }
223
224    #[test]
225    fn pointer_format_string() {
226        let long_str = "this is a longer string that exceeds 12 bytes";
227        let len = long_str.len();
228        let ptr = long_str.as_ptr();
229
230        let mut bytes = [0u8; 16];
231        // Write length
232        bytes[..4].copy_from_slice(&u32::try_from(len).unwrap_or(u32::MAX).to_le_bytes());
233        // Write prefix (first 4 bytes of the string)
234        bytes[4..8].copy_from_slice(&long_str.as_bytes()[..4]);
235        // Write pointer at bytes 8..16
236        let ptr_val = ptr as usize;
237        bytes[8..16].copy_from_slice(&ptr_val.to_le_bytes());
238
239        let view = DuckStringView::from_bytes(&bytes);
240        assert_eq!(view.len(), len);
241        assert_eq!(view.as_str(), Some(long_str));
242    }
243
244    #[test]
245    fn pointer_null_returns_none() {
246        let mut bytes = [0u8; 16];
247        // Write length > 12
248        bytes[..4].copy_from_slice(&13u32.to_le_bytes());
249        // pointer bytes 8..16 remain 0 (null pointer)
250
251        let view = DuckStringView::from_bytes(&bytes);
252        // Null pointer for long string should return None
253        assert!(view.as_str().is_none());
254    }
255
256    #[test]
257    fn read_duck_string_inline() {
258        let bytes = make_inline_bytes("world");
259        let data = bytes.as_ptr();
260        // SAFETY: data points to a valid 16-byte inline string at idx 0.
261        let s = unsafe { read_duck_string(data, 0) };
262        assert_eq!(s, "world");
263    }
264
265    #[test]
266    fn read_duck_string_pointer_format() {
267        let long_str = "abcdefghijklmnopqrst"; // 20 bytes
268        let len = long_str.len();
269        let ptr = long_str.as_ptr();
270
271        let mut bytes = [0u8; 16];
272        bytes[..4].copy_from_slice(&u32::try_from(len).unwrap_or(u32::MAX).to_le_bytes());
273        bytes[4..8].copy_from_slice(&long_str.as_bytes()[..4]);
274        let ptr_val = ptr as usize;
275        bytes[8..16].copy_from_slice(&ptr_val.to_le_bytes());
276
277        // SAFETY: bytes is a valid pointer-format duckdb_string_t at idx 0.
278        let s = unsafe { read_duck_string(bytes.as_ptr(), 0) };
279        assert_eq!(s, long_str);
280    }
281
282    #[test]
283    fn duck_string_size_constant() {
284        assert_eq!(DUCK_STRING_SIZE, 16);
285    }
286
287    #[test]
288    fn duck_string_inline_max_len_constant() {
289        assert_eq!(DUCK_STRING_INLINE_MAX_LEN, 12);
290    }
291}