quack_rs/vector/string.rs
1// SPDX-License-Identifier: MIT
2// Copyright 2026 Tom F. <https://github.com/tomtom215/>
3// My way of giving something small back to the open source community
4// and encouraging more Rust development!
5
6//! `DuckDB` `VARCHAR` (`duckdb_string_t`) reading utilities.
7//!
8//! # Pitfall P7: Undocumented `duckdb_string_t` format
9//!
10//! `DuckDB` stores VARCHAR values in a 16-byte `duckdb_string_t` struct with two
11//! representations:
12//!
13//! - **Inline** (length ≤ 12): `[ len: u32 | data: [u8; 12] ]`
14//! - **Pointer** (length > 12): `[ len: u32 | prefix: [u8; 4] | ptr: *const u8 | unused: u32 ]`
15//!
16//! This is not documented in the Rust bindings. The layout was determined by
17//! reading `DuckDB`'s C source and confirmed by the duckdb-behavioral implementation.
18//!
19//! # Example
20//!
21//! ```rust
22//! use quack_rs::vector::string::{DuckStringView, read_duck_string};
23//!
24//! // A short string (inline case)
25//! let bytes: [u8; 16] = {
26//! let mut b = [0u8; 16];
27//! b[0] = 5; // length = 5
28//! b[4..9].copy_from_slice(b"hello");
29//! b
30//! };
31//! let view = DuckStringView::from_bytes(&bytes);
32//! assert_eq!(view.as_str(), Some("hello"));
33//! assert_eq!(view.len(), 5);
34//! ```
35
36/// The size of a `duckdb_string_t` in bytes.
37pub const DUCK_STRING_SIZE: usize = 16;
38
39/// The maximum string length that fits inline in a `duckdb_string_t` (≤12 bytes).
40pub const DUCK_STRING_INLINE_MAX_LEN: usize = 12;
41
42/// A parsed view of a `duckdb_string_t` value.
43///
44/// This type borrows from the raw vector data — it does not allocate.
45///
46/// # Safety
47///
48/// The `data` slice from which this view is created must outlive the view.
49/// For pointer-format strings, the pointed-to heap data must also be valid.
50#[derive(Debug, Clone, Copy)]
51pub struct DuckStringView<'a> {
52 bytes: &'a [u8],
53 length: usize,
54}
55
56impl<'a> DuckStringView<'a> {
57 /// Creates a `DuckStringView` from the raw 16-byte representation.
58 ///
59 /// The input is a fixed-size `[u8; 16]` reference, so the size is
60 /// enforced at compile time.
61 #[must_use]
62 pub const fn from_bytes(raw: &'a [u8; DUCK_STRING_SIZE]) -> Self {
63 let length = u32::from_le_bytes([raw[0], raw[1], raw[2], raw[3]]) as usize;
64 Self { bytes: raw, length }
65 }
66
67 /// Returns the length of the string in bytes.
68 #[must_use]
69 #[inline]
70 pub const fn len(&self) -> usize {
71 self.length
72 }
73
74 /// Returns `true` if the string is empty.
75 #[must_use]
76 #[inline]
77 pub const fn is_empty(&self) -> bool {
78 self.length == 0
79 }
80
81 /// Returns the string as a UTF-8 `str` slice, or `None` if it is not valid UTF-8.
82 ///
83 /// The returned `&'a str` has the same lifetime as the underlying data slice —
84 /// not the lifetime of `self`. This allows the result to outlive the `DuckStringView`.
85 ///
86 /// # Safety
87 ///
88 /// For pointer-format strings (length > 12), the pointer stored at bytes 8–15
89 /// must be a valid pointer to at least `self.length` bytes of string data that
90 /// is live for lifetime `'a`.
91 #[must_use]
92 pub fn as_str(&self) -> Option<&'a str> {
93 let slice = self.as_bytes_unsafe()?;
94 std::str::from_utf8(slice).ok()
95 }
96
97 /// Returns the raw bytes of the string content.
98 ///
99 /// Returns `None` if the internal pointer (for long strings) is null.
100 ///
101 /// The returned bytes have lifetime `'a` (the lifetime of the underlying data).
102 ///
103 /// # Platform assumption
104 ///
105 /// The pointer-format branch reads bytes 8–15 as a `usize` pointer, which
106 /// assumes a 64-bit platform (8-byte pointers). `DuckDB` itself only supports
107 /// 64-bit platforms, so this is a safe assumption.
108 ///
109 /// # Safety (internal)
110 ///
111 /// This method dereferences the pointer stored in the `duckdb_string_t` struct
112 /// for strings longer than 12 bytes. The caller (i.e., the `DuckStringView`
113 /// constructor) must ensure the underlying vector data is still valid.
114 fn as_bytes_unsafe(&self) -> Option<&'a [u8]> {
115 if self.length <= DUCK_STRING_INLINE_MAX_LEN {
116 // Inline case: data starts at byte 4, length bytes follow
117 Some(&self.bytes[4..4 + self.length])
118 } else {
119 // Pointer case: bytes 8–15 contain the pointer (little-endian usize)
120 // SAFETY: For pointer-format strings, bytes 8..16 hold a valid pointer
121 // to heap memory allocated by DuckDB and valid for the vector's lifetime.
122 let ptr_bytes: [u8; 8] = self.bytes[8..16].try_into().ok()?;
123 let ptr_val = usize::from_le_bytes(ptr_bytes) as *const u8;
124 if ptr_val.is_null() {
125 return None;
126 }
127 // SAFETY: `ptr_val` is a DuckDB-managed pointer; the caller guarantees
128 // the underlying data is valid for the lifetime of the DuckStringView.
129 Some(unsafe { std::slice::from_raw_parts(ptr_val, self.length) })
130 }
131 }
132}
133
134/// Reads a `DuckDB` `VARCHAR` value from a raw vector data pointer at a given row index.
135///
136/// Returns the string as a `&str` slice, or an empty string if the data is not
137/// valid UTF-8 or if the pointer is null.
138///
139/// # Pitfall P7
140///
141/// `DuckDB` strings have two storage formats:
142/// - **Inline** (≤ 12 bytes): stored directly in the 16-byte struct
143/// - **Pointer** (> 12 bytes): struct contains a pointer to heap-allocated data
144///
145/// This function handles both transparently.
146///
147/// # Safety
148///
149/// - `data` must point to a `DuckDB` VARCHAR vector's data buffer.
150/// - `idx` must be within bounds of the vector.
151/// - For pointer-format strings, the heap data pointed to must be valid for the
152/// duration of this function call and the returned `&str` slice.
153/// - The returned `&str` borrows from the `DuckDB` vector — do not destroy the
154/// data chunk while the returned reference is live.
155///
156/// # Example
157///
158/// ```rust,no_run
159/// use quack_rs::vector::string::read_duck_string;
160///
161/// // Inside a DuckDB aggregate callback:
162/// // let data = libduckdb_sys::duckdb_vector_get_data(vec) as *const u8;
163/// // let s = unsafe { read_duck_string(data, row_idx) };
164/// # let data: *const u8 = std::ptr::null();
165/// # let _ = data;
166/// ```
167pub unsafe fn read_duck_string<'a>(data: *const u8, idx: usize) -> &'a str {
168 // SAFETY: Each duckdb_string_t is exactly 16 bytes. The caller guarantees
169 // `data` is valid and `idx` is in bounds.
170 let str_ptr = unsafe { data.add(idx * DUCK_STRING_SIZE) };
171 // SAFETY: `str_ptr` points to the idx-th duckdb_string_t in the vector.
172 // The reference has lifetime 'a because it borrows from the raw pointer
173 // whose backing data lives for the vector's lifetime ('a per caller's contract).
174 let raw_bytes: &'a [u8; DUCK_STRING_SIZE] =
175 unsafe { &*str_ptr.cast::<[u8; DUCK_STRING_SIZE]>() };
176 // DuckStringView<'a> stores &'a [u8], so as_str() returns Option<&'a str>.
177 DuckStringView::from_bytes(raw_bytes).as_str().unwrap_or("")
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183
184 fn make_inline_bytes(s: &str) -> [u8; 16] {
185 assert!(
186 s.len() <= DUCK_STRING_INLINE_MAX_LEN,
187 "use pointer format for long strings"
188 );
189 let mut bytes = [0u8; 16];
190 let len = u32::try_from(s.len()).unwrap_or(u32::MAX);
191 bytes[..4].copy_from_slice(&len.to_le_bytes());
192 bytes[4..4 + s.len()].copy_from_slice(s.as_bytes());
193 bytes
194 }
195
196 #[test]
197 fn empty_string_inline() {
198 let bytes = make_inline_bytes("");
199 let view = DuckStringView::from_bytes(&bytes);
200 assert_eq!(view.len(), 0);
201 assert!(view.is_empty());
202 assert_eq!(view.as_str(), Some(""));
203 }
204
205 #[test]
206 fn short_string_inline() {
207 let bytes = make_inline_bytes("hello");
208 let view = DuckStringView::from_bytes(&bytes);
209 assert_eq!(view.len(), 5);
210 assert!(!view.is_empty());
211 assert_eq!(view.as_str(), Some("hello"));
212 }
213
214 #[test]
215 fn max_inline_string() {
216 let s = "abcdefghijkl"; // exactly 12 bytes
217 assert_eq!(s.len(), DUCK_STRING_INLINE_MAX_LEN);
218 let bytes = make_inline_bytes(s);
219 let view = DuckStringView::from_bytes(&bytes);
220 assert_eq!(view.len(), 12);
221 assert_eq!(view.as_str(), Some(s));
222 }
223
224 #[test]
225 fn pointer_format_string() {
226 let long_str = "this is a longer string that exceeds 12 bytes";
227 let len = long_str.len();
228 let ptr = long_str.as_ptr();
229
230 let mut bytes = [0u8; 16];
231 // Write length
232 bytes[..4].copy_from_slice(&u32::try_from(len).unwrap_or(u32::MAX).to_le_bytes());
233 // Write prefix (first 4 bytes of the string)
234 bytes[4..8].copy_from_slice(&long_str.as_bytes()[..4]);
235 // Write pointer at bytes 8..16
236 let ptr_val = ptr as usize;
237 bytes[8..16].copy_from_slice(&ptr_val.to_le_bytes());
238
239 let view = DuckStringView::from_bytes(&bytes);
240 assert_eq!(view.len(), len);
241 assert_eq!(view.as_str(), Some(long_str));
242 }
243
244 #[test]
245 fn pointer_null_returns_none() {
246 let mut bytes = [0u8; 16];
247 // Write length > 12
248 bytes[..4].copy_from_slice(&13u32.to_le_bytes());
249 // pointer bytes 8..16 remain 0 (null pointer)
250
251 let view = DuckStringView::from_bytes(&bytes);
252 // Null pointer for long string should return None
253 assert!(view.as_str().is_none());
254 }
255
256 #[test]
257 fn read_duck_string_inline() {
258 let bytes = make_inline_bytes("world");
259 let data = bytes.as_ptr();
260 // SAFETY: data points to a valid 16-byte inline string at idx 0.
261 let s = unsafe { read_duck_string(data, 0) };
262 assert_eq!(s, "world");
263 }
264
265 #[test]
266 fn read_duck_string_pointer_format() {
267 let long_str = "abcdefghijklmnopqrst"; // 20 bytes
268 let len = long_str.len();
269 let ptr = long_str.as_ptr();
270
271 let mut bytes = [0u8; 16];
272 bytes[..4].copy_from_slice(&u32::try_from(len).unwrap_or(u32::MAX).to_le_bytes());
273 bytes[4..8].copy_from_slice(&long_str.as_bytes()[..4]);
274 let ptr_val = ptr as usize;
275 bytes[8..16].copy_from_slice(&ptr_val.to_le_bytes());
276
277 // SAFETY: bytes is a valid pointer-format duckdb_string_t at idx 0.
278 let s = unsafe { read_duck_string(bytes.as_ptr(), 0) };
279 assert_eq!(s, long_str);
280 }
281
282 #[test]
283 fn duck_string_size_constant() {
284 assert_eq!(DUCK_STRING_SIZE, 16);
285 }
286
287 #[test]
288 fn duck_string_inline_max_len_constant() {
289 assert_eq!(DUCK_STRING_INLINE_MAX_LEN, 12);
290 }
291}