Skip to main content

reifydb_type/value/container/
utf8.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2025 ReifyDB
3
4use std::{
5	fmt::{self, Debug},
6	result::Result as StdResult,
7	str,
8};
9
10use serde::{Deserialize, Deserializer, Serialize, Serializer};
11
12use crate::{
13	Result,
14	storage::{Cow, Storage},
15	value::{Value, container::varlen::VarlenContainer, r#type::Type},
16};
17
18pub struct Utf8Container<S: Storage = Cow> {
19	inner: VarlenContainer<S>,
20}
21
22impl<S: Storage> Clone for Utf8Container<S> {
23	fn clone(&self) -> Self {
24		Self {
25			inner: self.inner.clone(),
26		}
27	}
28}
29
30impl<S: Storage> Debug for Utf8Container<S>
31where
32	VarlenContainer<S>: Debug,
33{
34	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35		f.debug_struct("Utf8Container").field("inner", &self.inner).finish()
36	}
37}
38
39impl<S: Storage> PartialEq for Utf8Container<S>
40where
41	VarlenContainer<S>: PartialEq,
42{
43	fn eq(&self, other: &Self) -> bool {
44		self.inner == other.inner
45	}
46}
47
48impl Serialize for Utf8Container<Cow> {
49	fn serialize<Ser: Serializer>(&self, serializer: Ser) -> StdResult<Ser::Ok, Ser::Error> {
50		// Postcard wire compat with the previous `Vec<String>` form: the
51		// inner VarlenContainer encodes as a sequence of byte slices,
52		// which postcard serializes identically to a sequence of strings
53		// (length-prefixed length-prefixed bytes).
54		self.inner.serialize(serializer)
55	}
56}
57
58impl<'de> Deserialize<'de> for Utf8Container<Cow> {
59	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> StdResult<Self, D::Error> {
60		let inner = VarlenContainer::deserialize(deserializer)?;
61		Ok(Self {
62			inner,
63		})
64	}
65}
66
67impl Utf8Container<Cow> {
68	pub fn new(data: Vec<String>) -> Self {
69		Self::from_vec(data)
70	}
71
72	pub fn from_vec(data: Vec<String>) -> Self {
73		let inner = VarlenContainer::from_byte_slices(data.iter().map(|s| s.as_bytes()));
74		Self {
75			inner,
76		}
77	}
78
79	pub fn with_capacity(capacity: usize) -> Self {
80		// Heuristic: assume average ~16 bytes per string for the byte
81		// arena. This is just a starting capacity hint; the buffer
82		// grows on demand.
83		Self {
84			inner: VarlenContainer::with_capacity(capacity, capacity * 16),
85		}
86	}
87
88	/// Reconstruct from a Vec of owned Strings (for compatibility with
89	/// previous `from_raw_parts(Vec<String>)`).
90	pub fn from_raw_parts(data: Vec<String>) -> Self {
91		Self::from_vec(data)
92	}
93
94	/// Build directly from contiguous bytes + offsets (zero-copy from the
95	/// caller's perspective). Caller must ensure the bytes are valid UTF-8
96	/// and offsets are well-formed.
97	pub fn from_bytes_offsets(data: Vec<u8>, offsets: Vec<u64>) -> Self {
98		debug_assert!(str::from_utf8(&data).is_ok(), "Utf8Container data must be valid UTF-8");
99		Self {
100			inner: VarlenContainer::from_raw_parts(data, offsets),
101		}
102	}
103
104	/// Try to decompose into a `Vec<String>` for compatibility with code
105	/// paths that need owned strings. Always succeeds (allocates).
106	pub fn try_into_raw_parts(self) -> Option<Vec<String>> {
107		Some(self.iter().map(|s| s.unwrap().to_string()).collect())
108	}
109}
110
111impl<S: Storage> Utf8Container<S> {
112	pub fn from_inner(inner: VarlenContainer<S>) -> Self {
113		Self {
114			inner,
115		}
116	}
117
118	/// Construct from storage-generic data+offsets vectors. Used by arena
119	/// conversion. Caller must ensure data is valid UTF-8 and offsets are
120	/// well-formed (length >= 1, [0] == 0, monotonic non-decreasing,
121	/// last <= data.len()).
122	pub fn from_storage_parts(data: S::Vec<u8>, offsets: S::Vec<u64>) -> Self {
123		Self {
124			inner: VarlenContainer::from_storage_parts(data, offsets),
125		}
126	}
127
128	/// Borrow the inner data + offsets vectors.
129	pub fn data_storage(&self) -> &S::Vec<u8> {
130		self.inner.data()
131	}
132
133	pub fn offsets_storage(&self) -> &S::Vec<u64> {
134		self.inner.offsets_data()
135	}
136
137	pub fn len(&self) -> usize {
138		self.inner.len()
139	}
140
141	pub fn capacity(&self) -> usize {
142		self.inner.capacity()
143	}
144
145	pub fn is_empty(&self) -> bool {
146		self.inner.is_empty()
147	}
148
149	/// Storage-generic clear (works for any `S: Storage`).
150	pub fn clear(&mut self) {
151		self.inner.clear_generic();
152	}
153
154	/// Borrow the i-th string. UTF-8 validity is guaranteed by construction.
155	pub fn get(&self, index: usize) -> Option<&str> {
156		let bytes = self.inner.get_bytes(index)?;
157		// SAFETY: All push paths validate UTF-8 (push(&str) takes a
158		// validated &str; from_bytes_offsets debug-asserts validity).
159		// VarlenContainer never splits or rearranges bytes.
160		Some(unsafe { str::from_utf8_unchecked(bytes) })
161	}
162
163	pub fn is_defined(&self, idx: usize) -> bool {
164		idx < self.len()
165	}
166
167	pub fn is_fully_defined(&self) -> bool {
168		true
169	}
170
171	/// Borrow the underlying concatenated payload bytes. Used by the FFI
172	/// marshal path for zero-copy borrow.
173	pub fn data_bytes(&self) -> &[u8] {
174		self.inner.data_bytes()
175	}
176
177	/// Borrow the underlying offsets array (length = `len + 1`).
178	pub fn offsets(&self) -> &[u64] {
179		self.inner.offsets()
180	}
181
182	/// Borrow the underlying VarlenContainer (test/debug only).
183	pub fn inner(&self) -> &VarlenContainer<S> {
184		&self.inner
185	}
186
187	pub fn as_string(&self, index: usize) -> String {
188		self.get(index).map(str::to_string).unwrap_or_else(|| "none".to_string())
189	}
190
191	pub fn get_value(&self, index: usize) -> Value {
192		match self.get(index) {
193			Some(s) => Value::Utf8(s.to_string()),
194			None => Value::none_of(Type::Utf8),
195		}
196	}
197
198	/// Iterate strings as `Option<&str>`. Always Some for indices < len.
199	pub fn iter(&self) -> impl Iterator<Item = Option<&str>> + '_ {
200		(0..self.len()).map(|i| self.get(i))
201	}
202
203	/// Iterate strings as `&str` directly.
204	pub fn iter_str(&self) -> impl Iterator<Item = &str> + '_ {
205		(0..self.len()).map(|i| self.get(i).unwrap())
206	}
207}
208
209impl Utf8Container<Cow> {
210	pub fn push(&mut self, value: String) {
211		self.inner.push_bytes(value.as_bytes());
212	}
213
214	pub fn push_str(&mut self, value: &str) {
215		self.inner.push_bytes(value.as_bytes());
216	}
217
218	pub fn push_default(&mut self) {
219		self.inner.push_bytes(&[]);
220	}
221
222	pub fn extend(&mut self, other: &Self) -> Result<()> {
223		self.inner.extend_from(&other.inner);
224		Ok(())
225	}
226
227	pub fn slice(&self, start: usize, end: usize) -> Self {
228		Self {
229			inner: self.inner.slice(start, end),
230		}
231	}
232
233	pub fn filter(&mut self, mask: &<Cow as Storage>::BitVec) {
234		let bits: Vec<bool> = mask.iter().collect();
235		self.inner.filter_in_place(|i| bits.get(i).copied().unwrap_or(false));
236	}
237
238	pub fn reorder(&mut self, indices: &[usize]) {
239		self.inner.reorder_in_place(indices);
240	}
241
242	pub fn take(&self, num: usize) -> Self {
243		Self {
244			inner: self.inner.take_n(num),
245		}
246	}
247}
248
249impl Default for Utf8Container<Cow> {
250	fn default() -> Self {
251		Self::with_capacity(0)
252	}
253}
254
255#[cfg(test)]
256pub mod tests {
257	use postcard::to_allocvec as postcard_to_allocvec;
258
259	use super::*;
260	use crate::util::bitvec::BitVec;
261
262	#[test]
263	fn test_new() {
264		let data = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
265		let container = Utf8Container::new(data.clone());
266
267		assert_eq!(container.len(), 3);
268		assert_eq!(container.get(0), Some("hello"));
269		assert_eq!(container.get(1), Some("world"));
270		assert_eq!(container.get(2), Some("test"));
271	}
272
273	#[test]
274	fn test_from_vec() {
275		let data = vec!["foo".to_string(), "bar".to_string(), "baz".to_string()];
276		let container = Utf8Container::from_vec(data);
277
278		assert_eq!(container.len(), 3);
279		assert_eq!(container.get(0), Some("foo"));
280		assert_eq!(container.get(1), Some("bar"));
281		assert_eq!(container.get(2), Some("baz"));
282
283		for i in 0..3 {
284			assert!(container.is_defined(i));
285		}
286	}
287
288	#[test]
289	fn test_with_capacity() {
290		let container = Utf8Container::with_capacity(10);
291		assert_eq!(container.len(), 0);
292		assert!(container.is_empty());
293		assert!(container.capacity() >= 10);
294	}
295
296	#[test]
297	fn test_push() {
298		let mut container = Utf8Container::with_capacity(3);
299
300		container.push("first".to_string());
301		container.push("second".to_string());
302		container.push_default();
303
304		assert_eq!(container.len(), 3);
305		assert_eq!(container.get(0), Some("first"));
306		assert_eq!(container.get(1), Some("second"));
307		assert_eq!(container.get(2), Some(""));
308
309		assert!(container.is_defined(0));
310		assert!(container.is_defined(1));
311		assert!(container.is_defined(2));
312	}
313
314	#[test]
315	fn test_extend() {
316		let mut container1 = Utf8Container::from_vec(vec!["a".to_string(), "b".to_string()]);
317		let container2 = Utf8Container::from_vec(vec!["c".to_string(), "d".to_string()]);
318
319		container1.extend(&container2).unwrap();
320
321		assert_eq!(container1.len(), 4);
322		assert_eq!(container1.get(0), Some("a"));
323		assert_eq!(container1.get(1), Some("b"));
324		assert_eq!(container1.get(2), Some("c"));
325		assert_eq!(container1.get(3), Some("d"));
326	}
327
328	#[test]
329	fn test_iter() {
330		let data = vec!["x".to_string(), "y".to_string(), "z".to_string()];
331		let container = Utf8Container::new(data);
332
333		let collected: Vec<Option<&str>> = container.iter().collect();
334		assert_eq!(collected, vec![Some("x"), Some("y"), Some("z")]);
335	}
336
337	#[test]
338	fn test_slice() {
339		let container = Utf8Container::from_vec(vec![
340			"one".to_string(),
341			"two".to_string(),
342			"three".to_string(),
343			"four".to_string(),
344		]);
345		let sliced = container.slice(1, 3);
346
347		assert_eq!(sliced.len(), 2);
348		assert_eq!(sliced.get(0), Some("two"));
349		assert_eq!(sliced.get(1), Some("three"));
350	}
351
352	#[test]
353	fn test_filter() {
354		let mut container = Utf8Container::from_vec(vec![
355			"keep".to_string(),
356			"drop".to_string(),
357			"keep".to_string(),
358			"drop".to_string(),
359		]);
360		let mask = BitVec::from_slice(&[true, false, true, false]);
361
362		container.filter(&mask);
363
364		assert_eq!(container.len(), 2);
365		assert_eq!(container.get(0), Some("keep"));
366		assert_eq!(container.get(1), Some("keep"));
367	}
368
369	#[test]
370	fn test_reorder() {
371		let mut container =
372			Utf8Container::from_vec(vec!["first".to_string(), "second".to_string(), "third".to_string()]);
373		let indices = [2, 0, 1];
374
375		container.reorder(&indices);
376
377		assert_eq!(container.len(), 3);
378		assert_eq!(container.get(0), Some("third"));
379		assert_eq!(container.get(1), Some("first"));
380		assert_eq!(container.get(2), Some("second"));
381	}
382
383	#[test]
384	fn test_reorder_with_out_of_bounds() {
385		let mut container = Utf8Container::from_vec(vec!["a".to_string(), "b".to_string()]);
386		let indices = [1, 5, 0];
387
388		container.reorder(&indices);
389
390		assert_eq!(container.len(), 3);
391		assert_eq!(container.get(0), Some("b"));
392		assert_eq!(container.get(1), Some(""));
393		assert_eq!(container.get(2), Some("a"));
394	}
395
396	#[test]
397	fn test_empty_strings() {
398		let mut container = Utf8Container::with_capacity(2);
399		container.push("".to_string());
400		container.push_default();
401
402		assert_eq!(container.len(), 2);
403		assert_eq!(container.get(0), Some(""));
404		assert_eq!(container.get(1), Some(""));
405
406		assert!(container.is_defined(0));
407		assert!(container.is_defined(1));
408	}
409
410	#[test]
411	fn testault() {
412		let container = Utf8Container::default();
413		assert_eq!(container.len(), 0);
414		assert!(container.is_empty());
415	}
416
417	#[test]
418	fn test_data_bytes_and_offsets_match_zero_copy_layout() {
419		let container = Utf8Container::from_vec(vec!["aa".to_string(), "bb".to_string()]);
420		assert_eq!(container.data_bytes(), b"aabb");
421		assert_eq!(container.offsets(), &[0u64, 2, 4]);
422	}
423
424	#[test]
425	fn test_postcard_wire_compat() {
426		// The postcard byte form must match what `Vec<String>` would
427		// produce so on-disk state and CDC streams stay readable.
428		let strings = vec!["hello".to_string(), "world".to_string()];
429		let strings_bytes: Vec<u8> = postcard_to_allocvec(&strings).unwrap();
430
431		let container = Utf8Container::from_vec(strings.clone());
432		let container_bytes: Vec<u8> = postcard_to_allocvec(&container).unwrap();
433
434		assert_eq!(strings_bytes, container_bytes);
435	}
436}