Skip to main content

reifydb_value/value/container/
utf8.rs

1// SPDX-License-Identifier: MIT
2// Copyright (c) 2026 ReifyDB
3
4use std::{
5	fmt::{self, Debug},
6	result::Result as StdResult,
7	str,
8};
9
10use serde::{Deserialize, Deserializer, Serialize, Serializer};
11
12use crate::{
13	Result,
14	storage::{Cow, Storage},
15	value::{Value, container::varlen::VarlenContainer, value_type::ValueType},
16};
17
18pub struct Utf8Container<S: Storage = Cow> {
19	inner: VarlenContainer<S>,
20}
21
22impl<S: Storage> Clone for Utf8Container<S> {
23	fn clone(&self) -> Self {
24		Self {
25			inner: self.inner.clone(),
26		}
27	}
28}
29
30impl<S: Storage> Debug for Utf8Container<S>
31where
32	VarlenContainer<S>: Debug,
33{
34	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35		f.debug_struct("Utf8Container").field("inner", &self.inner).finish()
36	}
37}
38
39impl<S: Storage> PartialEq for Utf8Container<S>
40where
41	VarlenContainer<S>: PartialEq,
42{
43	fn eq(&self, other: &Self) -> bool {
44		self.inner == other.inner
45	}
46}
47
48impl Serialize for Utf8Container<Cow> {
49	fn serialize<Ser: Serializer>(&self, serializer: Ser) -> StdResult<Ser::Ok, Ser::Error> {
50		self.inner.serialize(serializer)
51	}
52}
53
54impl<'de> Deserialize<'de> for Utf8Container<Cow> {
55	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> StdResult<Self, D::Error> {
56		let inner = VarlenContainer::deserialize(deserializer)?;
57		Ok(Self {
58			inner,
59		})
60	}
61}
62
63impl Utf8Container<Cow> {
64	pub fn new(data: Vec<String>) -> Self {
65		Self::from_vec(data)
66	}
67
68	pub fn from_vec(data: Vec<String>) -> Self {
69		let inner = VarlenContainer::from_byte_slices(data.iter().map(|s| s.as_bytes()));
70		Self {
71			inner,
72		}
73	}
74
75	pub fn from_repeated_str(value: &str, count: usize) -> Self {
76		Self {
77			inner: VarlenContainer::from_repeated_bytes(value.as_bytes(), count),
78		}
79	}
80
81	pub fn with_capacity(capacity: usize) -> Self {
82		Self {
83			inner: VarlenContainer::with_capacity(capacity, capacity * 16),
84		}
85	}
86
87	pub fn from_raw_parts(data: Vec<String>) -> Self {
88		Self::from_vec(data)
89	}
90
91	pub fn from_bytes_offsets(data: Vec<u8>, offsets: Vec<u64>) -> Self {
92		debug_assert!(str::from_utf8(&data).is_ok(), "Utf8Container data must be valid UTF-8");
93		Self {
94			inner: VarlenContainer::from_raw_parts(data, offsets),
95		}
96	}
97
98	pub fn try_into_raw_parts(self) -> Option<Vec<String>> {
99		Some(self.iter().map(|s| s.unwrap().to_string()).collect())
100	}
101}
102
103impl<S: Storage> Utf8Container<S> {
104	pub fn from_inner(inner: VarlenContainer<S>) -> Self {
105		Self {
106			inner,
107		}
108	}
109
110	pub fn from_storage_parts(data: S::Vec<u8>, offsets: S::Vec<u64>) -> Self {
111		Self {
112			inner: VarlenContainer::from_storage_parts(data, offsets),
113		}
114	}
115
116	pub fn data_storage(&self) -> &S::Vec<u8> {
117		self.inner.data()
118	}
119
120	pub fn offsets_storage(&self) -> &S::Vec<u64> {
121		self.inner.offsets_data()
122	}
123
124	pub fn len(&self) -> usize {
125		self.inner.len()
126	}
127
128	pub fn capacity(&self) -> usize {
129		self.inner.capacity()
130	}
131
132	pub fn is_empty(&self) -> bool {
133		self.inner.is_empty()
134	}
135
136	pub fn clear(&mut self) {
137		self.inner.clear_generic();
138	}
139
140	pub fn get(&self, index: usize) -> Option<&str> {
141		let bytes = self.inner.get_bytes(index)?;
142		// SAFETY: All push paths validate UTF-8 (push(&str) takes a
143
144		Some(unsafe { str::from_utf8_unchecked(bytes) })
145	}
146
147	pub fn is_defined(&self, idx: usize) -> bool {
148		idx < self.len()
149	}
150
151	pub fn is_fully_defined(&self) -> bool {
152		true
153	}
154
155	pub fn data_bytes(&self) -> &[u8] {
156		self.inner.data_bytes()
157	}
158
159	pub fn offsets(&self) -> &[u64] {
160		self.inner.offsets()
161	}
162
163	pub fn inner(&self) -> &VarlenContainer<S> {
164		&self.inner
165	}
166
167	pub fn as_string(&self, index: usize) -> String {
168		self.get(index).map(str::to_string).unwrap_or_else(|| "none".to_string())
169	}
170
171	pub fn get_value(&self, index: usize) -> Value {
172		match self.get(index) {
173			Some(s) => Value::Utf8(s.to_string()),
174			None => Value::none_of(ValueType::Utf8),
175		}
176	}
177
178	pub fn iter(&self) -> impl Iterator<Item = Option<&str>> + '_ {
179		(0..self.len()).map(|i| self.get(i))
180	}
181
182	pub fn iter_str(&self) -> impl Iterator<Item = &str> + '_ {
183		(0..self.len()).map(|i| self.get(i).unwrap())
184	}
185}
186
187impl Utf8Container<Cow> {
188	pub fn push(&mut self, value: String) {
189		self.inner.push_bytes(value.as_bytes());
190	}
191
192	pub fn push_str(&mut self, value: &str) {
193		self.inner.push_bytes(value.as_bytes());
194	}
195
196	pub fn push_default(&mut self) {
197		self.inner.push_bytes(&[]);
198	}
199
200	pub fn extend(&mut self, other: &Self) -> Result<()> {
201		self.inner.extend_from(&other.inner);
202		Ok(())
203	}
204
205	pub fn slice(&self, start: usize, end: usize) -> Self {
206		Self {
207			inner: self.inner.slice(start, end),
208		}
209	}
210
211	pub fn filter(&mut self, mask: &<Cow as Storage>::BitVec) {
212		let bits: Vec<bool> = mask.iter().collect();
213		self.inner.filter_in_place(|i| bits.get(i).copied().unwrap_or(false));
214	}
215
216	pub fn reorder(&mut self, indices: &[usize]) {
217		self.inner.reorder_in_place(indices);
218	}
219
220	pub fn take(&self, num: usize) -> Self {
221		Self {
222			inner: self.inner.take_n(num),
223		}
224	}
225}
226
227impl Default for Utf8Container<Cow> {
228	fn default() -> Self {
229		Self::with_capacity(0)
230	}
231}
232
233#[cfg(test)]
234pub mod tests {
235	use postcard::to_allocvec as postcard_to_allocvec;
236
237	use super::*;
238	use crate::util::bitvec::BitVec;
239
240	#[test]
241	fn test_new() {
242		let data = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
243		let container = Utf8Container::new(data.clone());
244
245		assert_eq!(container.len(), 3);
246		assert_eq!(container.get(0), Some("hello"));
247		assert_eq!(container.get(1), Some("world"));
248		assert_eq!(container.get(2), Some("test"));
249	}
250
251	#[test]
252	fn test_from_vec() {
253		let data = vec!["foo".to_string(), "bar".to_string(), "baz".to_string()];
254		let container = Utf8Container::from_vec(data);
255
256		assert_eq!(container.len(), 3);
257		assert_eq!(container.get(0), Some("foo"));
258		assert_eq!(container.get(1), Some("bar"));
259		assert_eq!(container.get(2), Some("baz"));
260
261		for i in 0..3 {
262			assert!(container.is_defined(i));
263		}
264	}
265
266	#[test]
267	fn test_from_repeated_str() {
268		let container = Utf8Container::from_repeated_str("mint", 3);
269		let explicit =
270			Utf8Container::from_vec(vec!["mint".to_string(), "mint".to_string(), "mint".to_string()]);
271		assert_eq!(container, explicit);
272		assert_eq!(container.len(), 3);
273		assert_eq!(container.get(0), Some("mint"));
274		assert_eq!(container.get(2), Some("mint"));
275		for i in 0..3 {
276			assert!(container.is_defined(i));
277		}
278	}
279
280	#[test]
281	fn test_with_capacity() {
282		let container = Utf8Container::with_capacity(10);
283		assert_eq!(container.len(), 0);
284		assert!(container.is_empty());
285		assert!(container.capacity() >= 10);
286	}
287
288	#[test]
289	fn test_push() {
290		let mut container = Utf8Container::with_capacity(3);
291
292		container.push("first".to_string());
293		container.push("second".to_string());
294		container.push_default();
295
296		assert_eq!(container.len(), 3);
297		assert_eq!(container.get(0), Some("first"));
298		assert_eq!(container.get(1), Some("second"));
299		assert_eq!(container.get(2), Some(""));
300
301		assert!(container.is_defined(0));
302		assert!(container.is_defined(1));
303		assert!(container.is_defined(2));
304	}
305
306	#[test]
307	fn test_extend() {
308		let mut container1 = Utf8Container::from_vec(vec!["a".to_string(), "b".to_string()]);
309		let container2 = Utf8Container::from_vec(vec!["c".to_string(), "d".to_string()]);
310
311		container1.extend(&container2).unwrap();
312
313		assert_eq!(container1.len(), 4);
314		assert_eq!(container1.get(0), Some("a"));
315		assert_eq!(container1.get(1), Some("b"));
316		assert_eq!(container1.get(2), Some("c"));
317		assert_eq!(container1.get(3), Some("d"));
318	}
319
320	#[test]
321	fn test_iter() {
322		let data = vec!["x".to_string(), "y".to_string(), "z".to_string()];
323		let container = Utf8Container::new(data);
324
325		let collected: Vec<Option<&str>> = container.iter().collect();
326		assert_eq!(collected, vec![Some("x"), Some("y"), Some("z")]);
327	}
328
329	#[test]
330	fn test_slice() {
331		let container = Utf8Container::from_vec(vec![
332			"one".to_string(),
333			"two".to_string(),
334			"three".to_string(),
335			"four".to_string(),
336		]);
337		let sliced = container.slice(1, 3);
338
339		assert_eq!(sliced.len(), 2);
340		assert_eq!(sliced.get(0), Some("two"));
341		assert_eq!(sliced.get(1), Some("three"));
342	}
343
344	#[test]
345	fn test_filter() {
346		let mut container = Utf8Container::from_vec(vec![
347			"keep".to_string(),
348			"drop".to_string(),
349			"keep".to_string(),
350			"drop".to_string(),
351		]);
352		let mask = BitVec::from_slice(&[true, false, true, false]);
353
354		container.filter(&mask);
355
356		assert_eq!(container.len(), 2);
357		assert_eq!(container.get(0), Some("keep"));
358		assert_eq!(container.get(1), Some("keep"));
359	}
360
361	#[test]
362	fn test_reorder() {
363		let mut container =
364			Utf8Container::from_vec(vec!["first".to_string(), "second".to_string(), "third".to_string()]);
365		let indices = [2, 0, 1];
366
367		container.reorder(&indices);
368
369		assert_eq!(container.len(), 3);
370		assert_eq!(container.get(0), Some("third"));
371		assert_eq!(container.get(1), Some("first"));
372		assert_eq!(container.get(2), Some("second"));
373	}
374
375	#[test]
376	fn test_reorder_with_out_of_bounds() {
377		let mut container = Utf8Container::from_vec(vec!["a".to_string(), "b".to_string()]);
378		let indices = [1, 5, 0];
379
380		container.reorder(&indices);
381
382		assert_eq!(container.len(), 3);
383		assert_eq!(container.get(0), Some("b"));
384		assert_eq!(container.get(1), Some(""));
385		assert_eq!(container.get(2), Some("a"));
386	}
387
388	#[test]
389	fn test_empty_strings() {
390		let mut container = Utf8Container::with_capacity(2);
391		container.push("".to_string());
392		container.push_default();
393
394		assert_eq!(container.len(), 2);
395		assert_eq!(container.get(0), Some(""));
396		assert_eq!(container.get(1), Some(""));
397
398		assert!(container.is_defined(0));
399		assert!(container.is_defined(1));
400	}
401
402	#[test]
403	fn testault() {
404		let container = Utf8Container::default();
405		assert_eq!(container.len(), 0);
406		assert!(container.is_empty());
407	}
408
409	#[test]
410	fn test_data_bytes_and_offsets_match_zero_copy_layout() {
411		let container = Utf8Container::from_vec(vec!["aa".to_string(), "bb".to_string()]);
412		assert_eq!(container.data_bytes(), b"aabb");
413		assert_eq!(container.offsets(), &[0u64, 2, 4]);
414	}
415
416	#[test]
417	fn test_postcard_wire_compat() {
418		// The postcard byte form must match what `Vec<String>` would
419		// produce so on-disk state and CDC streams stay readable.
420		let strings = vec!["hello".to_string(), "world".to_string()];
421		let strings_bytes: Vec<u8> = postcard_to_allocvec(&strings).unwrap();
422
423		let container = Utf8Container::from_vec(strings.clone());
424		let container_bytes: Vec<u8> = postcard_to_allocvec(&container).unwrap();
425
426		assert_eq!(strings_bytes, container_bytes);
427	}
428}