Skip to main content

reifydb_type/value/container/
utf8.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2025 ReifyDB
3
4use std::{
5	fmt::{self, Debug},
6	result::Result as StdResult,
7	str,
8};
9
10use serde::{Deserialize, Deserializer, Serialize, Serializer};
11
12use crate::{
13	Result,
14	storage::{Cow, Storage},
15	value::{Value, container::varlen::VarlenContainer, r#type::Type},
16};
17
18pub struct Utf8Container<S: Storage = Cow> {
19	inner: VarlenContainer<S>,
20}
21
22impl<S: Storage> Clone for Utf8Container<S> {
23	fn clone(&self) -> Self {
24		Self {
25			inner: self.inner.clone(),
26		}
27	}
28}
29
30impl<S: Storage> Debug for Utf8Container<S>
31where
32	VarlenContainer<S>: Debug,
33{
34	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35		f.debug_struct("Utf8Container").field("inner", &self.inner).finish()
36	}
37}
38
39impl<S: Storage> PartialEq for Utf8Container<S>
40where
41	VarlenContainer<S>: PartialEq,
42{
43	fn eq(&self, other: &Self) -> bool {
44		self.inner == other.inner
45	}
46}
47
48impl Serialize for Utf8Container<Cow> {
49	fn serialize<Ser: Serializer>(&self, serializer: Ser) -> StdResult<Ser::Ok, Ser::Error> {
50		self.inner.serialize(serializer)
51	}
52}
53
54impl<'de> Deserialize<'de> for Utf8Container<Cow> {
55	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> StdResult<Self, D::Error> {
56		let inner = VarlenContainer::deserialize(deserializer)?;
57		Ok(Self {
58			inner,
59		})
60	}
61}
62
63impl Utf8Container<Cow> {
64	pub fn new(data: Vec<String>) -> Self {
65		Self::from_vec(data)
66	}
67
68	pub fn from_vec(data: Vec<String>) -> Self {
69		let inner = VarlenContainer::from_byte_slices(data.iter().map(|s| s.as_bytes()));
70		Self {
71			inner,
72		}
73	}
74
75	pub fn with_capacity(capacity: usize) -> Self {
76		Self {
77			inner: VarlenContainer::with_capacity(capacity, capacity * 16),
78		}
79	}
80
81	pub fn from_raw_parts(data: Vec<String>) -> Self {
82		Self::from_vec(data)
83	}
84
85	pub fn from_bytes_offsets(data: Vec<u8>, offsets: Vec<u64>) -> Self {
86		debug_assert!(str::from_utf8(&data).is_ok(), "Utf8Container data must be valid UTF-8");
87		Self {
88			inner: VarlenContainer::from_raw_parts(data, offsets),
89		}
90	}
91
92	pub fn try_into_raw_parts(self) -> Option<Vec<String>> {
93		Some(self.iter().map(|s| s.unwrap().to_string()).collect())
94	}
95}
96
97impl<S: Storage> Utf8Container<S> {
98	pub fn from_inner(inner: VarlenContainer<S>) -> Self {
99		Self {
100			inner,
101		}
102	}
103
104	pub fn from_storage_parts(data: S::Vec<u8>, offsets: S::Vec<u64>) -> Self {
105		Self {
106			inner: VarlenContainer::from_storage_parts(data, offsets),
107		}
108	}
109
110	pub fn data_storage(&self) -> &S::Vec<u8> {
111		self.inner.data()
112	}
113
114	pub fn offsets_storage(&self) -> &S::Vec<u64> {
115		self.inner.offsets_data()
116	}
117
118	pub fn len(&self) -> usize {
119		self.inner.len()
120	}
121
122	pub fn capacity(&self) -> usize {
123		self.inner.capacity()
124	}
125
126	pub fn is_empty(&self) -> bool {
127		self.inner.is_empty()
128	}
129
130	pub fn clear(&mut self) {
131		self.inner.clear_generic();
132	}
133
134	pub fn get(&self, index: usize) -> Option<&str> {
135		let bytes = self.inner.get_bytes(index)?;
136		// SAFETY: All push paths validate UTF-8 (push(&str) takes a
137
138		Some(unsafe { str::from_utf8_unchecked(bytes) })
139	}
140
141	pub fn is_defined(&self, idx: usize) -> bool {
142		idx < self.len()
143	}
144
145	pub fn is_fully_defined(&self) -> bool {
146		true
147	}
148
149	pub fn data_bytes(&self) -> &[u8] {
150		self.inner.data_bytes()
151	}
152
153	pub fn offsets(&self) -> &[u64] {
154		self.inner.offsets()
155	}
156
157	pub fn inner(&self) -> &VarlenContainer<S> {
158		&self.inner
159	}
160
161	pub fn as_string(&self, index: usize) -> String {
162		self.get(index).map(str::to_string).unwrap_or_else(|| "none".to_string())
163	}
164
165	pub fn get_value(&self, index: usize) -> Value {
166		match self.get(index) {
167			Some(s) => Value::Utf8(s.to_string()),
168			None => Value::none_of(Type::Utf8),
169		}
170	}
171
172	pub fn iter(&self) -> impl Iterator<Item = Option<&str>> + '_ {
173		(0..self.len()).map(|i| self.get(i))
174	}
175
176	pub fn iter_str(&self) -> impl Iterator<Item = &str> + '_ {
177		(0..self.len()).map(|i| self.get(i).unwrap())
178	}
179}
180
181impl Utf8Container<Cow> {
182	pub fn push(&mut self, value: String) {
183		self.inner.push_bytes(value.as_bytes());
184	}
185
186	pub fn push_str(&mut self, value: &str) {
187		self.inner.push_bytes(value.as_bytes());
188	}
189
190	pub fn push_default(&mut self) {
191		self.inner.push_bytes(&[]);
192	}
193
194	pub fn extend(&mut self, other: &Self) -> Result<()> {
195		self.inner.extend_from(&other.inner);
196		Ok(())
197	}
198
199	pub fn slice(&self, start: usize, end: usize) -> Self {
200		Self {
201			inner: self.inner.slice(start, end),
202		}
203	}
204
205	pub fn filter(&mut self, mask: &<Cow as Storage>::BitVec) {
206		let bits: Vec<bool> = mask.iter().collect();
207		self.inner.filter_in_place(|i| bits.get(i).copied().unwrap_or(false));
208	}
209
210	pub fn reorder(&mut self, indices: &[usize]) {
211		self.inner.reorder_in_place(indices);
212	}
213
214	pub fn take(&self, num: usize) -> Self {
215		Self {
216			inner: self.inner.take_n(num),
217		}
218	}
219}
220
221impl Default for Utf8Container<Cow> {
222	fn default() -> Self {
223		Self::with_capacity(0)
224	}
225}
226
227#[cfg(test)]
228pub mod tests {
229	use postcard::to_allocvec as postcard_to_allocvec;
230
231	use super::*;
232	use crate::util::bitvec::BitVec;
233
234	#[test]
235	fn test_new() {
236		let data = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
237		let container = Utf8Container::new(data.clone());
238
239		assert_eq!(container.len(), 3);
240		assert_eq!(container.get(0), Some("hello"));
241		assert_eq!(container.get(1), Some("world"));
242		assert_eq!(container.get(2), Some("test"));
243	}
244
245	#[test]
246	fn test_from_vec() {
247		let data = vec!["foo".to_string(), "bar".to_string(), "baz".to_string()];
248		let container = Utf8Container::from_vec(data);
249
250		assert_eq!(container.len(), 3);
251		assert_eq!(container.get(0), Some("foo"));
252		assert_eq!(container.get(1), Some("bar"));
253		assert_eq!(container.get(2), Some("baz"));
254
255		for i in 0..3 {
256			assert!(container.is_defined(i));
257		}
258	}
259
260	#[test]
261	fn test_with_capacity() {
262		let container = Utf8Container::with_capacity(10);
263		assert_eq!(container.len(), 0);
264		assert!(container.is_empty());
265		assert!(container.capacity() >= 10);
266	}
267
268	#[test]
269	fn test_push() {
270		let mut container = Utf8Container::with_capacity(3);
271
272		container.push("first".to_string());
273		container.push("second".to_string());
274		container.push_default();
275
276		assert_eq!(container.len(), 3);
277		assert_eq!(container.get(0), Some("first"));
278		assert_eq!(container.get(1), Some("second"));
279		assert_eq!(container.get(2), Some(""));
280
281		assert!(container.is_defined(0));
282		assert!(container.is_defined(1));
283		assert!(container.is_defined(2));
284	}
285
286	#[test]
287	fn test_extend() {
288		let mut container1 = Utf8Container::from_vec(vec!["a".to_string(), "b".to_string()]);
289		let container2 = Utf8Container::from_vec(vec!["c".to_string(), "d".to_string()]);
290
291		container1.extend(&container2).unwrap();
292
293		assert_eq!(container1.len(), 4);
294		assert_eq!(container1.get(0), Some("a"));
295		assert_eq!(container1.get(1), Some("b"));
296		assert_eq!(container1.get(2), Some("c"));
297		assert_eq!(container1.get(3), Some("d"));
298	}
299
300	#[test]
301	fn test_iter() {
302		let data = vec!["x".to_string(), "y".to_string(), "z".to_string()];
303		let container = Utf8Container::new(data);
304
305		let collected: Vec<Option<&str>> = container.iter().collect();
306		assert_eq!(collected, vec![Some("x"), Some("y"), Some("z")]);
307	}
308
309	#[test]
310	fn test_slice() {
311		let container = Utf8Container::from_vec(vec![
312			"one".to_string(),
313			"two".to_string(),
314			"three".to_string(),
315			"four".to_string(),
316		]);
317		let sliced = container.slice(1, 3);
318
319		assert_eq!(sliced.len(), 2);
320		assert_eq!(sliced.get(0), Some("two"));
321		assert_eq!(sliced.get(1), Some("three"));
322	}
323
324	#[test]
325	fn test_filter() {
326		let mut container = Utf8Container::from_vec(vec![
327			"keep".to_string(),
328			"drop".to_string(),
329			"keep".to_string(),
330			"drop".to_string(),
331		]);
332		let mask = BitVec::from_slice(&[true, false, true, false]);
333
334		container.filter(&mask);
335
336		assert_eq!(container.len(), 2);
337		assert_eq!(container.get(0), Some("keep"));
338		assert_eq!(container.get(1), Some("keep"));
339	}
340
341	#[test]
342	fn test_reorder() {
343		let mut container =
344			Utf8Container::from_vec(vec!["first".to_string(), "second".to_string(), "third".to_string()]);
345		let indices = [2, 0, 1];
346
347		container.reorder(&indices);
348
349		assert_eq!(container.len(), 3);
350		assert_eq!(container.get(0), Some("third"));
351		assert_eq!(container.get(1), Some("first"));
352		assert_eq!(container.get(2), Some("second"));
353	}
354
355	#[test]
356	fn test_reorder_with_out_of_bounds() {
357		let mut container = Utf8Container::from_vec(vec!["a".to_string(), "b".to_string()]);
358		let indices = [1, 5, 0];
359
360		container.reorder(&indices);
361
362		assert_eq!(container.len(), 3);
363		assert_eq!(container.get(0), Some("b"));
364		assert_eq!(container.get(1), Some(""));
365		assert_eq!(container.get(2), Some("a"));
366	}
367
368	#[test]
369	fn test_empty_strings() {
370		let mut container = Utf8Container::with_capacity(2);
371		container.push("".to_string());
372		container.push_default();
373
374		assert_eq!(container.len(), 2);
375		assert_eq!(container.get(0), Some(""));
376		assert_eq!(container.get(1), Some(""));
377
378		assert!(container.is_defined(0));
379		assert!(container.is_defined(1));
380	}
381
382	#[test]
383	fn testault() {
384		let container = Utf8Container::default();
385		assert_eq!(container.len(), 0);
386		assert!(container.is_empty());
387	}
388
389	#[test]
390	fn test_data_bytes_and_offsets_match_zero_copy_layout() {
391		let container = Utf8Container::from_vec(vec!["aa".to_string(), "bb".to_string()]);
392		assert_eq!(container.data_bytes(), b"aabb");
393		assert_eq!(container.offsets(), &[0u64, 2, 4]);
394	}
395
396	#[test]
397	fn test_postcard_wire_compat() {
398		// The postcard byte form must match what `Vec<String>` would
399		// produce so on-disk state and CDC streams stay readable.
400		let strings = vec!["hello".to_string(), "world".to_string()];
401		let strings_bytes: Vec<u8> = postcard_to_allocvec(&strings).unwrap();
402
403		let container = Utf8Container::from_vec(strings.clone());
404		let container_bytes: Vec<u8> = postcard_to_allocvec(&container).unwrap();
405
406		assert_eq!(strings_bytes, container_bytes);
407	}
408}