vortex_array/arrays/varbin/
array.rs1use num_traits::AsPrimitive;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::{DType, IntegerPType, Nullability, match_each_integer_ptype};
7use vortex_error::{VortexExpect, VortexResult, vortex_ensure, vortex_err};
8
9use crate::arrays::varbin::builder::VarBinBuilder;
10use crate::stats::ArrayStats;
11use crate::validity::Validity;
12use crate::{Array, ArrayRef, ToCanonical};
13
14#[derive(Clone, Debug)]
15pub struct VarBinArray {
16 pub(super) dtype: DType,
17 bytes: ByteBuffer,
18 offsets: ArrayRef,
19 pub(super) validity: Validity,
20 pub(super) stats_set: ArrayStats,
21}
22
23impl VarBinArray {
24 pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
31 Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
32 }
33
34 pub fn try_new(
43 offsets: ArrayRef,
44 bytes: ByteBuffer,
45 dtype: DType,
46 validity: Validity,
47 ) -> VortexResult<Self> {
48 Self::validate(&offsets, &bytes, &dtype, &validity)?;
49
50 Ok(unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) })
52 }
53
54 pub unsafe fn new_unchecked(
83 offsets: ArrayRef,
84 bytes: ByteBuffer,
85 dtype: DType,
86 validity: Validity,
87 ) -> Self {
88 #[cfg(debug_assertions)]
89 Self::validate(&offsets, &bytes, &dtype, &validity)
90 .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
91
92 Self {
93 dtype,
94 bytes,
95 offsets,
96 validity,
97 stats_set: Default::default(),
98 }
99 }
100
101 pub fn validate(
105 offsets: &dyn Array,
106 bytes: &ByteBuffer,
107 dtype: &DType,
108 validity: &Validity,
109 ) -> VortexResult<()> {
110 vortex_ensure!(
112 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
113 MismatchedTypes: "non nullable int", offsets.dtype()
114 );
115
116 vortex_ensure!(
118 matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
119 MismatchedTypes: "utf8 or binary", dtype
120 );
121
122 vortex_ensure!(
124 dtype.is_nullable() != (validity == &Validity::NonNullable),
125 "incorrect validity {:?} for dtype {}",
126 validity,
127 dtype
128 );
129
130 vortex_ensure!(
132 !offsets.is_empty(),
133 "Offsets must have at least one element"
134 );
135
136 if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
138 vortex_ensure!(is_sorted, "offsets must be sorted");
139 }
140
141 let last_offset = offsets
142 .scalar_at(offsets.len() - 1)
143 .as_primitive()
144 .as_::<usize>()
145 .ok_or_else(|| vortex_err!("Last offset must be convertible to usize"))?;
146 vortex_ensure!(
147 last_offset <= bytes.len(),
148 "Last offset {} exceeds bytes length {}",
149 last_offset,
150 bytes.len()
151 );
152
153 if let Some(validity_len) = validity.maybe_len() {
155 vortex_ensure!(
156 validity_len == offsets.len() - 1,
157 "Validity length {} doesn't match array length {}",
158 validity_len,
159 offsets.len() - 1
160 );
161 }
162
163 if matches!(dtype, DType::Utf8(_)) {
165 let primitive_offsets = offsets.to_primitive();
166 match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
167 let offsets_slice = primitive_offsets.as_slice::<O>();
168 for (i, (start, end)) in offsets_slice
169 .windows(2)
170 .map(|o| (o[0].as_(), o[1].as_()))
171 .enumerate()
172 {
173 if validity.is_null(i) {
174 continue;
175 }
176
177 let string_bytes = &bytes.as_ref()[start..end];
178 simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
179 #[allow(clippy::unwrap_used)]
180 let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
182 vortex_err!("invalid utf-8: {err} at index {i}")
183 })?;
184 }
185 });
186 }
187
188 Ok(())
189 }
190
191 #[inline]
192 pub fn offsets(&self) -> &ArrayRef {
193 &self.offsets
194 }
195
196 #[inline]
204 pub fn bytes(&self) -> &ByteBuffer {
205 &self.bytes
206 }
207
208 pub fn sliced_bytes(&self) -> ByteBuffer {
211 let first_offset: usize = self.offset_at(0);
212 let last_offset = self.offset_at(self.len());
213
214 self.bytes().slice(first_offset..last_offset)
215 }
216
217 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
218 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
219 if size < u32::MAX as usize {
220 Self::from_vec_sized::<u32, T>(vec, dtype)
221 } else {
222 Self::from_vec_sized::<u64, T>(vec, dtype)
223 }
224 }
225
226 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
227 where
228 O: IntegerPType,
229 T: AsRef<[u8]>,
230 {
231 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
232 for v in vec {
233 builder.append_value(v.as_ref());
234 }
235 builder.finish(dtype)
236 }
237
238 #[allow(clippy::same_name_method)]
239 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
240 iter: I,
241 dtype: DType,
242 ) -> Self {
243 let iter = iter.into_iter();
244 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
245 for v in iter {
246 builder.append(v.as_ref().map(|o| o.as_ref()));
247 }
248 builder.finish(dtype)
249 }
250
251 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
252 iter: I,
253 dtype: DType,
254 ) -> Self {
255 let iter = iter.into_iter();
256 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
257 for v in iter {
258 builder.append_value(v);
259 }
260 builder.finish(dtype)
261 }
262
263 pub fn offset_at(&self, index: usize) -> usize {
269 assert!(
270 index <= self.len(),
271 "Index {index} out of bounds 0..={}",
272 self.len()
273 );
274
275 self.offsets()
276 .scalar_at(index)
277 .as_ref()
278 .try_into()
279 .vortex_expect("Failed to convert offset to usize")
280 }
281
282 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
286 let start = self.offset_at(index);
287 let end = self.offset_at(index + 1);
288
289 self.bytes().slice(start..end)
290 }
291
292 pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
295 (self.dtype, self.bytes, self.offsets, self.validity)
296 }
297}
298
299impl From<Vec<&[u8]>> for VarBinArray {
300 fn from(value: Vec<&[u8]>) -> Self {
301 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
302 }
303}
304
305impl From<Vec<Vec<u8>>> for VarBinArray {
306 fn from(value: Vec<Vec<u8>>) -> Self {
307 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
308 }
309}
310
311impl From<Vec<String>> for VarBinArray {
312 fn from(value: Vec<String>) -> Self {
313 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
314 }
315}
316
317impl From<Vec<&str>> for VarBinArray {
318 fn from(value: Vec<&str>) -> Self {
319 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
320 }
321}
322
323impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
324 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
325 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
326 }
327}
328
329impl FromIterator<Option<Vec<u8>>> for VarBinArray {
330 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
331 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
332 }
333}
334
335impl FromIterator<Option<String>> for VarBinArray {
336 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
337 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
338 }
339}
340
341impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
342 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
343 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
344 }
345}