vortex_array/arrays/varbin/
array.rs1use num_traits::AsPrimitive;
5use vortex_buffer::ByteBuffer;
6use vortex_dtype::DType;
7use vortex_dtype::IntegerPType;
8use vortex_dtype::Nullability;
9use vortex_dtype::match_each_integer_ptype;
10use vortex_error::VortexExpect;
11use vortex_error::VortexResult;
12use vortex_error::vortex_ensure;
13use vortex_error::vortex_err;
14
15use crate::Array;
16use crate::ArrayRef;
17use crate::ToCanonical;
18use crate::arrays::varbin::builder::VarBinBuilder;
19use crate::stats::ArrayStats;
20use crate::validity::Validity;
21
22#[derive(Clone, Debug)]
23pub struct VarBinArray {
24 pub(super) dtype: DType,
25 bytes: ByteBuffer,
26 offsets: ArrayRef,
27 pub(super) validity: Validity,
28 pub(super) stats_set: ArrayStats,
29}
30
31impl VarBinArray {
32 pub fn new(offsets: ArrayRef, bytes: ByteBuffer, dtype: DType, validity: Validity) -> Self {
39 Self::try_new(offsets, bytes, dtype, validity).vortex_expect("VarBinArray new")
40 }
41
42 pub fn try_new(
51 offsets: ArrayRef,
52 bytes: ByteBuffer,
53 dtype: DType,
54 validity: Validity,
55 ) -> VortexResult<Self> {
56 Self::validate(&offsets, &bytes, &dtype, &validity)?;
57
58 Ok(unsafe { Self::new_unchecked(offsets, bytes, dtype, validity) })
60 }
61
62 pub unsafe fn new_unchecked(
91 offsets: ArrayRef,
92 bytes: ByteBuffer,
93 dtype: DType,
94 validity: Validity,
95 ) -> Self {
96 #[cfg(debug_assertions)]
97 Self::validate(&offsets, &bytes, &dtype, &validity)
98 .vortex_expect("[Debug Assertion]: Invalid `VarBinArray` parameters");
99
100 Self {
101 dtype,
102 bytes,
103 offsets,
104 validity,
105 stats_set: Default::default(),
106 }
107 }
108
109 pub fn validate(
113 offsets: &dyn Array,
114 bytes: &ByteBuffer,
115 dtype: &DType,
116 validity: &Validity,
117 ) -> VortexResult<()> {
118 vortex_ensure!(
120 offsets.dtype().is_int() && !offsets.dtype().is_nullable(),
121 MismatchedTypes: "non nullable int", offsets.dtype()
122 );
123
124 vortex_ensure!(
126 matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
127 MismatchedTypes: "utf8 or binary", dtype
128 );
129
130 vortex_ensure!(
132 dtype.is_nullable() != (validity == &Validity::NonNullable),
133 "incorrect validity {:?} for dtype {}",
134 validity,
135 dtype
136 );
137
138 vortex_ensure!(
140 !offsets.is_empty(),
141 "Offsets must have at least one element"
142 );
143
144 if let Some(is_sorted) = offsets.statistics().compute_is_sorted() {
146 vortex_ensure!(is_sorted, "offsets must be sorted");
147 }
148
149 let last_offset = offsets
150 .scalar_at(offsets.len() - 1)
151 .as_primitive()
152 .as_::<usize>()
153 .ok_or_else(|| vortex_err!("Last offset must be convertible to usize"))?;
154 vortex_ensure!(
155 last_offset <= bytes.len(),
156 "Last offset {} exceeds bytes length {}",
157 last_offset,
158 bytes.len()
159 );
160
161 if let Some(validity_len) = validity.maybe_len() {
163 vortex_ensure!(
164 validity_len == offsets.len() - 1,
165 "Validity length {} doesn't match array length {}",
166 validity_len,
167 offsets.len() - 1
168 );
169 }
170
171 if matches!(dtype, DType::Utf8(_)) {
173 let primitive_offsets = offsets.to_primitive();
174 match_each_integer_ptype!(primitive_offsets.dtype().as_ptype(), |O| {
175 let offsets_slice = primitive_offsets.as_slice::<O>();
176 for (i, (start, end)) in offsets_slice
177 .windows(2)
178 .map(|o| (o[0].as_(), o[1].as_()))
179 .enumerate()
180 {
181 if validity.is_null(i) {
182 continue;
183 }
184
185 let string_bytes = &bytes.as_ref()[start..end];
186 simdutf8::basic::from_utf8(string_bytes).map_err(|_| {
187 #[allow(clippy::unwrap_used)]
188 let err = simdutf8::compat::from_utf8(string_bytes).unwrap_err();
190 vortex_err!("invalid utf-8: {err} at index {i}")
191 })?;
192 }
193 });
194 }
195
196 Ok(())
197 }
198
199 #[inline]
200 pub fn offsets(&self) -> &ArrayRef {
201 &self.offsets
202 }
203
204 #[inline]
212 pub fn bytes(&self) -> &ByteBuffer {
213 &self.bytes
214 }
215
216 pub fn sliced_bytes(&self) -> ByteBuffer {
219 let first_offset: usize = self.offset_at(0);
220 let last_offset = self.offset_at(self.len());
221
222 self.bytes().slice(first_offset..last_offset)
223 }
224
225 pub fn from_vec<T: AsRef<[u8]>>(vec: Vec<T>, dtype: DType) -> Self {
226 let size: usize = vec.iter().map(|v| v.as_ref().len()).sum();
227 if size < u32::MAX as usize {
228 Self::from_vec_sized::<u32, T>(vec, dtype)
229 } else {
230 Self::from_vec_sized::<u64, T>(vec, dtype)
231 }
232 }
233
234 fn from_vec_sized<O, T>(vec: Vec<T>, dtype: DType) -> Self
235 where
236 O: IntegerPType,
237 T: AsRef<[u8]>,
238 {
239 let mut builder = VarBinBuilder::<O>::with_capacity(vec.len());
240 for v in vec {
241 builder.append_value(v.as_ref());
242 }
243 builder.finish(dtype)
244 }
245
246 #[expect(
247 clippy::same_name_method,
248 reason = "intentionally named from_iter like Iterator::from_iter"
249 )]
250 pub fn from_iter<T: AsRef<[u8]>, I: IntoIterator<Item = Option<T>>>(
251 iter: I,
252 dtype: DType,
253 ) -> Self {
254 let iter = iter.into_iter();
255 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
256 for v in iter {
257 builder.append(v.as_ref().map(|o| o.as_ref()));
258 }
259 builder.finish(dtype)
260 }
261
262 pub fn from_iter_nonnull<T: AsRef<[u8]>, I: IntoIterator<Item = T>>(
263 iter: I,
264 dtype: DType,
265 ) -> Self {
266 let iter = iter.into_iter();
267 let mut builder = VarBinBuilder::<u32>::with_capacity(iter.size_hint().0);
268 for v in iter {
269 builder.append_value(v);
270 }
271 builder.finish(dtype)
272 }
273
274 pub fn offset_at(&self, index: usize) -> usize {
280 assert!(
281 index <= self.len(),
282 "Index {index} out of bounds 0..={}",
283 self.len()
284 );
285
286 self.offsets()
287 .scalar_at(index)
288 .as_ref()
289 .try_into()
290 .vortex_expect("Failed to convert offset to usize")
291 }
292
293 pub fn bytes_at(&self, index: usize) -> ByteBuffer {
297 let start = self.offset_at(index);
298 let end = self.offset_at(index + 1);
299
300 self.bytes().slice(start..end)
301 }
302
303 pub fn into_parts(self) -> (DType, ByteBuffer, ArrayRef, Validity) {
306 (self.dtype, self.bytes, self.offsets, self.validity)
307 }
308}
309
310impl From<Vec<&[u8]>> for VarBinArray {
311 fn from(value: Vec<&[u8]>) -> Self {
312 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
313 }
314}
315
316impl From<Vec<Vec<u8>>> for VarBinArray {
317 fn from(value: Vec<Vec<u8>>) -> Self {
318 Self::from_vec(value, DType::Binary(Nullability::NonNullable))
319 }
320}
321
322impl From<Vec<String>> for VarBinArray {
323 fn from(value: Vec<String>) -> Self {
324 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
325 }
326}
327
328impl From<Vec<&str>> for VarBinArray {
329 fn from(value: Vec<&str>) -> Self {
330 Self::from_vec(value, DType::Utf8(Nullability::NonNullable))
331 }
332}
333
334impl From<Vec<Option<&[u8]>>> for VarBinArray {
335 fn from(value: Vec<Option<&[u8]>>) -> Self {
336 Self::from_iter(value, DType::Binary(Nullability::Nullable))
337 }
338}
339
340impl From<Vec<Option<Vec<u8>>>> for VarBinArray {
341 fn from(value: Vec<Option<Vec<u8>>>) -> Self {
342 Self::from_iter(value, DType::Binary(Nullability::Nullable))
343 }
344}
345
346impl From<Vec<Option<String>>> for VarBinArray {
347 fn from(value: Vec<Option<String>>) -> Self {
348 Self::from_iter(value, DType::Utf8(Nullability::Nullable))
349 }
350}
351
352impl From<Vec<Option<&str>>> for VarBinArray {
353 fn from(value: Vec<Option<&str>>) -> Self {
354 Self::from_iter(value, DType::Utf8(Nullability::Nullable))
355 }
356}
357
358impl<'a> FromIterator<Option<&'a [u8]>> for VarBinArray {
359 fn from_iter<T: IntoIterator<Item = Option<&'a [u8]>>>(iter: T) -> Self {
360 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
361 }
362}
363
364impl FromIterator<Option<Vec<u8>>> for VarBinArray {
365 fn from_iter<T: IntoIterator<Item = Option<Vec<u8>>>>(iter: T) -> Self {
366 Self::from_iter(iter, DType::Binary(Nullability::Nullable))
367 }
368}
369
370impl FromIterator<Option<String>> for VarBinArray {
371 fn from_iter<T: IntoIterator<Item = Option<String>>>(iter: T) -> Self {
372 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
373 }
374}
375
376impl<'a> FromIterator<Option<&'a str>> for VarBinArray {
377 fn from_iter<T: IntoIterator<Item = Option<&'a str>>>(iter: T) -> Self {
378 Self::from_iter(iter, DType::Utf8(Nullability::Nullable))
379 }
380}