1use crate::ArrayDigest;
2use arrow::{
3 array::{
4 Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray,
5 FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericStringArray,
6 LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, OffsetSizeTrait,
7 StringArray, StringViewArray,
8 },
9 buffer::NullBuffer,
10 datatypes::DataType,
11};
12use digest::{Digest, Output, OutputSizeUser};
13
14pub struct ArrayDigestV0<Dig: Digest> {
16 hasher: Dig,
17}
18
19impl<Dig: Digest> OutputSizeUser for ArrayDigestV0<Dig> {
22 type OutputSize = Dig::OutputSize;
23}
24
25impl<Dig: Digest> ArrayDigest for ArrayDigestV0<Dig> {
26 fn digest(array: &dyn Array) -> Output<Dig> {
27 let mut d = Self::new(array.data_type());
28 d.update(array, None);
29 d.finalize()
30 }
31
32 fn new(data_type: &DataType) -> Self {
33 let mut hasher = Dig::new();
34 crate::schema_digest::hash_data_type(data_type, &mut hasher);
35 Self { hasher }
36 }
37
38 fn update(&mut self, array: &dyn Array, parent_null_bitmap: Option<&NullBuffer>) {
39 let array_data = array.to_data();
40 let combined_null_bitmap_val =
41 crate::utils::maybe_combine_null_buffers(parent_null_bitmap, array_data.nulls());
42 let combined_null_bitmap = combined_null_bitmap_val.as_option();
43
44 let data_type = array.data_type();
45
46 #[inline]
47 fn unsupported(data_type: &DataType) -> ! {
48 unimplemented!("Type {} is not yet supported", data_type);
49 }
50
51 match data_type {
52 DataType::Null => unsupported(data_type),
53 DataType::Boolean => self.hash_array_bool(array, combined_null_bitmap),
54 DataType::Int8 | DataType::UInt8 => {
55 self.hash_fixed_size(array, 1, combined_null_bitmap)
56 }
57 DataType::Int16 | DataType::UInt16 => {
58 self.hash_fixed_size(array, 2, combined_null_bitmap)
59 }
60 DataType::Int32 | DataType::UInt32 => {
61 self.hash_fixed_size(array, 4, combined_null_bitmap)
62 }
63 DataType::Int64 | DataType::UInt64 => {
64 self.hash_fixed_size(array, 8, combined_null_bitmap)
65 }
66 DataType::Float16 => self.hash_fixed_size(array, 2, combined_null_bitmap),
67 DataType::Float32 => self.hash_fixed_size(array, 4, combined_null_bitmap),
68 DataType::Float64 => self.hash_fixed_size(array, 8, combined_null_bitmap),
69 DataType::Timestamp(_, _) => self.hash_fixed_size(array, 8, combined_null_bitmap),
70 DataType::Date32 => self.hash_fixed_size(array, 4, combined_null_bitmap),
71 DataType::Date64 => self.hash_fixed_size(array, 8, combined_null_bitmap),
72 DataType::Time32(_) => self.hash_fixed_size(array, 4, combined_null_bitmap),
73 DataType::Time64(_) => self.hash_fixed_size(array, 8, combined_null_bitmap),
74 DataType::Duration(_) => unsupported(data_type),
75 DataType::Interval(_) => unsupported(data_type),
76 DataType::Binary => self.hash_array_binary(
77 array.as_any().downcast_ref::<BinaryArray>().unwrap(),
78 combined_null_bitmap,
79 ),
80 DataType::LargeBinary => self.hash_array_binary(
81 array.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
82 combined_null_bitmap,
83 ),
84 DataType::BinaryView => self.hash_array_binary_view(
85 array.as_any().downcast_ref::<BinaryViewArray>().unwrap(),
86 combined_null_bitmap
87 ),
88 DataType::FixedSizeBinary(size) => {
89 self.hash_array_binary_fixed(
90 array.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap(),
91 *size as usize,
92 combined_null_bitmap,
93 )
94 },
95 DataType::Utf8 => self.hash_array_string(
96 array.as_any().downcast_ref::<StringArray>().unwrap(),
97 combined_null_bitmap,
98 ),
99 DataType::LargeUtf8 => self.hash_array_string(
100 array.as_any().downcast_ref::<LargeStringArray>().unwrap(),
101 combined_null_bitmap,
102 ),
103 DataType::Utf8View => self.hash_array_string_view(
104 array.as_any().downcast_ref::<StringViewArray>().unwrap(),
105 combined_null_bitmap
106 ),
107 DataType::List(_) => self.hash_array_list(
108 array.as_any().downcast_ref::<ListArray>().unwrap(),
109 combined_null_bitmap,
110 ),
111 DataType::LargeList(_) => self.hash_array_list(
112 array.as_any().downcast_ref::<LargeListArray>().unwrap(),
113 combined_null_bitmap,
114 ),
115 DataType::ListView(_) | DataType::LargeListView(_) => unsupported(data_type),
116 DataType::FixedSizeList(..) => self.hash_array_list_fixed(
117 array.as_any().downcast_ref::<FixedSizeListArray>().unwrap(),
118 combined_null_bitmap,
119 ),
120 DataType::Struct(_) => panic!(
122 "Structs are currently flattened by RecordDigest and cannot be processed by ArrayDigest"
123 ),
124 DataType::Union(_, _) => unsupported(data_type),
125 DataType::Dictionary(..) => unsupported(data_type),
126 DataType::Decimal128(_, _) => self.hash_fixed_size(array, 16, combined_null_bitmap),
127 DataType::Decimal256(_, _) => self.hash_fixed_size(array, 32, combined_null_bitmap),
128 DataType::Map(..) => unsupported(data_type),
129 DataType::RunEndEncoded(..) => unsupported(data_type),
130 }
131 }
132
133 fn finalize(self) -> Output<Dig> {
134 self.hasher.finalize()
135 }
136}
137
138impl<Dig: Digest> ArrayDigestV0<Dig> {
141 const NULL_MARKER: [u8; 1] = [0];
142
143 fn hash_fixed_size(
144 &mut self,
145 array: &dyn Array,
146 item_size: usize,
147 null_bitmap: Option<&NullBuffer>,
148 ) {
149 let array_data = array.to_data();
150
151 assert_eq!(
153 array_data.buffers().len(),
154 1,
155 "Multiple buffers on a primitive type array"
156 );
157
158 let slice = {
159 let data_start = array_data.offset() * item_size;
160 let data_end = data_start + array_data.len() * item_size;
161 &array_data.buffers()[0].as_slice()[data_start..data_end]
162 };
163
164 match null_bitmap {
165 None => {
166 self.hasher.update(slice);
168 }
169 Some(null_bitmap) => {
170 for i in 0..array.len() {
172 if null_bitmap.is_valid(i) {
173 let pos = i * item_size;
174 self.hasher.update(&slice[pos..pos + item_size]);
175 } else {
176 self.hasher.update(Self::NULL_MARKER);
177 }
178 }
179 }
180 }
181 }
182
183 fn hash_array_bool(&mut self, array: &dyn Array, null_bitmap: Option<&NullBuffer>) {
185 let bool_array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
186
187 match null_bitmap {
188 None => {
189 for i in 0..bool_array.len() {
190 let value = unsafe { bool_array.value_unchecked(i) };
192 self.hasher.update([value as u8 + 1]);
193 }
194 }
195 Some(null_bitmap) => {
196 for i in 0..bool_array.len() {
197 if null_bitmap.is_valid(i) {
198 let value = unsafe { bool_array.value_unchecked(i) };
200 self.hasher.update([value as u8 + 1]);
201 } else {
202 self.hasher.update(Self::NULL_MARKER);
203 }
204 }
205 }
206 }
207 }
208
209 fn hash_array_string<OffsetSize: OffsetSizeTrait>(
210 &mut self,
211 array: &GenericStringArray<OffsetSize>,
212 null_bitmap: Option<&NullBuffer>,
213 ) {
214 match null_bitmap {
215 None => {
216 for i in 0..array.len() {
217 let s = array.value(i);
218 self.hasher.update((s.len() as u64).to_le_bytes());
219 self.hasher.update(s.as_bytes());
220 }
221 }
222 Some(null_bitmap) => {
223 for i in 0..array.len() {
224 if null_bitmap.is_valid(i) {
225 let s = array.value(i);
226 self.hasher.update((s.len() as u64).to_le_bytes());
227 self.hasher.update(s.as_bytes());
228 } else {
229 self.hasher.update(Self::NULL_MARKER);
230 }
231 }
232 }
233 }
234 }
235
236 fn hash_array_string_view(
237 &mut self,
238 array: &StringViewArray,
239 null_bitmap: Option<&NullBuffer>,
240 ) {
241 match null_bitmap {
242 None => {
243 for i in 0..array.len() {
244 let s = array.value(i);
245 self.hasher.update((s.len() as u64).to_le_bytes());
246 self.hasher.update(s.as_bytes());
247 }
248 }
249 Some(null_bitmap) => {
250 for i in 0..array.len() {
251 if null_bitmap.is_valid(i) {
252 let s = array.value(i);
253 self.hasher.update((s.len() as u64).to_le_bytes());
254 self.hasher.update(s.as_bytes());
255 } else {
256 self.hasher.update(Self::NULL_MARKER);
257 }
258 }
259 }
260 }
261 }
262
263 fn hash_array_binary<OffsetSize: OffsetSizeTrait>(
264 &mut self,
265 array: &GenericBinaryArray<OffsetSize>,
266 null_bitmap: Option<&NullBuffer>,
267 ) {
268 match null_bitmap {
269 None => {
270 for i in 0..array.len() {
271 let slice = array.value(i);
272 self.hasher.update((slice.len() as u64).to_le_bytes());
273 self.hasher.update(slice);
274 }
275 }
276 Some(null_bitmap) => {
277 for i in 0..array.len() {
278 if null_bitmap.is_valid(i) {
279 let slice = array.value(i);
280 self.hasher.update((slice.len() as u64).to_le_bytes());
281 self.hasher.update(slice);
282 } else {
283 self.hasher.update(Self::NULL_MARKER);
284 }
285 }
286 }
287 }
288 }
289
290 fn hash_array_binary_view(
291 &mut self,
292 array: &BinaryViewArray,
293 null_bitmap: Option<&NullBuffer>,
294 ) {
295 match null_bitmap {
296 None => {
297 for i in 0..array.len() {
298 let slice = array.value(i);
299 self.hasher.update((slice.len() as u64).to_le_bytes());
300 self.hasher.update(slice);
301 }
302 }
303 Some(null_bitmap) => {
304 for i in 0..array.len() {
305 if null_bitmap.is_valid(i) {
306 let slice = array.value(i);
307 self.hasher.update((slice.len() as u64).to_le_bytes());
308 self.hasher.update(slice);
309 } else {
310 self.hasher.update(Self::NULL_MARKER);
311 }
312 }
313 }
314 }
315 }
316
317 fn hash_array_binary_fixed(
318 &mut self,
319 array: &FixedSizeBinaryArray,
320 size: usize,
321 null_bitmap: Option<&NullBuffer>,
322 ) {
323 match null_bitmap {
324 None => {
325 for i in 0..array.len() {
326 let slice = array.value(i);
327 self.hasher.update((size as u64).to_le_bytes());
328 self.hasher.update(slice);
329 }
330 }
331 Some(null_bitmap) => {
332 for i in 0..array.len() {
333 if null_bitmap.is_valid(i) {
334 let slice = array.value(i);
335 self.hasher.update((size as u64).to_le_bytes());
336 self.hasher.update(slice);
337 } else {
338 self.hasher.update(Self::NULL_MARKER);
339 }
340 }
341 }
342 }
343 }
344
345 fn hash_array_list<Off: OffsetSizeTrait>(
346 &mut self,
347 array: &GenericListArray<Off>,
348 null_bitmap: Option<&NullBuffer>,
349 ) {
350 match null_bitmap {
351 None => {
352 for i in 0..array.len() {
353 let sub_array = array.value(i);
354 self.hasher.update((sub_array.len() as u64).to_le_bytes());
355 self.update(sub_array.as_ref(), None);
356 }
357 }
358 Some(null_bitmap) => {
359 for i in 0..array.len() {
360 if null_bitmap.is_valid(i) {
361 let sub_array = array.value(i);
362 self.hasher.update((sub_array.len() as u64).to_le_bytes());
363 self.update(sub_array.as_ref(), None);
364 } else {
365 self.hasher.update(Self::NULL_MARKER);
366 }
367 }
368 }
369 }
370 }
371
372 fn hash_array_list_fixed(
373 &mut self,
374 array: &FixedSizeListArray,
375 null_bitmap: Option<&NullBuffer>,
376 ) {
377 match null_bitmap {
378 None => {
379 for i in 0..array.len() {
380 let sub_array = array.value(i);
381 self.hasher.update((sub_array.len() as u64).to_le_bytes());
382 self.update(sub_array.as_ref(), None);
383 }
384 }
385 Some(null_bitmap) => {
386 for i in 0..array.len() {
387 if null_bitmap.is_valid(i) {
388 let sub_array = array.value(i);
389 self.hasher.update((sub_array.len() as u64).to_le_bytes());
390 self.update(sub_array.as_ref(), None);
391 } else {
392 self.hasher.update(Self::NULL_MARKER);
393 }
394 }
395 }
396 }
397 }
398}
399
400#[cfg(test)]
405mod tests {
406 use super::*;
407 use arrow::{
408 array::{
409 ArrayData, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int32Array, StringArray,
410 UInt32Array,
411 },
412 buffer::Buffer,
413 datatypes::Int32Type,
414 };
415 use sha3::Sha3_256;
416
417 #[test]
418 fn test_ints() {
419 assert_eq!(
420 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
421 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
422 );
423
424 assert_ne!(
425 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3]),),
426 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![
427 Some(1),
428 Some(2),
429 None,
430 Some(3)
431 ]),),
432 );
433 }
434
435 #[test]
436 fn test_int_array() {
437 assert_eq!(
438 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
439 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
440 );
441
442 assert_ne!(
443 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
444 ArrayDigestV0::<Sha3_256>::digest(&UInt32Array::from(vec![1, 2, 3])),
445 );
446 }
447
448 #[test]
449 fn test_bool_array() {
450 fn make_bool_array(data: Vec<u8>, len: usize, nulls: Option<Vec<u8>>) -> BooleanArray {
451 let builder = ArrayData::builder(DataType::Boolean)
452 .len(len)
453 .add_buffer(Buffer::from_vec(data));
454
455 let builder = if let Some(nulls) = nulls {
456 builder.null_bit_buffer(Some(Buffer::from_vec(nulls)))
457 } else {
458 builder
459 };
460
461 BooleanArray::from(builder.build().unwrap())
462 }
463
464 let array1 = make_bool_array(vec![0b0011_0101u8], 6, None);
465 let array2 = make_bool_array(vec![0b1111_0101u8], 6, None);
466
467 assert_eq!(
468 ArrayDigestV0::<Sha3_256>::digest(&array1),
469 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
470 true, false, true, false, true, true
471 ])),
472 );
473
474 assert_ne!(
475 ArrayDigestV0::<Sha3_256>::digest(&array1),
476 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
477 true, false, true, false, true, false
478 ])),
479 );
480
481 assert_ne!(
482 ArrayDigestV0::<Sha3_256>::digest(&array1),
483 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
484 false, false, true, false, true, true
485 ])),
486 );
487
488 assert_eq!(
489 ArrayDigestV0::<Sha3_256>::digest(&array1),
490 ArrayDigestV0::<Sha3_256>::digest(&array2),
491 );
492
493 let array3 = make_bool_array(vec![0b1111_0101u8], 6, Some(vec![0b1110_1110]));
494
495 assert_eq!(
496 ArrayDigestV0::<Sha3_256>::digest(&array3),
497 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
498 None,
499 Some(false),
500 Some(true),
501 Some(false),
502 None,
503 Some(true)
504 ])),
505 );
506 }
507
508 #[test]
509 fn test_string_array() {
510 assert_eq!(
511 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
512 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
513 );
514
515 assert_eq!(
516 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
517 ArrayDigestV0::<Sha3_256>::digest(&LargeStringArray::from(vec!["foo", "bar", "baz"])),
518 );
519
520 assert_ne!(
521 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
522 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "", "baz"])),
523 );
524
525 assert_ne!(
526 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec![
527 Some("foo"),
528 Some("bar"),
529 Some("baz")
530 ]),),
531 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec![
532 Some("foo"),
533 Some("bar"),
534 None,
535 Some("baz")
536 ]),),
537 );
538
539 assert_eq!(
541 ArrayDigestV0::<Sha3_256>::digest(&StringViewArray::from_iter_values(vec![
542 "foo", "bar", "baz"
543 ])),
544 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
545 );
546 }
547
548 #[test]
549 fn test_list_array() {
550 assert_eq!(
576 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
577 vec![
578 Some(vec![Some(0), Some(1), Some(2)]),
579 None,
580 Some(vec![Some(3), None, Some(4), Some(5)]),
581 Some(vec![Some(6), Some(7)]),
582 ]
583 )),
584 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
585 vec![
586 Some(vec![Some(0), Some(1), Some(2)]),
587 None,
588 Some(vec![Some(3), None, Some(4), Some(5)]),
589 Some(vec![Some(6), Some(7)]),
590 ]
591 )),
592 );
593
594 assert_ne!(
596 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
597 vec![
598 Some(vec![Some(0), Some(1), Some(2)]),
599 None,
600 Some(vec![Some(3), None, Some(4), Some(5)]),
601 Some(vec![Some(6), Some(7)]),
602 ]
603 )),
604 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
605 vec![
606 Some(vec![Some(0), Some(1), Some(2)]),
607 None,
608 Some(vec![Some(3), None, Some(4), Some(100)]),
609 Some(vec![Some(6), Some(7)]),
610 ]
611 )),
612 );
613
614 assert_ne!(
616 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
617 vec![
618 Some(vec![Some(0), Some(1), Some(2)]),
619 None,
620 Some(vec![Some(3), None, Some(4), Some(5)]),
621 Some(vec![Some(6), Some(7)]),
622 ]
623 )),
624 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
625 vec![
626 Some(vec![Some(0), Some(1), Some(2)]),
627 None,
628 Some(vec![Some(3), None, Some(4)]),
629 Some(vec![Some(5), Some(6), Some(7)]),
630 ]
631 )),
632 );
633 }
634
635 #[test]
636 fn test_binary_array() {
637 assert_eq!(
638 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
639 b"one", b"two", b"", b"three"
640 ])),
641 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
642 b"one", b"two", b"", b"three"
643 ])),
644 );
645
646 assert_ne!(
647 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
648 b"one", b"two", b"", b"three"
649 ])),
650 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
651 b"one", b"two", b"three"
652 ])),
653 );
654
655 assert_eq!(
656 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
657 b"one", b"two", b"", b"three"
658 ])),
659 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_opt_vec(vec![
660 Some(b"one"),
661 Some(b"two"),
662 Some(b""),
663 Some(b"three")
664 ])),
665 );
666
667 assert_ne!(
668 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
669 b"one", b"two", b"", b"three"
670 ])),
671 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_opt_vec(vec![
672 Some(b"one"),
673 Some(b"two"),
674 None,
675 Some(b"three")
676 ])),
677 );
678
679 assert_eq!(
680 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![b"one", b"two"])),
681 ArrayDigestV0::<Sha3_256>::digest(&FixedSizeBinaryArray::from(
682 ArrayData::builder(DataType::FixedSizeBinary(3))
683 .len(2)
684 .add_buffer(Buffer::from(b"onetwo"))
685 .build()
686 .unwrap()
687 )),
688 );
689
690 assert_eq!(
692 ArrayDigestV0::<Sha3_256>::digest(&BinaryViewArray::from_iter_values(vec![
693 b"one" as &[u8],
694 b"two",
695 b"",
696 b"three"
697 ])),
698 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
699 b"one", b"two", b"", b"three"
700 ])),
701 );
702 }
703}