1use crate::ArrayDigest;
2use arrow::{
3 array::{
4 Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray,
5 FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericStringArray,
6 LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, OffsetSizeTrait,
7 StringArray, StringViewArray,
8 },
9 buffer::NullBuffer,
10 datatypes::DataType,
11};
12use digest::{Digest, Output, OutputSizeUser};
13
14pub struct ArrayDigestV0<Dig: Digest> {
16 hasher: Dig,
17}
18
19impl<Dig: Digest> OutputSizeUser for ArrayDigestV0<Dig> {
22 type OutputSize = Dig::OutputSize;
23}
24
25impl<Dig: Digest> ArrayDigest for ArrayDigestV0<Dig> {
26 fn digest(array: &dyn Array) -> Output<Dig> {
27 let mut d = Self::new(array.data_type());
28 d.update(array, None);
29 d.finalize()
30 }
31
32 fn new(data_type: &DataType) -> Self {
33 let mut hasher = Dig::new();
34 crate::schema_digest::hash_data_type(data_type, &mut hasher);
35 Self { hasher }
36 }
37
38 fn update(&mut self, array: &dyn Array, parent_null_bitmap: Option<&NullBuffer>) {
39 let array_data = array.to_data();
40 let combined_null_bitmap_val =
41 crate::utils::maybe_combine_null_buffers(parent_null_bitmap, array_data.nulls());
42 let combined_null_bitmap = combined_null_bitmap_val.as_option();
43
44 let data_type = array.data_type();
45
46 #[inline]
47 fn unsupported(data_type: &DataType) -> ! {
48 unimplemented!("Type {} is not yet supported", data_type);
49 }
50
51 match data_type {
52 DataType::Null => unsupported(data_type),
53 DataType::Boolean => self.hash_array_bool(array, combined_null_bitmap),
54 DataType::Int8 | DataType::UInt8 => {
55 self.hash_fixed_size(array, 1, combined_null_bitmap)
56 }
57 DataType::Int16 | DataType::UInt16 => {
58 self.hash_fixed_size(array, 2, combined_null_bitmap)
59 }
60 DataType::Int32 | DataType::UInt32 => {
61 self.hash_fixed_size(array, 4, combined_null_bitmap)
62 }
63 DataType::Int64 | DataType::UInt64 => {
64 self.hash_fixed_size(array, 8, combined_null_bitmap)
65 }
66 DataType::Float16 => self.hash_fixed_size(array, 2, combined_null_bitmap),
67 DataType::Float32 => self.hash_fixed_size(array, 4, combined_null_bitmap),
68 DataType::Float64 => self.hash_fixed_size(array, 8, combined_null_bitmap),
69 DataType::Timestamp(_, _) => self.hash_fixed_size(array, 8, combined_null_bitmap),
70 DataType::Date32 => self.hash_fixed_size(array, 4, combined_null_bitmap),
71 DataType::Date64 => self.hash_fixed_size(array, 8, combined_null_bitmap),
72 DataType::Time32(_) => self.hash_fixed_size(array, 4, combined_null_bitmap),
73 DataType::Time64(_) => self.hash_fixed_size(array, 8, combined_null_bitmap),
74 DataType::Duration(_) => unsupported(data_type),
75 DataType::Interval(_) => unsupported(data_type),
76 DataType::Binary => self.hash_array_binary(
77 array.as_any().downcast_ref::<BinaryArray>().unwrap(),
78 combined_null_bitmap,
79 ),
80 DataType::LargeBinary => self.hash_array_binary(
81 array.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
82 combined_null_bitmap,
83 ),
84 DataType::BinaryView => self.hash_array_binary_view(
85 array.as_any().downcast_ref::<BinaryViewArray>().unwrap(),
86 combined_null_bitmap,
87 ),
88 DataType::FixedSizeBinary(size) => self.hash_array_binary_fixed(
89 array
90 .as_any()
91 .downcast_ref::<FixedSizeBinaryArray>()
92 .unwrap(),
93 *size as usize,
94 combined_null_bitmap,
95 ),
96 DataType::Utf8 => self.hash_array_string(
97 array.as_any().downcast_ref::<StringArray>().unwrap(),
98 combined_null_bitmap,
99 ),
100 DataType::LargeUtf8 => self.hash_array_string(
101 array.as_any().downcast_ref::<LargeStringArray>().unwrap(),
102 combined_null_bitmap,
103 ),
104 DataType::Utf8View => self.hash_array_string_view(
105 array.as_any().downcast_ref::<StringViewArray>().unwrap(),
106 combined_null_bitmap,
107 ),
108 DataType::List(_) => self.hash_array_list(
109 array.as_any().downcast_ref::<ListArray>().unwrap(),
110 combined_null_bitmap,
111 ),
112 DataType::LargeList(_) => self.hash_array_list(
113 array.as_any().downcast_ref::<LargeListArray>().unwrap(),
114 combined_null_bitmap,
115 ),
116 DataType::ListView(_) | DataType::LargeListView(_) => unsupported(data_type),
117 DataType::FixedSizeList(..) => self.hash_array_list_fixed(
118 array.as_any().downcast_ref::<FixedSizeListArray>().unwrap(),
119 combined_null_bitmap,
120 ),
121 DataType::Struct(_) => panic!(
123 "Structs are currently flattened by RecordDigest and cannot be processed by ArrayDigest"
124 ),
125 DataType::Union(_, _) => unsupported(data_type),
126 DataType::Dictionary(..) => unsupported(data_type),
127 DataType::Decimal32(_, _) => self.hash_fixed_size(array, 4, combined_null_bitmap),
128 DataType::Decimal64(_, _) => self.hash_fixed_size(array, 8, combined_null_bitmap),
129 DataType::Decimal128(_, _) => self.hash_fixed_size(array, 16, combined_null_bitmap),
130 DataType::Decimal256(_, _) => self.hash_fixed_size(array, 32, combined_null_bitmap),
131 DataType::Map(..) => unsupported(data_type),
132 DataType::RunEndEncoded(..) => unsupported(data_type),
133 }
134 }
135
136 fn finalize(self) -> Output<Dig> {
137 self.hasher.finalize()
138 }
139}
140
141impl<Dig: Digest> ArrayDigestV0<Dig> {
144 const NULL_MARKER: [u8; 1] = [0];
145
146 fn hash_fixed_size(
147 &mut self,
148 array: &dyn Array,
149 item_size: usize,
150 null_bitmap: Option<&NullBuffer>,
151 ) {
152 let array_data = array.to_data();
153
154 assert_eq!(
156 array_data.buffers().len(),
157 1,
158 "Multiple buffers on a primitive type array"
159 );
160
161 let slice = {
162 let data_start = array_data.offset() * item_size;
163 let data_end = data_start + array_data.len() * item_size;
164 &array_data.buffers()[0].as_slice()[data_start..data_end]
165 };
166
167 match null_bitmap {
168 None => {
169 self.hasher.update(slice);
171 }
172 Some(null_bitmap) => {
173 for i in 0..array.len() {
175 if null_bitmap.is_valid(i) {
176 let pos = i * item_size;
177 self.hasher.update(&slice[pos..pos + item_size]);
178 } else {
179 self.hasher.update(Self::NULL_MARKER);
180 }
181 }
182 }
183 }
184 }
185
186 fn hash_array_bool(&mut self, array: &dyn Array, null_bitmap: Option<&NullBuffer>) {
188 let bool_array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
189
190 match null_bitmap {
191 None => {
192 for i in 0..bool_array.len() {
193 let value = unsafe { bool_array.value_unchecked(i) };
195 self.hasher.update([value as u8 + 1]);
196 }
197 }
198 Some(null_bitmap) => {
199 for i in 0..bool_array.len() {
200 if null_bitmap.is_valid(i) {
201 let value = unsafe { bool_array.value_unchecked(i) };
203 self.hasher.update([value as u8 + 1]);
204 } else {
205 self.hasher.update(Self::NULL_MARKER);
206 }
207 }
208 }
209 }
210 }
211
212 fn hash_array_string<OffsetSize: OffsetSizeTrait>(
213 &mut self,
214 array: &GenericStringArray<OffsetSize>,
215 null_bitmap: Option<&NullBuffer>,
216 ) {
217 match null_bitmap {
218 None => {
219 for i in 0..array.len() {
220 let s = array.value(i);
221 self.hasher.update((s.len() as u64).to_le_bytes());
222 self.hasher.update(s.as_bytes());
223 }
224 }
225 Some(null_bitmap) => {
226 for i in 0..array.len() {
227 if null_bitmap.is_valid(i) {
228 let s = array.value(i);
229 self.hasher.update((s.len() as u64).to_le_bytes());
230 self.hasher.update(s.as_bytes());
231 } else {
232 self.hasher.update(Self::NULL_MARKER);
233 }
234 }
235 }
236 }
237 }
238
239 fn hash_array_string_view(
240 &mut self,
241 array: &StringViewArray,
242 null_bitmap: Option<&NullBuffer>,
243 ) {
244 match null_bitmap {
245 None => {
246 for i in 0..array.len() {
247 let s = array.value(i);
248 self.hasher.update((s.len() as u64).to_le_bytes());
249 self.hasher.update(s.as_bytes());
250 }
251 }
252 Some(null_bitmap) => {
253 for i in 0..array.len() {
254 if null_bitmap.is_valid(i) {
255 let s = array.value(i);
256 self.hasher.update((s.len() as u64).to_le_bytes());
257 self.hasher.update(s.as_bytes());
258 } else {
259 self.hasher.update(Self::NULL_MARKER);
260 }
261 }
262 }
263 }
264 }
265
266 fn hash_array_binary<OffsetSize: OffsetSizeTrait>(
267 &mut self,
268 array: &GenericBinaryArray<OffsetSize>,
269 null_bitmap: Option<&NullBuffer>,
270 ) {
271 match null_bitmap {
272 None => {
273 for i in 0..array.len() {
274 let slice = array.value(i);
275 self.hasher.update((slice.len() as u64).to_le_bytes());
276 self.hasher.update(slice);
277 }
278 }
279 Some(null_bitmap) => {
280 for i in 0..array.len() {
281 if null_bitmap.is_valid(i) {
282 let slice = array.value(i);
283 self.hasher.update((slice.len() as u64).to_le_bytes());
284 self.hasher.update(slice);
285 } else {
286 self.hasher.update(Self::NULL_MARKER);
287 }
288 }
289 }
290 }
291 }
292
293 fn hash_array_binary_view(
294 &mut self,
295 array: &BinaryViewArray,
296 null_bitmap: Option<&NullBuffer>,
297 ) {
298 match null_bitmap {
299 None => {
300 for i in 0..array.len() {
301 let slice = array.value(i);
302 self.hasher.update((slice.len() as u64).to_le_bytes());
303 self.hasher.update(slice);
304 }
305 }
306 Some(null_bitmap) => {
307 for i in 0..array.len() {
308 if null_bitmap.is_valid(i) {
309 let slice = array.value(i);
310 self.hasher.update((slice.len() as u64).to_le_bytes());
311 self.hasher.update(slice);
312 } else {
313 self.hasher.update(Self::NULL_MARKER);
314 }
315 }
316 }
317 }
318 }
319
320 fn hash_array_binary_fixed(
321 &mut self,
322 array: &FixedSizeBinaryArray,
323 size: usize,
324 null_bitmap: Option<&NullBuffer>,
325 ) {
326 match null_bitmap {
327 None => {
328 for i in 0..array.len() {
329 let slice = array.value(i);
330 self.hasher.update((size as u64).to_le_bytes());
331 self.hasher.update(slice);
332 }
333 }
334 Some(null_bitmap) => {
335 for i in 0..array.len() {
336 if null_bitmap.is_valid(i) {
337 let slice = array.value(i);
338 self.hasher.update((size as u64).to_le_bytes());
339 self.hasher.update(slice);
340 } else {
341 self.hasher.update(Self::NULL_MARKER);
342 }
343 }
344 }
345 }
346 }
347
348 fn hash_array_list<Off: OffsetSizeTrait>(
349 &mut self,
350 array: &GenericListArray<Off>,
351 null_bitmap: Option<&NullBuffer>,
352 ) {
353 match null_bitmap {
354 None => {
355 for i in 0..array.len() {
356 let sub_array = array.value(i);
357 self.hasher.update((sub_array.len() as u64).to_le_bytes());
358 self.update(sub_array.as_ref(), None);
359 }
360 }
361 Some(null_bitmap) => {
362 for i in 0..array.len() {
363 if null_bitmap.is_valid(i) {
364 let sub_array = array.value(i);
365 self.hasher.update((sub_array.len() as u64).to_le_bytes());
366 self.update(sub_array.as_ref(), None);
367 } else {
368 self.hasher.update(Self::NULL_MARKER);
369 }
370 }
371 }
372 }
373 }
374
375 fn hash_array_list_fixed(
376 &mut self,
377 array: &FixedSizeListArray,
378 null_bitmap: Option<&NullBuffer>,
379 ) {
380 match null_bitmap {
381 None => {
382 for i in 0..array.len() {
383 let sub_array = array.value(i);
384 self.hasher.update((sub_array.len() as u64).to_le_bytes());
385 self.update(sub_array.as_ref(), None);
386 }
387 }
388 Some(null_bitmap) => {
389 for i in 0..array.len() {
390 if null_bitmap.is_valid(i) {
391 let sub_array = array.value(i);
392 self.hasher.update((sub_array.len() as u64).to_le_bytes());
393 self.update(sub_array.as_ref(), None);
394 } else {
395 self.hasher.update(Self::NULL_MARKER);
396 }
397 }
398 }
399 }
400 }
401}
402
403#[cfg(test)]
408mod tests {
409 use super::*;
410 use arrow::{
411 array::{
412 ArrayData, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int32Array, StringArray,
413 UInt32Array,
414 },
415 buffer::Buffer,
416 datatypes::Int32Type,
417 };
418 use sha3::Sha3_256;
419
420 #[test]
421 fn test_ints() {
422 assert_eq!(
423 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
424 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
425 );
426
427 assert_ne!(
428 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3]),),
429 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![
430 Some(1),
431 Some(2),
432 None,
433 Some(3)
434 ]),),
435 );
436 }
437
438 #[test]
439 fn test_int_array() {
440 assert_eq!(
441 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
442 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
443 );
444
445 assert_ne!(
446 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
447 ArrayDigestV0::<Sha3_256>::digest(&UInt32Array::from(vec![1, 2, 3])),
448 );
449 }
450
451 #[test]
452 fn test_bool_array() {
453 fn make_bool_array(data: Vec<u8>, len: usize, nulls: Option<Vec<u8>>) -> BooleanArray {
454 let builder = ArrayData::builder(DataType::Boolean)
455 .len(len)
456 .add_buffer(Buffer::from_vec(data));
457
458 let builder = if let Some(nulls) = nulls {
459 builder.null_bit_buffer(Some(Buffer::from_vec(nulls)))
460 } else {
461 builder
462 };
463
464 BooleanArray::from(builder.build().unwrap())
465 }
466
467 let array1 = make_bool_array(vec![0b0011_0101u8], 6, None);
468 let array2 = make_bool_array(vec![0b1111_0101u8], 6, None);
469
470 assert_eq!(
471 ArrayDigestV0::<Sha3_256>::digest(&array1),
472 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
473 true, false, true, false, true, true
474 ])),
475 );
476
477 assert_ne!(
478 ArrayDigestV0::<Sha3_256>::digest(&array1),
479 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
480 true, false, true, false, true, false
481 ])),
482 );
483
484 assert_ne!(
485 ArrayDigestV0::<Sha3_256>::digest(&array1),
486 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
487 false, false, true, false, true, true
488 ])),
489 );
490
491 assert_eq!(
492 ArrayDigestV0::<Sha3_256>::digest(&array1),
493 ArrayDigestV0::<Sha3_256>::digest(&array2),
494 );
495
496 let array3 = make_bool_array(vec![0b1111_0101u8], 6, Some(vec![0b1110_1110]));
497
498 assert_eq!(
499 ArrayDigestV0::<Sha3_256>::digest(&array3),
500 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
501 None,
502 Some(false),
503 Some(true),
504 Some(false),
505 None,
506 Some(true)
507 ])),
508 );
509 }
510
511 #[test]
512 fn test_string_array() {
513 assert_eq!(
514 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
515 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
516 );
517
518 assert_eq!(
519 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
520 ArrayDigestV0::<Sha3_256>::digest(&LargeStringArray::from(vec!["foo", "bar", "baz"])),
521 );
522
523 assert_ne!(
524 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
525 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "", "baz"])),
526 );
527
528 assert_ne!(
529 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec![
530 Some("foo"),
531 Some("bar"),
532 Some("baz")
533 ]),),
534 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec![
535 Some("foo"),
536 Some("bar"),
537 None,
538 Some("baz")
539 ]),),
540 );
541
542 assert_eq!(
544 ArrayDigestV0::<Sha3_256>::digest(&StringViewArray::from_iter_values(vec![
545 "foo", "bar", "baz"
546 ])),
547 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
548 );
549 }
550
551 #[test]
552 fn test_list_array() {
553 assert_eq!(
579 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
580 vec![
581 Some(vec![Some(0), Some(1), Some(2)]),
582 None,
583 Some(vec![Some(3), None, Some(4), Some(5)]),
584 Some(vec![Some(6), Some(7)]),
585 ]
586 )),
587 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
588 vec![
589 Some(vec![Some(0), Some(1), Some(2)]),
590 None,
591 Some(vec![Some(3), None, Some(4), Some(5)]),
592 Some(vec![Some(6), Some(7)]),
593 ]
594 )),
595 );
596
597 assert_ne!(
599 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
600 vec![
601 Some(vec![Some(0), Some(1), Some(2)]),
602 None,
603 Some(vec![Some(3), None, Some(4), Some(5)]),
604 Some(vec![Some(6), Some(7)]),
605 ]
606 )),
607 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
608 vec![
609 Some(vec![Some(0), Some(1), Some(2)]),
610 None,
611 Some(vec![Some(3), None, Some(4), Some(100)]),
612 Some(vec![Some(6), Some(7)]),
613 ]
614 )),
615 );
616
617 assert_ne!(
619 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
620 vec![
621 Some(vec![Some(0), Some(1), Some(2)]),
622 None,
623 Some(vec![Some(3), None, Some(4), Some(5)]),
624 Some(vec![Some(6), Some(7)]),
625 ]
626 )),
627 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
628 vec![
629 Some(vec![Some(0), Some(1), Some(2)]),
630 None,
631 Some(vec![Some(3), None, Some(4)]),
632 Some(vec![Some(5), Some(6), Some(7)]),
633 ]
634 )),
635 );
636 }
637
638 #[test]
639 fn test_binary_array() {
640 assert_eq!(
641 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
642 b"one", b"two", b"", b"three"
643 ])),
644 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
645 b"one", b"two", b"", b"three"
646 ])),
647 );
648
649 assert_ne!(
650 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
651 b"one", b"two", b"", b"three"
652 ])),
653 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
654 b"one", b"two", b"three"
655 ])),
656 );
657
658 assert_eq!(
659 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
660 b"one", b"two", b"", b"three"
661 ])),
662 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_opt_vec(vec![
663 Some(b"one"),
664 Some(b"two"),
665 Some(b""),
666 Some(b"three")
667 ])),
668 );
669
670 assert_ne!(
671 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
672 b"one", b"two", b"", b"three"
673 ])),
674 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_opt_vec(vec![
675 Some(b"one"),
676 Some(b"two"),
677 None,
678 Some(b"three")
679 ])),
680 );
681
682 assert_eq!(
683 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![b"one", b"two"])),
684 ArrayDigestV0::<Sha3_256>::digest(&FixedSizeBinaryArray::from(
685 ArrayData::builder(DataType::FixedSizeBinary(3))
686 .len(2)
687 .add_buffer(Buffer::from(b"onetwo"))
688 .build()
689 .unwrap()
690 )),
691 );
692
693 assert_eq!(
695 ArrayDigestV0::<Sha3_256>::digest(&BinaryViewArray::from_iter_values(vec![
696 b"one" as &[u8],
697 b"two",
698 b"",
699 b"three"
700 ])),
701 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
702 b"one", b"two", b"", b"three"
703 ])),
704 );
705 }
706}