1use crate::ArrayDigest;
2use arrow::{
3 array::{
4 Array, BinaryArray, BinaryViewArray, BooleanArray, FixedSizeBinaryArray,
5 FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericStringArray,
6 LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, OffsetSizeTrait,
7 StringArray, StringViewArray,
8 },
9 buffer::NullBuffer,
10 datatypes::DataType,
11};
12use digest::{Digest, Output, OutputSizeUser};
13
14pub struct ArrayDigestV0<Dig: Digest> {
16 hasher: Dig,
17}
18
19impl<Dig: Digest> OutputSizeUser for ArrayDigestV0<Dig> {
22 type OutputSize = Dig::OutputSize;
23}
24
25impl<Dig: Digest> ArrayDigest for ArrayDigestV0<Dig> {
26 fn digest(array: &dyn Array) -> Output<Dig> {
27 let mut d = Self::new(array.data_type());
28 d.update(array, None);
29 d.finalize()
30 }
31
32 fn new(data_type: &DataType) -> Self {
33 let mut hasher = Dig::new();
34 crate::schema_digest::hash_data_type(data_type, &mut hasher);
35 Self { hasher }
36 }
37
38 fn update(&mut self, array: &dyn Array, parent_null_bitmap: Option<&NullBuffer>) {
39 let array_data = array.to_data();
40 let combined_null_bitmap_val =
41 crate::utils::maybe_combine_null_buffers(parent_null_bitmap, array_data.nulls());
42 let combined_null_bitmap = combined_null_bitmap_val.as_option();
43
44 let data_type = array.data_type();
45
46 #[inline]
47 fn unsupported(data_type: &DataType) -> ! {
48 unimplemented!("Type {} is not yet supported", data_type);
49 }
50
51 match data_type {
52 DataType::Null => unsupported(data_type),
53 DataType::Boolean => self.hash_array_bool(array, combined_null_bitmap),
54 DataType::Int8 | DataType::UInt8 => {
55 self.hash_fixed_size(array, 1, combined_null_bitmap)
56 }
57 DataType::Int16 | DataType::UInt16 => {
58 self.hash_fixed_size(array, 2, combined_null_bitmap)
59 }
60 DataType::Int32 | DataType::UInt32 => {
61 self.hash_fixed_size(array, 4, combined_null_bitmap)
62 }
63 DataType::Int64 | DataType::UInt64 => {
64 self.hash_fixed_size(array, 8, combined_null_bitmap)
65 }
66 DataType::Float16 => self.hash_fixed_size(array, 2, combined_null_bitmap),
67 DataType::Float32 => self.hash_fixed_size(array, 4, combined_null_bitmap),
68 DataType::Float64 => self.hash_fixed_size(array, 8, combined_null_bitmap),
69 DataType::Timestamp(_, _) => self.hash_fixed_size(array, 8, combined_null_bitmap),
70 DataType::Date32 => self.hash_fixed_size(array, 4, combined_null_bitmap),
71 DataType::Date64 => self.hash_fixed_size(array, 8, combined_null_bitmap),
72 DataType::Time32(_) => self.hash_fixed_size(array, 4, combined_null_bitmap),
73 DataType::Time64(_) => self.hash_fixed_size(array, 8, combined_null_bitmap),
74 DataType::Duration(_) => unsupported(data_type),
75 DataType::Interval(_) => unsupported(data_type),
76 DataType::Binary => self.hash_array_binary(
77 array.as_any().downcast_ref::<BinaryArray>().unwrap(),
78 combined_null_bitmap,
79 ),
80 DataType::LargeBinary => self.hash_array_binary(
81 array.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
82 combined_null_bitmap,
83 ),
84 DataType::BinaryView => self.hash_array_binary_view(
85 array.as_any().downcast_ref::<BinaryViewArray>().unwrap(),
86 combined_null_bitmap
87 ),
88 DataType::FixedSizeBinary(size) => {
89 self.hash_array_binary_fixed(
90 array.as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap(),
91 *size as usize,
92 combined_null_bitmap,
93 )
94 },
95 DataType::Utf8 => self.hash_array_string(
96 array.as_any().downcast_ref::<StringArray>().unwrap(),
97 combined_null_bitmap,
98 ),
99 DataType::LargeUtf8 => self.hash_array_string(
100 array.as_any().downcast_ref::<LargeStringArray>().unwrap(),
101 combined_null_bitmap,
102 ),
103 DataType::Utf8View => self.hash_array_string_view(
104 array.as_any().downcast_ref::<StringViewArray>().unwrap(),
105 combined_null_bitmap
106 ),
107 DataType::List(_) => self.hash_array_list(
108 array.as_any().downcast_ref::<ListArray>().unwrap(),
109 combined_null_bitmap,
110 ),
111 DataType::LargeList(_) => self.hash_array_list(
112 array.as_any().downcast_ref::<LargeListArray>().unwrap(),
113 combined_null_bitmap,
114 ),
115 DataType::ListView(_) | DataType::LargeListView(_) => unsupported(data_type),
116 DataType::FixedSizeList(..) => self.hash_array_list_fixed(
117 array.as_any().downcast_ref::<FixedSizeListArray>().unwrap(),
118 combined_null_bitmap,
119 ),
120 DataType::Struct(_) => panic!(
122 "Structs are currently flattened by RecordDigest and cannot be processed by ArrayDigest"
123 ),
124 DataType::Union(_, _) => unsupported(data_type),
125 DataType::Dictionary(..) => unsupported(data_type),
126 DataType::Decimal32(_, _) => self.hash_fixed_size(array, 4, combined_null_bitmap),
127 DataType::Decimal64(_, _) => self.hash_fixed_size(array, 8, combined_null_bitmap),
128 DataType::Decimal128(_, _) => self.hash_fixed_size(array, 16, combined_null_bitmap),
129 DataType::Decimal256(_, _) => self.hash_fixed_size(array, 32, combined_null_bitmap),
130 DataType::Map(..) => unsupported(data_type),
131 DataType::RunEndEncoded(..) => unsupported(data_type),
132 }
133 }
134
135 fn finalize(self) -> Output<Dig> {
136 self.hasher.finalize()
137 }
138}
139
140impl<Dig: Digest> ArrayDigestV0<Dig> {
143 const NULL_MARKER: [u8; 1] = [0];
144
145 fn hash_fixed_size(
146 &mut self,
147 array: &dyn Array,
148 item_size: usize,
149 null_bitmap: Option<&NullBuffer>,
150 ) {
151 let array_data = array.to_data();
152
153 assert_eq!(
155 array_data.buffers().len(),
156 1,
157 "Multiple buffers on a primitive type array"
158 );
159
160 let slice = {
161 let data_start = array_data.offset() * item_size;
162 let data_end = data_start + array_data.len() * item_size;
163 &array_data.buffers()[0].as_slice()[data_start..data_end]
164 };
165
166 match null_bitmap {
167 None => {
168 self.hasher.update(slice);
170 }
171 Some(null_bitmap) => {
172 for i in 0..array.len() {
174 if null_bitmap.is_valid(i) {
175 let pos = i * item_size;
176 self.hasher.update(&slice[pos..pos + item_size]);
177 } else {
178 self.hasher.update(Self::NULL_MARKER);
179 }
180 }
181 }
182 }
183 }
184
185 fn hash_array_bool(&mut self, array: &dyn Array, null_bitmap: Option<&NullBuffer>) {
187 let bool_array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
188
189 match null_bitmap {
190 None => {
191 for i in 0..bool_array.len() {
192 let value = unsafe { bool_array.value_unchecked(i) };
194 self.hasher.update([value as u8 + 1]);
195 }
196 }
197 Some(null_bitmap) => {
198 for i in 0..bool_array.len() {
199 if null_bitmap.is_valid(i) {
200 let value = unsafe { bool_array.value_unchecked(i) };
202 self.hasher.update([value as u8 + 1]);
203 } else {
204 self.hasher.update(Self::NULL_MARKER);
205 }
206 }
207 }
208 }
209 }
210
211 fn hash_array_string<OffsetSize: OffsetSizeTrait>(
212 &mut self,
213 array: &GenericStringArray<OffsetSize>,
214 null_bitmap: Option<&NullBuffer>,
215 ) {
216 match null_bitmap {
217 None => {
218 for i in 0..array.len() {
219 let s = array.value(i);
220 self.hasher.update((s.len() as u64).to_le_bytes());
221 self.hasher.update(s.as_bytes());
222 }
223 }
224 Some(null_bitmap) => {
225 for i in 0..array.len() {
226 if null_bitmap.is_valid(i) {
227 let s = array.value(i);
228 self.hasher.update((s.len() as u64).to_le_bytes());
229 self.hasher.update(s.as_bytes());
230 } else {
231 self.hasher.update(Self::NULL_MARKER);
232 }
233 }
234 }
235 }
236 }
237
238 fn hash_array_string_view(
239 &mut self,
240 array: &StringViewArray,
241 null_bitmap: Option<&NullBuffer>,
242 ) {
243 match null_bitmap {
244 None => {
245 for i in 0..array.len() {
246 let s = array.value(i);
247 self.hasher.update((s.len() as u64).to_le_bytes());
248 self.hasher.update(s.as_bytes());
249 }
250 }
251 Some(null_bitmap) => {
252 for i in 0..array.len() {
253 if null_bitmap.is_valid(i) {
254 let s = array.value(i);
255 self.hasher.update((s.len() as u64).to_le_bytes());
256 self.hasher.update(s.as_bytes());
257 } else {
258 self.hasher.update(Self::NULL_MARKER);
259 }
260 }
261 }
262 }
263 }
264
265 fn hash_array_binary<OffsetSize: OffsetSizeTrait>(
266 &mut self,
267 array: &GenericBinaryArray<OffsetSize>,
268 null_bitmap: Option<&NullBuffer>,
269 ) {
270 match null_bitmap {
271 None => {
272 for i in 0..array.len() {
273 let slice = array.value(i);
274 self.hasher.update((slice.len() as u64).to_le_bytes());
275 self.hasher.update(slice);
276 }
277 }
278 Some(null_bitmap) => {
279 for i in 0..array.len() {
280 if null_bitmap.is_valid(i) {
281 let slice = array.value(i);
282 self.hasher.update((slice.len() as u64).to_le_bytes());
283 self.hasher.update(slice);
284 } else {
285 self.hasher.update(Self::NULL_MARKER);
286 }
287 }
288 }
289 }
290 }
291
292 fn hash_array_binary_view(
293 &mut self,
294 array: &BinaryViewArray,
295 null_bitmap: Option<&NullBuffer>,
296 ) {
297 match null_bitmap {
298 None => {
299 for i in 0..array.len() {
300 let slice = array.value(i);
301 self.hasher.update((slice.len() as u64).to_le_bytes());
302 self.hasher.update(slice);
303 }
304 }
305 Some(null_bitmap) => {
306 for i in 0..array.len() {
307 if null_bitmap.is_valid(i) {
308 let slice = array.value(i);
309 self.hasher.update((slice.len() as u64).to_le_bytes());
310 self.hasher.update(slice);
311 } else {
312 self.hasher.update(Self::NULL_MARKER);
313 }
314 }
315 }
316 }
317 }
318
319 fn hash_array_binary_fixed(
320 &mut self,
321 array: &FixedSizeBinaryArray,
322 size: usize,
323 null_bitmap: Option<&NullBuffer>,
324 ) {
325 match null_bitmap {
326 None => {
327 for i in 0..array.len() {
328 let slice = array.value(i);
329 self.hasher.update((size as u64).to_le_bytes());
330 self.hasher.update(slice);
331 }
332 }
333 Some(null_bitmap) => {
334 for i in 0..array.len() {
335 if null_bitmap.is_valid(i) {
336 let slice = array.value(i);
337 self.hasher.update((size as u64).to_le_bytes());
338 self.hasher.update(slice);
339 } else {
340 self.hasher.update(Self::NULL_MARKER);
341 }
342 }
343 }
344 }
345 }
346
347 fn hash_array_list<Off: OffsetSizeTrait>(
348 &mut self,
349 array: &GenericListArray<Off>,
350 null_bitmap: Option<&NullBuffer>,
351 ) {
352 match null_bitmap {
353 None => {
354 for i in 0..array.len() {
355 let sub_array = array.value(i);
356 self.hasher.update((sub_array.len() as u64).to_le_bytes());
357 self.update(sub_array.as_ref(), None);
358 }
359 }
360 Some(null_bitmap) => {
361 for i in 0..array.len() {
362 if null_bitmap.is_valid(i) {
363 let sub_array = array.value(i);
364 self.hasher.update((sub_array.len() as u64).to_le_bytes());
365 self.update(sub_array.as_ref(), None);
366 } else {
367 self.hasher.update(Self::NULL_MARKER);
368 }
369 }
370 }
371 }
372 }
373
374 fn hash_array_list_fixed(
375 &mut self,
376 array: &FixedSizeListArray,
377 null_bitmap: Option<&NullBuffer>,
378 ) {
379 match null_bitmap {
380 None => {
381 for i in 0..array.len() {
382 let sub_array = array.value(i);
383 self.hasher.update((sub_array.len() as u64).to_le_bytes());
384 self.update(sub_array.as_ref(), None);
385 }
386 }
387 Some(null_bitmap) => {
388 for i in 0..array.len() {
389 if null_bitmap.is_valid(i) {
390 let sub_array = array.value(i);
391 self.hasher.update((sub_array.len() as u64).to_le_bytes());
392 self.update(sub_array.as_ref(), None);
393 } else {
394 self.hasher.update(Self::NULL_MARKER);
395 }
396 }
397 }
398 }
399 }
400}
401
402#[cfg(test)]
407mod tests {
408 use super::*;
409 use arrow::{
410 array::{
411 ArrayData, BinaryArray, BooleanArray, FixedSizeBinaryArray, Int32Array, StringArray,
412 UInt32Array,
413 },
414 buffer::Buffer,
415 datatypes::Int32Type,
416 };
417 use sha3::Sha3_256;
418
419 #[test]
420 fn test_ints() {
421 assert_eq!(
422 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
423 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
424 );
425
426 assert_ne!(
427 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3]),),
428 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![
429 Some(1),
430 Some(2),
431 None,
432 Some(3)
433 ]),),
434 );
435 }
436
437 #[test]
438 fn test_int_array() {
439 assert_eq!(
440 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
441 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
442 );
443
444 assert_ne!(
445 ArrayDigestV0::<Sha3_256>::digest(&Int32Array::from(vec![1, 2, 3])),
446 ArrayDigestV0::<Sha3_256>::digest(&UInt32Array::from(vec![1, 2, 3])),
447 );
448 }
449
450 #[test]
451 fn test_bool_array() {
452 fn make_bool_array(data: Vec<u8>, len: usize, nulls: Option<Vec<u8>>) -> BooleanArray {
453 let builder = ArrayData::builder(DataType::Boolean)
454 .len(len)
455 .add_buffer(Buffer::from_vec(data));
456
457 let builder = if let Some(nulls) = nulls {
458 builder.null_bit_buffer(Some(Buffer::from_vec(nulls)))
459 } else {
460 builder
461 };
462
463 BooleanArray::from(builder.build().unwrap())
464 }
465
466 let array1 = make_bool_array(vec![0b0011_0101u8], 6, None);
467 let array2 = make_bool_array(vec![0b1111_0101u8], 6, None);
468
469 assert_eq!(
470 ArrayDigestV0::<Sha3_256>::digest(&array1),
471 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
472 true, false, true, false, true, true
473 ])),
474 );
475
476 assert_ne!(
477 ArrayDigestV0::<Sha3_256>::digest(&array1),
478 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
479 true, false, true, false, true, false
480 ])),
481 );
482
483 assert_ne!(
484 ArrayDigestV0::<Sha3_256>::digest(&array1),
485 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
486 false, false, true, false, true, true
487 ])),
488 );
489
490 assert_eq!(
491 ArrayDigestV0::<Sha3_256>::digest(&array1),
492 ArrayDigestV0::<Sha3_256>::digest(&array2),
493 );
494
495 let array3 = make_bool_array(vec![0b1111_0101u8], 6, Some(vec![0b1110_1110]));
496
497 assert_eq!(
498 ArrayDigestV0::<Sha3_256>::digest(&array3),
499 ArrayDigestV0::<Sha3_256>::digest(&BooleanArray::from(vec![
500 None,
501 Some(false),
502 Some(true),
503 Some(false),
504 None,
505 Some(true)
506 ])),
507 );
508 }
509
510 #[test]
511 fn test_string_array() {
512 assert_eq!(
513 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
514 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
515 );
516
517 assert_eq!(
518 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
519 ArrayDigestV0::<Sha3_256>::digest(&LargeStringArray::from(vec!["foo", "bar", "baz"])),
520 );
521
522 assert_ne!(
523 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
524 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "", "baz"])),
525 );
526
527 assert_ne!(
528 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec![
529 Some("foo"),
530 Some("bar"),
531 Some("baz")
532 ]),),
533 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec![
534 Some("foo"),
535 Some("bar"),
536 None,
537 Some("baz")
538 ]),),
539 );
540
541 assert_eq!(
543 ArrayDigestV0::<Sha3_256>::digest(&StringViewArray::from_iter_values(vec![
544 "foo", "bar", "baz"
545 ])),
546 ArrayDigestV0::<Sha3_256>::digest(&StringArray::from(vec!["foo", "bar", "baz"])),
547 );
548 }
549
550 #[test]
551 fn test_list_array() {
552 assert_eq!(
578 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
579 vec![
580 Some(vec![Some(0), Some(1), Some(2)]),
581 None,
582 Some(vec![Some(3), None, Some(4), Some(5)]),
583 Some(vec![Some(6), Some(7)]),
584 ]
585 )),
586 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
587 vec![
588 Some(vec![Some(0), Some(1), Some(2)]),
589 None,
590 Some(vec![Some(3), None, Some(4), Some(5)]),
591 Some(vec![Some(6), Some(7)]),
592 ]
593 )),
594 );
595
596 assert_ne!(
598 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
599 vec![
600 Some(vec![Some(0), Some(1), Some(2)]),
601 None,
602 Some(vec![Some(3), None, Some(4), Some(5)]),
603 Some(vec![Some(6), Some(7)]),
604 ]
605 )),
606 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
607 vec![
608 Some(vec![Some(0), Some(1), Some(2)]),
609 None,
610 Some(vec![Some(3), None, Some(4), Some(100)]),
611 Some(vec![Some(6), Some(7)]),
612 ]
613 )),
614 );
615
616 assert_ne!(
618 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
619 vec![
620 Some(vec![Some(0), Some(1), Some(2)]),
621 None,
622 Some(vec![Some(3), None, Some(4), Some(5)]),
623 Some(vec![Some(6), Some(7)]),
624 ]
625 )),
626 ArrayDigestV0::<Sha3_256>::digest(&ListArray::from_iter_primitive::<Int32Type, _, _>(
627 vec![
628 Some(vec![Some(0), Some(1), Some(2)]),
629 None,
630 Some(vec![Some(3), None, Some(4)]),
631 Some(vec![Some(5), Some(6), Some(7)]),
632 ]
633 )),
634 );
635 }
636
637 #[test]
638 fn test_binary_array() {
639 assert_eq!(
640 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
641 b"one", b"two", b"", b"three"
642 ])),
643 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
644 b"one", b"two", b"", b"three"
645 ])),
646 );
647
648 assert_ne!(
649 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
650 b"one", b"two", b"", b"three"
651 ])),
652 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
653 b"one", b"two", b"three"
654 ])),
655 );
656
657 assert_eq!(
658 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
659 b"one", b"two", b"", b"three"
660 ])),
661 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_opt_vec(vec![
662 Some(b"one"),
663 Some(b"two"),
664 Some(b""),
665 Some(b"three")
666 ])),
667 );
668
669 assert_ne!(
670 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
671 b"one", b"two", b"", b"three"
672 ])),
673 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_opt_vec(vec![
674 Some(b"one"),
675 Some(b"two"),
676 None,
677 Some(b"three")
678 ])),
679 );
680
681 assert_eq!(
682 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![b"one", b"two"])),
683 ArrayDigestV0::<Sha3_256>::digest(&FixedSizeBinaryArray::from(
684 ArrayData::builder(DataType::FixedSizeBinary(3))
685 .len(2)
686 .add_buffer(Buffer::from(b"onetwo"))
687 .build()
688 .unwrap()
689 )),
690 );
691
692 assert_eq!(
694 ArrayDigestV0::<Sha3_256>::digest(&BinaryViewArray::from_iter_values(vec![
695 b"one" as &[u8],
696 b"two",
697 b"",
698 b"three"
699 ])),
700 ArrayDigestV0::<Sha3_256>::digest(&BinaryArray::from_vec(vec![
701 b"one", b"two", b"", b"three"
702 ])),
703 );
704 }
705}