arrow_array/builder/
fixed_size_binary_dictionary_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, FixedSizeBinaryBuilder, PrimitiveBuilder};
19use crate::types::ArrowDictionaryKeyType;
20use crate::{Array, ArrayRef, DictionaryArray};
21use arrow_buffer::ArrowNativeType;
22use arrow_schema::DataType::FixedSizeBinary;
23use arrow_schema::{ArrowError, DataType};
24use hashbrown::HashTable;
25use std::any::Any;
26use std::sync::Arc;
27
28/// Builder for [`DictionaryArray`] of [`FixedSizeBinaryArray`]
29///
30/// The output array has a dictionary of unique, fixed-size binary values. The
31/// builder handles deduplication.
32///
33/// # Example
34/// ```
35/// # use arrow_array::builder::{FixedSizeBinaryDictionaryBuilder};
36/// # use arrow_array::array::{Array, FixedSizeBinaryArray};
37/// # use arrow_array::DictionaryArray;
38/// # use arrow_array::types::Int8Type;
39/// // Build 3 byte FixedBinaryArrays
40/// let byte_width = 3;
41/// let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
42/// builder.append("abc").unwrap();
43/// builder.append_null();
44/// builder.append(b"def").unwrap();
45/// builder.append(b"def").unwrap(); // duplicate value
46/// // Result is a Dictionary Array
47/// let array = builder.finish();
48/// let dict_array = array.as_any().downcast_ref::<DictionaryArray<Int8Type>>().unwrap();
49/// // The array represents "abc", null, "def", "def"
50/// assert_eq!(array.keys().len(), 4);
51/// // but there are only 2 unique values
52/// assert_eq!(array.values().len(), 2);
53/// let values = dict_array.values().as_any().downcast_ref::<FixedSizeBinaryArray>().unwrap();
54/// assert_eq!(values.value(0), "abc".as_bytes());
55/// assert_eq!(values.value(1), "def".as_bytes());
56/// ```
57///
58/// [`FixedSizeBinaryArray`]: crate::FixedSizeBinaryArray
59#[derive(Debug)]
60pub struct FixedSizeBinaryDictionaryBuilder<K>
61where
62    K: ArrowDictionaryKeyType,
63{
64    state: ahash::RandomState,
65    dedup: HashTable<usize>,
66
67    keys_builder: PrimitiveBuilder<K>,
68    values_builder: FixedSizeBinaryBuilder,
69    byte_width: i32,
70}
71
72impl<K> FixedSizeBinaryDictionaryBuilder<K>
73where
74    K: ArrowDictionaryKeyType,
75{
76    /// Creates a new `FixedSizeBinaryDictionaryBuilder`
77    pub fn new(byte_width: i32) -> Self {
78        let keys_builder = PrimitiveBuilder::new();
79        let values_builder = FixedSizeBinaryBuilder::new(byte_width);
80        Self {
81            state: Default::default(),
82            dedup: HashTable::with_capacity(keys_builder.capacity()),
83            keys_builder,
84            values_builder,
85            byte_width,
86        }
87    }
88
89    /// Creates a new `FixedSizeBinaryDictionaryBuilder` with the provided capacities
90    ///
91    /// `keys_capacity`: the number of keys, i.e. length of array to build
92    /// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
93    /// `byte_width`: the byte width for individual values in the values array
94    pub fn with_capacity(keys_capacity: usize, value_capacity: usize, byte_width: i32) -> Self {
95        Self {
96            state: Default::default(),
97            dedup: Default::default(),
98            keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
99            values_builder: FixedSizeBinaryBuilder::with_capacity(value_capacity, byte_width),
100            byte_width,
101        }
102    }
103}
104
105impl<K> ArrayBuilder for FixedSizeBinaryDictionaryBuilder<K>
106where
107    K: ArrowDictionaryKeyType,
108{
109    /// Returns the builder as an non-mutable `Any` reference.
110    fn as_any(&self) -> &dyn Any {
111        self
112    }
113
114    /// Returns the builder as an mutable `Any` reference.
115    fn as_any_mut(&mut self) -> &mut dyn Any {
116        self
117    }
118
119    /// Returns the boxed builder as a box of `Any`.
120    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
121        self
122    }
123
124    /// Returns the number of array slots in the builder
125    fn len(&self) -> usize {
126        self.keys_builder.len()
127    }
128
129    /// Builds the array and reset this builder.
130    fn finish(&mut self) -> ArrayRef {
131        Arc::new(self.finish())
132    }
133
134    /// Builds the array without resetting the builder.
135    fn finish_cloned(&self) -> ArrayRef {
136        Arc::new(self.finish_cloned())
137    }
138}
139
140impl<K> FixedSizeBinaryDictionaryBuilder<K>
141where
142    K: ArrowDictionaryKeyType,
143{
144    fn get_or_insert_key(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
145        let value_bytes: &[u8] = value.as_ref();
146
147        let state = &self.state;
148        let storage = &mut self.values_builder;
149        let hash = state.hash_one(value_bytes);
150
151        let idx = *self
152            .dedup
153            .entry(
154                hash,
155                |idx| value_bytes == get_bytes(storage, self.byte_width, *idx),
156                |idx| state.hash_one(get_bytes(storage, self.byte_width, *idx)),
157            )
158            .or_insert_with(|| {
159                let idx = storage.len();
160                let _ = storage.append_value(value);
161                idx
162            })
163            .get();
164
165        let key = K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?;
166
167        Ok(key)
168    }
169
170    /// Append a value to the array. Return an existing index
171    /// if already present in the values array or a new index if the
172    /// value is appended to the values array.
173    ///
174    /// Returns an error if the new index would overflow the key type.
175    pub fn append(&mut self, value: impl AsRef<[u8]>) -> Result<K::Native, ArrowError> {
176        if self.byte_width != value.as_ref().len() as i32 {
177            Err(ArrowError::InvalidArgumentError(format!(
178                "Invalid input length passed to FixedSizeBinaryBuilder. Expected {} got {}",
179                self.byte_width,
180                value.as_ref().len()
181            )))
182        } else {
183            let key = self.get_or_insert_key(value)?;
184            self.keys_builder.append_value(key);
185            Ok(key)
186        }
187    }
188
189    /// Appends a null slot into the builder
190    #[inline]
191    pub fn append_null(&mut self) {
192        self.keys_builder.append_null()
193    }
194
195    /// Appends `n` `null`s into the builder.
196    #[inline]
197    pub fn append_nulls(&mut self, n: usize) {
198        self.keys_builder.append_nulls(n);
199    }
200
201    /// Infallibly append a value to this builder
202    ///
203    /// # Panics
204    ///
205    /// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
206    pub fn append_value(&mut self, value: impl AsRef<[u8]>) {
207        self.append(value).expect("dictionary key overflow");
208    }
209
210    /// Builds the `DictionaryArray` and reset this builder.
211    pub fn finish(&mut self) -> DictionaryArray<K> {
212        self.dedup.clear();
213        let values = self.values_builder.finish();
214        let keys = self.keys_builder.finish();
215
216        let data_type = DataType::Dictionary(
217            Box::new(K::DATA_TYPE),
218            Box::new(FixedSizeBinary(self.byte_width)),
219        );
220
221        let builder = keys
222            .into_data()
223            .into_builder()
224            .data_type(data_type)
225            .child_data(vec![values.into_data()]);
226
227        DictionaryArray::from(unsafe { builder.build_unchecked() })
228    }
229
230    /// Builds the `DictionaryArray` without resetting the builder.
231    pub fn finish_cloned(&self) -> DictionaryArray<K> {
232        let values = self.values_builder.finish_cloned();
233        let keys = self.keys_builder.finish_cloned();
234
235        let data_type = DataType::Dictionary(
236            Box::new(K::DATA_TYPE),
237            Box::new(FixedSizeBinary(self.byte_width)),
238        );
239
240        let builder = keys
241            .into_data()
242            .into_builder()
243            .data_type(data_type)
244            .child_data(vec![values.into_data()]);
245
246        DictionaryArray::from(unsafe { builder.build_unchecked() })
247    }
248}
249
250fn get_bytes(values: &FixedSizeBinaryBuilder, byte_width: i32, idx: usize) -> &[u8] {
251    let values = values.values_slice();
252    let start = idx * byte_width.as_usize();
253    let end = idx * byte_width.as_usize() + byte_width.as_usize();
254    &values[start..end]
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260
261    use crate::types::Int8Type;
262    use crate::{FixedSizeBinaryArray, Int8Array};
263
264    #[test]
265    fn test_fixed_size_dictionary_builder() {
266        let values = ["abc", "def"];
267
268        let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
269        assert_eq!(b.append(values[0]).unwrap(), 0);
270        b.append_null();
271        assert_eq!(b.append(values[1]).unwrap(), 1);
272        assert_eq!(b.append(values[1]).unwrap(), 1);
273        assert_eq!(b.append(values[0]).unwrap(), 0);
274        b.append_nulls(2);
275        assert_eq!(b.append(values[0]).unwrap(), 0);
276        let array = b.finish();
277
278        assert_eq!(
279            array.keys(),
280            &Int8Array::from(vec![
281                Some(0),
282                None,
283                Some(1),
284                Some(1),
285                Some(0),
286                None,
287                None,
288                Some(0)
289            ]),
290        );
291
292        // Values are polymorphic and so require a downcast.
293        let ava = array
294            .values()
295            .as_any()
296            .downcast_ref::<FixedSizeBinaryArray>()
297            .unwrap();
298
299        assert_eq!(ava.value(0), values[0].as_bytes());
300        assert_eq!(ava.value(1), values[1].as_bytes());
301    }
302
303    #[test]
304    fn test_fixed_size_dictionary_builder_wrong_size() {
305        let mut b = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
306        let err = b.append(b"too long").unwrap_err().to_string();
307        assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 8");
308        let err = b.append("").unwrap_err().to_string();
309        assert_eq!(err, "Invalid argument error: Invalid input length passed to FixedSizeBinaryBuilder. Expected 3 got 0");
310    }
311
312    #[test]
313    fn test_fixed_size_dictionary_builder_finish_cloned() {
314        let values = ["abc", "def", "ghi"];
315
316        let mut builder = FixedSizeBinaryDictionaryBuilder::<Int8Type>::new(3);
317
318        builder.append(values[0]).unwrap();
319        builder.append_null();
320        builder.append(values[1]).unwrap();
321        builder.append(values[1]).unwrap();
322        builder.append(values[0]).unwrap();
323        let mut array = builder.finish_cloned();
324
325        assert_eq!(
326            array.keys(),
327            &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
328        );
329
330        // Values are polymorphic and so require a downcast.
331        let ava = array
332            .values()
333            .as_any()
334            .downcast_ref::<FixedSizeBinaryArray>()
335            .unwrap();
336
337        assert_eq!(ava.value(0), values[0].as_bytes());
338        assert_eq!(ava.value(1), values[1].as_bytes());
339
340        builder.append(values[0]).unwrap();
341        builder.append(values[2]).unwrap();
342        builder.append(values[1]).unwrap();
343
344        array = builder.finish();
345
346        assert_eq!(
347            array.keys(),
348            &Int8Array::from(vec![
349                Some(0),
350                None,
351                Some(1),
352                Some(1),
353                Some(0),
354                Some(0),
355                Some(2),
356                Some(1)
357            ])
358        );
359
360        // Values are polymorphic and so require a downcast.
361        let ava2 = array
362            .values()
363            .as_any()
364            .downcast_ref::<FixedSizeBinaryArray>()
365            .unwrap();
366
367        assert_eq!(ava2.value(0), values[0].as_bytes());
368        assert_eq!(ava2.value(1), values[1].as_bytes());
369        assert_eq!(ava2.value(2), values[2].as_bytes());
370    }
371}