1use crate::types::GenericStringType;
19use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20use arrow_schema::{ArrowError, DataType};
21
22pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26    #[deprecated(note = "please use `Self::DATA_TYPE` instead")]
28    pub const fn get_data_type() -> DataType {
29        Self::DATA_TYPE
30    }
31
32    pub fn num_chars(&self, i: usize) -> usize {
38        self.value(i).chars().count()
39    }
40
41    pub fn take_iter<'a>(
43        &'a self,
44        indexes: impl Iterator<Item = Option<usize>> + 'a,
45    ) -> impl Iterator<Item = Option<&'a str>> {
46        indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
47    }
48
49    pub unsafe fn take_iter_unchecked<'a>(
54        &'a self,
55        indexes: impl Iterator<Item = Option<usize>> + 'a,
56    ) -> impl Iterator<Item = Option<&'a str>> {
57        indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
58    }
59
60    pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
63        let (offsets, values, nulls) = v.into_parts();
64        Self::try_new(offsets, values, nulls)
65    }
66}
67
68impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
69    for GenericStringArray<OffsetSize>
70{
71    fn from(v: GenericListArray<OffsetSize>) -> Self {
72        GenericBinaryArray::<OffsetSize>::from(v).into()
73    }
74}
75
76impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
77    for GenericStringArray<OffsetSize>
78{
79    fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
80        Self::try_from_binary(v).unwrap()
81    }
82}
83
84impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
85    fn from(v: Vec<Option<&str>>) -> Self {
86        v.into_iter().collect()
87    }
88}
89
90impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
91    fn from(v: Vec<&str>) -> Self {
92        Self::from_iter_values(v)
93    }
94}
95
96impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
97    fn from(v: Vec<Option<String>>) -> Self {
98        v.into_iter().collect()
99    }
100}
101
102impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
103    fn from(v: Vec<String>) -> Self {
104        Self::from_iter_values(v)
105    }
106}
107
108pub type StringArray = GenericStringArray<i32>;
134
135pub type LargeStringArray = GenericStringArray<i64>;
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165    use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
166    use crate::types::UInt8Type;
167    use crate::Array;
168    use arrow_buffer::Buffer;
169    use arrow_data::ArrayData;
170    use arrow_schema::Field;
171    use std::sync::Arc;
172
173    #[test]
174    fn test_string_array_from_u8_slice() {
175        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
176
177        let string_array = StringArray::from(values);
179
180        assert_eq!(3, string_array.len());
181        assert_eq!(0, string_array.null_count());
182        assert_eq!("hello", string_array.value(0));
183        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
184        assert_eq!("", string_array.value(1));
185        assert_eq!("", unsafe { string_array.value_unchecked(1) });
186        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
187        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
188            string_array.value_unchecked(2)
189        });
190        assert_eq!(20, string_array.value_length(2)); assert_eq!(8, string_array.num_chars(2));
192        for i in 0..3 {
193            assert!(string_array.is_valid(i));
194            assert!(!string_array.is_null(i));
195        }
196    }
197
198    #[test]
199    #[should_panic(expected = "StringArray expects DataType::Utf8")]
200    fn test_string_array_from_int() {
201        let array = LargeStringArray::from(vec!["a", "b"]);
202        drop(StringArray::from(array.into_data()));
203    }
204
205    #[test]
206    fn test_large_string_array_from_u8_slice() {
207        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
208
209        let string_array = LargeStringArray::from(values);
211
212        assert_eq!(3, string_array.len());
213        assert_eq!(0, string_array.null_count());
214        assert_eq!("hello", string_array.value(0));
215        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
216        assert_eq!("", string_array.value(1));
217        assert_eq!("", unsafe { string_array.value_unchecked(1) });
218        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
219        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
220            string_array.value_unchecked(2)
221        });
222        assert_eq!(5, string_array.value_offsets()[2]);
223        assert_eq!(20, string_array.value_length(2)); assert_eq!(8, string_array.num_chars(2));
225        for i in 0..3 {
226            assert!(string_array.is_valid(i));
227            assert!(!string_array.is_null(i));
228        }
229    }
230
231    #[test]
232    fn test_nested_string_array() {
233        let string_builder = StringBuilder::with_capacity(3, 10);
234        let mut list_of_string_builder = ListBuilder::new(string_builder);
235
236        list_of_string_builder.values().append_value("foo");
237        list_of_string_builder.values().append_value("bar");
238        list_of_string_builder.append(true);
239
240        list_of_string_builder.values().append_value("foobar");
241        list_of_string_builder.append(true);
242        let list_of_strings = list_of_string_builder.finish();
243
244        assert_eq!(list_of_strings.len(), 2);
245
246        let first_slot = list_of_strings.value(0);
247        let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
248        assert_eq!(first_list.len(), 2);
249        assert_eq!(first_list.value(0), "foo");
250        assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
251        assert_eq!(first_list.value(1), "bar");
252        assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
253
254        let second_slot = list_of_strings.value(1);
255        let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
256        assert_eq!(second_list.len(), 1);
257        assert_eq!(second_list.value(0), "foobar");
258        assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
259    }
260
261    #[test]
262    #[should_panic(
263        expected = "Trying to access an element at index 4 from a StringArray of length 3"
264    )]
265    fn test_string_array_get_value_index_out_of_bound() {
266        let values: [u8; 12] = [
267            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
268        ];
269        let offsets: [i32; 4] = [0, 5, 5, 12];
270        let array_data = ArrayData::builder(DataType::Utf8)
271            .len(3)
272            .add_buffer(Buffer::from_slice_ref(offsets))
273            .add_buffer(Buffer::from_slice_ref(values))
274            .build()
275            .unwrap();
276        let string_array = StringArray::from(array_data);
277        string_array.value(4);
278    }
279
280    #[test]
281    fn test_string_array_fmt_debug() {
282        let arr: StringArray = vec!["hello", "arrow"].into();
283        assert_eq!(
284            "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
285            format!("{arr:?}")
286        );
287    }
288
289    #[test]
290    fn test_large_string_array_fmt_debug() {
291        let arr: LargeStringArray = vec!["hello", "arrow"].into();
292        assert_eq!(
293            "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
294            format!("{arr:?}")
295        );
296    }
297
298    #[test]
299    fn test_string_array_from_iter() {
300        let data = [Some("hello"), None, Some("arrow")];
301        let data_vec = data.to_vec();
302        let array1 = StringArray::from(data_vec.clone());
304        let array2: StringArray = data_vec.clone().into_iter().collect();
306        let array3: StringArray = data_vec
308            .into_iter()
309            .map(|x| x.map(|s| s.to_string()))
310            .collect();
311        let array4: StringArray = data.iter().collect::<StringArray>();
313
314        assert_eq!(array1, array2);
315        assert_eq!(array2, array3);
316        assert_eq!(array3, array4);
317    }
318
319    #[test]
320    fn test_string_array_from_iter_values() {
321        let data = ["hello", "hello2"];
322        let array1 = StringArray::from_iter_values(data.iter());
323
324        assert_eq!(array1.value(0), "hello");
325        assert_eq!(array1.value(1), "hello2");
326
327        let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
329        let array2 = StringArray::from_iter_values(data2.iter());
330
331        assert_eq!(array2.value(0), "goodbye");
332        assert_eq!(array2.value(1), "goodbye2");
333    }
334
335    #[test]
336    fn test_string_array_from_unbound_iter() {
337        let string_iter = (0..)
339            .scan(0usize, |pos, i| {
340                if *pos < 10 {
341                    *pos += 1;
342                    Some(Some(format!("value {i}")))
343                } else {
344                    None
346                }
347            })
348            .take(100);
350
351        let (_, upper_size_bound) = string_iter.size_hint();
352        assert_eq!(upper_size_bound, Some(100));
354        let string_array: StringArray = string_iter.collect();
355        assert_eq!(string_array.len(), 10);
357    }
358
359    #[test]
360    fn test_string_array_all_null() {
361        let data: Vec<Option<&str>> = vec![None];
362        let array = StringArray::from(data);
363        array
364            .into_data()
365            .validate_full()
366            .expect("All null array has valid array data");
367    }
368
369    #[test]
370    fn test_large_string_array_all_null() {
371        let data: Vec<Option<&str>> = vec![None];
372        let array = LargeStringArray::from(data);
373        array
374            .into_data()
375            .validate_full()
376            .expect("All null array has valid array data");
377    }
378
379    fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
380        let values = b"HelloArrowAndParquet";
381        let child_data = ArrayData::builder(DataType::UInt8)
383            .len(15)
384            .offset(5)
385            .add_buffer(Buffer::from(&values[..]))
386            .build()
387            .unwrap();
388
389        let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
390        let null_buffer = Buffer::from_slice_ref([0b101]);
391        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
392            "item",
393            DataType::UInt8,
394            false,
395        )));
396
397        let array_data = ArrayData::builder(data_type)
399            .len(2)
400            .offset(1)
401            .add_buffer(Buffer::from_slice_ref(offsets))
402            .null_bit_buffer(Some(null_buffer))
403            .add_child_data(child_data)
404            .build()
405            .unwrap();
406        let list_array = GenericListArray::<O>::from(array_data);
407        let string_array = GenericStringArray::<O>::from(list_array);
408
409        assert_eq!(2, string_array.len());
410        assert_eq!(1, string_array.null_count());
411        assert!(string_array.is_null(0));
412        assert!(string_array.is_valid(1));
413        assert_eq!("Parquet", string_array.value(1));
414    }
415
416    #[test]
417    fn test_string_array_from_list_array() {
418        _test_generic_string_array_from_list_array::<i32>();
419    }
420
421    #[test]
422    fn test_large_string_array_from_list_array() {
423        _test_generic_string_array_from_list_array::<i64>();
424    }
425
426    fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
427        let values = b"HelloArrow";
428        let child_data = ArrayData::builder(DataType::UInt8)
429            .len(10)
430            .add_buffer(Buffer::from(&values[..]))
431            .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
432            .build()
433            .unwrap();
434
435        let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
436
437        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
440            "item",
441            DataType::UInt8,
442            true,
443        )));
444
445        let array_data = ArrayData::builder(data_type)
447            .len(2)
448            .add_buffer(Buffer::from_slice_ref(offsets))
449            .add_child_data(child_data)
450            .build()
451            .unwrap();
452        let list_array = GenericListArray::<O>::from(array_data);
453        drop(GenericStringArray::<O>::from(list_array));
454    }
455
456    #[test]
457    #[should_panic(expected = "The child array cannot contain null values.")]
458    fn test_string_array_from_list_array_with_child_nulls_failed() {
459        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
460    }
461
462    #[test]
463    #[should_panic(expected = "The child array cannot contain null values.")]
464    fn test_large_string_array_from_list_array_with_child_nulls_failed() {
465        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
466    }
467
468    fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
469        let values = b"HelloArrow";
470        let child_data = ArrayData::builder(DataType::UInt16)
471            .len(5)
472            .add_buffer(Buffer::from(&values[..]))
473            .build()
474            .unwrap();
475
476        let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
477        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(Field::new(
478            "item",
479            DataType::UInt16,
480            false,
481        )));
482
483        let array_data = ArrayData::builder(data_type)
484            .len(2)
485            .add_buffer(Buffer::from_slice_ref(offsets))
486            .add_child_data(child_data)
487            .build()
488            .unwrap();
489        let list_array = GenericListArray::<O>::from(array_data);
490        drop(GenericStringArray::<O>::from(list_array));
491    }
492
493    #[test]
494    #[should_panic(
495        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
496    )]
497    fn test_string_array_from_list_array_wrong_type() {
498        _test_generic_string_array_from_list_array_wrong_type::<i32>();
499    }
500
501    #[test]
502    #[should_panic(
503        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
504    )]
505    fn test_large_string_array_from_list_array_wrong_type() {
506        _test_generic_string_array_from_list_array_wrong_type::<i64>();
507    }
508
509    #[test]
510    #[should_panic(
511        expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
512    )]
513    fn test_list_array_utf8_validation() {
514        let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
515        builder.values().append_value(0xFF);
516        builder.append(true);
517        let list = builder.finish();
518        let _ = StringArray::from(list);
519    }
520
521    #[test]
522    fn test_empty_offsets() {
523        let string = StringArray::from(
524            ArrayData::builder(DataType::Utf8)
525                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
526                .build()
527                .unwrap(),
528        );
529        assert_eq!(string.len(), 0);
530        assert_eq!(string.value_offsets(), &[0]);
531
532        let string = LargeStringArray::from(
533            ArrayData::builder(DataType::LargeUtf8)
534                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
535                .build()
536                .unwrap(),
537        );
538        assert_eq!(string.len(), 0);
539        assert_eq!(string.value_offsets(), &[0]);
540    }
541
542    #[test]
543    fn test_into_builder() {
544        let array: StringArray = vec!["hello", "arrow"].into();
545
546        let mut builder = array.into_builder().unwrap();
548
549        builder.append_value("rust");
550
551        let expected: StringArray = vec!["hello", "arrow", "rust"].into();
552        let array = builder.finish();
553        assert_eq!(expected, array);
554    }
555
556    #[test]
557    fn test_into_builder_err() {
558        let array: StringArray = vec!["hello", "arrow"].into();
559
560        let shared_array = array.clone();
562
563        let err_return = array.into_builder().unwrap_err();
564        assert_eq!(&err_return, &shared_array);
565    }
566}