1use arrow::array::{
21 Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
22 Int8Array, Int16Array, Int32Array, Int64Array, LargeStringArray, ListBuilder,
23 OffsetSizeTrait, StringArray, StringBuilder, UInt8Array, UInt16Array, UInt32Array,
24 UInt64Array,
25};
26use arrow::datatypes::{DataType, Field};
27
28use datafusion_common::utils::ListCoercion;
29use datafusion_common::{DataFusionError, Result, not_impl_err};
30
31use std::any::Any;
32use std::fmt::Write;
33
34use crate::utils::make_scalar_function;
35use arrow::array::{
36 GenericStringArray, StringArrayType, StringViewArray,
37 builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
38 cast::AsArray,
39};
40use arrow::compute::{can_cast_types, cast};
41use arrow::datatypes::DataType::{
42 Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
43};
44use datafusion_common::cast::{
45 as_fixed_size_list_array, as_large_list_array, as_list_array,
46};
47use datafusion_common::exec_err;
48use datafusion_common::types::logical_string;
49use datafusion_expr::{
50 ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue,
51 Documentation, ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass,
52 Volatility,
53};
54use datafusion_functions::downcast_arg;
55use datafusion_macros::user_doc;
56use std::sync::Arc;
57
58make_udf_expr_and_func!(
60 ArrayToString,
61 array_to_string,
62 array delimiter, "converts each element to its text representation.", array_to_string_udf );
66
67#[user_doc(
68 doc_section(label = "Array Functions"),
69 description = "Converts each element to its text representation.",
70 syntax_example = "array_to_string(array, delimiter[, null_string])",
71 sql_example = r#"```sql
72> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ',');
73+----------------------------------------------------+
74| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) |
75+----------------------------------------------------+
76| 1,2,3,4,5,6,7,8 |
77+----------------------------------------------------+
78```"#,
79 argument(
80 name = "array",
81 description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
82 ),
83 argument(name = "delimiter", description = "Array element separator."),
84 argument(
85 name = "null_string",
86 description = "Optional. String to use for null values in the output. If not provided, nulls will be omitted."
87 )
88)]
89#[derive(Debug, PartialEq, Eq, Hash)]
90pub struct ArrayToString {
91 signature: Signature,
92 aliases: Vec<String>,
93}
94
95impl Default for ArrayToString {
96 fn default() -> Self {
97 Self::new()
98 }
99}
100
101impl ArrayToString {
102 pub fn new() -> Self {
103 Self {
104 signature: Signature::one_of(
105 vec![
106 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
107 arguments: vec![
108 ArrayFunctionArgument::Array,
109 ArrayFunctionArgument::String,
110 ArrayFunctionArgument::String,
111 ],
112 array_coercion: Some(ListCoercion::FixedSizedListToList),
113 }),
114 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
115 arguments: vec![
116 ArrayFunctionArgument::Array,
117 ArrayFunctionArgument::String,
118 ],
119 array_coercion: Some(ListCoercion::FixedSizedListToList),
120 }),
121 ],
122 Volatility::Immutable,
123 ),
124 aliases: vec![
125 String::from("list_to_string"),
126 String::from("array_join"),
127 String::from("list_join"),
128 ],
129 }
130 }
131}
132
133impl ScalarUDFImpl for ArrayToString {
134 fn as_any(&self) -> &dyn Any {
135 self
136 }
137
138 fn name(&self) -> &str {
139 "array_to_string"
140 }
141
142 fn signature(&self) -> &Signature {
143 &self.signature
144 }
145
146 fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
147 Ok(Utf8)
148 }
149
150 fn invoke_with_args(
151 &self,
152 args: datafusion_expr::ScalarFunctionArgs,
153 ) -> Result<ColumnarValue> {
154 make_scalar_function(array_to_string_inner)(&args.args)
155 }
156
157 fn aliases(&self) -> &[String] {
158 &self.aliases
159 }
160
161 fn documentation(&self) -> Option<&Documentation> {
162 self.doc()
163 }
164}
165
166make_udf_expr_and_func!(
167 StringToArray,
168 string_to_array,
169 string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", string_to_array_udf );
173
174#[user_doc(
175 doc_section(label = "Array Functions"),
176 description = "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.",
177 syntax_example = "string_to_array(str, delimiter[, null_str])",
178 sql_example = r#"```sql
179> select string_to_array('abc##def', '##');
180+-----------------------------------+
181| string_to_array(Utf8('abc##def')) |
182+-----------------------------------+
183| ['abc', 'def'] |
184+-----------------------------------+
185> select string_to_array('abc def', ' ', 'def');
186+---------------------------------------------+
187| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) |
188+---------------------------------------------+
189| ['abc', NULL] |
190+---------------------------------------------+
191```"#,
192 argument(name = "str", description = "String expression to split."),
193 argument(name = "delimiter", description = "Delimiter string to split on."),
194 argument(
195 name = "null_str",
196 description = "Substring values to be replaced with `NULL`."
197 )
198)]
199#[derive(Debug, PartialEq, Eq, Hash)]
200pub(super) struct StringToArray {
201 signature: Signature,
202 aliases: Vec<String>,
203}
204
205impl StringToArray {
206 pub fn new() -> Self {
207 Self {
208 signature: Signature::one_of(
209 vec![
210 TypeSignature::Coercible(vec![
211 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
212 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
213 ]),
214 TypeSignature::Coercible(vec![
215 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
216 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
217 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
218 ]),
219 ],
220 Volatility::Immutable,
221 ),
222 aliases: vec![String::from("string_to_list")],
223 }
224 }
225}
226
227impl ScalarUDFImpl for StringToArray {
228 fn as_any(&self) -> &dyn Any {
229 self
230 }
231
232 fn name(&self) -> &str {
233 "string_to_array"
234 }
235
236 fn signature(&self) -> &Signature {
237 &self.signature
238 }
239
240 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
241 Ok(List(Arc::new(Field::new_list_field(
242 arg_types[0].clone(),
243 true,
244 ))))
245 }
246
247 fn invoke_with_args(
248 &self,
249 args: datafusion_expr::ScalarFunctionArgs,
250 ) -> Result<ColumnarValue> {
251 let args = &args.args;
252 match args[0].data_type() {
253 Utf8 | Utf8View => make_scalar_function(string_to_array_inner::<i32>)(args),
254 LargeUtf8 => make_scalar_function(string_to_array_inner::<i64>)(args),
255 other => {
256 exec_err!("unsupported type for string_to_array function as {other:?}")
257 }
258 }
259 }
260
261 fn aliases(&self) -> &[String] {
262 &self.aliases
263 }
264
265 fn documentation(&self) -> Option<&Documentation> {
266 self.doc()
267 }
268}
269
270fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
271 if args.len() < 2 || args.len() > 3 {
272 return exec_err!("array_to_string expects two or three arguments");
273 }
274
275 let arr = &args[0];
276
277 let delimiters: Vec<Option<&str>> = match args[1].data_type() {
278 Utf8 => args[1].as_string::<i32>().iter().collect(),
279 Utf8View => args[1].as_string_view().iter().collect(),
280 LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
281 other => {
282 return exec_err!(
283 "unsupported type for second argument to array_to_string function as {other:?}"
284 );
285 }
286 };
287
288 let null_strings: Vec<Option<&str>> = if args.len() == 3 {
289 match args[2].data_type() {
290 Utf8 => args[2].as_string::<i32>().iter().collect(),
291 Utf8View => args[2].as_string_view().iter().collect(),
292 LargeUtf8 => args[2].as_string::<i64>().iter().collect(),
293 other => {
294 return exec_err!(
295 "unsupported type for third argument to array_to_string function as {other:?}"
296 );
297 }
298 }
299 } else {
300 vec![None; args[0].len()]
303 };
304
305 let string_arr = match arr.data_type() {
306 List(_) => {
307 let list_array = as_list_array(&arr)?;
308 generate_string_array::<i32>(list_array, &delimiters, &null_strings)?
309 }
310 LargeList(_) => {
311 let list_array = as_large_list_array(&arr)?;
312 generate_string_array::<i64>(list_array, &delimiters, &null_strings)?
313 }
314 _ => return exec_err!("array_to_string expects list as first argument"),
316 };
317
318 Ok(Arc::new(string_arr))
319}
320
321fn generate_string_array<O: OffsetSizeTrait>(
322 list_arr: &GenericListArray<O>,
323 delimiters: &[Option<&str>],
324 null_strings: &[Option<&str>],
325) -> Result<StringArray> {
326 let mut builder = StringBuilder::with_capacity(list_arr.len(), 0);
327 let mut buf = String::new();
328
329 for ((arr, &delimiter), &null_string) in list_arr
330 .iter()
331 .zip(delimiters.iter())
332 .zip(null_strings.iter())
333 {
334 let (Some(arr), Some(delimiter)) = (arr, delimiter) else {
335 builder.append_null();
336 continue;
337 };
338
339 buf.clear();
340 let mut first = true;
341 compute_array_to_string(&mut buf, &arr, delimiter, null_string, &mut first)?;
342 builder.append_value(&buf);
343 }
344
345 Ok(builder.finish())
346}
347
348fn compute_array_to_string(
349 buf: &mut String,
350 arr: &ArrayRef,
351 delimiter: &str,
352 null_string: Option<&str>,
353 first: &mut bool,
354) -> Result<()> {
355 macro_rules! handle_list {
357 ($list_array:expr) => {
358 for i in 0..$list_array.len() {
359 if !$list_array.is_null(i) {
360 compute_array_to_string(
361 buf,
362 &$list_array.value(i),
363 delimiter,
364 null_string,
365 first,
366 )?;
367 } else if let Some(ns) = null_string {
368 if *first {
369 *first = false;
370 } else {
371 buf.push_str(delimiter);
372 }
373 buf.push_str(ns);
374 }
375 }
376 };
377 }
378
379 match arr.data_type() {
380 List(..) => {
381 let list_array = as_list_array(arr)?;
382 handle_list!(list_array);
383 Ok(())
384 }
385 FixedSizeList(..) => {
386 let list_array = as_fixed_size_list_array(arr)?;
387 handle_list!(list_array);
388 Ok(())
389 }
390 LargeList(..) => {
391 let list_array = as_large_list_array(arr)?;
392 handle_list!(list_array);
393 Ok(())
394 }
395 Dictionary(_key_type, value_type) => {
396 let values = cast(arr, value_type.as_ref()).map_err(|e| {
399 DataFusionError::from(e)
400 .context("Casting dictionary to values in compute_array_to_string")
401 })?;
402 compute_array_to_string(buf, &values, delimiter, null_string, first)
403 }
404 Null => Ok(()),
405 data_type => {
406 macro_rules! str_leaf {
407 ($ARRAY_TYPE:ident) => {
408 write_leaf_to_string(
409 buf,
410 downcast_arg!(arr, $ARRAY_TYPE),
411 delimiter,
412 null_string,
413 first,
414 |buf, x: &str| buf.push_str(x),
415 )
416 };
417 }
418 macro_rules! bool_leaf {
419 ($ARRAY_TYPE:ident) => {
420 write_leaf_to_string(
421 buf,
422 downcast_arg!(arr, $ARRAY_TYPE),
423 delimiter,
424 null_string,
425 first,
426 |buf, x: bool| {
427 if x {
428 buf.push_str("true");
429 } else {
430 buf.push_str("false");
431 }
432 },
433 )
434 };
435 }
436 macro_rules! int_leaf {
437 ($ARRAY_TYPE:ident) => {
438 write_leaf_to_string(
439 buf,
440 downcast_arg!(arr, $ARRAY_TYPE),
441 delimiter,
442 null_string,
443 first,
444 |buf, x| {
445 let mut itoa_buf = itoa::Buffer::new();
446 buf.push_str(itoa_buf.format(x));
447 },
448 )
449 };
450 }
451 macro_rules! float_leaf {
452 ($ARRAY_TYPE:ident) => {
453 write_leaf_to_string(
454 buf,
455 downcast_arg!(arr, $ARRAY_TYPE),
456 delimiter,
457 null_string,
458 first,
459 |buf, x| {
460 write!(buf, "{}", x).unwrap();
465 },
466 )
467 };
468 }
469 match data_type {
470 Utf8 => str_leaf!(StringArray),
471 Utf8View => str_leaf!(StringViewArray),
472 LargeUtf8 => str_leaf!(LargeStringArray),
473 DataType::Boolean => bool_leaf!(BooleanArray),
474 DataType::Float32 => float_leaf!(Float32Array),
475 DataType::Float64 => float_leaf!(Float64Array),
476 DataType::Int8 => int_leaf!(Int8Array),
477 DataType::Int16 => int_leaf!(Int16Array),
478 DataType::Int32 => int_leaf!(Int32Array),
479 DataType::Int64 => int_leaf!(Int64Array),
480 DataType::UInt8 => int_leaf!(UInt8Array),
481 DataType::UInt16 => int_leaf!(UInt16Array),
482 DataType::UInt32 => int_leaf!(UInt32Array),
483 DataType::UInt64 => int_leaf!(UInt64Array),
484 data_type if can_cast_types(data_type, &Utf8) => {
485 let str_arr = cast(arr, &Utf8).map_err(|e| {
486 DataFusionError::from(e)
487 .context("Casting to string in array_to_string")
488 })?;
489 return compute_array_to_string(
490 buf,
491 &str_arr,
492 delimiter,
493 null_string,
494 first,
495 );
496 }
497 data_type => {
498 return not_impl_err!(
499 "Unsupported data type in array_to_string: {data_type}"
500 );
501 }
502 }
503 Ok(())
504 }
505 }
506}
507
508fn write_leaf_to_string<'a, A, T>(
513 buf: &mut String,
514 arr: &'a A,
515 delimiter: &str,
516 null_string: Option<&str>,
517 first: &mut bool,
518 append: impl Fn(&mut String, T),
519) where
520 &'a A: IntoIterator<Item = Option<T>>,
521{
522 for x in arr {
523 if x.is_none() && null_string.is_none() {
525 continue;
526 }
527
528 if *first {
529 *first = false;
530 } else {
531 buf.push_str(delimiter);
532 }
533
534 match x {
535 Some(x) => append(buf, x),
536 None => buf.push_str(null_string.unwrap()),
537 }
538 }
539}
540
541fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
545 if args.len() < 2 || args.len() > 3 {
546 return exec_err!("string_to_array expects two or three arguments");
547 }
548
549 match args[0].data_type() {
550 Utf8 => {
551 let string_array = args[0].as_string::<T>();
552 let builder = StringBuilder::with_capacity(
553 string_array.len(),
554 string_array.get_buffer_memory_size(),
555 );
556 string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(
557 args,
558 &string_array,
559 builder,
560 )
561 }
562 Utf8View => {
563 let string_array = args[0].as_string_view();
564 let builder = StringViewBuilder::with_capacity(string_array.len());
565 string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(
566 args,
567 &string_array,
568 builder,
569 )
570 }
571 LargeUtf8 => {
572 let string_array = args[0].as_string::<T>();
573 let builder = LargeStringBuilder::with_capacity(
574 string_array.len(),
575 string_array.get_buffer_memory_size(),
576 );
577 string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(
578 args,
579 &string_array,
580 builder,
581 )
582 }
583 other => exec_err!(
584 "unsupported type for first argument to string_to_array function as {other:?}"
585 ),
586 }
587}
588
589fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>(
590 args: &'a [ArrayRef],
591 string_array: &StringArrType,
592 string_builder: StringBuilderType,
593) -> Result<ArrayRef>
594where
595 StringArrType: StringArrayType<'a>,
596 StringBuilderType: StringArrayBuilderType,
597{
598 match args[1].data_type() {
599 Utf8 => {
600 let delimiter_array = args[1].as_string::<i32>();
601 if args.len() == 2 {
602 string_to_array_impl::<
603 StringArrType,
604 &GenericStringArray<i32>,
605 &StringViewArray,
606 StringBuilderType,
607 >(string_array, &delimiter_array, None, string_builder)
608 } else {
609 string_to_array_inner_3::<
610 StringArrType,
611 &GenericStringArray<i32>,
612 StringBuilderType,
613 >(args, string_array, &delimiter_array, string_builder)
614 }
615 }
616 Utf8View => {
617 let delimiter_array = args[1].as_string_view();
618
619 if args.len() == 2 {
620 string_to_array_impl::<
621 StringArrType,
622 &StringViewArray,
623 &StringViewArray,
624 StringBuilderType,
625 >(string_array, &delimiter_array, None, string_builder)
626 } else {
627 string_to_array_inner_3::<
628 StringArrType,
629 &StringViewArray,
630 StringBuilderType,
631 >(args, string_array, &delimiter_array, string_builder)
632 }
633 }
634 LargeUtf8 => {
635 let delimiter_array = args[1].as_string::<i64>();
636 if args.len() == 2 {
637 string_to_array_impl::<
638 StringArrType,
639 &GenericStringArray<i64>,
640 &StringViewArray,
641 StringBuilderType,
642 >(string_array, &delimiter_array, None, string_builder)
643 } else {
644 string_to_array_inner_3::<
645 StringArrType,
646 &GenericStringArray<i64>,
647 StringBuilderType,
648 >(args, string_array, &delimiter_array, string_builder)
649 }
650 }
651 other => exec_err!(
652 "unsupported type for second argument to string_to_array function as {other:?}"
653 ),
654 }
655}
656
657fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>(
658 args: &'a [ArrayRef],
659 string_array: &StringArrType,
660 delimiter_array: &DelimiterArrType,
661 string_builder: StringBuilderType,
662) -> Result<ArrayRef>
663where
664 StringArrType: StringArrayType<'a>,
665 DelimiterArrType: StringArrayType<'a>,
666 StringBuilderType: StringArrayBuilderType,
667{
668 match args[2].data_type() {
669 Utf8 => {
670 let null_type_array = Some(args[2].as_string::<i32>());
671 string_to_array_impl::<
672 StringArrType,
673 DelimiterArrType,
674 &GenericStringArray<i32>,
675 StringBuilderType,
676 >(
677 string_array,
678 delimiter_array,
679 null_type_array,
680 string_builder,
681 )
682 }
683 Utf8View => {
684 let null_type_array = Some(args[2].as_string_view());
685 string_to_array_impl::<
686 StringArrType,
687 DelimiterArrType,
688 &StringViewArray,
689 StringBuilderType,
690 >(
691 string_array,
692 delimiter_array,
693 null_type_array,
694 string_builder,
695 )
696 }
697 LargeUtf8 => {
698 let null_type_array = Some(args[2].as_string::<i64>());
699 string_to_array_impl::<
700 StringArrType,
701 DelimiterArrType,
702 &GenericStringArray<i64>,
703 StringBuilderType,
704 >(
705 string_array,
706 delimiter_array,
707 null_type_array,
708 string_builder,
709 )
710 }
711 other => {
712 exec_err!("unsupported type for string_to_array function as {other:?}")
713 }
714 }
715}
716
717fn string_to_array_impl<
718 'a,
719 StringArrType,
720 DelimiterArrType,
721 NullValueArrType,
722 StringBuilderType,
723>(
724 string_array: &StringArrType,
725 delimiter_array: &DelimiterArrType,
726 null_value_array: Option<NullValueArrType>,
727 string_builder: StringBuilderType,
728) -> Result<ArrayRef>
729where
730 StringArrType: StringArrayType<'a>,
731 DelimiterArrType: StringArrayType<'a>,
732 NullValueArrType: StringArrayType<'a>,
733 StringBuilderType: StringArrayBuilderType,
734{
735 let mut list_builder = ListBuilder::new(string_builder);
736
737 match null_value_array {
738 None => {
739 string_array.iter().zip(delimiter_array.iter()).for_each(
740 |(string, delimiter)| {
741 match (string, delimiter) {
742 (Some(string), Some("")) => {
743 list_builder.values().append_value(string);
744 list_builder.append(true);
745 }
746 (Some(string), Some(delimiter)) => {
747 string.split(delimiter).for_each(|s| {
748 list_builder.values().append_value(s);
749 });
750 list_builder.append(true);
751 }
752 (Some(string), None) => {
753 string.chars().map(|c| c.to_string()).for_each(|c| {
754 list_builder.values().append_value(c.as_str());
755 });
756 list_builder.append(true);
757 }
758 _ => list_builder.append(false), }
760 },
761 )
762 }
763 Some(null_value_array) => string_array
764 .iter()
765 .zip(delimiter_array.iter())
766 .zip(null_value_array.iter())
767 .for_each(|((string, delimiter), null_value)| {
768 match (string, delimiter) {
769 (Some(string), Some("")) => {
770 if Some(string) == null_value {
771 list_builder.values().append_null();
772 } else {
773 list_builder.values().append_value(string);
774 }
775 list_builder.append(true);
776 }
777 (Some(string), Some(delimiter)) => {
778 string.split(delimiter).for_each(|s| {
779 if Some(s) == null_value {
780 list_builder.values().append_null();
781 } else {
782 list_builder.values().append_value(s);
783 }
784 });
785 list_builder.append(true);
786 }
787 (Some(string), None) => {
788 string.chars().map(|c| c.to_string()).for_each(|c| {
789 if Some(c.as_str()) == null_value {
790 list_builder.values().append_null();
791 } else {
792 list_builder.values().append_value(c.as_str());
793 }
794 });
795 list_builder.append(true);
796 }
797 _ => list_builder.append(false), }
799 }),
800 };
801
802 let list_array = list_builder.finish();
803 Ok(Arc::new(list_array) as ArrayRef)
804}
805
806trait StringArrayBuilderType: ArrayBuilder {
807 fn append_value(&mut self, val: &str);
808
809 fn append_null(&mut self);
810}
811
812impl StringArrayBuilderType for StringBuilder {
813 fn append_value(&mut self, val: &str) {
814 StringBuilder::append_value(self, val);
815 }
816
817 fn append_null(&mut self) {
818 StringBuilder::append_null(self);
819 }
820}
821
822impl StringArrayBuilderType for StringViewBuilder {
823 fn append_value(&mut self, val: &str) {
824 StringViewBuilder::append_value(self, val)
825 }
826
827 fn append_null(&mut self) {
828 StringViewBuilder::append_null(self)
829 }
830}
831
832impl StringArrayBuilderType for LargeStringBuilder {
833 fn append_value(&mut self, val: &str) {
834 LargeStringBuilder::append_value(self, val);
835 }
836
837 fn append_null(&mut self) {
838 LargeStringBuilder::append_null(self);
839 }
840}