1use arrow::array::{
21 Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
22 Int8Array, Int16Array, Int32Array, Int64Array, LargeStringArray, ListBuilder,
23 OffsetSizeTrait, StringArray, StringBuilder, UInt8Array, UInt16Array, UInt32Array,
24 UInt64Array,
25};
26use arrow::datatypes::{DataType, Field};
27
28use datafusion_common::utils::ListCoercion;
29use datafusion_common::{DataFusionError, Result, ScalarValue, not_impl_err};
30
31use std::fmt::{self, Write};
32
33use crate::utils::make_scalar_function;
34use arrow::array::{
35 StringArrayType, StringViewArray,
36 builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
37 cast::AsArray,
38};
39use arrow::compute::{can_cast_types, cast};
40use arrow::datatypes::DataType::{
41 Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
42};
43use datafusion_common::cast::{
44 as_fixed_size_list_array, as_large_list_array, as_list_array,
45};
46use datafusion_common::types::logical_string;
47use datafusion_common::{exec_datafusion_err, exec_err};
48use datafusion_expr::{
49 ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue,
50 Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
51 TypeSignatureClass, Volatility,
52};
53use datafusion_functions::downcast_arg;
54use datafusion_macros::user_doc;
55use std::sync::Arc;
56
57make_udf_expr_and_func!(
59 ArrayToString,
60 array_to_string,
61 array delimiter, "converts each element to its text representation.", array_to_string_udf );
65
66#[user_doc(
67 doc_section(label = "Array Functions"),
68 description = "Converts each element to its text representation.",
69 syntax_example = "array_to_string(array, delimiter[, null_string])",
70 sql_example = r#"```sql
71> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ',');
72+----------------------------------------------------+
73| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) |
74+----------------------------------------------------+
75| 1,2,3,4,5,6,7,8 |
76+----------------------------------------------------+
77```"#,
78 argument(
79 name = "array",
80 description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
81 ),
82 argument(name = "delimiter", description = "Array element separator."),
83 argument(
84 name = "null_string",
85 description = "Optional. String to use for null values in the output. If not provided, nulls will be omitted."
86 )
87)]
88#[derive(Debug, PartialEq, Eq, Hash)]
89pub struct ArrayToString {
90 signature: Signature,
91 aliases: Vec<String>,
92}
93
94impl Default for ArrayToString {
95 fn default() -> Self {
96 Self::new()
97 }
98}
99
100impl ArrayToString {
101 pub fn new() -> Self {
102 Self {
103 signature: Signature::one_of(
104 vec![
105 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
106 arguments: vec![
107 ArrayFunctionArgument::Array,
108 ArrayFunctionArgument::String,
109 ArrayFunctionArgument::String,
110 ],
111 array_coercion: Some(ListCoercion::FixedSizedListToList),
112 }),
113 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
114 arguments: vec![
115 ArrayFunctionArgument::Array,
116 ArrayFunctionArgument::String,
117 ],
118 array_coercion: Some(ListCoercion::FixedSizedListToList),
119 }),
120 ],
121 Volatility::Immutable,
122 ),
123 aliases: vec![
124 String::from("list_to_string"),
125 String::from("array_join"),
126 String::from("list_join"),
127 ],
128 }
129 }
130}
131
132impl ScalarUDFImpl for ArrayToString {
133 fn name(&self) -> &str {
134 "array_to_string"
135 }
136
137 fn signature(&self) -> &Signature {
138 &self.signature
139 }
140
141 fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
142 Ok(Utf8)
143 }
144
145 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
146 make_scalar_function(array_to_string_inner)(&args.args)
147 }
148
149 fn aliases(&self) -> &[String] {
150 &self.aliases
151 }
152
153 fn documentation(&self) -> Option<&Documentation> {
154 self.doc()
155 }
156}
157
158make_udf_expr_and_func!(
159 StringToArray,
160 string_to_array,
161 string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", string_to_array_udf );
165
166#[user_doc(
167 doc_section(label = "Array Functions"),
168 description = "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.",
169 syntax_example = "string_to_array(str, delimiter[, null_str])",
170 sql_example = r#"```sql
171> select string_to_array('abc##def', '##');
172+-----------------------------------+
173| string_to_array(Utf8('abc##def')) |
174+-----------------------------------+
175| ['abc', 'def'] |
176+-----------------------------------+
177> select string_to_array('abc def', ' ', 'def');
178+---------------------------------------------+
179| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) |
180+---------------------------------------------+
181| ['abc', NULL] |
182+---------------------------------------------+
183```"#,
184 argument(name = "str", description = "String expression to split."),
185 argument(name = "delimiter", description = "Delimiter string to split on."),
186 argument(
187 name = "null_str",
188 description = "Substring values to be replaced with `NULL`."
189 )
190)]
191#[derive(Debug, PartialEq, Eq, Hash)]
192pub struct StringToArray {
193 signature: Signature,
194 aliases: Vec<String>,
195}
196
197impl Default for StringToArray {
198 fn default() -> Self {
199 Self::new()
200 }
201}
202
203impl StringToArray {
204 pub fn new() -> Self {
205 Self {
206 signature: Signature::one_of(
207 vec![
208 TypeSignature::Coercible(vec![
209 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
210 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
211 ]),
212 TypeSignature::Coercible(vec![
213 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
214 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
215 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
216 ]),
217 ],
218 Volatility::Immutable,
219 ),
220 aliases: vec![String::from("string_to_list")],
221 }
222 }
223}
224
225impl ScalarUDFImpl for StringToArray {
226 fn name(&self) -> &str {
227 "string_to_array"
228 }
229
230 fn signature(&self) -> &Signature {
231 &self.signature
232 }
233
234 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
235 Ok(List(Arc::new(Field::new_list_field(
236 arg_types[0].clone(),
237 true,
238 ))))
239 }
240
241 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
242 let ScalarFunctionArgs { args, .. } = args;
243
244 let delimiter_is_scalar = matches!(&args[1], ColumnarValue::Scalar(_));
245 let null_str_is_scalar = args
246 .get(2)
247 .is_none_or(|a| matches!(a, ColumnarValue::Scalar(_)));
248
249 if !(delimiter_is_scalar && null_str_is_scalar) {
250 return make_scalar_function(string_to_array_fallback)(&args);
251 }
252
253 let delimiter = match &args[1] {
255 ColumnarValue::Scalar(s) => s.try_as_str().ok_or_else(|| {
256 exec_datafusion_err!(
257 "unsupported type for string_to_array delimiter: {:?}",
258 args[1].data_type()
259 )
260 })?,
261 _ => unreachable!("delimiter must be scalar in this branch"),
262 };
263 let null_value = match args.get(2) {
264 Some(ColumnarValue::Scalar(s)) => s.try_as_str().ok_or_else(|| {
265 exec_datafusion_err!(
266 "unsupported type for string_to_array null_str: {:?}",
267 args[2].data_type()
268 )
269 })?,
270 _ => None,
271 };
272
273 let (all_scalar, string_array) = match &args[0] {
274 ColumnarValue::Array(a) => (false, Arc::clone(a)),
275 ColumnarValue::Scalar(s) => (true, s.to_array_of_size(1)?),
276 };
277
278 let result = match string_array.data_type() {
279 Utf8 => {
280 let arr = string_array.as_string::<i32>();
281 let builder =
282 StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size());
283 string_to_array_scalar_args(&arr, delimiter, null_value, builder)
284 }
285 Utf8View => {
286 let arr = string_array.as_string_view();
287 let builder = StringViewBuilder::with_capacity(arr.len());
288 string_to_array_scalar_args(&arr, delimiter, null_value, builder)
289 }
290 LargeUtf8 => {
291 let arr = string_array.as_string::<i64>();
292 let builder = LargeStringBuilder::with_capacity(
293 arr.len(),
294 arr.get_buffer_memory_size(),
295 );
296 string_to_array_scalar_args(&arr, delimiter, null_value, builder)
297 }
298 other => {
299 exec_err!("unsupported type for string_to_array function as {other:?}")
300 }
301 }?;
302
303 if all_scalar {
304 ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar)
305 } else {
306 Ok(ColumnarValue::Array(result))
307 }
308 }
309
310 fn aliases(&self) -> &[String] {
311 &self.aliases
312 }
313
314 fn documentation(&self) -> Option<&Documentation> {
315 self.doc()
316 }
317}
318
319#[inline(always)]
321fn append_part(
322 builder: &mut impl StringArrayBuilderType,
323 value: &str,
324 null_value: Option<&str>,
325) {
326 if null_value == Some(value) {
327 builder.append_null();
328 } else {
329 builder.append_value(value);
330 }
331}
332
333fn string_to_array_scalar_args<'a, StringArrType, StringBuilderType>(
336 string_array: &StringArrType,
337 delimiter: Option<&str>,
338 null_value: Option<&str>,
339 string_builder: StringBuilderType,
340) -> Result<ArrayRef>
341where
342 StringArrType: StringArrayType<'a>,
343 StringBuilderType: StringArrayBuilderType,
344{
345 let mut list_builder = ListBuilder::new(string_builder);
346
347 match delimiter {
348 Some("") => {
349 for i in 0..string_array.len() {
352 if string_array.is_null(i) {
353 list_builder.append(false);
354 continue;
355 }
356 let string = string_array.value(i);
357 if !string.is_empty() {
358 append_part(list_builder.values(), string, null_value);
359 }
360 list_builder.append(true);
361 }
362 }
363 Some(delimiter) => {
364 let finder = memchr::memmem::Finder::new(delimiter.as_bytes());
368 let delim_len = delimiter.len();
369
370 for i in 0..string_array.len() {
371 if string_array.is_null(i) {
372 list_builder.append(false);
373 continue;
374 }
375 let string = string_array.value(i);
376 if !string.is_empty() {
377 let bytes = string.as_bytes();
378 let mut start = 0;
379 for pos in finder.find_iter(bytes) {
380 append_part(
381 list_builder.values(),
382 &string[start..pos],
383 null_value,
384 );
385 start = pos + delim_len;
386 }
387 append_part(list_builder.values(), &string[start..], null_value);
390 }
391 list_builder.append(true);
392 }
393 }
394 None => {
395 for i in 0..string_array.len() {
397 if string_array.is_null(i) {
398 list_builder.append(false);
399 continue;
400 }
401 let string = string_array.value(i);
402 for (pos, c) in string.char_indices() {
403 append_part(
404 list_builder.values(),
405 &string[pos..pos + c.len_utf8()],
406 null_value,
407 );
408 }
409 list_builder.append(true);
410 }
411 }
412 }
413
414 Ok(Arc::new(list_builder.finish()) as ArrayRef)
415}
416
417fn string_to_array_fallback(args: &[ArrayRef]) -> Result<ArrayRef> {
420 let null_value_array = args.get(2);
421
422 match args[0].data_type() {
423 Utf8 => {
424 let arr = args[0].as_string::<i32>();
425 let builder =
426 StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size());
427 string_to_array_column_args(&arr, &args[1], null_value_array, builder)
428 }
429 Utf8View => {
430 let arr = args[0].as_string_view();
431 let builder = StringViewBuilder::with_capacity(arr.len());
432 string_to_array_column_args(&arr, &args[1], null_value_array, builder)
433 }
434 LargeUtf8 => {
435 let arr = args[0].as_string::<i64>();
436 let builder = LargeStringBuilder::with_capacity(
437 arr.len(),
438 arr.get_buffer_memory_size(),
439 );
440 string_to_array_column_args(&arr, &args[1], null_value_array, builder)
441 }
442 other => exec_err!("unsupported type for string_to_array function as {other:?}"),
443 }
444}
445
446fn string_to_array_column_args<'a, StringArrType, StringBuilderType>(
447 string_array: &StringArrType,
448 delimiter_array: &ArrayRef,
449 null_value_array: Option<&ArrayRef>,
450 string_builder: StringBuilderType,
451) -> Result<ArrayRef>
452where
453 StringArrType: StringArrayType<'a>,
454 StringBuilderType: StringArrayBuilderType,
455{
456 let mut list_builder = ListBuilder::new(string_builder);
457
458 for i in 0..string_array.len() {
459 if string_array.is_null(i) {
460 list_builder.append(false);
461 continue;
462 }
463
464 let string = string_array.value(i);
465 let delimiter = get_str_value(delimiter_array, i);
466 let null_value = null_value_array.and_then(|arr| get_str_value(arr, i));
467
468 match delimiter {
469 Some("") => {
470 if !string.is_empty() {
471 append_part(list_builder.values(), string, null_value);
472 }
473 }
474 Some(delimiter) => {
475 if !string.is_empty() {
476 for part in string.split(delimiter) {
477 append_part(list_builder.values(), part, null_value);
478 }
479 }
480 }
481 None => {
482 for (pos, c) in string.char_indices() {
483 append_part(
484 list_builder.values(),
485 &string[pos..pos + c.len_utf8()],
486 null_value,
487 );
488 }
489 }
490 }
491
492 list_builder.append(true);
493 }
494
495 Ok(Arc::new(list_builder.finish()) as ArrayRef)
496}
497
498fn get_str_value(array: &ArrayRef, i: usize) -> Option<&str> {
500 if array.is_null(i) {
501 return None;
502 }
503 match array.data_type() {
504 Utf8 => Some(array.as_string::<i32>().value(i)),
505 LargeUtf8 => Some(array.as_string::<i64>().value(i)),
506 Utf8View => Some(array.as_string_view().value(i)),
507 other => {
508 debug_assert!(false, "unexpected type in get_str_value: {other:?}");
509 None
510 }
511 }
512}
513
514fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
515 if args.len() < 2 || args.len() > 3 {
516 return exec_err!("array_to_string expects two or three arguments");
517 }
518
519 let arr = &args[0];
520
521 let delimiters: Vec<Option<&str>> = match args[1].data_type() {
522 Utf8 => args[1].as_string::<i32>().iter().collect(),
523 Utf8View => args[1].as_string_view().iter().collect(),
524 LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
525 other => {
526 return exec_err!(
527 "unsupported type for second argument to array_to_string function as {other:?}"
528 );
529 }
530 };
531
532 let null_strings: Vec<Option<&str>> = if args.len() == 3 {
533 match args[2].data_type() {
534 Utf8 => args[2].as_string::<i32>().iter().collect(),
535 Utf8View => args[2].as_string_view().iter().collect(),
536 LargeUtf8 => args[2].as_string::<i64>().iter().collect(),
537 other => {
538 return exec_err!(
539 "unsupported type for third argument to array_to_string function as {other:?}"
540 );
541 }
542 }
543 } else {
544 vec![None; args[0].len()]
547 };
548
549 let string_arr = match arr.data_type() {
550 List(_) => {
551 let list_array = as_list_array(&arr)?;
552 generate_string_array::<i32>(list_array, &delimiters, &null_strings)?
553 }
554 LargeList(_) => {
555 let list_array = as_large_list_array(&arr)?;
556 generate_string_array::<i64>(list_array, &delimiters, &null_strings)?
557 }
558 _ => return exec_err!("array_to_string expects list as first argument"),
560 };
561
562 Ok(Arc::new(string_arr))
563}
564
565fn generate_string_array<O: OffsetSizeTrait>(
566 list_arr: &GenericListArray<O>,
567 delimiters: &[Option<&str>],
568 null_strings: &[Option<&str>],
569) -> Result<StringArray> {
570 let mut builder = StringBuilder::with_capacity(list_arr.len(), 0);
571
572 for ((arr, &delimiter), &null_string) in list_arr
573 .iter()
574 .zip(delimiters.iter())
575 .zip(null_strings.iter())
576 {
577 let (Some(arr), Some(delimiter)) = (arr, delimiter) else {
578 builder.append_null();
579 continue;
580 };
581
582 let mut first = true;
583 compute_array_to_string(&mut builder, &arr, delimiter, null_string, &mut first)?;
584 builder.append_value("");
585 }
586
587 Ok(builder.finish())
588}
589
590fn compute_array_to_string(
591 w: &mut impl Write,
592 arr: &ArrayRef,
593 delimiter: &str,
594 null_string: Option<&str>,
595 first: &mut bool,
596) -> Result<()> {
597 macro_rules! handle_list {
599 ($list_array:expr) => {
600 for i in 0..$list_array.len() {
601 if !$list_array.is_null(i) {
602 compute_array_to_string(
603 w,
604 &$list_array.value(i),
605 delimiter,
606 null_string,
607 first,
608 )?;
609 } else if let Some(ns) = null_string {
610 if *first {
611 *first = false;
612 } else {
613 w.write_str(delimiter)?;
614 }
615 w.write_str(ns)?;
616 }
617 }
618 };
619 }
620
621 match arr.data_type() {
622 List(..) => {
623 let list_array = as_list_array(arr)?;
624 handle_list!(list_array);
625 Ok(())
626 }
627 FixedSizeList(..) => {
628 let list_array = as_fixed_size_list_array(arr)?;
629 handle_list!(list_array);
630 Ok(())
631 }
632 LargeList(..) => {
633 let list_array = as_large_list_array(arr)?;
634 handle_list!(list_array);
635 Ok(())
636 }
637 Dictionary(_key_type, value_type) => {
638 let values = cast(arr, value_type.as_ref()).map_err(|e| {
641 DataFusionError::from(e)
642 .context("Casting dictionary to values in compute_array_to_string")
643 })?;
644 compute_array_to_string(w, &values, delimiter, null_string, first)
645 }
646 Null => Ok(()),
647 data_type => {
648 macro_rules! str_leaf {
649 ($ARRAY_TYPE:ident) => {
650 write_leaf_to_string(
651 w,
652 downcast_arg!(arr, $ARRAY_TYPE),
653 delimiter,
654 null_string,
655 first,
656 |w, x: &str| w.write_str(x),
657 )?
658 };
659 }
660 macro_rules! bool_leaf {
661 ($ARRAY_TYPE:ident) => {
662 write_leaf_to_string(
663 w,
664 downcast_arg!(arr, $ARRAY_TYPE),
665 delimiter,
666 null_string,
667 first,
668 |w, x: bool| {
669 if x {
670 w.write_str("true")
671 } else {
672 w.write_str("false")
673 }
674 },
675 )?
676 };
677 }
678 macro_rules! int_leaf {
679 ($ARRAY_TYPE:ident) => {
680 write_leaf_to_string(
681 w,
682 downcast_arg!(arr, $ARRAY_TYPE),
683 delimiter,
684 null_string,
685 first,
686 |w, x| {
687 let mut itoa_buf = itoa::Buffer::new();
688 w.write_str(itoa_buf.format(x))
689 },
690 )?
691 };
692 }
693 macro_rules! float_leaf {
694 ($ARRAY_TYPE:ident) => {
695 write_leaf_to_string(
696 w,
697 downcast_arg!(arr, $ARRAY_TYPE),
698 delimiter,
699 null_string,
700 first,
701 |w, x| write!(w, "{}", x),
706 )?
707 };
708 }
709 match data_type {
710 Utf8 => str_leaf!(StringArray),
711 Utf8View => str_leaf!(StringViewArray),
712 LargeUtf8 => str_leaf!(LargeStringArray),
713 DataType::Boolean => bool_leaf!(BooleanArray),
714 DataType::Float32 => float_leaf!(Float32Array),
715 DataType::Float64 => float_leaf!(Float64Array),
716 DataType::Int8 => int_leaf!(Int8Array),
717 DataType::Int16 => int_leaf!(Int16Array),
718 DataType::Int32 => int_leaf!(Int32Array),
719 DataType::Int64 => int_leaf!(Int64Array),
720 DataType::UInt8 => int_leaf!(UInt8Array),
721 DataType::UInt16 => int_leaf!(UInt16Array),
722 DataType::UInt32 => int_leaf!(UInt32Array),
723 DataType::UInt64 => int_leaf!(UInt64Array),
724 data_type if can_cast_types(data_type, &Utf8) => {
725 let str_arr = cast(arr, &Utf8).map_err(|e| {
726 DataFusionError::from(e)
727 .context("Casting to string in array_to_string")
728 })?;
729 return compute_array_to_string(
730 w,
731 &str_arr,
732 delimiter,
733 null_string,
734 first,
735 );
736 }
737 data_type => {
738 return not_impl_err!(
739 "Unsupported data type in array_to_string: {data_type}"
740 );
741 }
742 }
743 Ok(())
744 }
745 }
746}
747
748fn write_leaf_to_string<'a, W: Write, A, T>(
753 w: &mut W,
754 arr: &'a A,
755 delimiter: &str,
756 null_string: Option<&str>,
757 first: &mut bool,
758 append: impl Fn(&mut W, T) -> fmt::Result,
759) -> Result<()>
760where
761 &'a A: IntoIterator<Item = Option<T>>,
762{
763 for x in arr {
764 if x.is_none() && null_string.is_none() {
766 continue;
767 }
768
769 if *first {
770 *first = false;
771 } else {
772 w.write_str(delimiter)?;
773 }
774
775 match x {
776 Some(x) => append(w, x)?,
777 None => w.write_str(null_string.unwrap())?,
778 }
779 }
780 Ok(())
781}
782
783trait StringArrayBuilderType: ArrayBuilder {
784 fn append_value(&mut self, val: &str);
785
786 fn append_null(&mut self);
787}
788
789impl StringArrayBuilderType for StringBuilder {
790 fn append_value(&mut self, val: &str) {
791 StringBuilder::append_value(self, val);
792 }
793
794 fn append_null(&mut self) {
795 StringBuilder::append_null(self);
796 }
797}
798
799impl StringArrayBuilderType for StringViewBuilder {
800 fn append_value(&mut self, val: &str) {
801 StringViewBuilder::append_value(self, val)
802 }
803
804 fn append_null(&mut self) {
805 StringViewBuilder::append_null(self)
806 }
807}
808
809impl StringArrayBuilderType for LargeStringBuilder {
810 fn append_value(&mut self, val: &str) {
811 LargeStringBuilder::append_value(self, val);
812 }
813
814 fn append_null(&mut self) {
815 LargeStringBuilder::append_null(self);
816 }
817}