1use arrow::array::{
21 Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
22 Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder,
23 OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array,
24 UInt8Array,
25};
26use arrow::datatypes::{DataType, Field};
27
28use datafusion_common::{
29 internal_datafusion_err, not_impl_err, plan_err, DataFusionError, Result,
30};
31
32use std::any::Any;
33
34use crate::utils::make_scalar_function;
35use arrow::array::{
36 builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
37 cast::AsArray,
38 GenericStringArray, StringArrayType, StringViewArray,
39};
40use arrow::compute::cast;
41use arrow::datatypes::DataType::{
42 Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
43};
44use datafusion_common::cast::{as_large_list_array, as_list_array};
45use datafusion_common::exec_err;
46use datafusion_common::types::logical_string;
47use datafusion_expr::{
48 Coercion, ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature,
49 TypeSignatureClass, Volatility,
50};
51use datafusion_functions::{downcast_arg, downcast_named_arg};
52use datafusion_macros::user_doc;
53use std::sync::Arc;
54
55macro_rules! call_array_function {
56 ($DATATYPE:expr, false) => {
57 match $DATATYPE {
58 DataType::Utf8 => array_function!(StringArray),
59 DataType::Utf8View => array_function!(StringViewArray),
60 DataType::LargeUtf8 => array_function!(LargeStringArray),
61 DataType::Boolean => array_function!(BooleanArray),
62 DataType::Float32 => array_function!(Float32Array),
63 DataType::Float64 => array_function!(Float64Array),
64 DataType::Int8 => array_function!(Int8Array),
65 DataType::Int16 => array_function!(Int16Array),
66 DataType::Int32 => array_function!(Int32Array),
67 DataType::Int64 => array_function!(Int64Array),
68 DataType::UInt8 => array_function!(UInt8Array),
69 DataType::UInt16 => array_function!(UInt16Array),
70 DataType::UInt32 => array_function!(UInt32Array),
71 DataType::UInt64 => array_function!(UInt64Array),
72 dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
73 }
74 };
75 ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
76 match $DATATYPE {
77 DataType::List(_) => array_function!(ListArray),
78 DataType::Utf8 => array_function!(StringArray),
79 DataType::Utf8View => array_function!(StringViewArray),
80 DataType::LargeUtf8 => array_function!(LargeStringArray),
81 DataType::Boolean => array_function!(BooleanArray),
82 DataType::Float32 => array_function!(Float32Array),
83 DataType::Float64 => array_function!(Float64Array),
84 DataType::Int8 => array_function!(Int8Array),
85 DataType::Int16 => array_function!(Int16Array),
86 DataType::Int32 => array_function!(Int32Array),
87 DataType::Int64 => array_function!(Int64Array),
88 DataType::UInt8 => array_function!(UInt8Array),
89 DataType::UInt16 => array_function!(UInt16Array),
90 DataType::UInt32 => array_function!(UInt32Array),
91 DataType::UInt64 => array_function!(UInt64Array),
92 dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
93 }
94 }};
95}
96
97macro_rules! to_string {
98 ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{
99 let arr = downcast_arg!($ARRAY, $ARRAY_TYPE);
100 for x in arr {
101 match x {
102 Some(x) => {
103 $ARG.push_str(&x.to_string());
104 $ARG.push_str($DELIMITER);
105 }
106 None => {
107 if $WITH_NULL_STRING {
108 $ARG.push_str($NULL_STRING);
109 $ARG.push_str($DELIMITER);
110 }
111 }
112 }
113 }
114 Ok($ARG)
115 }};
116}
117
118make_udf_expr_and_func!(
120 ArrayToString,
121 array_to_string,
122 array delimiter, "converts each element to its text representation.", array_to_string_udf );
126
127#[user_doc(
128 doc_section(label = "Array Functions"),
129 description = "Converts each element to its text representation.",
130 syntax_example = "array_to_string(array, delimiter[, null_string])",
131 sql_example = r#"```sql
132> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ',');
133+----------------------------------------------------+
134| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) |
135+----------------------------------------------------+
136| 1,2,3,4,5,6,7,8 |
137+----------------------------------------------------+
138```"#,
139 argument(
140 name = "array",
141 description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
142 ),
143 argument(name = "delimiter", description = "Array element separator."),
144 argument(
145 name = "null_string",
146 description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior."
147 )
148)]
149#[derive(Debug)]
150pub struct ArrayToString {
151 signature: Signature,
152 aliases: Vec<String>,
153}
154
155impl Default for ArrayToString {
156 fn default() -> Self {
157 Self::new()
158 }
159}
160
161impl ArrayToString {
162 pub fn new() -> Self {
163 Self {
164 signature: Signature::variadic_any(Volatility::Immutable),
165 aliases: vec![
166 String::from("list_to_string"),
167 String::from("array_join"),
168 String::from("list_join"),
169 ],
170 }
171 }
172}
173
174impl ScalarUDFImpl for ArrayToString {
175 fn as_any(&self) -> &dyn Any {
176 self
177 }
178
179 fn name(&self) -> &str {
180 "array_to_string"
181 }
182
183 fn signature(&self) -> &Signature {
184 &self.signature
185 }
186
187 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
188 Ok(match arg_types[0] {
189 List(_) | LargeList(_) | FixedSizeList(_, _) => Utf8,
190 _ => {
191 return plan_err!("The array_to_string function can only accept List/LargeList/FixedSizeList.");
192 }
193 })
194 }
195
196 fn invoke_with_args(
197 &self,
198 args: datafusion_expr::ScalarFunctionArgs,
199 ) -> Result<ColumnarValue> {
200 make_scalar_function(array_to_string_inner)(&args.args)
201 }
202
203 fn aliases(&self) -> &[String] {
204 &self.aliases
205 }
206
207 fn documentation(&self) -> Option<&Documentation> {
208 self.doc()
209 }
210}
211
212make_udf_expr_and_func!(
213 StringToArray,
214 string_to_array,
215 string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", string_to_array_udf );
219
220#[user_doc(
221 doc_section(label = "Array Functions"),
222 description = "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.",
223 syntax_example = "string_to_array(str, delimiter[, null_str])",
224 sql_example = r#"```sql
225> select string_to_array('abc##def', '##');
226+-----------------------------------+
227| string_to_array(Utf8('abc##def')) |
228+-----------------------------------+
229| ['abc', 'def'] |
230+-----------------------------------+
231> select string_to_array('abc def', ' ', 'def');
232+---------------------------------------------+
233| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) |
234+---------------------------------------------+
235| ['abc', NULL] |
236+---------------------------------------------+
237```"#,
238 argument(name = "str", description = "String expression to split."),
239 argument(name = "delimiter", description = "Delimiter string to split on."),
240 argument(
241 name = "null_str",
242 description = "Substring values to be replaced with `NULL`."
243 )
244)]
245#[derive(Debug)]
246pub(super) struct StringToArray {
247 signature: Signature,
248 aliases: Vec<String>,
249}
250
251impl StringToArray {
252 pub fn new() -> Self {
253 Self {
254 signature: Signature::one_of(
255 vec![
256 TypeSignature::Coercible(vec![
257 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
258 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
259 ]),
260 TypeSignature::Coercible(vec![
261 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
262 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
263 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
264 ]),
265 ],
266 Volatility::Immutable,
267 ),
268 aliases: vec![String::from("string_to_list")],
269 }
270 }
271}
272
273impl ScalarUDFImpl for StringToArray {
274 fn as_any(&self) -> &dyn Any {
275 self
276 }
277
278 fn name(&self) -> &str {
279 "string_to_array"
280 }
281
282 fn signature(&self) -> &Signature {
283 &self.signature
284 }
285
286 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
287 Ok(match arg_types[0] {
288 Utf8 | Utf8View | LargeUtf8 => {
289 List(Arc::new(Field::new_list_field(arg_types[0].clone(), true)))
290 }
291 _ => {
292 return plan_err!(
293 "The string_to_array function can only accept Utf8, Utf8View or LargeUtf8."
294 );
295 }
296 })
297 }
298
299 fn invoke_with_args(
300 &self,
301 args: datafusion_expr::ScalarFunctionArgs,
302 ) -> Result<ColumnarValue> {
303 let args = &args.args;
304 match args[0].data_type() {
305 Utf8 | Utf8View => make_scalar_function(string_to_array_inner::<i32>)(args),
306 LargeUtf8 => make_scalar_function(string_to_array_inner::<i64>)(args),
307 other => {
308 exec_err!("unsupported type for string_to_array function as {other:?}")
309 }
310 }
311 }
312
313 fn aliases(&self) -> &[String] {
314 &self.aliases
315 }
316
317 fn documentation(&self) -> Option<&Documentation> {
318 self.doc()
319 }
320}
321
322pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
324 if args.len() < 2 || args.len() > 3 {
325 return exec_err!("array_to_string expects two or three arguments");
326 }
327
328 let arr = &args[0];
329
330 let delimiters: Vec<Option<&str>> = match args[1].data_type() {
331 Utf8 => args[1].as_string::<i32>().iter().collect(),
332 Utf8View => args[1].as_string_view().iter().collect(),
333 LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
334 other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
335 };
336
337 let mut null_string = String::from("");
338 let mut with_null_string = false;
339 if args.len() == 3 {
340 null_string = match args[2].data_type() {
341 Utf8 => args[2].as_string::<i32>().value(0).to_string(),
342 Utf8View => args[2].as_string_view().value(0).to_string(),
343 LargeUtf8 => args[2].as_string::<i64>().value(0).to_string(),
344 other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
345 };
346 with_null_string = true;
347 }
348
349 fn compute_array_to_string(
352 arg: &mut String,
353 arr: ArrayRef,
354 delimiter: String,
355 null_string: String,
356 with_null_string: bool,
357 ) -> Result<&mut String> {
358 match arr.data_type() {
359 List(..) => {
360 let list_array = as_list_array(&arr)?;
361 for i in 0..list_array.len() {
362 compute_array_to_string(
363 arg,
364 list_array.value(i),
365 delimiter.clone(),
366 null_string.clone(),
367 with_null_string,
368 )?;
369 }
370
371 Ok(arg)
372 }
373 LargeList(..) => {
374 let list_array = as_large_list_array(&arr)?;
375 for i in 0..list_array.len() {
376 compute_array_to_string(
377 arg,
378 list_array.value(i),
379 delimiter.clone(),
380 null_string.clone(),
381 with_null_string,
382 )?;
383 }
384
385 Ok(arg)
386 }
387 Dictionary(_key_type, value_type) => {
388 let values = cast(&arr, value_type.as_ref()).map_err(|e| {
391 DataFusionError::from(e).context(
392 "Casting dictionary to values in compute_array_to_string",
393 )
394 })?;
395 compute_array_to_string(
396 arg,
397 values,
398 delimiter,
399 null_string,
400 with_null_string,
401 )
402 }
403 Null => Ok(arg),
404 data_type => {
405 macro_rules! array_function {
406 ($ARRAY_TYPE:ident) => {
407 to_string!(
408 arg,
409 arr,
410 &delimiter,
411 &null_string,
412 with_null_string,
413 $ARRAY_TYPE
414 )
415 };
416 }
417 call_array_function!(data_type, false)
418 }
419 }
420 }
421
422 fn generate_string_array<O: OffsetSizeTrait>(
423 list_arr: &GenericListArray<O>,
424 delimiters: Vec<Option<&str>>,
425 null_string: String,
426 with_null_string: bool,
427 ) -> Result<StringArray> {
428 let mut res: Vec<Option<String>> = Vec::new();
429 for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) {
430 if let (Some(arr), Some(delimiter)) = (arr, delimiter) {
431 let mut arg = String::from("");
432 let s = compute_array_to_string(
433 &mut arg,
434 arr,
435 delimiter.to_string(),
436 null_string.clone(),
437 with_null_string,
438 )?
439 .clone();
440
441 if let Some(s) = s.strip_suffix(delimiter) {
442 res.push(Some(s.to_string()));
443 } else {
444 res.push(Some(s));
445 }
446 } else {
447 res.push(None);
448 }
449 }
450
451 Ok(StringArray::from(res))
452 }
453
454 let arr_type = arr.data_type();
455 let string_arr = match arr_type {
456 List(_) | FixedSizeList(_, _) => {
457 let list_array = as_list_array(&arr)?;
458 generate_string_array::<i32>(
459 list_array,
460 delimiters,
461 null_string,
462 with_null_string,
463 )?
464 }
465 LargeList(_) => {
466 let list_array = as_large_list_array(&arr)?;
467 generate_string_array::<i64>(
468 list_array,
469 delimiters,
470 null_string,
471 with_null_string,
472 )?
473 }
474 _ => {
475 let mut arg = String::from("");
476 let mut res: Vec<Option<String>> = Vec::new();
477 assert_eq!(delimiters.len(), 1);
479 let delimiter = delimiters[0].unwrap();
480 let s = compute_array_to_string(
481 &mut arg,
482 Arc::clone(arr),
483 delimiter.to_string(),
484 null_string,
485 with_null_string,
486 )?
487 .clone();
488
489 if !s.is_empty() {
490 let s = s.strip_suffix(delimiter).unwrap().to_string();
491 res.push(Some(s));
492 } else {
493 res.push(Some(s));
494 }
495 StringArray::from(res)
496 }
497 };
498
499 Ok(Arc::new(string_arr))
500}
501
502fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
506 if args.len() < 2 || args.len() > 3 {
507 return exec_err!("string_to_array expects two or three arguments");
508 }
509
510 match args[0].data_type() {
511 Utf8 => {
512 let string_array = args[0].as_string::<T>();
513 let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
514 string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder)
515 }
516 Utf8View => {
517 let string_array = args[0].as_string_view();
518 let builder = StringViewBuilder::with_capacity(string_array.len());
519 string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder)
520 }
521 LargeUtf8 => {
522 let string_array = args[0].as_string::<T>();
523 let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
524 string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder)
525 }
526 other => exec_err!("unsupported type for first argument to string_to_array function as {other:?}")
527 }
528}
529
530fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>(
531 args: &'a [ArrayRef],
532 string_array: StringArrType,
533 string_builder: StringBuilderType,
534) -> Result<ArrayRef>
535where
536 StringArrType: StringArrayType<'a>,
537 StringBuilderType: StringArrayBuilderType,
538{
539 match args[1].data_type() {
540 Utf8 => {
541 let delimiter_array = args[1].as_string::<i32>();
542 if args.len() == 2 {
543 string_to_array_impl::<
544 StringArrType,
545 &GenericStringArray<i32>,
546 &StringViewArray,
547 StringBuilderType,
548 >(string_array, delimiter_array, None, string_builder)
549 } else {
550 string_to_array_inner_3::<StringArrType,
551 &GenericStringArray<i32>,
552 StringBuilderType>(args, string_array, delimiter_array, string_builder)
553 }
554 }
555 Utf8View => {
556 let delimiter_array = args[1].as_string_view();
557
558 if args.len() == 2 {
559 string_to_array_impl::<
560 StringArrType,
561 &StringViewArray,
562 &StringViewArray,
563 StringBuilderType,
564 >(string_array, delimiter_array, None, string_builder)
565 } else {
566 string_to_array_inner_3::<StringArrType,
567 &StringViewArray,
568 StringBuilderType>(args, string_array, delimiter_array, string_builder)
569 }
570 }
571 LargeUtf8 => {
572 let delimiter_array = args[1].as_string::<i64>();
573 if args.len() == 2 {
574 string_to_array_impl::<
575 StringArrType,
576 &GenericStringArray<i64>,
577 &StringViewArray,
578 StringBuilderType,
579 >(string_array, delimiter_array, None, string_builder)
580 } else {
581 string_to_array_inner_3::<StringArrType,
582 &GenericStringArray<i64>,
583 StringBuilderType>(args, string_array, delimiter_array, string_builder)
584 }
585 }
586 other => exec_err!("unsupported type for second argument to string_to_array function as {other:?}")
587 }
588}
589
590fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>(
591 args: &'a [ArrayRef],
592 string_array: StringArrType,
593 delimiter_array: DelimiterArrType,
594 string_builder: StringBuilderType,
595) -> Result<ArrayRef>
596where
597 StringArrType: StringArrayType<'a>,
598 DelimiterArrType: StringArrayType<'a>,
599 StringBuilderType: StringArrayBuilderType,
600{
601 match args[2].data_type() {
602 Utf8 => {
603 let null_type_array = Some(args[2].as_string::<i32>());
604 string_to_array_impl::<
605 StringArrType,
606 DelimiterArrType,
607 &GenericStringArray<i32>,
608 StringBuilderType,
609 >(
610 string_array,
611 delimiter_array,
612 null_type_array,
613 string_builder,
614 )
615 }
616 Utf8View => {
617 let null_type_array = Some(args[2].as_string_view());
618 string_to_array_impl::<
619 StringArrType,
620 DelimiterArrType,
621 &StringViewArray,
622 StringBuilderType,
623 >(
624 string_array,
625 delimiter_array,
626 null_type_array,
627 string_builder,
628 )
629 }
630 LargeUtf8 => {
631 let null_type_array = Some(args[2].as_string::<i64>());
632 string_to_array_impl::<
633 StringArrType,
634 DelimiterArrType,
635 &GenericStringArray<i64>,
636 StringBuilderType,
637 >(
638 string_array,
639 delimiter_array,
640 null_type_array,
641 string_builder,
642 )
643 }
644 other => {
645 exec_err!("unsupported type for string_to_array function as {other:?}")
646 }
647 }
648}
649
650fn string_to_array_impl<
651 'a,
652 StringArrType,
653 DelimiterArrType,
654 NullValueArrType,
655 StringBuilderType,
656>(
657 string_array: StringArrType,
658 delimiter_array: DelimiterArrType,
659 null_value_array: Option<NullValueArrType>,
660 string_builder: StringBuilderType,
661) -> Result<ArrayRef>
662where
663 StringArrType: StringArrayType<'a>,
664 DelimiterArrType: StringArrayType<'a>,
665 NullValueArrType: StringArrayType<'a>,
666 StringBuilderType: StringArrayBuilderType,
667{
668 let mut list_builder = ListBuilder::new(string_builder);
669
670 match null_value_array {
671 None => {
672 string_array.iter().zip(delimiter_array.iter()).for_each(
673 |(string, delimiter)| {
674 match (string, delimiter) {
675 (Some(string), Some("")) => {
676 list_builder.values().append_value(string);
677 list_builder.append(true);
678 }
679 (Some(string), Some(delimiter)) => {
680 string.split(delimiter).for_each(|s| {
681 list_builder.values().append_value(s);
682 });
683 list_builder.append(true);
684 }
685 (Some(string), None) => {
686 string.chars().map(|c| c.to_string()).for_each(|c| {
687 list_builder.values().append_value(c.as_str());
688 });
689 list_builder.append(true);
690 }
691 _ => list_builder.append(false), }
693 },
694 )
695 }
696 Some(null_value_array) => string_array
697 .iter()
698 .zip(delimiter_array.iter())
699 .zip(null_value_array.iter())
700 .for_each(|((string, delimiter), null_value)| {
701 match (string, delimiter) {
702 (Some(string), Some("")) => {
703 if Some(string) == null_value {
704 list_builder.values().append_null();
705 } else {
706 list_builder.values().append_value(string);
707 }
708 list_builder.append(true);
709 }
710 (Some(string), Some(delimiter)) => {
711 string.split(delimiter).for_each(|s| {
712 if Some(s) == null_value {
713 list_builder.values().append_null();
714 } else {
715 list_builder.values().append_value(s);
716 }
717 });
718 list_builder.append(true);
719 }
720 (Some(string), None) => {
721 string.chars().map(|c| c.to_string()).for_each(|c| {
722 if Some(c.as_str()) == null_value {
723 list_builder.values().append_null();
724 } else {
725 list_builder.values().append_value(c.as_str());
726 }
727 });
728 list_builder.append(true);
729 }
730 _ => list_builder.append(false), }
732 }),
733 };
734
735 let list_array = list_builder.finish();
736 Ok(Arc::new(list_array) as ArrayRef)
737}
738
739trait StringArrayBuilderType: ArrayBuilder {
740 fn append_value(&mut self, val: &str);
741
742 fn append_null(&mut self);
743}
744
745impl StringArrayBuilderType for StringBuilder {
746 fn append_value(&mut self, val: &str) {
747 StringBuilder::append_value(self, val);
748 }
749
750 fn append_null(&mut self) {
751 StringBuilder::append_null(self);
752 }
753}
754
755impl StringArrayBuilderType for StringViewBuilder {
756 fn append_value(&mut self, val: &str) {
757 StringViewBuilder::append_value(self, val)
758 }
759
760 fn append_null(&mut self) {
761 StringViewBuilder::append_null(self)
762 }
763}
764
765impl StringArrayBuilderType for LargeStringBuilder {
766 fn append_value(&mut self, val: &str) {
767 LargeStringBuilder::append_value(self, val);
768 }
769
770 fn append_null(&mut self) {
771 LargeStringBuilder::append_null(self);
772 }
773}