1use arrow::array::{
21 Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
22 Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder,
23 OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array,
24 UInt8Array,
25};
26use arrow::datatypes::{DataType, Field};
27
28use datafusion_common::utils::ListCoercion;
29use datafusion_common::{not_impl_err, DataFusionError, Result};
30
31use std::any::Any;
32
33use crate::utils::make_scalar_function;
34use arrow::array::{
35 builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
36 cast::AsArray,
37 GenericStringArray, StringArrayType, StringViewArray,
38};
39use arrow::compute::cast;
40use arrow::datatypes::DataType::{
41 Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
42};
43use datafusion_common::cast::{
44 as_fixed_size_list_array, as_large_list_array, as_list_array,
45};
46use datafusion_common::exec_err;
47use datafusion_common::types::logical_string;
48use datafusion_expr::{
49 ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue,
50 Documentation, ScalarUDFImpl, Signature, TypeSignature, TypeSignatureClass,
51 Volatility,
52};
53use datafusion_functions::downcast_arg;
54use datafusion_macros::user_doc;
55use std::sync::Arc;
56
57macro_rules! call_array_function {
58 ($DATATYPE:expr, false) => {
59 match $DATATYPE {
60 DataType::Utf8 => array_function!(StringArray),
61 DataType::Utf8View => array_function!(StringViewArray),
62 DataType::LargeUtf8 => array_function!(LargeStringArray),
63 DataType::Boolean => array_function!(BooleanArray),
64 DataType::Float32 => array_function!(Float32Array),
65 DataType::Float64 => array_function!(Float64Array),
66 DataType::Int8 => array_function!(Int8Array),
67 DataType::Int16 => array_function!(Int16Array),
68 DataType::Int32 => array_function!(Int32Array),
69 DataType::Int64 => array_function!(Int64Array),
70 DataType::UInt8 => array_function!(UInt8Array),
71 DataType::UInt16 => array_function!(UInt16Array),
72 DataType::UInt32 => array_function!(UInt32Array),
73 DataType::UInt64 => array_function!(UInt64Array),
74 dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
75 }
76 };
77 ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
78 match $DATATYPE {
79 DataType::List(_) => array_function!(ListArray),
80 DataType::Utf8 => array_function!(StringArray),
81 DataType::Utf8View => array_function!(StringViewArray),
82 DataType::LargeUtf8 => array_function!(LargeStringArray),
83 DataType::Boolean => array_function!(BooleanArray),
84 DataType::Float32 => array_function!(Float32Array),
85 DataType::Float64 => array_function!(Float64Array),
86 DataType::Int8 => array_function!(Int8Array),
87 DataType::Int16 => array_function!(Int16Array),
88 DataType::Int32 => array_function!(Int32Array),
89 DataType::Int64 => array_function!(Int64Array),
90 DataType::UInt8 => array_function!(UInt8Array),
91 DataType::UInt16 => array_function!(UInt16Array),
92 DataType::UInt32 => array_function!(UInt32Array),
93 DataType::UInt64 => array_function!(UInt64Array),
94 dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
95 }
96 }};
97}
98
99macro_rules! to_string {
100 ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{
101 let arr = downcast_arg!($ARRAY, $ARRAY_TYPE);
102 for x in arr {
103 match x {
104 Some(x) => {
105 $ARG.push_str(&x.to_string());
106 $ARG.push_str($DELIMITER);
107 }
108 None => {
109 if $WITH_NULL_STRING {
110 $ARG.push_str($NULL_STRING);
111 $ARG.push_str($DELIMITER);
112 }
113 }
114 }
115 }
116 Ok($ARG)
117 }};
118}
119
120make_udf_expr_and_func!(
122 ArrayToString,
123 array_to_string,
124 array delimiter, "converts each element to its text representation.", array_to_string_udf );
128
129#[user_doc(
130 doc_section(label = "Array Functions"),
131 description = "Converts each element to its text representation.",
132 syntax_example = "array_to_string(array, delimiter[, null_string])",
133 sql_example = r#"```sql
134> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ',');
135+----------------------------------------------------+
136| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) |
137+----------------------------------------------------+
138| 1,2,3,4,5,6,7,8 |
139+----------------------------------------------------+
140```"#,
141 argument(
142 name = "array",
143 description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
144 ),
145 argument(name = "delimiter", description = "Array element separator."),
146 argument(
147 name = "null_string",
148 description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior."
149 )
150)]
151#[derive(Debug, PartialEq, Eq, Hash)]
152pub struct ArrayToString {
153 signature: Signature,
154 aliases: Vec<String>,
155}
156
157impl Default for ArrayToString {
158 fn default() -> Self {
159 Self::new()
160 }
161}
162
163impl ArrayToString {
164 pub fn new() -> Self {
165 Self {
166 signature: Signature::one_of(
167 vec![
168 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
169 arguments: vec![
170 ArrayFunctionArgument::Array,
171 ArrayFunctionArgument::String,
172 ArrayFunctionArgument::String,
173 ],
174 array_coercion: Some(ListCoercion::FixedSizedListToList),
175 }),
176 TypeSignature::ArraySignature(ArrayFunctionSignature::Array {
177 arguments: vec![
178 ArrayFunctionArgument::Array,
179 ArrayFunctionArgument::String,
180 ],
181 array_coercion: Some(ListCoercion::FixedSizedListToList),
182 }),
183 ],
184 Volatility::Immutable,
185 ),
186 aliases: vec![
187 String::from("list_to_string"),
188 String::from("array_join"),
189 String::from("list_join"),
190 ],
191 }
192 }
193}
194
195impl ScalarUDFImpl for ArrayToString {
196 fn as_any(&self) -> &dyn Any {
197 self
198 }
199
200 fn name(&self) -> &str {
201 "array_to_string"
202 }
203
204 fn signature(&self) -> &Signature {
205 &self.signature
206 }
207
208 fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
209 Ok(Utf8)
210 }
211
212 fn invoke_with_args(
213 &self,
214 args: datafusion_expr::ScalarFunctionArgs,
215 ) -> Result<ColumnarValue> {
216 make_scalar_function(array_to_string_inner)(&args.args)
217 }
218
219 fn aliases(&self) -> &[String] {
220 &self.aliases
221 }
222
223 fn documentation(&self) -> Option<&Documentation> {
224 self.doc()
225 }
226}
227
228make_udf_expr_and_func!(
229 StringToArray,
230 string_to_array,
231 string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", string_to_array_udf );
235
236#[user_doc(
237 doc_section(label = "Array Functions"),
238 description = "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.",
239 syntax_example = "string_to_array(str, delimiter[, null_str])",
240 sql_example = r#"```sql
241> select string_to_array('abc##def', '##');
242+-----------------------------------+
243| string_to_array(Utf8('abc##def')) |
244+-----------------------------------+
245| ['abc', 'def'] |
246+-----------------------------------+
247> select string_to_array('abc def', ' ', 'def');
248+---------------------------------------------+
249| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) |
250+---------------------------------------------+
251| ['abc', NULL] |
252+---------------------------------------------+
253```"#,
254 argument(name = "str", description = "String expression to split."),
255 argument(name = "delimiter", description = "Delimiter string to split on."),
256 argument(
257 name = "null_str",
258 description = "Substring values to be replaced with `NULL`."
259 )
260)]
261#[derive(Debug, PartialEq, Eq, Hash)]
262pub(super) struct StringToArray {
263 signature: Signature,
264 aliases: Vec<String>,
265}
266
267impl StringToArray {
268 pub fn new() -> Self {
269 Self {
270 signature: Signature::one_of(
271 vec![
272 TypeSignature::Coercible(vec![
273 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
274 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
275 ]),
276 TypeSignature::Coercible(vec![
277 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
278 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
279 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
280 ]),
281 ],
282 Volatility::Immutable,
283 ),
284 aliases: vec![String::from("string_to_list")],
285 }
286 }
287}
288
289impl ScalarUDFImpl for StringToArray {
290 fn as_any(&self) -> &dyn Any {
291 self
292 }
293
294 fn name(&self) -> &str {
295 "string_to_array"
296 }
297
298 fn signature(&self) -> &Signature {
299 &self.signature
300 }
301
302 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
303 Ok(List(Arc::new(Field::new_list_field(
304 arg_types[0].clone(),
305 true,
306 ))))
307 }
308
309 fn invoke_with_args(
310 &self,
311 args: datafusion_expr::ScalarFunctionArgs,
312 ) -> Result<ColumnarValue> {
313 let args = &args.args;
314 match args[0].data_type() {
315 Utf8 | Utf8View => make_scalar_function(string_to_array_inner::<i32>)(args),
316 LargeUtf8 => make_scalar_function(string_to_array_inner::<i64>)(args),
317 other => {
318 exec_err!("unsupported type for string_to_array function as {other:?}")
319 }
320 }
321 }
322
323 fn aliases(&self) -> &[String] {
324 &self.aliases
325 }
326
327 fn documentation(&self) -> Option<&Documentation> {
328 self.doc()
329 }
330}
331
332pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
334 if args.len() < 2 || args.len() > 3 {
335 return exec_err!("array_to_string expects two or three arguments");
336 }
337
338 let arr = &args[0];
339
340 let delimiters: Vec<Option<&str>> = match args[1].data_type() {
341 Utf8 => args[1].as_string::<i32>().iter().collect(),
342 Utf8View => args[1].as_string_view().iter().collect(),
343 LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
344 other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
345 };
346
347 let mut null_string = String::from("");
348 let mut with_null_string = false;
349 if args.len() == 3 {
350 null_string = match args[2].data_type() {
351 Utf8 => args[2].as_string::<i32>().value(0).to_string(),
352 Utf8View => args[2].as_string_view().value(0).to_string(),
353 LargeUtf8 => args[2].as_string::<i64>().value(0).to_string(),
354 other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
355 };
356 with_null_string = true;
357 }
358
359 fn compute_array_to_string(
362 arg: &mut String,
363 arr: ArrayRef,
364 delimiter: String,
365 null_string: String,
366 with_null_string: bool,
367 ) -> Result<&mut String> {
368 match arr.data_type() {
369 List(..) => {
370 let list_array = as_list_array(&arr)?;
371 for i in 0..list_array.len() {
372 if !list_array.is_null(i) {
373 compute_array_to_string(
374 arg,
375 list_array.value(i),
376 delimiter.clone(),
377 null_string.clone(),
378 with_null_string,
379 )?;
380 } else if with_null_string {
381 arg.push_str(&null_string);
382 arg.push_str(&delimiter);
383 }
384 }
385
386 Ok(arg)
387 }
388 FixedSizeList(..) => {
389 let list_array = as_fixed_size_list_array(&arr)?;
390
391 for i in 0..list_array.len() {
392 if !list_array.is_null(i) {
393 compute_array_to_string(
394 arg,
395 list_array.value(i),
396 delimiter.clone(),
397 null_string.clone(),
398 with_null_string,
399 )?;
400 } else if with_null_string {
401 arg.push_str(&null_string);
402 arg.push_str(&delimiter);
403 }
404 }
405
406 Ok(arg)
407 }
408 LargeList(..) => {
409 let list_array = as_large_list_array(&arr)?;
410 for i in 0..list_array.len() {
411 if !list_array.is_null(i) {
412 compute_array_to_string(
413 arg,
414 list_array.value(i),
415 delimiter.clone(),
416 null_string.clone(),
417 with_null_string,
418 )?;
419 } else if with_null_string {
420 arg.push_str(&null_string);
421 arg.push_str(&delimiter);
422 }
423 }
424
425 Ok(arg)
426 }
427 Dictionary(_key_type, value_type) => {
428 let values = cast(&arr, value_type.as_ref()).map_err(|e| {
431 DataFusionError::from(e).context(
432 "Casting dictionary to values in compute_array_to_string",
433 )
434 })?;
435 compute_array_to_string(
436 arg,
437 values,
438 delimiter,
439 null_string,
440 with_null_string,
441 )
442 }
443 Null => Ok(arg),
444 data_type => {
445 macro_rules! array_function {
446 ($ARRAY_TYPE:ident) => {
447 to_string!(
448 arg,
449 arr,
450 &delimiter,
451 &null_string,
452 with_null_string,
453 $ARRAY_TYPE
454 )
455 };
456 }
457 call_array_function!(data_type, false)
458 }
459 }
460 }
461
462 fn generate_string_array<O: OffsetSizeTrait>(
463 list_arr: &GenericListArray<O>,
464 delimiters: Vec<Option<&str>>,
465 null_string: String,
466 with_null_string: bool,
467 ) -> Result<StringArray> {
468 let mut res: Vec<Option<String>> = Vec::new();
469 for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) {
470 if let (Some(arr), Some(delimiter)) = (arr, delimiter) {
471 let mut arg = String::from("");
472 let s = compute_array_to_string(
473 &mut arg,
474 arr,
475 delimiter.to_string(),
476 null_string.clone(),
477 with_null_string,
478 )?
479 .clone();
480
481 if let Some(s) = s.strip_suffix(delimiter) {
482 res.push(Some(s.to_string()));
483 } else {
484 res.push(Some(s));
485 }
486 } else {
487 res.push(None);
488 }
489 }
490
491 Ok(StringArray::from(res))
492 }
493
494 let string_arr = match arr.data_type() {
495 List(_) => {
496 let list_array = as_list_array(&arr)?;
497 generate_string_array::<i32>(
498 list_array,
499 delimiters,
500 null_string,
501 with_null_string,
502 )?
503 }
504 LargeList(_) => {
505 let list_array = as_large_list_array(&arr)?;
506 generate_string_array::<i64>(
507 list_array,
508 delimiters,
509 null_string,
510 with_null_string,
511 )?
512 }
513 _ => return exec_err!("array_to_string expects list as first argument"),
515 };
516
517 Ok(Arc::new(string_arr))
518}
519
520fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
524 if args.len() < 2 || args.len() > 3 {
525 return exec_err!("string_to_array expects two or three arguments");
526 }
527
528 match args[0].data_type() {
529 Utf8 => {
530 let string_array = args[0].as_string::<T>();
531 let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
532 string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder)
533 }
534 Utf8View => {
535 let string_array = args[0].as_string_view();
536 let builder = StringViewBuilder::with_capacity(string_array.len());
537 string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder)
538 }
539 LargeUtf8 => {
540 let string_array = args[0].as_string::<T>();
541 let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
542 string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder)
543 }
544 other => exec_err!("unsupported type for first argument to string_to_array function as {other:?}")
545 }
546}
547
548fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>(
549 args: &'a [ArrayRef],
550 string_array: StringArrType,
551 string_builder: StringBuilderType,
552) -> Result<ArrayRef>
553where
554 StringArrType: StringArrayType<'a>,
555 StringBuilderType: StringArrayBuilderType,
556{
557 match args[1].data_type() {
558 Utf8 => {
559 let delimiter_array = args[1].as_string::<i32>();
560 if args.len() == 2 {
561 string_to_array_impl::<
562 StringArrType,
563 &GenericStringArray<i32>,
564 &StringViewArray,
565 StringBuilderType,
566 >(string_array, delimiter_array, None, string_builder)
567 } else {
568 string_to_array_inner_3::<StringArrType,
569 &GenericStringArray<i32>,
570 StringBuilderType>(args, string_array, delimiter_array, string_builder)
571 }
572 }
573 Utf8View => {
574 let delimiter_array = args[1].as_string_view();
575
576 if args.len() == 2 {
577 string_to_array_impl::<
578 StringArrType,
579 &StringViewArray,
580 &StringViewArray,
581 StringBuilderType,
582 >(string_array, delimiter_array, None, string_builder)
583 } else {
584 string_to_array_inner_3::<StringArrType,
585 &StringViewArray,
586 StringBuilderType>(args, string_array, delimiter_array, string_builder)
587 }
588 }
589 LargeUtf8 => {
590 let delimiter_array = args[1].as_string::<i64>();
591 if args.len() == 2 {
592 string_to_array_impl::<
593 StringArrType,
594 &GenericStringArray<i64>,
595 &StringViewArray,
596 StringBuilderType,
597 >(string_array, delimiter_array, None, string_builder)
598 } else {
599 string_to_array_inner_3::<StringArrType,
600 &GenericStringArray<i64>,
601 StringBuilderType>(args, string_array, delimiter_array, string_builder)
602 }
603 }
604 other => exec_err!("unsupported type for second argument to string_to_array function as {other:?}")
605 }
606}
607
608fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>(
609 args: &'a [ArrayRef],
610 string_array: StringArrType,
611 delimiter_array: DelimiterArrType,
612 string_builder: StringBuilderType,
613) -> Result<ArrayRef>
614where
615 StringArrType: StringArrayType<'a>,
616 DelimiterArrType: StringArrayType<'a>,
617 StringBuilderType: StringArrayBuilderType,
618{
619 match args[2].data_type() {
620 Utf8 => {
621 let null_type_array = Some(args[2].as_string::<i32>());
622 string_to_array_impl::<
623 StringArrType,
624 DelimiterArrType,
625 &GenericStringArray<i32>,
626 StringBuilderType,
627 >(
628 string_array,
629 delimiter_array,
630 null_type_array,
631 string_builder,
632 )
633 }
634 Utf8View => {
635 let null_type_array = Some(args[2].as_string_view());
636 string_to_array_impl::<
637 StringArrType,
638 DelimiterArrType,
639 &StringViewArray,
640 StringBuilderType,
641 >(
642 string_array,
643 delimiter_array,
644 null_type_array,
645 string_builder,
646 )
647 }
648 LargeUtf8 => {
649 let null_type_array = Some(args[2].as_string::<i64>());
650 string_to_array_impl::<
651 StringArrType,
652 DelimiterArrType,
653 &GenericStringArray<i64>,
654 StringBuilderType,
655 >(
656 string_array,
657 delimiter_array,
658 null_type_array,
659 string_builder,
660 )
661 }
662 other => {
663 exec_err!("unsupported type for string_to_array function as {other:?}")
664 }
665 }
666}
667
668fn string_to_array_impl<
669 'a,
670 StringArrType,
671 DelimiterArrType,
672 NullValueArrType,
673 StringBuilderType,
674>(
675 string_array: StringArrType,
676 delimiter_array: DelimiterArrType,
677 null_value_array: Option<NullValueArrType>,
678 string_builder: StringBuilderType,
679) -> Result<ArrayRef>
680where
681 StringArrType: StringArrayType<'a>,
682 DelimiterArrType: StringArrayType<'a>,
683 NullValueArrType: StringArrayType<'a>,
684 StringBuilderType: StringArrayBuilderType,
685{
686 let mut list_builder = ListBuilder::new(string_builder);
687
688 match null_value_array {
689 None => {
690 string_array.iter().zip(delimiter_array.iter()).for_each(
691 |(string, delimiter)| {
692 match (string, delimiter) {
693 (Some(string), Some("")) => {
694 list_builder.values().append_value(string);
695 list_builder.append(true);
696 }
697 (Some(string), Some(delimiter)) => {
698 string.split(delimiter).for_each(|s| {
699 list_builder.values().append_value(s);
700 });
701 list_builder.append(true);
702 }
703 (Some(string), None) => {
704 string.chars().map(|c| c.to_string()).for_each(|c| {
705 list_builder.values().append_value(c.as_str());
706 });
707 list_builder.append(true);
708 }
709 _ => list_builder.append(false), }
711 },
712 )
713 }
714 Some(null_value_array) => string_array
715 .iter()
716 .zip(delimiter_array.iter())
717 .zip(null_value_array.iter())
718 .for_each(|((string, delimiter), null_value)| {
719 match (string, delimiter) {
720 (Some(string), Some("")) => {
721 if Some(string) == null_value {
722 list_builder.values().append_null();
723 } else {
724 list_builder.values().append_value(string);
725 }
726 list_builder.append(true);
727 }
728 (Some(string), Some(delimiter)) => {
729 string.split(delimiter).for_each(|s| {
730 if Some(s) == null_value {
731 list_builder.values().append_null();
732 } else {
733 list_builder.values().append_value(s);
734 }
735 });
736 list_builder.append(true);
737 }
738 (Some(string), None) => {
739 string.chars().map(|c| c.to_string()).for_each(|c| {
740 if Some(c.as_str()) == null_value {
741 list_builder.values().append_null();
742 } else {
743 list_builder.values().append_value(c.as_str());
744 }
745 });
746 list_builder.append(true);
747 }
748 _ => list_builder.append(false), }
750 }),
751 };
752
753 let list_array = list_builder.finish();
754 Ok(Arc::new(list_array) as ArrayRef)
755}
756
757trait StringArrayBuilderType: ArrayBuilder {
758 fn append_value(&mut self, val: &str);
759
760 fn append_null(&mut self);
761}
762
763impl StringArrayBuilderType for StringBuilder {
764 fn append_value(&mut self, val: &str) {
765 StringBuilder::append_value(self, val);
766 }
767
768 fn append_null(&mut self) {
769 StringBuilder::append_null(self);
770 }
771}
772
773impl StringArrayBuilderType for StringViewBuilder {
774 fn append_value(&mut self, val: &str) {
775 StringViewBuilder::append_value(self, val)
776 }
777
778 fn append_null(&mut self) {
779 StringViewBuilder::append_null(self)
780 }
781}
782
783impl StringArrayBuilderType for LargeStringBuilder {
784 fn append_value(&mut self, val: &str) {
785 LargeStringBuilder::append_value(self, val);
786 }
787
788 fn append_null(&mut self) {
789 LargeStringBuilder::append_null(self);
790 }
791}