1use arrow::array::{
21 Array, ArrayRef, BooleanArray, Float32Array, Float64Array, GenericListArray,
22 Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, ListBuilder,
23 OffsetSizeTrait, StringArray, StringBuilder, UInt16Array, UInt32Array, UInt64Array,
24 UInt8Array,
25};
26use arrow::datatypes::{DataType, Field};
27use datafusion_expr::TypeSignature;
28
29use datafusion_common::{
30 internal_datafusion_err, not_impl_err, plan_err, DataFusionError, Result,
31};
32
33use std::any::Any;
34
35use crate::utils::make_scalar_function;
36use arrow::array::{
37 builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder},
38 cast::AsArray,
39 GenericStringArray, StringArrayType, StringViewArray,
40};
41use arrow::compute::cast;
42use arrow::datatypes::DataType::{
43 Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8, Utf8View,
44};
45use datafusion_common::cast::{as_large_list_array, as_list_array};
46use datafusion_common::exec_err;
47use datafusion_expr::{
48 ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
49};
50use datafusion_functions::{downcast_arg, downcast_named_arg};
51use datafusion_macros::user_doc;
52use std::sync::Arc;
53
54macro_rules! call_array_function {
55 ($DATATYPE:expr, false) => {
56 match $DATATYPE {
57 DataType::Utf8 => array_function!(StringArray),
58 DataType::Utf8View => array_function!(StringViewArray),
59 DataType::LargeUtf8 => array_function!(LargeStringArray),
60 DataType::Boolean => array_function!(BooleanArray),
61 DataType::Float32 => array_function!(Float32Array),
62 DataType::Float64 => array_function!(Float64Array),
63 DataType::Int8 => array_function!(Int8Array),
64 DataType::Int16 => array_function!(Int16Array),
65 DataType::Int32 => array_function!(Int32Array),
66 DataType::Int64 => array_function!(Int64Array),
67 DataType::UInt8 => array_function!(UInt8Array),
68 DataType::UInt16 => array_function!(UInt16Array),
69 DataType::UInt32 => array_function!(UInt32Array),
70 DataType::UInt64 => array_function!(UInt64Array),
71 dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
72 }
73 };
74 ($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
75 match $DATATYPE {
76 DataType::List(_) => array_function!(ListArray),
77 DataType::Utf8 => array_function!(StringArray),
78 DataType::Utf8View => array_function!(StringViewArray),
79 DataType::LargeUtf8 => array_function!(LargeStringArray),
80 DataType::Boolean => array_function!(BooleanArray),
81 DataType::Float32 => array_function!(Float32Array),
82 DataType::Float64 => array_function!(Float64Array),
83 DataType::Int8 => array_function!(Int8Array),
84 DataType::Int16 => array_function!(Int16Array),
85 DataType::Int32 => array_function!(Int32Array),
86 DataType::Int64 => array_function!(Int64Array),
87 DataType::UInt8 => array_function!(UInt8Array),
88 DataType::UInt16 => array_function!(UInt16Array),
89 DataType::UInt32 => array_function!(UInt32Array),
90 DataType::UInt64 => array_function!(UInt64Array),
91 dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
92 }
93 }};
94}
95
96macro_rules! to_string {
97 ($ARG:expr, $ARRAY:expr, $DELIMITER:expr, $NULL_STRING:expr, $WITH_NULL_STRING:expr, $ARRAY_TYPE:ident) => {{
98 let arr = downcast_arg!($ARRAY, $ARRAY_TYPE);
99 for x in arr {
100 match x {
101 Some(x) => {
102 $ARG.push_str(&x.to_string());
103 $ARG.push_str($DELIMITER);
104 }
105 None => {
106 if $WITH_NULL_STRING {
107 $ARG.push_str($NULL_STRING);
108 $ARG.push_str($DELIMITER);
109 }
110 }
111 }
112 }
113 Ok($ARG)
114 }};
115}
116
117make_udf_expr_and_func!(
119 ArrayToString,
120 array_to_string,
121 array delimiter, "converts each element to its text representation.", array_to_string_udf );
125
126#[user_doc(
127 doc_section(label = "Array Functions"),
128 description = "Converts each element to its text representation.",
129 syntax_example = "array_to_string(array, delimiter[, null_string])",
130 sql_example = r#"```sql
131> select array_to_string([[1, 2, 3, 4], [5, 6, 7, 8]], ',');
132+----------------------------------------------------+
133| array_to_string(List([1,2,3,4,5,6,7,8]),Utf8(",")) |
134+----------------------------------------------------+
135| 1,2,3,4,5,6,7,8 |
136+----------------------------------------------------+
137```"#,
138 argument(
139 name = "array",
140 description = "Array expression. Can be a constant, column, or function, and any combination of array operators."
141 ),
142 argument(name = "delimiter", description = "Array element separator."),
143 argument(
144 name = "null_string",
145 description = "Optional. String to replace null values in the array. If not provided, nulls will be handled by default behavior."
146 )
147)]
148#[derive(Debug)]
149pub struct ArrayToString {
150 signature: Signature,
151 aliases: Vec<String>,
152}
153
154impl Default for ArrayToString {
155 fn default() -> Self {
156 Self::new()
157 }
158}
159
160impl ArrayToString {
161 pub fn new() -> Self {
162 Self {
163 signature: Signature::variadic_any(Volatility::Immutable),
164 aliases: vec![
165 String::from("list_to_string"),
166 String::from("array_join"),
167 String::from("list_join"),
168 ],
169 }
170 }
171}
172
173impl ScalarUDFImpl for ArrayToString {
174 fn as_any(&self) -> &dyn Any {
175 self
176 }
177
178 fn name(&self) -> &str {
179 "array_to_string"
180 }
181
182 fn signature(&self) -> &Signature {
183 &self.signature
184 }
185
186 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
187 Ok(match arg_types[0] {
188 List(_) | LargeList(_) | FixedSizeList(_, _) => Utf8,
189 _ => {
190 return plan_err!("The array_to_string function can only accept List/LargeList/FixedSizeList.");
191 }
192 })
193 }
194
195 fn invoke_with_args(
196 &self,
197 args: datafusion_expr::ScalarFunctionArgs,
198 ) -> Result<ColumnarValue> {
199 make_scalar_function(array_to_string_inner)(&args.args)
200 }
201
202 fn aliases(&self) -> &[String] {
203 &self.aliases
204 }
205
206 fn documentation(&self) -> Option<&Documentation> {
207 self.doc()
208 }
209}
210
211make_udf_expr_and_func!(
212 StringToArray,
213 string_to_array,
214 string delimiter null_string, "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", string_to_array_udf );
218
219#[user_doc(
220 doc_section(label = "Array Functions"),
221 description = "Splits a string into an array of substrings based on a delimiter. Any substrings matching the optional `null_str` argument are replaced with NULL.",
222 syntax_example = "string_to_array(str, delimiter[, null_str])",
223 sql_example = r#"```sql
224> select string_to_array('abc##def', '##');
225+-----------------------------------+
226| string_to_array(Utf8('abc##def')) |
227+-----------------------------------+
228| ['abc', 'def'] |
229+-----------------------------------+
230> select string_to_array('abc def', ' ', 'def');
231+---------------------------------------------+
232| string_to_array(Utf8('abc def'), Utf8(' '), Utf8('def')) |
233+---------------------------------------------+
234| ['abc', NULL] |
235+---------------------------------------------+
236```"#,
237 argument(name = "str", description = "String expression to split."),
238 argument(name = "delimiter", description = "Delimiter string to split on."),
239 argument(
240 name = "null_str",
241 description = "Substring values to be replaced with `NULL`."
242 )
243)]
244#[derive(Debug)]
245pub(super) struct StringToArray {
246 signature: Signature,
247 aliases: Vec<String>,
248}
249
250impl StringToArray {
251 pub fn new() -> Self {
252 Self {
253 signature: Signature::one_of(
254 vec![TypeSignature::String(2), TypeSignature::String(3)],
255 Volatility::Immutable,
256 ),
257 aliases: vec![String::from("string_to_list")],
258 }
259 }
260}
261
262impl ScalarUDFImpl for StringToArray {
263 fn as_any(&self) -> &dyn Any {
264 self
265 }
266
267 fn name(&self) -> &str {
268 "string_to_array"
269 }
270
271 fn signature(&self) -> &Signature {
272 &self.signature
273 }
274
275 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
276 Ok(match arg_types[0] {
277 Utf8 | Utf8View | LargeUtf8 => {
278 List(Arc::new(Field::new_list_field(arg_types[0].clone(), true)))
279 }
280 _ => {
281 return plan_err!(
282 "The string_to_array function can only accept Utf8, Utf8View or LargeUtf8."
283 );
284 }
285 })
286 }
287
288 fn invoke_with_args(
289 &self,
290 args: datafusion_expr::ScalarFunctionArgs,
291 ) -> Result<ColumnarValue> {
292 let args = &args.args;
293 match args[0].data_type() {
294 Utf8 | Utf8View => make_scalar_function(string_to_array_inner::<i32>)(args),
295 LargeUtf8 => make_scalar_function(string_to_array_inner::<i64>)(args),
296 other => {
297 exec_err!("unsupported type for string_to_array function as {other:?}")
298 }
299 }
300 }
301
302 fn aliases(&self) -> &[String] {
303 &self.aliases
304 }
305
306 fn documentation(&self) -> Option<&Documentation> {
307 self.doc()
308 }
309}
310
311pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
313 if args.len() < 2 || args.len() > 3 {
314 return exec_err!("array_to_string expects two or three arguments");
315 }
316
317 let arr = &args[0];
318
319 let delimiters: Vec<Option<&str>> = match args[1].data_type() {
320 Utf8 => args[1].as_string::<i32>().iter().collect(),
321 Utf8View => args[1].as_string_view().iter().collect(),
322 LargeUtf8 => args[1].as_string::<i64>().iter().collect(),
323 other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
324 };
325
326 let mut null_string = String::from("");
327 let mut with_null_string = false;
328 if args.len() == 3 {
329 null_string = match args[2].data_type() {
330 Utf8 => args[2].as_string::<i32>().value(0).to_string(),
331 Utf8View => args[2].as_string_view().value(0).to_string(),
332 LargeUtf8 => args[2].as_string::<i64>().value(0).to_string(),
333 other => return exec_err!("unsupported type for second argument to array_to_string function as {other:?}")
334 };
335 with_null_string = true;
336 }
337
338 fn compute_array_to_string(
341 arg: &mut String,
342 arr: ArrayRef,
343 delimiter: String,
344 null_string: String,
345 with_null_string: bool,
346 ) -> Result<&mut String> {
347 match arr.data_type() {
348 List(..) => {
349 let list_array = as_list_array(&arr)?;
350 for i in 0..list_array.len() {
351 compute_array_to_string(
352 arg,
353 list_array.value(i),
354 delimiter.clone(),
355 null_string.clone(),
356 with_null_string,
357 )?;
358 }
359
360 Ok(arg)
361 }
362 LargeList(..) => {
363 let list_array = as_large_list_array(&arr)?;
364 for i in 0..list_array.len() {
365 compute_array_to_string(
366 arg,
367 list_array.value(i),
368 delimiter.clone(),
369 null_string.clone(),
370 with_null_string,
371 )?;
372 }
373
374 Ok(arg)
375 }
376 Dictionary(_key_type, value_type) => {
377 let values = cast(&arr, value_type.as_ref()).map_err(|e| {
380 DataFusionError::from(e).context(
381 "Casting dictionary to values in compute_array_to_string",
382 )
383 })?;
384 compute_array_to_string(
385 arg,
386 values,
387 delimiter,
388 null_string,
389 with_null_string,
390 )
391 }
392 Null => Ok(arg),
393 data_type => {
394 macro_rules! array_function {
395 ($ARRAY_TYPE:ident) => {
396 to_string!(
397 arg,
398 arr,
399 &delimiter,
400 &null_string,
401 with_null_string,
402 $ARRAY_TYPE
403 )
404 };
405 }
406 call_array_function!(data_type, false)
407 }
408 }
409 }
410
411 fn generate_string_array<O: OffsetSizeTrait>(
412 list_arr: &GenericListArray<O>,
413 delimiters: Vec<Option<&str>>,
414 null_string: String,
415 with_null_string: bool,
416 ) -> Result<StringArray> {
417 let mut res: Vec<Option<String>> = Vec::new();
418 for (arr, &delimiter) in list_arr.iter().zip(delimiters.iter()) {
419 if let (Some(arr), Some(delimiter)) = (arr, delimiter) {
420 let mut arg = String::from("");
421 let s = compute_array_to_string(
422 &mut arg,
423 arr,
424 delimiter.to_string(),
425 null_string.clone(),
426 with_null_string,
427 )?
428 .clone();
429
430 if let Some(s) = s.strip_suffix(delimiter) {
431 res.push(Some(s.to_string()));
432 } else {
433 res.push(Some(s));
434 }
435 } else {
436 res.push(None);
437 }
438 }
439
440 Ok(StringArray::from(res))
441 }
442
443 let arr_type = arr.data_type();
444 let string_arr = match arr_type {
445 List(_) | FixedSizeList(_, _) => {
446 let list_array = as_list_array(&arr)?;
447 generate_string_array::<i32>(
448 list_array,
449 delimiters,
450 null_string,
451 with_null_string,
452 )?
453 }
454 LargeList(_) => {
455 let list_array = as_large_list_array(&arr)?;
456 generate_string_array::<i64>(
457 list_array,
458 delimiters,
459 null_string,
460 with_null_string,
461 )?
462 }
463 _ => {
464 let mut arg = String::from("");
465 let mut res: Vec<Option<String>> = Vec::new();
466 assert_eq!(delimiters.len(), 1);
468 let delimiter = delimiters[0].unwrap();
469 let s = compute_array_to_string(
470 &mut arg,
471 Arc::clone(arr),
472 delimiter.to_string(),
473 null_string,
474 with_null_string,
475 )?
476 .clone();
477
478 if !s.is_empty() {
479 let s = s.strip_suffix(delimiter).unwrap().to_string();
480 res.push(Some(s));
481 } else {
482 res.push(Some(s));
483 }
484 StringArray::from(res)
485 }
486 };
487
488 Ok(Arc::new(string_arr))
489}
490
491fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
495 if args.len() < 2 || args.len() > 3 {
496 return exec_err!("string_to_array expects two or three arguments");
497 }
498
499 match args[0].data_type() {
500 Utf8 => {
501 let string_array = args[0].as_string::<T>();
502 let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
503 string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder)
504 }
505 Utf8View => {
506 let string_array = args[0].as_string_view();
507 let builder = StringViewBuilder::with_capacity(string_array.len());
508 string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder)
509 }
510 LargeUtf8 => {
511 let string_array = args[0].as_string::<T>();
512 let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size());
513 string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder)
514 }
515 other => exec_err!("unsupported type for first argument to string_to_array function as {other:?}")
516 }
517}
518
519fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>(
520 args: &'a [ArrayRef],
521 string_array: StringArrType,
522 string_builder: StringBuilderType,
523) -> Result<ArrayRef>
524where
525 StringArrType: StringArrayType<'a>,
526 StringBuilderType: StringArrayBuilderType,
527{
528 match args[1].data_type() {
529 Utf8 => {
530 let delimiter_array = args[1].as_string::<i32>();
531 if args.len() == 2 {
532 string_to_array_impl::<
533 StringArrType,
534 &GenericStringArray<i32>,
535 &StringViewArray,
536 StringBuilderType,
537 >(string_array, delimiter_array, None, string_builder)
538 } else {
539 string_to_array_inner_3::<StringArrType,
540 &GenericStringArray<i32>,
541 StringBuilderType>(args, string_array, delimiter_array, string_builder)
542 }
543 }
544 Utf8View => {
545 let delimiter_array = args[1].as_string_view();
546
547 if args.len() == 2 {
548 string_to_array_impl::<
549 StringArrType,
550 &StringViewArray,
551 &StringViewArray,
552 StringBuilderType,
553 >(string_array, delimiter_array, None, string_builder)
554 } else {
555 string_to_array_inner_3::<StringArrType,
556 &StringViewArray,
557 StringBuilderType>(args, string_array, delimiter_array, string_builder)
558 }
559 }
560 LargeUtf8 => {
561 let delimiter_array = args[1].as_string::<i64>();
562 if args.len() == 2 {
563 string_to_array_impl::<
564 StringArrType,
565 &GenericStringArray<i64>,
566 &StringViewArray,
567 StringBuilderType,
568 >(string_array, delimiter_array, None, string_builder)
569 } else {
570 string_to_array_inner_3::<StringArrType,
571 &GenericStringArray<i64>,
572 StringBuilderType>(args, string_array, delimiter_array, string_builder)
573 }
574 }
575 other => exec_err!("unsupported type for second argument to string_to_array function as {other:?}")
576 }
577}
578
579fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>(
580 args: &'a [ArrayRef],
581 string_array: StringArrType,
582 delimiter_array: DelimiterArrType,
583 string_builder: StringBuilderType,
584) -> Result<ArrayRef>
585where
586 StringArrType: StringArrayType<'a>,
587 DelimiterArrType: StringArrayType<'a>,
588 StringBuilderType: StringArrayBuilderType,
589{
590 match args[2].data_type() {
591 Utf8 => {
592 let null_type_array = Some(args[2].as_string::<i32>());
593 string_to_array_impl::<
594 StringArrType,
595 DelimiterArrType,
596 &GenericStringArray<i32>,
597 StringBuilderType,
598 >(
599 string_array,
600 delimiter_array,
601 null_type_array,
602 string_builder,
603 )
604 }
605 Utf8View => {
606 let null_type_array = Some(args[2].as_string_view());
607 string_to_array_impl::<
608 StringArrType,
609 DelimiterArrType,
610 &StringViewArray,
611 StringBuilderType,
612 >(
613 string_array,
614 delimiter_array,
615 null_type_array,
616 string_builder,
617 )
618 }
619 LargeUtf8 => {
620 let null_type_array = Some(args[2].as_string::<i64>());
621 string_to_array_impl::<
622 StringArrType,
623 DelimiterArrType,
624 &GenericStringArray<i64>,
625 StringBuilderType,
626 >(
627 string_array,
628 delimiter_array,
629 null_type_array,
630 string_builder,
631 )
632 }
633 other => {
634 exec_err!("unsupported type for string_to_array function as {other:?}")
635 }
636 }
637}
638
639fn string_to_array_impl<
640 'a,
641 StringArrType,
642 DelimiterArrType,
643 NullValueArrType,
644 StringBuilderType,
645>(
646 string_array: StringArrType,
647 delimiter_array: DelimiterArrType,
648 null_value_array: Option<NullValueArrType>,
649 string_builder: StringBuilderType,
650) -> Result<ArrayRef>
651where
652 StringArrType: StringArrayType<'a>,
653 DelimiterArrType: StringArrayType<'a>,
654 NullValueArrType: StringArrayType<'a>,
655 StringBuilderType: StringArrayBuilderType,
656{
657 let mut list_builder = ListBuilder::new(string_builder);
658
659 match null_value_array {
660 None => {
661 string_array.iter().zip(delimiter_array.iter()).for_each(
662 |(string, delimiter)| {
663 match (string, delimiter) {
664 (Some(string), Some("")) => {
665 list_builder.values().append_value(string);
666 list_builder.append(true);
667 }
668 (Some(string), Some(delimiter)) => {
669 string.split(delimiter).for_each(|s| {
670 list_builder.values().append_value(s);
671 });
672 list_builder.append(true);
673 }
674 (Some(string), None) => {
675 string.chars().map(|c| c.to_string()).for_each(|c| {
676 list_builder.values().append_value(c.as_str());
677 });
678 list_builder.append(true);
679 }
680 _ => list_builder.append(false), }
682 },
683 )
684 }
685 Some(null_value_array) => string_array
686 .iter()
687 .zip(delimiter_array.iter())
688 .zip(null_value_array.iter())
689 .for_each(|((string, delimiter), null_value)| {
690 match (string, delimiter) {
691 (Some(string), Some("")) => {
692 if Some(string) == null_value {
693 list_builder.values().append_null();
694 } else {
695 list_builder.values().append_value(string);
696 }
697 list_builder.append(true);
698 }
699 (Some(string), Some(delimiter)) => {
700 string.split(delimiter).for_each(|s| {
701 if Some(s) == null_value {
702 list_builder.values().append_null();
703 } else {
704 list_builder.values().append_value(s);
705 }
706 });
707 list_builder.append(true);
708 }
709 (Some(string), None) => {
710 string.chars().map(|c| c.to_string()).for_each(|c| {
711 if Some(c.as_str()) == null_value {
712 list_builder.values().append_null();
713 } else {
714 list_builder.values().append_value(c.as_str());
715 }
716 });
717 list_builder.append(true);
718 }
719 _ => list_builder.append(false), }
721 }),
722 };
723
724 let list_array = list_builder.finish();
725 Ok(Arc::new(list_array) as ArrayRef)
726}
727
728trait StringArrayBuilderType: ArrayBuilder {
729 fn append_value(&mut self, val: &str);
730
731 fn append_null(&mut self);
732}
733
734impl StringArrayBuilderType for StringBuilder {
735 fn append_value(&mut self, val: &str) {
736 StringBuilder::append_value(self, val);
737 }
738
739 fn append_null(&mut self) {
740 StringBuilder::append_null(self);
741 }
742}
743
744impl StringArrayBuilderType for StringViewBuilder {
745 fn append_value(&mut self, val: &str) {
746 StringViewBuilder::append_value(self, val)
747 }
748
749 fn append_null(&mut self) {
750 StringViewBuilder::append_null(self)
751 }
752}
753
754impl StringArrayBuilderType for LargeStringBuilder {
755 fn append_value(&mut self, val: &str) {
756 LargeStringBuilder::append_value(self, val);
757 }
758
759 fn append_null(&mut self) {
760 LargeStringBuilder::append_null(self);
761 }
762}