1use std::fmt::Write;
19use std::sync::Arc;
20
21use DataType::{LargeUtf8, Utf8, Utf8View};
22use arrow::array::{
23 Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
24 OffsetSizeTrait, StringArrayType, StringViewArray,
25};
26use arrow::datatypes::DataType;
27
28use crate::utils::{make_scalar_function, utf8_to_str_type};
29use datafusion_common::cast::as_int64_array;
30use datafusion_common::{Result, exec_err};
31use datafusion_expr::TypeSignature::Exact;
32use datafusion_expr::{
33 ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
34 Volatility,
35};
36use datafusion_macros::user_doc;
37
38#[user_doc(
39 doc_section(label = "String Functions"),
40 description = "Pads the left side of a string with another string to a specified string length.",
41 syntax_example = "lpad(str, n[, padding_str])",
42 sql_example = r#"```sql
43> select lpad('Dolly', 10, 'hello');
44+---------------------------------------------+
45| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) |
46+---------------------------------------------+
47| helloDolly |
48+---------------------------------------------+
49```"#,
50 standard_argument(name = "str", prefix = "String"),
51 argument(
52 name = "n",
53 description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)."
54 ),
55 argument(
56 name = "padding_str",
57 description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
58 ),
59 related_udf(name = "rpad")
60)]
61#[derive(Debug, PartialEq, Eq, Hash)]
62pub struct LPadFunc {
63 signature: Signature,
64}
65
66impl Default for LPadFunc {
67 fn default() -> Self {
68 Self::new()
69 }
70}
71
72impl LPadFunc {
73 pub fn new() -> Self {
74 use DataType::*;
75 Self {
76 signature: Signature::one_of(
77 vec![
78 Exact(vec![Utf8View, Int64]),
79 Exact(vec![Utf8View, Int64, Utf8View]),
80 Exact(vec![Utf8View, Int64, Utf8]),
81 Exact(vec![Utf8View, Int64, LargeUtf8]),
82 Exact(vec![Utf8, Int64]),
83 Exact(vec![Utf8, Int64, Utf8View]),
84 Exact(vec![Utf8, Int64, Utf8]),
85 Exact(vec![Utf8, Int64, LargeUtf8]),
86 Exact(vec![LargeUtf8, Int64]),
87 Exact(vec![LargeUtf8, Int64, Utf8View]),
88 Exact(vec![LargeUtf8, Int64, Utf8]),
89 Exact(vec![LargeUtf8, Int64, LargeUtf8]),
90 ],
91 Volatility::Immutable,
92 ),
93 }
94 }
95}
96
97impl ScalarUDFImpl for LPadFunc {
98 fn name(&self) -> &str {
99 "lpad"
100 }
101
102 fn signature(&self) -> &Signature {
103 &self.signature
104 }
105
106 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
107 utf8_to_str_type(&arg_types[0], "lpad")
108 }
109
110 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
111 let ScalarFunctionArgs {
112 args, number_rows, ..
113 } = args;
114
115 const MAX_SCALAR_TARGET_LEN: usize = 16384;
116
117 if let Some(target_len) = try_as_scalar_i64(&args[1]) {
120 let target_len: usize = match usize::try_from(target_len) {
121 Ok(n) if n <= i32::MAX as usize => n,
122 Ok(n) => {
123 return exec_err!(
124 "lpad requested length {n} too large, maximum allowed length is {}",
125 i32::MAX
126 );
127 }
128 Err(_) => 0, };
130
131 let fill_str = if args.len() == 3 {
132 try_as_scalar_str(&args[2])
133 } else {
134 Some(" ")
135 };
136
137 if target_len <= MAX_SCALAR_TARGET_LEN
141 && let Some(fill) = fill_str
142 {
143 let string_array = args[0].to_array_of_size(number_rows)?;
144 let result = match string_array.data_type() {
145 Utf8View => lpad_scalar_args::<_, i32>(
146 string_array.as_string_view(),
147 target_len,
148 fill,
149 ),
150 Utf8 => lpad_scalar_args::<_, i32>(
151 string_array.as_string::<i32>(),
152 target_len,
153 fill,
154 ),
155 LargeUtf8 => lpad_scalar_args::<_, i64>(
156 string_array.as_string::<i64>(),
157 target_len,
158 fill,
159 ),
160 other => {
161 exec_err!("Unsupported data type {other:?} for function lpad")
162 }
163 }?;
164 return Ok(ColumnarValue::Array(result));
165 }
166 }
167
168 match args[0].data_type() {
169 Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(&args),
170 LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(&args),
171 other => exec_err!("Unsupported data type {other:?} for function lpad"),
172 }
173 }
174
175 fn documentation(&self) -> Option<&Documentation> {
176 self.doc()
177 }
178}
179
180use super::common::{
181 StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
182};
183
184fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
186 string_array: V,
187 target_len: usize,
188 fill: &str,
189) -> Result<ArrayRef> {
190 if string_array.is_ascii() && fill.is_ascii() {
191 lpad_scalar_ascii::<V, T>(string_array, target_len, fill)
192 } else {
193 lpad_scalar_unicode::<V, T>(string_array, target_len, fill)
194 }
195}
196
197fn lpad_scalar_ascii<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
198 string_array: V,
199 target_len: usize,
200 fill: &str,
201) -> Result<ArrayRef> {
202 let padding_buf = if !fill.is_empty() {
205 let mut buf = String::with_capacity(target_len);
206 while buf.len() < target_len {
207 let remaining = target_len - buf.len();
208 if remaining >= fill.len() {
209 buf.push_str(fill);
210 } else {
211 buf.push_str(&fill[..remaining]);
212 }
213 }
214 buf
215 } else {
216 String::new()
217 };
218
219 let data_capacity = string_array.len().saturating_mul(target_len);
221 let mut builder =
222 GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
223
224 for maybe_string in string_array.iter() {
225 match maybe_string {
226 Some(string) => {
227 let str_len = string.len();
228 if target_len <= str_len {
229 builder.append_value(&string[..target_len]);
230 } else if fill.is_empty() {
231 builder.append_value(string);
232 } else {
233 let pad_needed = target_len - str_len;
234 builder.write_str(&padding_buf[..pad_needed])?;
235 builder.append_value(string);
236 }
237 }
238 None => builder.append_null(),
239 }
240 }
241
242 Ok(Arc::new(builder.finish()) as ArrayRef)
243}
244
245fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
246 string_array: V,
247 target_len: usize,
248 fill: &str,
249) -> Result<ArrayRef> {
250 let fill_chars: Vec<char> = fill.chars().collect();
251
252 let (padding_buf, char_byte_offsets) = if !fill_chars.is_empty() {
257 let mut buf = String::new();
258 let mut offsets = Vec::with_capacity(target_len + 1);
259 offsets.push(0usize);
260 for i in 0..target_len {
261 buf.push(fill_chars[i % fill_chars.len()]);
262 offsets.push(buf.len());
263 }
264 (buf, offsets)
265 } else {
266 (String::new(), vec![0])
267 };
268
269 let data_capacity = string_array.len().saturating_mul(target_len * 4);
272 let mut builder =
273 GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
274
275 for maybe_string in string_array.iter() {
276 match maybe_string {
277 Some(string) => match char_count_or_boundary(string, target_len) {
278 StringCharLen::ByteOffset(offset) => {
279 builder.append_value(&string[..offset]);
280 }
281 StringCharLen::CharCount(char_count) => {
282 if !fill_chars.is_empty() {
283 let pad_chars = target_len - char_count;
284 let pad_bytes = char_byte_offsets[pad_chars];
285 builder.write_str(&padding_buf[..pad_bytes])?;
286 }
287 builder.append_value(string);
288 }
289 },
290 None => builder.append_null(),
291 }
292 }
293
294 Ok(Arc::new(builder.finish()) as ArrayRef)
295}
296
297fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
301 if args.len() <= 1 || args.len() > 3 {
302 return exec_err!(
303 "lpad was called with {} arguments. It requires at least 2 and at most 3.",
304 args.len()
305 );
306 }
307
308 let length_array = as_int64_array(&args[1])?;
309
310 match (args.len(), args[0].data_type()) {
311 (2, Utf8View) => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, T>(
312 &args[0].as_string_view(),
313 length_array,
314 None,
315 ),
316 (2, Utf8 | LargeUtf8) => lpad_impl::<
317 &GenericStringArray<T>,
318 &GenericStringArray<T>,
319 T,
320 >(&args[0].as_string::<T>(), length_array, None),
321 (3, Utf8View) => lpad_with_replace::<&StringViewArray, T>(
322 &args[0].as_string_view(),
323 length_array,
324 &args[2],
325 ),
326 (3, Utf8 | LargeUtf8) => lpad_with_replace::<&GenericStringArray<T>, T>(
327 &args[0].as_string::<T>(),
328 length_array,
329 &args[2],
330 ),
331 (len, dt) => unreachable!("lpad: unexpected arg count ({len}) or type ({dt})"),
332 }
333}
334
335fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
336 string_array: &V,
337 length_array: &Int64Array,
338 fill_array: &'a ArrayRef,
339) -> Result<ArrayRef>
340where
341 V: StringArrayType<'a>,
342{
343 match fill_array.data_type() {
344 Utf8View => lpad_impl::<V, &StringViewArray, T>(
345 string_array,
346 length_array,
347 Some(fill_array.as_string_view()),
348 ),
349 LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
350 string_array,
351 length_array,
352 Some(fill_array.as_string::<i64>()),
353 ),
354 Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
355 string_array,
356 length_array,
357 Some(fill_array.as_string::<i32>()),
358 ),
359 other => {
360 exec_err!("Unsupported data type {other:?} for function lpad")
361 }
362 }
363}
364
365fn lpad_impl<'a, V, V2, T>(
366 string_array: &V,
367 length_array: &Int64Array,
368 fill_array: Option<V2>,
369) -> Result<ArrayRef>
370where
371 V: StringArrayType<'a>,
372 V2: StringArrayType<'a>,
373 T: OffsetSizeTrait,
374{
375 let array = if let Some(fill_array) = fill_array {
376 let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
377 let mut fill_chars_buf = Vec::new();
378
379 for ((string, target_len), fill) in string_array
380 .iter()
381 .zip(length_array.iter())
382 .zip(fill_array.iter())
383 {
384 if let (Some(string), Some(target_len), Some(fill)) =
385 (string, target_len, fill)
386 {
387 if target_len > i32::MAX as i64 {
388 return exec_err!(
389 "lpad requested length {target_len} too large, maximum allowed length is {}",
390 i32::MAX
391 );
392 }
393
394 let target_len = if target_len < 0 {
395 0
396 } else {
397 target_len as usize
398 };
399 if target_len == 0 {
400 builder.append_value("");
401 continue;
402 }
403
404 if string.is_ascii() && fill.is_ascii() {
405 let str_len = string.len();
407 if target_len < str_len {
408 builder.append_value(&string[..target_len]);
409 } else if fill.is_empty() {
410 builder.append_value(string);
411 } else {
412 let pad_len = target_len - str_len;
413 let fill_len = fill.len();
414 let full_reps = pad_len / fill_len;
415 let remainder = pad_len % fill_len;
416 for _ in 0..full_reps {
417 builder.write_str(fill)?;
418 }
419 if remainder > 0 {
420 builder.write_str(&fill[..remainder])?;
421 }
422 builder.append_value(string);
423 }
424 } else {
425 fill_chars_buf.clear();
426 fill_chars_buf.extend(fill.chars());
427
428 match char_count_or_boundary(string, target_len) {
429 StringCharLen::ByteOffset(offset) => {
430 builder.append_value(&string[..offset]);
431 }
432 StringCharLen::CharCount(char_count) => {
433 if !fill_chars_buf.is_empty() {
434 for l in 0..target_len - char_count {
435 let c = *fill_chars_buf
436 .get(l % fill_chars_buf.len())
437 .unwrap();
438 builder.write_char(c)?;
439 }
440 }
441 builder.append_value(string);
442 }
443 }
444 }
445 } else {
446 builder.append_null();
447 }
448 }
449
450 builder.finish()
451 } else {
452 let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
453
454 for (string, target_len) in string_array.iter().zip(length_array.iter()) {
455 if let (Some(string), Some(target_len)) = (string, target_len) {
456 if target_len > i32::MAX as i64 {
457 return exec_err!(
458 "lpad requested length {target_len} too large, maximum allowed length is {}",
459 i32::MAX
460 );
461 }
462
463 let target_len = if target_len < 0 {
464 0
465 } else {
466 target_len as usize
467 };
468 if target_len == 0 {
469 builder.append_value("");
470 continue;
471 }
472
473 if string.is_ascii() {
474 let str_len = string.len();
476 if target_len < str_len {
477 builder.append_value(&string[..target_len]);
478 } else {
479 for _ in 0..(target_len - str_len) {
480 builder.write_str(" ")?;
481 }
482 builder.append_value(string);
483 }
484 } else {
485 match char_count_or_boundary(string, target_len) {
486 StringCharLen::ByteOffset(offset) => {
487 builder.append_value(&string[..offset]);
488 }
489 StringCharLen::CharCount(char_count) => {
490 for _ in 0..(target_len - char_count) {
491 builder.write_str(" ")?;
492 }
493 builder.append_value(string);
494 }
495 }
496 }
497 } else {
498 builder.append_null();
499 }
500 }
501
502 builder.finish()
503 };
504
505 Ok(Arc::new(array) as ArrayRef)
506}
507
508#[cfg(test)]
509mod tests {
510 use crate::unicode::lpad::LPadFunc;
511 use crate::utils::test::test_function;
512
513 use arrow::array::{Array, LargeStringArray, StringArray};
514 use arrow::datatypes::DataType::{LargeUtf8, Utf8};
515
516 use datafusion_common::{Result, ScalarValue};
517 use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
518
519 macro_rules! test_lpad {
520 ($INPUT:expr, $LENGTH:expr, $EXPECTED:expr) => {
521 test_function!(
522 LPadFunc::new(),
523 vec![
524 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
525 ColumnarValue::Scalar($LENGTH)
526 ],
527 $EXPECTED,
528 &str,
529 Utf8,
530 StringArray
531 );
532
533 test_function!(
534 LPadFunc::new(),
535 vec![
536 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
537 ColumnarValue::Scalar($LENGTH)
538 ],
539 $EXPECTED,
540 &str,
541 LargeUtf8,
542 LargeStringArray
543 );
544
545 test_function!(
546 LPadFunc::new(),
547 vec![
548 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
549 ColumnarValue::Scalar($LENGTH)
550 ],
551 $EXPECTED,
552 &str,
553 Utf8,
554 StringArray
555 );
556 };
557
558 ($INPUT:expr, $LENGTH:expr, $REPLACE:expr, $EXPECTED:expr) => {
559 test_function!(
561 LPadFunc::new(),
562 vec![
563 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
564 ColumnarValue::Scalar($LENGTH),
565 ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
566 ],
567 $EXPECTED,
568 &str,
569 Utf8,
570 StringArray
571 );
572 test_function!(
574 LPadFunc::new(),
575 vec![
576 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
577 ColumnarValue::Scalar($LENGTH),
578 ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
579 ],
580 $EXPECTED,
581 &str,
582 Utf8,
583 StringArray
584 );
585 test_function!(
587 LPadFunc::new(),
588 vec![
589 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
590 ColumnarValue::Scalar($LENGTH),
591 ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
592 ],
593 $EXPECTED,
594 &str,
595 Utf8,
596 StringArray
597 );
598
599 test_function!(
601 LPadFunc::new(),
602 vec![
603 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
604 ColumnarValue::Scalar($LENGTH),
605 ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
606 ],
607 $EXPECTED,
608 &str,
609 LargeUtf8,
610 LargeStringArray
611 );
612 test_function!(
614 LPadFunc::new(),
615 vec![
616 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
617 ColumnarValue::Scalar($LENGTH),
618 ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
619 ],
620 $EXPECTED,
621 &str,
622 LargeUtf8,
623 LargeStringArray
624 );
625 test_function!(
627 LPadFunc::new(),
628 vec![
629 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
630 ColumnarValue::Scalar($LENGTH),
631 ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
632 ],
633 $EXPECTED,
634 &str,
635 LargeUtf8,
636 LargeStringArray
637 );
638
639 test_function!(
641 LPadFunc::new(),
642 vec![
643 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
644 ColumnarValue::Scalar($LENGTH),
645 ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
646 ],
647 $EXPECTED,
648 &str,
649 Utf8,
650 StringArray
651 );
652 test_function!(
654 LPadFunc::new(),
655 vec![
656 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
657 ColumnarValue::Scalar($LENGTH),
658 ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
659 ],
660 $EXPECTED,
661 &str,
662 Utf8,
663 StringArray
664 );
665 test_function!(
667 LPadFunc::new(),
668 vec![
669 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
670 ColumnarValue::Scalar($LENGTH),
671 ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
672 ],
673 $EXPECTED,
674 &str,
675 Utf8,
676 StringArray
677 );
678 };
679 }
680
681 #[test]
682 fn test_functions() -> Result<()> {
683 test_lpad!(
684 Some("josé".into()),
685 ScalarValue::Int64(Some(5i64)),
686 Ok(Some(" josé"))
687 );
688 test_lpad!(
689 Some("hi".into()),
690 ScalarValue::Int64(Some(5i64)),
691 Ok(Some(" hi"))
692 );
693 test_lpad!(
694 Some("hi".into()),
695 ScalarValue::Int64(Some(0i64)),
696 Ok(Some(""))
697 );
698 test_lpad!(Some("hi".into()), ScalarValue::Int64(None), Ok(None));
699 test_lpad!(None, ScalarValue::Int64(Some(5i64)), Ok(None));
700 test_lpad!(
701 Some("hi".into()),
702 ScalarValue::Int64(Some(5i64)),
703 Some("xy".into()),
704 Ok(Some("xyxhi"))
705 );
706 test_lpad!(
707 Some("hi".into()),
708 ScalarValue::Int64(Some(21i64)),
709 Some("abcdef".into()),
710 Ok(Some("abcdefabcdefabcdefahi"))
711 );
712 test_lpad!(
713 Some("hi".into()),
714 ScalarValue::Int64(Some(5i64)),
715 Some(" ".into()),
716 Ok(Some(" hi"))
717 );
718 test_lpad!(
719 Some("hi".into()),
720 ScalarValue::Int64(Some(5i64)),
721 Some("".into()),
722 Ok(Some("hi"))
723 );
724 test_lpad!(
725 None,
726 ScalarValue::Int64(Some(5i64)),
727 Some("xy".into()),
728 Ok(None)
729 );
730 test_lpad!(
731 Some("hi".into()),
732 ScalarValue::Int64(None),
733 Some("xy".into()),
734 Ok(None)
735 );
736 test_lpad!(
737 Some("hi".into()),
738 ScalarValue::Int64(Some(5i64)),
739 None,
740 Ok(None)
741 );
742 test_lpad!(
743 Some("hello".into()),
744 ScalarValue::Int64(Some(2i64)),
745 Ok(Some("he"))
746 );
747 test_lpad!(
748 Some("hi".into()),
749 ScalarValue::Int64(Some(6i64)),
750 Some("xy".into()),
751 Ok(Some("xyxyhi"))
752 );
753 test_lpad!(
754 Some("josé".into()),
755 ScalarValue::Int64(Some(10i64)),
756 Some("xy".into()),
757 Ok(Some("xyxyxyjosé"))
758 );
759 test_lpad!(
760 Some("josé".into()),
761 ScalarValue::Int64(Some(10i64)),
762 Some("éñ".into()),
763 Ok(Some("éñéñéñjosé"))
764 );
765
766 #[cfg(not(feature = "unicode_expressions"))]
767 test_lpad!(
768 Some("josé".into()),
769 ScalarValue::Int64(Some(5i64)),
770 internal_err!(
771 "function lpad requires compilation with feature flag: unicode_expressions."
772 )
773 );
774
775 Ok(())
776 }
777}