1use std::any::Any;
19use std::fmt::Write;
20use std::sync::Arc;
21
22use DataType::{LargeUtf8, Utf8, Utf8View};
23use arrow::array::{
24 Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
25 OffsetSizeTrait, StringArrayType, StringViewArray,
26};
27use arrow::datatypes::DataType;
28use unicode_segmentation::UnicodeSegmentation;
29
30use crate::utils::{make_scalar_function, utf8_to_str_type};
31use datafusion_common::cast::as_int64_array;
32use datafusion_common::{Result, exec_err};
33use datafusion_expr::TypeSignature::Exact;
34use datafusion_expr::{
35 ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
36};
37use datafusion_macros::user_doc;
38
39#[user_doc(
40 doc_section(label = "String Functions"),
41 description = "Pads the left side of a string with another string to a specified string length.",
42 syntax_example = "lpad(str, n[, padding_str])",
43 sql_example = r#"```sql
44> select lpad('Dolly', 10, 'hello');
45+---------------------------------------------+
46| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) |
47+---------------------------------------------+
48| helloDolly |
49+---------------------------------------------+
50```"#,
51 standard_argument(name = "str", prefix = "String"),
52 argument(name = "n", description = "String length to pad to."),
53 argument(
54 name = "padding_str",
55 description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
56 ),
57 related_udf(name = "rpad")
58)]
59#[derive(Debug, PartialEq, Eq, Hash)]
60pub struct LPadFunc {
61 signature: Signature,
62}
63
64impl Default for LPadFunc {
65 fn default() -> Self {
66 Self::new()
67 }
68}
69
70impl LPadFunc {
71 pub fn new() -> Self {
72 use DataType::*;
73 Self {
74 signature: Signature::one_of(
75 vec![
76 Exact(vec![Utf8View, Int64]),
77 Exact(vec![Utf8View, Int64, Utf8View]),
78 Exact(vec![Utf8View, Int64, Utf8]),
79 Exact(vec![Utf8View, Int64, LargeUtf8]),
80 Exact(vec![Utf8, Int64]),
81 Exact(vec![Utf8, Int64, Utf8View]),
82 Exact(vec![Utf8, Int64, Utf8]),
83 Exact(vec![Utf8, Int64, LargeUtf8]),
84 Exact(vec![LargeUtf8, Int64]),
85 Exact(vec![LargeUtf8, Int64, Utf8View]),
86 Exact(vec![LargeUtf8, Int64, Utf8]),
87 Exact(vec![LargeUtf8, Int64, LargeUtf8]),
88 ],
89 Volatility::Immutable,
90 ),
91 }
92 }
93}
94
95impl ScalarUDFImpl for LPadFunc {
96 fn as_any(&self) -> &dyn Any {
97 self
98 }
99
100 fn name(&self) -> &str {
101 "lpad"
102 }
103
104 fn signature(&self) -> &Signature {
105 &self.signature
106 }
107
108 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
109 utf8_to_str_type(&arg_types[0], "lpad")
110 }
111
112 fn invoke_with_args(
113 &self,
114 args: datafusion_expr::ScalarFunctionArgs,
115 ) -> Result<ColumnarValue> {
116 let args = &args.args;
117 match args[0].data_type() {
118 Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(args),
119 LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args),
120 other => exec_err!("Unsupported data type {other:?} for function lpad"),
121 }
122 }
123
124 fn documentation(&self) -> Option<&Documentation> {
125 self.doc()
126 }
127}
128
129fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
133 if args.len() <= 1 || args.len() > 3 {
134 return exec_err!(
135 "lpad was called with {} arguments. It requires at least 2 and at most 3.",
136 args.len()
137 );
138 }
139
140 let length_array = as_int64_array(&args[1])?;
141
142 match (args.len(), args[0].data_type()) {
143 (2, Utf8View) => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, T>(
144 &args[0].as_string_view(),
145 length_array,
146 None,
147 ),
148 (2, Utf8 | LargeUtf8) => lpad_impl::<
149 &GenericStringArray<T>,
150 &GenericStringArray<T>,
151 T,
152 >(&args[0].as_string::<T>(), length_array, None),
153 (3, Utf8View) => lpad_with_replace::<&StringViewArray, T>(
154 &args[0].as_string_view(),
155 length_array,
156 &args[2],
157 ),
158 (3, Utf8 | LargeUtf8) => lpad_with_replace::<&GenericStringArray<T>, T>(
159 &args[0].as_string::<T>(),
160 length_array,
161 &args[2],
162 ),
163 (_, _) => unreachable!("lpad"),
164 }
165}
166
167fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
168 string_array: &V,
169 length_array: &Int64Array,
170 fill_array: &'a ArrayRef,
171) -> Result<ArrayRef>
172where
173 V: StringArrayType<'a>,
174{
175 match fill_array.data_type() {
176 Utf8View => lpad_impl::<V, &StringViewArray, T>(
177 string_array,
178 length_array,
179 Some(fill_array.as_string_view()),
180 ),
181 LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
182 string_array,
183 length_array,
184 Some(fill_array.as_string::<i64>()),
185 ),
186 Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
187 string_array,
188 length_array,
189 Some(fill_array.as_string::<i32>()),
190 ),
191 other => {
192 exec_err!("Unsupported data type {other:?} for function lpad")
193 }
194 }
195}
196
197fn lpad_impl<'a, V, V2, T>(
198 string_array: &V,
199 length_array: &Int64Array,
200 fill_array: Option<V2>,
201) -> Result<ArrayRef>
202where
203 V: StringArrayType<'a>,
204 V2: StringArrayType<'a>,
205 T: OffsetSizeTrait,
206{
207 let array = if let Some(fill_array) = fill_array {
208 let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
209 let mut graphemes_buf = Vec::new();
210 let mut fill_chars_buf = Vec::new();
211
212 for ((string, length), fill) in string_array
213 .iter()
214 .zip(length_array.iter())
215 .zip(fill_array.iter())
216 {
217 if let (Some(string), Some(length), Some(fill)) = (string, length, fill) {
218 if length > i32::MAX as i64 {
219 return exec_err!("lpad requested length {length} too large");
220 }
221
222 let length = if length < 0 { 0 } else { length as usize };
223 if length == 0 {
224 builder.append_value("");
225 continue;
226 }
227
228 graphemes_buf.clear();
230 graphemes_buf.extend(string.graphemes(true));
231
232 fill_chars_buf.clear();
233 fill_chars_buf.extend(fill.chars());
234
235 if length < graphemes_buf.len() {
236 builder.append_value(graphemes_buf[..length].concat());
237 } else if fill_chars_buf.is_empty() {
238 builder.append_value(string);
239 } else {
240 for l in 0..length - graphemes_buf.len() {
241 let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
242 builder.write_char(c)?;
243 }
244 builder.write_str(string)?;
245 builder.append_value("");
246 }
247 } else {
248 builder.append_null();
249 }
250 }
251
252 builder.finish()
253 } else {
254 let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
255 let mut graphemes_buf = Vec::new();
256
257 for (string, length) in string_array.iter().zip(length_array.iter()) {
258 if let (Some(string), Some(length)) = (string, length) {
259 if length > i32::MAX as i64 {
260 return exec_err!("lpad requested length {length} too large");
261 }
262
263 let length = if length < 0 { 0 } else { length as usize };
264 if length == 0 {
265 builder.append_value("");
266 continue;
267 }
268
269 graphemes_buf.clear();
271 graphemes_buf.extend(string.graphemes(true));
272
273 if length < graphemes_buf.len() {
274 builder.append_value(graphemes_buf[..length].concat());
275 } else {
276 builder
277 .write_str(" ".repeat(length - graphemes_buf.len()).as_str())?;
278 builder.write_str(string)?;
279 builder.append_value("");
280 }
281 } else {
282 builder.append_null();
283 }
284 }
285
286 builder.finish()
287 };
288
289 Ok(Arc::new(array) as ArrayRef)
290}
291
292#[cfg(test)]
293mod tests {
294 use crate::unicode::lpad::LPadFunc;
295 use crate::utils::test::test_function;
296
297 use arrow::array::{Array, LargeStringArray, StringArray};
298 use arrow::datatypes::DataType::{LargeUtf8, Utf8};
299
300 use datafusion_common::{Result, ScalarValue};
301 use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
302
303 macro_rules! test_lpad {
304 ($INPUT:expr, $LENGTH:expr, $EXPECTED:expr) => {
305 test_function!(
306 LPadFunc::new(),
307 vec![
308 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
309 ColumnarValue::Scalar($LENGTH)
310 ],
311 $EXPECTED,
312 &str,
313 Utf8,
314 StringArray
315 );
316
317 test_function!(
318 LPadFunc::new(),
319 vec![
320 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
321 ColumnarValue::Scalar($LENGTH)
322 ],
323 $EXPECTED,
324 &str,
325 LargeUtf8,
326 LargeStringArray
327 );
328
329 test_function!(
330 LPadFunc::new(),
331 vec![
332 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
333 ColumnarValue::Scalar($LENGTH)
334 ],
335 $EXPECTED,
336 &str,
337 Utf8,
338 StringArray
339 );
340 };
341
342 ($INPUT:expr, $LENGTH:expr, $REPLACE:expr, $EXPECTED:expr) => {
343 test_function!(
345 LPadFunc::new(),
346 vec![
347 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
348 ColumnarValue::Scalar($LENGTH),
349 ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
350 ],
351 $EXPECTED,
352 &str,
353 Utf8,
354 StringArray
355 );
356 test_function!(
358 LPadFunc::new(),
359 vec![
360 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
361 ColumnarValue::Scalar($LENGTH),
362 ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
363 ],
364 $EXPECTED,
365 &str,
366 Utf8,
367 StringArray
368 );
369 test_function!(
371 LPadFunc::new(),
372 vec![
373 ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
374 ColumnarValue::Scalar($LENGTH),
375 ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
376 ],
377 $EXPECTED,
378 &str,
379 Utf8,
380 StringArray
381 );
382
383 test_function!(
385 LPadFunc::new(),
386 vec![
387 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
388 ColumnarValue::Scalar($LENGTH),
389 ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
390 ],
391 $EXPECTED,
392 &str,
393 LargeUtf8,
394 LargeStringArray
395 );
396 test_function!(
398 LPadFunc::new(),
399 vec![
400 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
401 ColumnarValue::Scalar($LENGTH),
402 ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
403 ],
404 $EXPECTED,
405 &str,
406 LargeUtf8,
407 LargeStringArray
408 );
409 test_function!(
411 LPadFunc::new(),
412 vec![
413 ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
414 ColumnarValue::Scalar($LENGTH),
415 ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
416 ],
417 $EXPECTED,
418 &str,
419 LargeUtf8,
420 LargeStringArray
421 );
422
423 test_function!(
425 LPadFunc::new(),
426 vec![
427 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
428 ColumnarValue::Scalar($LENGTH),
429 ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
430 ],
431 $EXPECTED,
432 &str,
433 Utf8,
434 StringArray
435 );
436 test_function!(
438 LPadFunc::new(),
439 vec![
440 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
441 ColumnarValue::Scalar($LENGTH),
442 ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
443 ],
444 $EXPECTED,
445 &str,
446 Utf8,
447 StringArray
448 );
449 test_function!(
451 LPadFunc::new(),
452 vec![
453 ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
454 ColumnarValue::Scalar($LENGTH),
455 ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
456 ],
457 $EXPECTED,
458 &str,
459 Utf8,
460 StringArray
461 );
462 };
463 }
464
465 #[test]
466 fn test_functions() -> Result<()> {
467 test_lpad!(
468 Some("josé".into()),
469 ScalarValue::Int64(Some(5i64)),
470 Ok(Some(" josé"))
471 );
472 test_lpad!(
473 Some("hi".into()),
474 ScalarValue::Int64(Some(5i64)),
475 Ok(Some(" hi"))
476 );
477 test_lpad!(
478 Some("hi".into()),
479 ScalarValue::Int64(Some(0i64)),
480 Ok(Some(""))
481 );
482 test_lpad!(Some("hi".into()), ScalarValue::Int64(None), Ok(None));
483 test_lpad!(None, ScalarValue::Int64(Some(5i64)), Ok(None));
484 test_lpad!(
485 Some("hi".into()),
486 ScalarValue::Int64(Some(5i64)),
487 Some("xy".into()),
488 Ok(Some("xyxhi"))
489 );
490 test_lpad!(
491 Some("hi".into()),
492 ScalarValue::Int64(Some(21i64)),
493 Some("abcdef".into()),
494 Ok(Some("abcdefabcdefabcdefahi"))
495 );
496 test_lpad!(
497 Some("hi".into()),
498 ScalarValue::Int64(Some(5i64)),
499 Some(" ".into()),
500 Ok(Some(" hi"))
501 );
502 test_lpad!(
503 Some("hi".into()),
504 ScalarValue::Int64(Some(5i64)),
505 Some("".into()),
506 Ok(Some("hi"))
507 );
508 test_lpad!(
509 None,
510 ScalarValue::Int64(Some(5i64)),
511 Some("xy".into()),
512 Ok(None)
513 );
514 test_lpad!(
515 Some("hi".into()),
516 ScalarValue::Int64(None),
517 Some("xy".into()),
518 Ok(None)
519 );
520 test_lpad!(
521 Some("hi".into()),
522 ScalarValue::Int64(Some(5i64)),
523 None,
524 Ok(None)
525 );
526 test_lpad!(
527 Some("josé".into()),
528 ScalarValue::Int64(Some(10i64)),
529 Some("xy".into()),
530 Ok(Some("xyxyxyjosé"))
531 );
532 test_lpad!(
533 Some("josé".into()),
534 ScalarValue::Int64(Some(10i64)),
535 Some("éñ".into()),
536 Ok(Some("éñéñéñjosé"))
537 );
538
539 #[cfg(not(feature = "unicode_expressions"))]
540 test_lpad!(
541 Some("josé".into()),
542 ScalarValue::Int64(Some(5i64)),
543 internal_err!(
544 "function lpad requires compilation with feature flag: unicode_expressions."
545 )
546 );
547
548 Ok(())
549 }
550}