datafusion_functions/unicode/
lpad.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::any::Any;
19use std::fmt::Write;
20use std::sync::Arc;
21
22use DataType::{LargeUtf8, Utf8, Utf8View};
23use arrow::array::{
24    Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
25    OffsetSizeTrait, StringArrayType, StringViewArray,
26};
27use arrow::datatypes::DataType;
28use unicode_segmentation::UnicodeSegmentation;
29
30use crate::utils::{make_scalar_function, utf8_to_str_type};
31use datafusion_common::cast::as_int64_array;
32use datafusion_common::{Result, exec_err};
33use datafusion_expr::TypeSignature::Exact;
34use datafusion_expr::{
35    ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility,
36};
37use datafusion_macros::user_doc;
38
39#[user_doc(
40    doc_section(label = "String Functions"),
41    description = "Pads the left side of a string with another string to a specified string length.",
42    syntax_example = "lpad(str, n[, padding_str])",
43    sql_example = r#"```sql
44> select lpad('Dolly', 10, 'hello');
45+---------------------------------------------+
46| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) |
47+---------------------------------------------+
48| helloDolly                                  |
49+---------------------------------------------+
50```"#,
51    standard_argument(name = "str", prefix = "String"),
52    argument(name = "n", description = "String length to pad to."),
53    argument(
54        name = "padding_str",
55        description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
56    ),
57    related_udf(name = "rpad")
58)]
59#[derive(Debug, PartialEq, Eq, Hash)]
60pub struct LPadFunc {
61    signature: Signature,
62}
63
64impl Default for LPadFunc {
65    fn default() -> Self {
66        Self::new()
67    }
68}
69
70impl LPadFunc {
71    pub fn new() -> Self {
72        use DataType::*;
73        Self {
74            signature: Signature::one_of(
75                vec![
76                    Exact(vec![Utf8View, Int64]),
77                    Exact(vec![Utf8View, Int64, Utf8View]),
78                    Exact(vec![Utf8View, Int64, Utf8]),
79                    Exact(vec![Utf8View, Int64, LargeUtf8]),
80                    Exact(vec![Utf8, Int64]),
81                    Exact(vec![Utf8, Int64, Utf8View]),
82                    Exact(vec![Utf8, Int64, Utf8]),
83                    Exact(vec![Utf8, Int64, LargeUtf8]),
84                    Exact(vec![LargeUtf8, Int64]),
85                    Exact(vec![LargeUtf8, Int64, Utf8View]),
86                    Exact(vec![LargeUtf8, Int64, Utf8]),
87                    Exact(vec![LargeUtf8, Int64, LargeUtf8]),
88                ],
89                Volatility::Immutable,
90            ),
91        }
92    }
93}
94
95impl ScalarUDFImpl for LPadFunc {
96    fn as_any(&self) -> &dyn Any {
97        self
98    }
99
100    fn name(&self) -> &str {
101        "lpad"
102    }
103
104    fn signature(&self) -> &Signature {
105        &self.signature
106    }
107
108    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
109        utf8_to_str_type(&arg_types[0], "lpad")
110    }
111
112    fn invoke_with_args(
113        &self,
114        args: datafusion_expr::ScalarFunctionArgs,
115    ) -> Result<ColumnarValue> {
116        let args = &args.args;
117        match args[0].data_type() {
118            Utf8 | Utf8View => make_scalar_function(lpad::<i32>, vec![])(args),
119            LargeUtf8 => make_scalar_function(lpad::<i64>, vec![])(args),
120            other => exec_err!("Unsupported data type {other:?} for function lpad"),
121        }
122    }
123
124    fn documentation(&self) -> Option<&Documentation> {
125        self.doc()
126    }
127}
128
129/// Extends the string to length 'length' by prepending the characters fill (a space by default).
130/// If the string is already longer than length then it is truncated (on the right).
131/// lpad('hi', 5, 'xy') = 'xyxhi'
132fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
133    if args.len() <= 1 || args.len() > 3 {
134        return exec_err!(
135            "lpad was called with {} arguments. It requires at least 2 and at most 3.",
136            args.len()
137        );
138    }
139
140    let length_array = as_int64_array(&args[1])?;
141
142    match (args.len(), args[0].data_type()) {
143        (2, Utf8View) => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, T>(
144            &args[0].as_string_view(),
145            length_array,
146            None,
147        ),
148        (2, Utf8 | LargeUtf8) => lpad_impl::<
149            &GenericStringArray<T>,
150            &GenericStringArray<T>,
151            T,
152        >(&args[0].as_string::<T>(), length_array, None),
153        (3, Utf8View) => lpad_with_replace::<&StringViewArray, T>(
154            &args[0].as_string_view(),
155            length_array,
156            &args[2],
157        ),
158        (3, Utf8 | LargeUtf8) => lpad_with_replace::<&GenericStringArray<T>, T>(
159            &args[0].as_string::<T>(),
160            length_array,
161            &args[2],
162        ),
163        (_, _) => unreachable!("lpad"),
164    }
165}
166
167fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
168    string_array: &V,
169    length_array: &Int64Array,
170    fill_array: &'a ArrayRef,
171) -> Result<ArrayRef>
172where
173    V: StringArrayType<'a>,
174{
175    match fill_array.data_type() {
176        Utf8View => lpad_impl::<V, &StringViewArray, T>(
177            string_array,
178            length_array,
179            Some(fill_array.as_string_view()),
180        ),
181        LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
182            string_array,
183            length_array,
184            Some(fill_array.as_string::<i64>()),
185        ),
186        Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
187            string_array,
188            length_array,
189            Some(fill_array.as_string::<i32>()),
190        ),
191        other => {
192            exec_err!("Unsupported data type {other:?} for function lpad")
193        }
194    }
195}
196
197fn lpad_impl<'a, V, V2, T>(
198    string_array: &V,
199    length_array: &Int64Array,
200    fill_array: Option<V2>,
201) -> Result<ArrayRef>
202where
203    V: StringArrayType<'a>,
204    V2: StringArrayType<'a>,
205    T: OffsetSizeTrait,
206{
207    let array = if let Some(fill_array) = fill_array {
208        let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
209        let mut graphemes_buf = Vec::new();
210        let mut fill_chars_buf = Vec::new();
211
212        for ((string, length), fill) in string_array
213            .iter()
214            .zip(length_array.iter())
215            .zip(fill_array.iter())
216        {
217            if let (Some(string), Some(length), Some(fill)) = (string, length, fill) {
218                if length > i32::MAX as i64 {
219                    return exec_err!("lpad requested length {length} too large");
220                }
221
222                let length = if length < 0 { 0 } else { length as usize };
223                if length == 0 {
224                    builder.append_value("");
225                    continue;
226                }
227
228                // Reuse buffers by clearing and refilling
229                graphemes_buf.clear();
230                graphemes_buf.extend(string.graphemes(true));
231
232                fill_chars_buf.clear();
233                fill_chars_buf.extend(fill.chars());
234
235                if length < graphemes_buf.len() {
236                    builder.append_value(graphemes_buf[..length].concat());
237                } else if fill_chars_buf.is_empty() {
238                    builder.append_value(string);
239                } else {
240                    for l in 0..length - graphemes_buf.len() {
241                        let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
242                        builder.write_char(c)?;
243                    }
244                    builder.write_str(string)?;
245                    builder.append_value("");
246                }
247            } else {
248                builder.append_null();
249            }
250        }
251
252        builder.finish()
253    } else {
254        let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
255        let mut graphemes_buf = Vec::new();
256
257        for (string, length) in string_array.iter().zip(length_array.iter()) {
258            if let (Some(string), Some(length)) = (string, length) {
259                if length > i32::MAX as i64 {
260                    return exec_err!("lpad requested length {length} too large");
261                }
262
263                let length = if length < 0 { 0 } else { length as usize };
264                if length == 0 {
265                    builder.append_value("");
266                    continue;
267                }
268
269                // Reuse buffer by clearing and refilling
270                graphemes_buf.clear();
271                graphemes_buf.extend(string.graphemes(true));
272
273                if length < graphemes_buf.len() {
274                    builder.append_value(graphemes_buf[..length].concat());
275                } else {
276                    builder
277                        .write_str(" ".repeat(length - graphemes_buf.len()).as_str())?;
278                    builder.write_str(string)?;
279                    builder.append_value("");
280                }
281            } else {
282                builder.append_null();
283            }
284        }
285
286        builder.finish()
287    };
288
289    Ok(Arc::new(array) as ArrayRef)
290}
291
292#[cfg(test)]
293mod tests {
294    use crate::unicode::lpad::LPadFunc;
295    use crate::utils::test::test_function;
296
297    use arrow::array::{Array, LargeStringArray, StringArray};
298    use arrow::datatypes::DataType::{LargeUtf8, Utf8};
299
300    use datafusion_common::{Result, ScalarValue};
301    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
302
303    macro_rules! test_lpad {
304        ($INPUT:expr, $LENGTH:expr, $EXPECTED:expr) => {
305            test_function!(
306                LPadFunc::new(),
307                vec![
308                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
309                    ColumnarValue::Scalar($LENGTH)
310                ],
311                $EXPECTED,
312                &str,
313                Utf8,
314                StringArray
315            );
316
317            test_function!(
318                LPadFunc::new(),
319                vec![
320                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
321                    ColumnarValue::Scalar($LENGTH)
322                ],
323                $EXPECTED,
324                &str,
325                LargeUtf8,
326                LargeStringArray
327            );
328
329            test_function!(
330                LPadFunc::new(),
331                vec![
332                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
333                    ColumnarValue::Scalar($LENGTH)
334                ],
335                $EXPECTED,
336                &str,
337                Utf8,
338                StringArray
339            );
340        };
341
342        ($INPUT:expr, $LENGTH:expr, $REPLACE:expr, $EXPECTED:expr) => {
343            // utf8, utf8
344            test_function!(
345                LPadFunc::new(),
346                vec![
347                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
348                    ColumnarValue::Scalar($LENGTH),
349                    ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
350                ],
351                $EXPECTED,
352                &str,
353                Utf8,
354                StringArray
355            );
356            // utf8, largeutf8
357            test_function!(
358                LPadFunc::new(),
359                vec![
360                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
361                    ColumnarValue::Scalar($LENGTH),
362                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
363                ],
364                $EXPECTED,
365                &str,
366                Utf8,
367                StringArray
368            );
369            // utf8, utf8view
370            test_function!(
371                LPadFunc::new(),
372                vec![
373                    ColumnarValue::Scalar(ScalarValue::Utf8($INPUT)),
374                    ColumnarValue::Scalar($LENGTH),
375                    ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
376                ],
377                $EXPECTED,
378                &str,
379                Utf8,
380                StringArray
381            );
382
383            // largeutf8, utf8
384            test_function!(
385                LPadFunc::new(),
386                vec![
387                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
388                    ColumnarValue::Scalar($LENGTH),
389                    ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
390                ],
391                $EXPECTED,
392                &str,
393                LargeUtf8,
394                LargeStringArray
395            );
396            // largeutf8, largeutf8
397            test_function!(
398                LPadFunc::new(),
399                vec![
400                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
401                    ColumnarValue::Scalar($LENGTH),
402                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
403                ],
404                $EXPECTED,
405                &str,
406                LargeUtf8,
407                LargeStringArray
408            );
409            // largeutf8, utf8view
410            test_function!(
411                LPadFunc::new(),
412                vec![
413                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT)),
414                    ColumnarValue::Scalar($LENGTH),
415                    ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
416                ],
417                $EXPECTED,
418                &str,
419                LargeUtf8,
420                LargeStringArray
421            );
422
423            // utf8view, utf8
424            test_function!(
425                LPadFunc::new(),
426                vec![
427                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
428                    ColumnarValue::Scalar($LENGTH),
429                    ColumnarValue::Scalar(ScalarValue::Utf8($REPLACE))
430                ],
431                $EXPECTED,
432                &str,
433                Utf8,
434                StringArray
435            );
436            // utf8view, largeutf8
437            test_function!(
438                LPadFunc::new(),
439                vec![
440                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
441                    ColumnarValue::Scalar($LENGTH),
442                    ColumnarValue::Scalar(ScalarValue::LargeUtf8($REPLACE))
443                ],
444                $EXPECTED,
445                &str,
446                Utf8,
447                StringArray
448            );
449            // utf8view, utf8view
450            test_function!(
451                LPadFunc::new(),
452                vec![
453                    ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT)),
454                    ColumnarValue::Scalar($LENGTH),
455                    ColumnarValue::Scalar(ScalarValue::Utf8View($REPLACE))
456                ],
457                $EXPECTED,
458                &str,
459                Utf8,
460                StringArray
461            );
462        };
463    }
464
465    #[test]
466    fn test_functions() -> Result<()> {
467        test_lpad!(
468            Some("josé".into()),
469            ScalarValue::Int64(Some(5i64)),
470            Ok(Some(" josé"))
471        );
472        test_lpad!(
473            Some("hi".into()),
474            ScalarValue::Int64(Some(5i64)),
475            Ok(Some("   hi"))
476        );
477        test_lpad!(
478            Some("hi".into()),
479            ScalarValue::Int64(Some(0i64)),
480            Ok(Some(""))
481        );
482        test_lpad!(Some("hi".into()), ScalarValue::Int64(None), Ok(None));
483        test_lpad!(None, ScalarValue::Int64(Some(5i64)), Ok(None));
484        test_lpad!(
485            Some("hi".into()),
486            ScalarValue::Int64(Some(5i64)),
487            Some("xy".into()),
488            Ok(Some("xyxhi"))
489        );
490        test_lpad!(
491            Some("hi".into()),
492            ScalarValue::Int64(Some(21i64)),
493            Some("abcdef".into()),
494            Ok(Some("abcdefabcdefabcdefahi"))
495        );
496        test_lpad!(
497            Some("hi".into()),
498            ScalarValue::Int64(Some(5i64)),
499            Some(" ".into()),
500            Ok(Some("   hi"))
501        );
502        test_lpad!(
503            Some("hi".into()),
504            ScalarValue::Int64(Some(5i64)),
505            Some("".into()),
506            Ok(Some("hi"))
507        );
508        test_lpad!(
509            None,
510            ScalarValue::Int64(Some(5i64)),
511            Some("xy".into()),
512            Ok(None)
513        );
514        test_lpad!(
515            Some("hi".into()),
516            ScalarValue::Int64(None),
517            Some("xy".into()),
518            Ok(None)
519        );
520        test_lpad!(
521            Some("hi".into()),
522            ScalarValue::Int64(Some(5i64)),
523            None,
524            Ok(None)
525        );
526        test_lpad!(
527            Some("josé".into()),
528            ScalarValue::Int64(Some(10i64)),
529            Some("xy".into()),
530            Ok(Some("xyxyxyjosé"))
531        );
532        test_lpad!(
533            Some("josé".into()),
534            ScalarValue::Int64(Some(10i64)),
535            Some("éñ".into()),
536            Ok(Some("éñéñéñjosé"))
537        );
538
539        #[cfg(not(feature = "unicode_expressions"))]
540        test_lpad!(
541            Some("josé".into()),
542            ScalarValue::Int64(Some(5i64)),
543            internal_err!(
544                "function lpad requires compilation with feature flag: unicode_expressions."
545            )
546        );
547
548        Ok(())
549    }
550}