Skip to main content

datafusion_functions/unicode/
substr.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::sync::Arc;
19
20use crate::strings::{StringViewArrayBuilder, append_view};
21use crate::utils::make_scalar_function;
22use arrow::array::{
23    Array, ArrayRef, AsArray, GenericStringArray, Int64Array, OffsetSizeTrait,
24    StringArrayType, StringViewArray, make_view,
25};
26use arrow::buffer::{NullBuffer, ScalarBuffer};
27use arrow::datatypes::DataType;
28use datafusion_common::cast::as_int64_array;
29use datafusion_common::types::{
30    NativeType, logical_int32, logical_int64, logical_string,
31};
32use datafusion_common::{Result, exec_err};
33use datafusion_expr::{
34    Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
35    TypeSignature, TypeSignatureClass, Volatility,
36};
37use datafusion_macros::user_doc;
38
39#[user_doc(
40    doc_section(label = "String Functions"),
41    description = "Extracts a substring of a specified number of characters from a specific starting position in a string.",
42    syntax_example = "substr(str, start_pos[, length])",
43    alternative_syntax = "substring(str from start_pos for length)",
44    sql_example = r#"```sql
45> select substr('datafusion', 5, 3);
46+----------------------------------------------+
47| substr(Utf8("datafusion"),Int64(5),Int64(3)) |
48+----------------------------------------------+
49| fus                                          |
50+----------------------------------------------+
51```"#,
52    standard_argument(name = "str", prefix = "String"),
53    argument(
54        name = "start_pos",
55        description = "Character position to start the substring at. The first character in the string has a position of 1. If the start position is less than 1, it is treated as if it is before the start of the string and the (absolute) number of characters before position 1 is subtracted from `length` (if given). For example, `substr('abc', -3, 6)` returns `'ab'`."
56    ),
57    argument(
58        name = "length",
59        description = "Number of characters to extract. If not specified, returns the rest of the string after the start position."
60    )
61)]
62#[derive(Debug, PartialEq, Eq, Hash)]
63pub struct SubstrFunc {
64    signature: Signature,
65    aliases: Vec<String>,
66}
67
68impl Default for SubstrFunc {
69    fn default() -> Self {
70        Self::new()
71    }
72}
73
74impl SubstrFunc {
75    pub fn new() -> Self {
76        let string = Coercion::new_exact(TypeSignatureClass::Native(logical_string()));
77        let int64 = Coercion::new_implicit(
78            TypeSignatureClass::Native(logical_int64()),
79            vec![TypeSignatureClass::Native(logical_int32())],
80            NativeType::Int64,
81        );
82        Self {
83            signature: Signature::one_of(
84                vec![
85                    TypeSignature::Coercible(vec![string.clone(), int64.clone()]),
86                    TypeSignature::Coercible(vec![
87                        string.clone(),
88                        int64.clone(),
89                        int64.clone(),
90                    ]),
91                ],
92                Volatility::Immutable,
93            )
94            .with_parameter_names(vec![
95                "str".to_string(),
96                "start_pos".to_string(),
97                "length".to_string(),
98            ])
99            .expect("valid parameter names"),
100            aliases: vec![String::from("substring")],
101        }
102    }
103}
104
105impl ScalarUDFImpl for SubstrFunc {
106    fn name(&self) -> &str {
107        "substr"
108    }
109
110    fn signature(&self) -> &Signature {
111        &self.signature
112    }
113
114    // `SubstrFunc` always generates `Utf8View` output for its efficiency.
115    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
116        Ok(DataType::Utf8View)
117    }
118
119    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
120        make_scalar_function(substr, vec![])(&args.args)
121    }
122
123    fn aliases(&self) -> &[String] {
124        &self.aliases
125    }
126
127    fn documentation(&self) -> Option<&Documentation> {
128        self.doc()
129    }
130}
131
132/// Dispatches `substr` to the appropriate string array implementation.
133fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
134    match args[0].data_type() {
135        DataType::Utf8 => {
136            let string_array = args[0].as_string::<i32>();
137            generic_string_substr(string_array, &args[1..])
138        }
139        DataType::LargeUtf8 => {
140            let string_array = args[0].as_string::<i64>();
141            generic_string_substr(string_array, &args[1..])
142        }
143        DataType::Utf8View => {
144            let string_array = args[0].as_string_view();
145            string_view_substr(string_array, &args[1..])
146        }
147        other => exec_err!(
148            "Unsupported data type {other:?} for function substr,\
149            expected Utf8View, Utf8 or LargeUtf8."
150        ),
151    }
152}
153
154/// Convert the given `start` and `count` to valid byte indices within `input` string.
155///
156/// Input `start` and `count` are equivalent to PostgreSQL's `substr(s, start, count)`.
157/// `start` is 1-based; if `count` is not provided, returns indices to the end of the string.
158/// Input indices are character-based, and return values are byte indices.
159/// The input bounds can be outside string bounds; this function will return
160/// the intersection between input bounds and valid string bounds.
161/// `is_input_ascii_only` is used to optimize this function if `input` is ASCII-only.
162///
163/// # Example
164/// ```text
165/// 'Hi🌏' in-mem (`[]` for one char, `x` for one byte): [x][x][xxxx]
166/// get_true_start_end('Hi🌏', 1, None) -> Ok((0, 6))
167/// get_true_start_end('Hi🌏', 1, Some(1)) -> Ok((0, 1))
168/// get_true_start_end('Hi🌏', -10, Some(2)) -> Ok((0, 0))
169/// ```
170pub fn get_true_start_end(
171    input: &str,
172    start: i64,
173    count: Option<i64>,
174    is_input_ascii_only: bool,
175) -> Result<(usize, usize)> {
176    if let Some(count) = count
177        && count < 0
178    {
179        return exec_err!("negative count not allowed: {count}");
180    }
181
182    // The caller-provided `start` is 1-indexed.
183    let Some(start) = start.checked_sub(1) else {
184        return exec_err!("start position overflow: {start}");
185    };
186
187    let end = match count {
188        Some(count) => start.saturating_add(count),
189        None => input.len() as i64,
190    };
191
192    let start = start.clamp(0, input.len() as i64) as usize;
193    let end = end.clamp(0, input.len() as i64) as usize;
194
195    // If input is ASCII-only, byte-based indices equal char-based indices
196    if is_input_ascii_only {
197        return Ok((start, end));
198    }
199
200    // Otherwise, calculate byte indices from char indices.  We initialize both
201    // `byte_start` and `byte_end` to the string length to handle cases where
202    // the requested 'start' or 'end' positions are at or beyond the end of the
203    // string (resulting in an empty substring).
204    let mut byte_start = input.len();
205    let mut byte_end = input.len();
206
207    for (char_idx, (byte_idx, _)) in input.char_indices().enumerate() {
208        if char_idx == start {
209            byte_start = byte_idx;
210            // If no length is specified, we only need the start offset.
211            if count.is_none() {
212                break;
213            }
214        }
215        if char_idx == end {
216            byte_end = byte_idx;
217            break;
218        }
219    }
220
221    Ok((byte_start, byte_end))
222}
223
224// String characters are variable length encoded in UTF-8, `substr()` function's
225// arguments are character-based, converting them into byte-based indices
226// requires expensive decoding.
227// However, checking if a string is ASCII-only is relatively cheap.
228// If strings are ASCII only, use byte-based indices instead.
229//
230// A common pattern to call `substr()` is taking a small prefix of a long
231// string, such as `substr(long_str_with_1k_chars, 1, 32)`.
232// In such case the overhead of ASCII-validation may not be worth it, so
233// skip the validation for short prefix for now.
234pub fn enable_ascii_fast_path<'a, V: StringArrayType<'a>>(
235    string_array: &V,
236    start: &Int64Array,
237    count: Option<&Int64Array>,
238) -> bool {
239    let is_short_prefix = match count {
240        Some(count) => {
241            let short_prefix_threshold = 32.0;
242            let n_sample = 10;
243
244            // HACK: can be simplified if function has specialized
245            // implementation for `ScalarValue` (implement without `make_scalar_function()`)
246            let total_prefix_len = start
247                .iter()
248                .zip(count.iter())
249                .take(n_sample)
250                .map(|(start, count)| {
251                    let start = start.unwrap_or(0);
252                    let count = count.unwrap_or(0);
253                    // To get substring, need to decode from 0 to start+count instead of start to start+count
254                    start.saturating_add(count)
255                })
256                .fold(0i64, |acc, val| acc.saturating_add(val));
257
258            (total_prefix_len as f64 / n_sample as f64) <= short_prefix_threshold
259        }
260        None => false,
261    };
262
263    if is_short_prefix {
264        // Skip ASCII validation for short prefix
265        false
266    } else {
267        string_array.is_ascii()
268    }
269}
270
271fn string_view_substr(
272    string_view_array: &StringViewArray,
273    args: &[ArrayRef],
274) -> Result<ArrayRef> {
275    let start_array = as_int64_array(&args[0])?;
276    let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
277
278    let is_ascii =
279        enable_ascii_fast_path(&string_view_array, start_array, count_array_opt);
280
281    // Combine null bitmaps from all inputs in bulk.
282    let nulls = NullBuffer::union_many([
283        string_view_array.nulls(),
284        start_array.nulls(),
285        count_array_opt.and_then(|a| a.nulls()),
286    ]);
287
288    let mut views_buf = Vec::with_capacity(string_view_array.len());
289
290    for (i, raw_view) in string_view_array.views().iter().enumerate() {
291        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
292            views_buf.push(0);
293            continue;
294        }
295
296        let string = string_view_array.value(i);
297        let start = start_array.value(i);
298        let count = count_array_opt.map(|a| a.value(i));
299
300        let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
301        let substr = &string[byte_start..byte_end];
302
303        append_view(&mut views_buf, raw_view, substr, byte_start as u32);
304    }
305
306    let views_buf = ScalarBuffer::from(views_buf);
307
308    // Safety:
309    // (1) The blocks of the given views are all provided
310    // (2) Each of the range `view.offset+start..end` of view in views_buf is within
311    // the bounds of each of the blocks
312    unsafe {
313        let array = StringViewArray::new_unchecked(
314            views_buf,
315            string_view_array.data_buffers().to_vec(),
316            nulls,
317        );
318        Ok(Arc::new(array) as ArrayRef)
319    }
320}
321
322fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
323    // The Arrow spec defines StringView offset fields as signed 32-bit
324    // integers, so the maximum representable offset is i32::MAX.
325    string_array
326        .offsets()
327        .last()
328        .map(|offset| offset.as_usize() <= i32::MAX as usize)
329        .unwrap_or(true)
330}
331
332#[inline]
333fn append_view_from_buffer(
334    views_buf: &mut Vec<u128>,
335    substr: &str,
336    byte_offset: usize,
337) -> bool {
338    let byte_offset =
339        u32::try_from(byte_offset).expect("validated string buffer offset fits in i32");
340    let view = make_view(substr.as_bytes(), 0, byte_offset);
341    views_buf.push(view);
342    substr.len() > 12
343}
344
345#[expect(clippy::needless_range_loop)]
346fn generic_string_substr<T: OffsetSizeTrait>(
347    string_array: &GenericStringArray<T>,
348    args: &[ArrayRef],
349) -> Result<ArrayRef> {
350    // We'd like to return a StringViewArray that points into the input string
351    // array's values buffer. Since the Arrow spec defines StringView offsets
352    // as i32, we can't use this approach when the values buffer is >2GB, so
353    // fallback to copying.
354    if !values_fit_in_i32(string_array) {
355        return generic_string_substr_copy(string_array, args);
356    }
357
358    let start_array = as_int64_array(&args[0])?;
359    let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
360
361    let is_ascii = enable_ascii_fast_path(&string_array, start_array, count_array_opt);
362    let offsets = string_array.value_offsets();
363    let mut views_buf = Vec::with_capacity(string_array.len());
364    let mut has_out_of_line = false;
365
366    // Combine null bitmaps from all inputs in bulk.
367    let nulls = NullBuffer::union_many([
368        string_array.nulls(),
369        start_array.nulls(),
370        count_array_opt.and_then(|a| a.nulls()),
371    ]);
372
373    for i in 0..string_array.len() {
374        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
375            views_buf.push(0);
376            continue;
377        }
378
379        let string = string_array.value(i);
380        let source_offset = offsets[i].as_usize();
381        let start = start_array.value(i);
382        let count = count_array_opt.map(|a| a.value(i));
383
384        let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
385        has_out_of_line |= append_view_from_buffer(
386            &mut views_buf,
387            &string[byte_start..byte_end],
388            source_offset + byte_start,
389        );
390    }
391
392    let views_buf = ScalarBuffer::from(views_buf);
393
394    // If all result strings are stored inline, we don't need to retain the
395    // input string array.
396    let data_buffers = if has_out_of_line {
397        vec![string_array.values().clone()]
398    } else {
399        vec![]
400    };
401
402    // Safety:
403    // (1) The blocks of the given views are all provided
404    // (2) Each referenced range in the source values buffer is within bounds
405    unsafe {
406        let array = StringViewArray::new_unchecked(views_buf, data_buffers, nulls);
407        Ok(Arc::new(array) as ArrayRef)
408    }
409}
410
411// Fallback for `generic_string_substr` if we can't use zerocopy because the
412// input string array is too large.
413fn generic_string_substr_copy<T: OffsetSizeTrait>(
414    string_array: &GenericStringArray<T>,
415    args: &[ArrayRef],
416) -> Result<ArrayRef> {
417    let start_array = as_int64_array(&args[0])?;
418    let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
419
420    let is_ascii = enable_ascii_fast_path(&string_array, start_array, count_array_opt);
421
422    // Combine null bitmaps from all inputs in bulk.
423    let nulls = NullBuffer::union_many([
424        string_array.nulls(),
425        start_array.nulls(),
426        count_array_opt.and_then(|a| a.nulls()),
427    ]);
428
429    let len = string_array.len();
430    let mut result_builder = StringViewArrayBuilder::with_capacity(len);
431
432    for i in 0..len {
433        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
434            result_builder.append_placeholder();
435            continue;
436        }
437
438        let string = string_array.value(i);
439        let start = start_array.value(i);
440        let count = count_array_opt.map(|a| a.value(i));
441
442        let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
443        result_builder.append_value(&string[byte_start..byte_end]);
444    }
445
446    Ok(Arc::new(result_builder.finish(nulls)?) as ArrayRef)
447}
448
449#[cfg(test)]
450mod tests {
451    use std::sync::Arc;
452
453    use arrow::array::{
454        Array, ArrayRef, AsArray, Int64Array, StringArray, StringViewArray,
455    };
456    use arrow::datatypes::DataType::Utf8View;
457
458    use datafusion_common::{Result, ScalarValue, exec_err};
459    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
460
461    use crate::unicode::substr::SubstrFunc;
462    use crate::utils::test::test_function;
463
464    #[test]
465    fn test_functions() -> Result<()> {
466        test_function!(
467            SubstrFunc::new(),
468            vec![
469                ColumnarValue::Scalar(ScalarValue::Utf8View(None)),
470                ColumnarValue::Scalar(ScalarValue::from(1i64)),
471            ],
472            Ok(None),
473            &str,
474            Utf8View,
475            StringViewArray
476        );
477        test_function!(
478            SubstrFunc::new(),
479            vec![
480                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
481                    "alphabet"
482                )))),
483                ColumnarValue::Scalar(ScalarValue::from(0i64)),
484            ],
485            Ok(Some("alphabet")),
486            &str,
487            Utf8View,
488            StringViewArray
489        );
490        test_function!(
491            SubstrFunc::new(),
492            vec![
493                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
494                    "this és longer than 12B"
495                )))),
496                ColumnarValue::Scalar(ScalarValue::from(5i64)),
497                ColumnarValue::Scalar(ScalarValue::from(2i64)),
498            ],
499            Ok(Some(" é")),
500            &str,
501            Utf8View,
502            StringViewArray
503        );
504        test_function!(
505            SubstrFunc::new(),
506            vec![
507                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
508                    "this is longer than 12B"
509                )))),
510                ColumnarValue::Scalar(ScalarValue::from(5i64)),
511            ],
512            Ok(Some(" is longer than 12B")),
513            &str,
514            Utf8View,
515            StringViewArray
516        );
517        test_function!(
518            SubstrFunc::new(),
519            vec![
520                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
521                    "joséésoj"
522                )))),
523                ColumnarValue::Scalar(ScalarValue::from(5i64)),
524            ],
525            Ok(Some("ésoj")),
526            &str,
527            Utf8View,
528            StringViewArray
529        );
530        test_function!(
531            SubstrFunc::new(),
532            vec![
533                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
534                    "alphabet"
535                )))),
536                ColumnarValue::Scalar(ScalarValue::from(3i64)),
537                ColumnarValue::Scalar(ScalarValue::from(2i64)),
538            ],
539            Ok(Some("ph")),
540            &str,
541            Utf8View,
542            StringViewArray
543        );
544        test_function!(
545            SubstrFunc::new(),
546            vec![
547                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
548                    "alphabet"
549                )))),
550                ColumnarValue::Scalar(ScalarValue::from(3i64)),
551                ColumnarValue::Scalar(ScalarValue::from(20i64)),
552            ],
553            Ok(Some("phabet")),
554            &str,
555            Utf8View,
556            StringViewArray
557        );
558        test_function!(
559            SubstrFunc::new(),
560            vec![
561                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
562                ColumnarValue::Scalar(ScalarValue::from(0i64)),
563            ],
564            Ok(Some("alphabet")),
565            &str,
566            Utf8View,
567            StringViewArray
568        );
569        test_function!(
570            SubstrFunc::new(),
571            vec![
572                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
573                ColumnarValue::Scalar(ScalarValue::from(5i64)),
574            ],
575            Ok(Some("ésoj")),
576            &str,
577            Utf8View,
578            StringViewArray
579        );
580        test_function!(
581            SubstrFunc::new(),
582            vec![
583                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
584                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
585            ],
586            Ok(Some("joséésoj")),
587            &str,
588            Utf8View,
589            StringViewArray
590        );
591        test_function!(
592            SubstrFunc::new(),
593            vec![
594                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
595                ColumnarValue::Scalar(ScalarValue::from(1i64)),
596            ],
597            Ok(Some("alphabet")),
598            &str,
599            Utf8View,
600            StringViewArray
601        );
602        test_function!(
603            SubstrFunc::new(),
604            vec![
605                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
606                ColumnarValue::Scalar(ScalarValue::from(2i64)),
607            ],
608            Ok(Some("lphabet")),
609            &str,
610            Utf8View,
611            StringViewArray
612        );
613        test_function!(
614            SubstrFunc::new(),
615            vec![
616                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
617                ColumnarValue::Scalar(ScalarValue::from(3i64)),
618            ],
619            Ok(Some("phabet")),
620            &str,
621            Utf8View,
622            StringViewArray
623        );
624        test_function!(
625            SubstrFunc::new(),
626            vec![
627                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
628                ColumnarValue::Scalar(ScalarValue::from(-3i64)),
629            ],
630            Ok(Some("alphabet")),
631            &str,
632            Utf8View,
633            StringViewArray
634        );
635        test_function!(
636            SubstrFunc::new(),
637            vec![
638                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
639                ColumnarValue::Scalar(ScalarValue::from(30i64)),
640            ],
641            Ok(Some("")),
642            &str,
643            Utf8View,
644            StringViewArray
645        );
646        test_function!(
647            SubstrFunc::new(),
648            vec![
649                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
650                ColumnarValue::Scalar(ScalarValue::Int64(None)),
651            ],
652            Ok(None),
653            &str,
654            Utf8View,
655            StringViewArray
656        );
657        test_function!(
658            SubstrFunc::new(),
659            vec![
660                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
661                ColumnarValue::Scalar(ScalarValue::from(3i64)),
662                ColumnarValue::Scalar(ScalarValue::from(2i64)),
663            ],
664            Ok(Some("ph")),
665            &str,
666            Utf8View,
667            StringViewArray
668        );
669        test_function!(
670            SubstrFunc::new(),
671            vec![
672                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
673                ColumnarValue::Scalar(ScalarValue::from(3i64)),
674                ColumnarValue::Scalar(ScalarValue::from(20i64)),
675            ],
676            Ok(Some("phabet")),
677            &str,
678            Utf8View,
679            StringViewArray
680        );
681        test_function!(
682            SubstrFunc::new(),
683            vec![
684                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
685                ColumnarValue::Scalar(ScalarValue::from(0i64)),
686                ColumnarValue::Scalar(ScalarValue::from(5i64)),
687            ],
688            Ok(Some("alph")),
689            &str,
690            Utf8View,
691            StringViewArray
692        );
693        // starting from 5 (10 + -5)
694        test_function!(
695            SubstrFunc::new(),
696            vec![
697                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
698                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
699                ColumnarValue::Scalar(ScalarValue::from(10i64)),
700            ],
701            Ok(Some("alph")),
702            &str,
703            Utf8View,
704            StringViewArray
705        );
706        // starting from -1 (4 + -5)
707        test_function!(
708            SubstrFunc::new(),
709            vec![
710                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
711                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
712                ColumnarValue::Scalar(ScalarValue::from(4i64)),
713            ],
714            Ok(Some("")),
715            &str,
716            Utf8View,
717            StringViewArray
718        );
719        // starting from 0 (5 + -5)
720        test_function!(
721            SubstrFunc::new(),
722            vec![
723                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
724                ColumnarValue::Scalar(ScalarValue::from(-5i64)),
725                ColumnarValue::Scalar(ScalarValue::from(5i64)),
726            ],
727            Ok(Some("")),
728            &str,
729            Utf8View,
730            StringViewArray
731        );
732        test_function!(
733            SubstrFunc::new(),
734            vec![
735                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
736                ColumnarValue::Scalar(ScalarValue::Int64(None)),
737                ColumnarValue::Scalar(ScalarValue::from(20i64)),
738            ],
739            Ok(None),
740            &str,
741            Utf8View,
742            StringViewArray
743        );
744        test_function!(
745            SubstrFunc::new(),
746            vec![
747                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
748                ColumnarValue::Scalar(ScalarValue::from(3i64)),
749                ColumnarValue::Scalar(ScalarValue::Int64(None)),
750            ],
751            Ok(None),
752            &str,
753            Utf8View,
754            StringViewArray
755        );
756        test_function!(
757            SubstrFunc::new(),
758            vec![
759                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
760                ColumnarValue::Scalar(ScalarValue::from(1i64)),
761                ColumnarValue::Scalar(ScalarValue::from(-1i64)),
762            ],
763            exec_err!("negative count not allowed: -1"),
764            &str,
765            Utf8View,
766            StringViewArray
767        );
768        test_function!(
769            SubstrFunc::new(),
770            vec![
771                ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
772                ColumnarValue::Scalar(ScalarValue::from(5i64)),
773                ColumnarValue::Scalar(ScalarValue::from(2i64)),
774            ],
775            Ok(Some("és")),
776            &str,
777            Utf8View,
778            StringViewArray
779        );
780        #[cfg(not(feature = "unicode_expressions"))]
781        test_function!(
782            SubstrFunc::new(),
783            &[
784                ColumnarValue::Scalar(ScalarValue::from("alphabet")),
785                ColumnarValue::Scalar(ScalarValue::from(0i64)),
786            ],
787            internal_err!(
788                "function substr requires compilation with feature flag: unicode_expressions."
789            ),
790            &str,
791            Utf8View,
792            StringViewArray
793        );
794        test_function!(
795            SubstrFunc::new(),
796            vec![
797                ColumnarValue::Scalar(ScalarValue::from("abc")),
798                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
799            ],
800            exec_err!("start position overflow: -9223372036854775808"),
801            &str,
802            Utf8View,
803            StringViewArray
804        );
805        test_function!(
806            SubstrFunc::new(),
807            vec![
808                ColumnarValue::Scalar(ScalarValue::from("overflow")),
809                ColumnarValue::Scalar(ScalarValue::from(i64::MIN)),
810                ColumnarValue::Scalar(ScalarValue::from(1i64)),
811            ],
812            exec_err!("start position overflow: -9223372036854775808"),
813            &str,
814            Utf8View,
815            StringViewArray
816        );
817        test_function!(
818            SubstrFunc::new(),
819            vec![
820                ColumnarValue::Scalar(ScalarValue::from("large count")),
821                ColumnarValue::Scalar(ScalarValue::from(2i64)),
822                ColumnarValue::Scalar(ScalarValue::from(i64::MAX)),
823            ],
824            Ok(Some("arge count")),
825            &str,
826            Utf8View,
827            StringViewArray
828        );
829
830        Ok(())
831    }
832
833    #[test]
834    fn test_sliced_string_array_array_args() -> Result<()> {
835        // Use strings longer than 12 bytes so the result views are out-of-line.
836        let string_array = Arc::new(StringArray::from(vec![
837            "skipped_prefix_value",
838            "alphabet_long_string",
839            "joséésojanother_long",
840        ])) as ArrayRef;
841        let string_array = string_array.slice(1, 2);
842        let start_array = Arc::new(Int64Array::from(vec![3, 5])) as ArrayRef;
843        let count_array = Arc::new(Int64Array::from(vec![15, 14])) as ArrayRef;
844
845        let result = super::substr(&[string_array, start_array, count_array])?;
846        let result = result.as_string_view();
847
848        assert_eq!(result.value(0), "phabet_long_str");
849        assert_eq!(result.value(1), "ésojanother_lo");
850
851        Ok(())
852    }
853}