Skip to main content

datafusion_functions/unicode/
substrindex.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::sync::Arc;
19
20use arrow::array::{
21    Array, ArrayRef, AsArray, ByteView, GenericStringArray, OffsetSizeTrait,
22    PrimitiveArray, StringArrayType, StringViewArray, make_view, new_null_array,
23};
24use arrow::buffer::ScalarBuffer;
25use arrow::datatypes::{DataType, Int64Type};
26use arrow_buffer::NullBuffer;
27
28use crate::strings::GenericStringArrayBuilder;
29use crate::utils::make_scalar_function;
30use datafusion_common::{
31    Result, ScalarValue, exec_datafusion_err, exec_err, utils::take_function_args,
32};
33use datafusion_expr::TypeSignature::Exact;
34use datafusion_expr::{
35    ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
36    Volatility,
37};
38use datafusion_macros::user_doc;
39use memchr::{memchr_iter, memmem, memrchr_iter};
40
41#[user_doc(
42    doc_section(label = "String Functions"),
43    description = r#"Returns the substring from str before count occurrences of the delimiter delim.
44If count is positive, everything to the left of the final delimiter (counting from the left) is returned.
45If count is negative, everything to the right of the final delimiter (counting from the right) is returned."#,
46    syntax_example = "substr_index(str, delim, count)",
47    sql_example = r#"```sql
48> select substr_index('www.apache.org', '.', 1);
49+---------------------------------------------------------+
50| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) |
51+---------------------------------------------------------+
52| www                                                     |
53+---------------------------------------------------------+
54> select substr_index('www.apache.org', '.', -1);
55+----------------------------------------------------------+
56| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) |
57+----------------------------------------------------------+
58| org                                                      |
59+----------------------------------------------------------+
60```"#,
61    standard_argument(name = "str", prefix = "String"),
62    argument(
63        name = "delim",
64        description = "The string to find in str to split str."
65    ),
66    argument(
67        name = "count",
68        description = "The number of times to search for the delimiter. Can be either a positive or negative number."
69    )
70)]
71#[derive(Debug, PartialEq, Eq, Hash)]
72pub struct SubstrIndexFunc {
73    signature: Signature,
74    aliases: Vec<String>,
75}
76
77impl Default for SubstrIndexFunc {
78    fn default() -> Self {
79        Self::new()
80    }
81}
82
83impl SubstrIndexFunc {
84    pub fn new() -> Self {
85        use DataType::*;
86        Self {
87            signature: Signature::one_of(
88                vec![
89                    Exact(vec![Utf8View, Utf8View, Int64]),
90                    Exact(vec![Utf8, Utf8, Int64]),
91                    Exact(vec![LargeUtf8, LargeUtf8, Int64]),
92                ],
93                Volatility::Immutable,
94            ),
95            aliases: vec![String::from("substring_index")],
96        }
97    }
98}
99
100impl ScalarUDFImpl for SubstrIndexFunc {
101    fn name(&self) -> &str {
102        "substr_index"
103    }
104
105    fn signature(&self) -> &Signature {
106        &self.signature
107    }
108
109    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
110        Ok(arg_types[0].clone())
111    }
112
113    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
114        let ScalarFunctionArgs { args, .. } = args;
115
116        if let (
117            ColumnarValue::Array(string_array),
118            ColumnarValue::Scalar(delim_scalar),
119            ColumnarValue::Scalar(count_scalar),
120        ) = (&args[0], &args[1], &args[2])
121        {
122            return substr_index_scalar(string_array, delim_scalar, count_scalar);
123        }
124
125        make_scalar_function(substr_index, vec![])(&args)
126    }
127
128    fn aliases(&self) -> &[String] {
129        &self.aliases
130    }
131
132    fn documentation(&self) -> Option<&Documentation> {
133        self.doc()
134    }
135}
136
137/// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned.
138/// SUBSTRING_INDEX('www.apache.org', '.', 1) = www
139/// SUBSTRING_INDEX('www.apache.org', '.', 2) = www.apache
140/// SUBSTRING_INDEX('www.apache.org', '.', -2) = apache.org
141/// SUBSTRING_INDEX('www.apache.org', '.', -1) = org
142fn substr_index(args: &[ArrayRef]) -> Result<ArrayRef> {
143    let [str, delim, count] = take_function_args("substr_index", args)?;
144
145    match str.data_type() {
146        DataType::Utf8 => {
147            let string_array = str.as_string::<i32>();
148            let delimiter_array = delim.as_string::<i32>();
149            let count_array: &PrimitiveArray<Int64Type> = count.as_primitive();
150            substr_index_general(
151                string_array,
152                delimiter_array,
153                count_array,
154                GenericStringArrayBuilder::<i32>::with_capacity(
155                    string_array.len(),
156                    visible_string_bytes(string_array),
157                ),
158            )
159        }
160        DataType::LargeUtf8 => {
161            let string_array = str.as_string::<i64>();
162            let delimiter_array = delim.as_string::<i64>();
163            let count_array: &PrimitiveArray<Int64Type> = count.as_primitive();
164            substr_index_general(
165                string_array,
166                delimiter_array,
167                count_array,
168                GenericStringArrayBuilder::<i64>::with_capacity(
169                    string_array.len(),
170                    visible_string_bytes(string_array),
171                ),
172            )
173        }
174        DataType::Utf8View => {
175            let string_array = str.as_string_view();
176            let delimiter_array = delim.as_string_view();
177            let count_array: &PrimitiveArray<Int64Type> = count.as_primitive();
178            substr_index_view(string_array, delimiter_array, count_array)
179        }
180        other => {
181            exec_err!("Unsupported data type {other:?} for function substr_index")
182        }
183    }
184}
185
186fn substr_index_scalar(
187    string_array: &ArrayRef,
188    delim_scalar: &ScalarValue,
189    count_scalar: &ScalarValue,
190) -> Result<ColumnarValue> {
191    if string_array.is_empty() {
192        return Ok(ColumnarValue::Array(new_null_array(
193            string_array.data_type(),
194            0,
195        )));
196    }
197
198    let delimiter = delim_scalar.try_as_str().ok_or_else(|| {
199        exec_datafusion_err!(
200            "Unsupported delimiter type {:?} for substr_index",
201            delim_scalar.data_type()
202        )
203    })?;
204
205    let count = match count_scalar {
206        ScalarValue::Int64(v) => *v,
207        other => {
208            return exec_err!(
209                "Unsupported count type {:?} for substr_index",
210                other.data_type()
211            );
212        }
213    };
214
215    let (Some(delimiter), Some(count)) = (delimiter, count) else {
216        return Ok(ColumnarValue::Array(new_null_array(
217            string_array.data_type(),
218            string_array.len(),
219        )));
220    };
221
222    let result = match string_array.data_type() {
223        DataType::Utf8View => {
224            substr_index_scalar_view(string_array.as_string_view(), delimiter, count)
225        }
226        DataType::Utf8 => {
227            let arr = string_array.as_string::<i32>();
228            substr_index_scalar_impl(
229                arr,
230                delimiter,
231                count,
232                GenericStringArrayBuilder::<i32>::with_capacity(
233                    arr.len(),
234                    visible_string_bytes(arr),
235                ),
236            )
237        }
238        DataType::LargeUtf8 => {
239            let arr = string_array.as_string::<i64>();
240            substr_index_scalar_impl(
241                arr,
242                delimiter,
243                count,
244                GenericStringArrayBuilder::<i64>::with_capacity(
245                    arr.len(),
246                    visible_string_bytes(arr),
247                ),
248            )
249        }
250        other => exec_err!("Unsupported string type {other:?} for substr_index"),
251    }?;
252
253    Ok(ColumnarValue::Array(result))
254}
255
256#[inline]
257fn visible_string_bytes<T: OffsetSizeTrait>(
258    string_array: &GenericStringArray<T>,
259) -> usize {
260    let offsets = string_array.value_offsets();
261    offsets[offsets.len() - 1].as_usize() - offsets[0].as_usize()
262}
263
264fn substr_index_general<'a, S, O>(
265    string_array: S,
266    delimiter_array: S,
267    count_array: &PrimitiveArray<Int64Type>,
268    mut builder: GenericStringArrayBuilder<O>,
269) -> Result<ArrayRef>
270where
271    S: StringArrayType<'a> + Copy,
272    O: OffsetSizeTrait,
273{
274    let num_rows = string_array.len();
275    // Output is null if and only if any input is null.
276    let nulls = NullBuffer::union_many([
277        string_array.nulls(),
278        delimiter_array.nulls(),
279        count_array.nulls(),
280    ]);
281
282    for i in 0..num_rows {
283        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
284            builder.append_placeholder();
285            continue;
286        }
287        // SAFETY: `i < num_rows` and the union of input nulls is valid at i,
288        // so each input is also valid at i.
289        let string = unsafe { string_array.value_unchecked(i) };
290        let delimiter = unsafe { delimiter_array.value_unchecked(i) };
291        let n = unsafe { count_array.value_unchecked(i) };
292        builder.append_value(substr_index_slice(string, delimiter, n));
293    }
294
295    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
296}
297
298fn substr_index_view(
299    string_array: &StringViewArray,
300    delimiter_array: &StringViewArray,
301    count_array: &PrimitiveArray<Int64Type>,
302) -> Result<ArrayRef> {
303    let nulls = NullBuffer::union_many([
304        string_array.nulls(),
305        delimiter_array.nulls(),
306        count_array.nulls(),
307    ]);
308    let views = string_array.views();
309    let mut views_buf = Vec::with_capacity(string_array.len());
310    let mut has_out_of_line = false;
311
312    for (i, raw_view) in views.iter().enumerate() {
313        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
314            views_buf.push(0);
315            continue;
316        }
317
318        let string = string_array.value(i);
319        let delimiter = delimiter_array.value(i);
320        let count = count_array.value(i);
321        let substr = substr_index_slice(string, delimiter, count);
322        has_out_of_line |= append_substr_view(&mut views_buf, raw_view, string, substr);
323    }
324
325    let data_buffers = if has_out_of_line {
326        string_array.data_buffers().to_vec()
327    } else {
328        vec![]
329    };
330
331    // Safety: each appended view is either:
332    // (1) a copied null sentinel,
333    // (2) the original valid input view, or
334    // (3) built by `append_view` for a contiguous substring of the input row.
335    unsafe {
336        Ok(Arc::new(StringViewArray::new_unchecked(
337            ScalarBuffer::from(views_buf),
338            data_buffers,
339            nulls,
340        )) as ArrayRef)
341    }
342}
343
344fn substr_index_scalar_impl<'a, S, O>(
345    string_array: S,
346    delimiter: &str,
347    count: i64,
348    builder: GenericStringArrayBuilder<O>,
349) -> Result<ArrayRef>
350where
351    S: StringArrayType<'a> + Copy,
352    O: OffsetSizeTrait,
353{
354    if count == 0 || delimiter.is_empty() {
355        return map_strings(string_array, builder, |string| &string[..0]);
356    }
357
358    if delimiter.len() == 1 {
359        let delimiter_byte = delimiter.as_bytes()[0];
360        return map_strings(string_array, builder, |string| {
361            substr_index_single_byte(string, delimiter_byte, count)
362        });
363    }
364
365    let occurrence_idx = usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX) - 1;
366    if count > 0 {
367        let finder = memmem::Finder::new(delimiter.as_bytes());
368        map_strings(string_array, builder, |string| {
369            substr_index_slice_finder(string, &finder, delimiter.len(), occurrence_idx)
370        })
371    } else {
372        let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
373        map_strings(string_array, builder, |string| {
374            substr_index_rslice_finder(
375                string,
376                &finder_rev,
377                delimiter.len(),
378                occurrence_idx,
379            )
380        })
381    }
382}
383
384fn substr_index_scalar_view(
385    string_array: &StringViewArray,
386    delimiter: &str,
387    count: i64,
388) -> Result<ArrayRef> {
389    let views = string_array.views();
390    let mut views_buf = Vec::with_capacity(string_array.len());
391    let mut has_out_of_line = false;
392
393    if count == 0 || delimiter.is_empty() {
394        let empty_view = make_view(b"", 0, 0);
395        for i in 0..string_array.len() {
396            if string_array.is_null(i) {
397                views_buf.push(0);
398            } else {
399                views_buf.push(empty_view);
400            }
401        }
402    } else if delimiter.len() == 1 {
403        let delimiter_byte = delimiter.as_bytes()[0];
404        for (i, raw_view) in views.iter().enumerate() {
405            if string_array.is_null(i) {
406                views_buf.push(0);
407                continue;
408            }
409
410            let string = string_array.value(i);
411            let substr = substr_index_single_byte(string, delimiter_byte, count);
412            has_out_of_line |=
413                append_substr_view(&mut views_buf, raw_view, string, substr);
414        }
415    } else {
416        let occurrence_idx =
417            usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX) - 1;
418        if count > 0 {
419            let finder = memmem::Finder::new(delimiter.as_bytes());
420            for (i, raw_view) in views.iter().enumerate() {
421                if string_array.is_null(i) {
422                    views_buf.push(0);
423                    continue;
424                }
425
426                let string = string_array.value(i);
427                let substr = substr_index_slice_finder(
428                    string,
429                    &finder,
430                    delimiter.len(),
431                    occurrence_idx,
432                );
433                has_out_of_line |=
434                    append_substr_view(&mut views_buf, raw_view, string, substr);
435            }
436        } else {
437            let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
438            for (i, raw_view) in views.iter().enumerate() {
439                if string_array.is_null(i) {
440                    views_buf.push(0);
441                    continue;
442                }
443
444                let string = string_array.value(i);
445                let substr = substr_index_rslice_finder(
446                    string,
447                    &finder_rev,
448                    delimiter.len(),
449                    occurrence_idx,
450                );
451                has_out_of_line |=
452                    append_substr_view(&mut views_buf, raw_view, string, substr);
453            }
454        }
455    }
456
457    let data_buffers = if has_out_of_line {
458        string_array.data_buffers().to_vec()
459    } else {
460        vec![]
461    };
462
463    // Safety: each appended view is either:
464    // (1) a copied null sentinel,
465    // (2) the original valid input view,
466    // (3) an inline empty string view, or
467    // (4) built by `append_view` for a contiguous substring of the input row.
468    unsafe {
469        Ok(Arc::new(StringViewArray::new_unchecked(
470            ScalarBuffer::from(views_buf),
471            data_buffers,
472            string_array.nulls().cloned(),
473        )) as ArrayRef)
474    }
475}
476
477fn map_strings<'a, S, O, F>(
478    string_array: S,
479    mut builder: GenericStringArrayBuilder<O>,
480    f: F,
481) -> Result<ArrayRef>
482where
483    S: StringArrayType<'a> + Copy,
484    O: OffsetSizeTrait,
485    F: Fn(&'a str) -> &'a str,
486{
487    let nulls = string_array.nulls().cloned();
488    for i in 0..string_array.len() {
489        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
490            builder.append_placeholder();
491            continue;
492        }
493        // SAFETY: `i < string_array.len()` and `nulls` is valid at i, so the
494        // input is also valid at i.
495        let s = unsafe { string_array.value_unchecked(i) };
496        builder.append_value(f(s));
497    }
498    Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
499}
500
501#[inline]
502fn substr_index_slice<'a>(string: &'a str, delimiter: &str, count: i64) -> &'a str {
503    if count == 0 || string.is_empty() || delimiter.is_empty() {
504        return &string[..0];
505    }
506
507    if delimiter.len() == 1 {
508        return substr_index_single_byte(string, delimiter.as_bytes()[0], count);
509    }
510
511    let occurrences = usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX);
512    if count > 0 {
513        string
514            .match_indices(delimiter)
515            .nth(occurrences - 1)
516            .map(|(idx, _)| &string[..idx])
517            .unwrap_or(string)
518    } else {
519        string
520            .rmatch_indices(delimiter)
521            .nth(occurrences - 1)
522            .map(|(idx, _)| &string[idx + delimiter.len()..])
523            .unwrap_or(string)
524    }
525}
526
527#[inline]
528fn substr_index_single_byte(string: &str, delimiter: u8, count: i64) -> &str {
529    let occurrences = usize::try_from(count.unsigned_abs()).unwrap_or(usize::MAX);
530    let idx = if count > 0 {
531        memchr_iter(delimiter, string.as_bytes()).nth(occurrences - 1)
532    } else {
533        memrchr_iter(delimiter, string.as_bytes())
534            .nth(occurrences - 1)
535            .map(|idx| idx + 1)
536    };
537
538    match idx {
539        Some(idx) if count > 0 => &string[..idx],
540        Some(idx) => &string[idx..],
541        None => string,
542    }
543}
544
545#[inline]
546fn substr_index_slice_finder<'a>(
547    string: &'a str,
548    finder: &memmem::Finder,
549    delimiter_len: usize,
550    occurrence_idx: usize,
551) -> &'a str {
552    let bytes = string.as_bytes();
553    let mut start = 0;
554    for _ in 0..occurrence_idx {
555        match finder.find(&bytes[start..]) {
556            Some(pos) => start += pos + delimiter_len,
557            None => return string,
558        }
559    }
560
561    match finder.find(&bytes[start..]) {
562        Some(pos) => &string[..start + pos],
563        None => string,
564    }
565}
566
567#[inline]
568fn substr_index_rslice_finder<'a>(
569    string: &'a str,
570    finder: &memmem::FinderRev,
571    delimiter_len: usize,
572    occurrence_idx: usize,
573) -> &'a str {
574    let bytes = string.as_bytes();
575    let mut end = bytes.len();
576    for _ in 0..occurrence_idx {
577        match finder.rfind(&bytes[..end]) {
578            Some(pos) => end = pos,
579            None => return string,
580        }
581    }
582
583    match finder.rfind(&bytes[..end]) {
584        Some(pos) => &string[pos + delimiter_len..],
585        None => string,
586    }
587}
588
589#[inline]
590fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
591    if substr.len() > 12 {
592        let view = ByteView::from(*original_view);
593        make_view(
594            substr.as_bytes(),
595            view.buffer_index,
596            view.offset + start_offset,
597        )
598    } else {
599        make_view(substr.as_bytes(), 0, 0)
600    }
601}
602
603#[inline]
604fn append_substr_view(
605    views_buf: &mut Vec<u128>,
606    raw_view: &u128,
607    string: &str,
608    substr: &str,
609) -> bool {
610    if substr.len() == string.len() {
611        views_buf.push(*raw_view);
612        return substr.len() > 12;
613    }
614
615    if substr.is_empty() {
616        views_buf.push(make_view(b"", 0, 0));
617        return false;
618    }
619
620    let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
621    let start_offset =
622        u32::try_from(start_offset).expect("string view offsets fit in u32");
623    views_buf.push(substr_view(raw_view, substr, start_offset));
624    substr.len() > 12
625}
626
627#[cfg(test)]
628mod tests {
629    use arrow::array::{
630        Array, ArrayRef, AsArray, Int64Array, StringArray, StringViewArray,
631    };
632    use arrow::datatypes::DataType::{Utf8, Utf8View};
633    use arrow::datatypes::{DataType, Field};
634
635    use datafusion_common::config::ConfigOptions;
636    use datafusion_common::{Result, ScalarValue};
637    use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
638    use std::sync::Arc;
639
640    use crate::unicode::substrindex::SubstrIndexFunc;
641    use crate::utils::test::test_function;
642
643    #[test]
644    fn test_functions() -> Result<()> {
645        test_function!(
646            SubstrIndexFunc::new(),
647            vec![
648                ColumnarValue::Scalar(ScalarValue::from("www.apache.org")),
649                ColumnarValue::Scalar(ScalarValue::from(".")),
650                ColumnarValue::Scalar(ScalarValue::from(1i64)),
651            ],
652            Ok(Some("www")),
653            &str,
654            Utf8,
655            StringArray
656        );
657        test_function!(
658            SubstrIndexFunc::new(),
659            vec![
660                ColumnarValue::Scalar(ScalarValue::from("www.apache.org")),
661                ColumnarValue::Scalar(ScalarValue::from(".")),
662                ColumnarValue::Scalar(ScalarValue::from(2i64)),
663            ],
664            Ok(Some("www.apache")),
665            &str,
666            Utf8,
667            StringArray
668        );
669        test_function!(
670            SubstrIndexFunc::new(),
671            vec![
672                ColumnarValue::Scalar(ScalarValue::from("www.apache.org")),
673                ColumnarValue::Scalar(ScalarValue::from(".")),
674                ColumnarValue::Scalar(ScalarValue::from(-2i64)),
675            ],
676            Ok(Some("apache.org")),
677            &str,
678            Utf8,
679            StringArray
680        );
681        test_function!(
682            SubstrIndexFunc::new(),
683            vec![
684                ColumnarValue::Scalar(ScalarValue::from("www.apache.org")),
685                ColumnarValue::Scalar(ScalarValue::from(".")),
686                ColumnarValue::Scalar(ScalarValue::from(-1i64)),
687            ],
688            Ok(Some("org")),
689            &str,
690            Utf8,
691            StringArray
692        );
693        test_function!(
694            SubstrIndexFunc::new(),
695            vec![
696                ColumnarValue::Scalar(ScalarValue::from("www.apache.org")),
697                ColumnarValue::Scalar(ScalarValue::from(".")),
698                ColumnarValue::Scalar(ScalarValue::from(0i64)),
699            ],
700            Ok(Some("")),
701            &str,
702            Utf8,
703            StringArray
704        );
705        test_function!(
706            SubstrIndexFunc::new(),
707            vec![
708                ColumnarValue::Scalar(ScalarValue::from("")),
709                ColumnarValue::Scalar(ScalarValue::from(".")),
710                ColumnarValue::Scalar(ScalarValue::from(1i64)),
711            ],
712            Ok(Some("")),
713            &str,
714            Utf8,
715            StringArray
716        );
717        test_function!(
718            SubstrIndexFunc::new(),
719            vec![
720                ColumnarValue::Scalar(ScalarValue::from("www.apache.org")),
721                ColumnarValue::Scalar(ScalarValue::from("")),
722                ColumnarValue::Scalar(ScalarValue::from(1i64)),
723            ],
724            Ok(Some("")),
725            &str,
726            Utf8,
727            StringArray
728        );
729        test_function!(
730            SubstrIndexFunc::new(),
731            vec![
732                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
733                    "verylongprefix.segment.tail".into(),
734                ))),
735                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))),
736                ColumnarValue::Scalar(ScalarValue::from(1i64)),
737            ],
738            Ok(Some("verylongprefix")),
739            &str,
740            Utf8View,
741            StringViewArray
742        );
743        test_function!(
744            SubstrIndexFunc::new(),
745            vec![
746                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
747                    "www.apache.org".into(),
748                ))),
749                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))),
750                ColumnarValue::Scalar(ScalarValue::from(-1i64)),
751            ],
752            Ok(Some("org")),
753            &str,
754            Utf8View,
755            StringViewArray
756        );
757        Ok(())
758    }
759
760    #[test]
761    fn test_substr_index_utf8view_scalar_fast_path() -> Result<()> {
762        let input = Arc::new(StringViewArray::from(vec![
763            Some("alpha.beta.gamma"),
764            Some("short.val"),
765            None,
766        ])) as ArrayRef;
767
768        let arg_fields = vec![
769            Field::new("a", Utf8View, true).into(),
770            Field::new("b", Utf8View, true).into(),
771            Field::new("c", DataType::Int64, true).into(),
772        ];
773
774        let args = ScalarFunctionArgs {
775            number_rows: input.len(),
776            args: vec![
777                ColumnarValue::Array(Arc::clone(&input)),
778                ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into()))),
779                ColumnarValue::Scalar(ScalarValue::Int64(Some(1))),
780            ],
781            arg_fields,
782            return_field: Field::new("f", Utf8View, true).into(),
783            config_options: Arc::new(ConfigOptions::default()),
784        };
785
786        let result = match SubstrIndexFunc::new().invoke_with_args(args)? {
787            ColumnarValue::Array(result) => result,
788            other => panic!("expected array result, got {other:?}"),
789        };
790        let result = result.as_string_view();
791
792        assert_eq!(result.len(), 3);
793        assert_eq!(result.value(0), "alpha");
794        assert_eq!(result.value(1), "short");
795        assert!(result.is_null(2));
796
797        Ok(())
798    }
799
800    #[test]
801    fn test_substr_index_utf8view_array_sliced() -> Result<()> {
802        use super::substr_index_view;
803
804        let strings: StringViewArray = vec![
805            Some("skip_this.value"),
806            Some("this_is_a_long_prefix.suffix"),
807            Some("short.val"),
808            Some("another_long_result.rest"),
809            None,
810        ]
811        .into_iter()
812        .collect();
813        let delimiters: StringViewArray =
814            vec![Some("."), Some("."), Some("."), Some("."), Some(".")]
815                .into_iter()
816                .collect();
817        let counts = Int64Array::from(vec![1, 1, -1, 1, 1]);
818
819        let sliced_strings = strings.slice(1, 4);
820        let sliced_delimiters = delimiters.slice(1, 4);
821        let sliced_counts = counts.slice(1, 4);
822
823        let result =
824            substr_index_view(&sliced_strings, &sliced_delimiters, &sliced_counts)?;
825        let result = result.as_string_view();
826
827        assert_eq!(result.len(), 4);
828        assert_eq!(result.value(0), "this_is_a_long_prefix");
829        assert_eq!(result.value(1), "val");
830        assert_eq!(result.value(2), "another_long_result");
831        assert!(result.is_null(3));
832
833        Ok(())
834    }
835
836    #[test]
837    fn test_substr_index_utf8view_scalar_reuses_original_view_when_unchanged()
838    -> Result<()> {
839        use super::substr_index_scalar_view;
840
841        let strings: StringViewArray = vec![
842            Some("very_long_value_without_separator"),
843            Some("short"),
844            None,
845        ]
846        .into_iter()
847        .collect();
848
849        let result = substr_index_scalar_view(&strings, ".", 1)?;
850        let result = result.as_string_view();
851
852        assert_eq!(result.len(), 3);
853        assert_eq!(result.value(0), "very_long_value_without_separator");
854        assert_eq!(result.value(1), "short");
855        assert_eq!(result.views()[0], strings.views()[0]);
856        assert_eq!(result.views()[1], strings.views()[1]);
857        assert!(result.is_null(2));
858
859        Ok(())
860    }
861}