Skip to main content

datafusion_functions/unicode/
common.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Common utilities for implementing unicode functions
19
20use arrow::array::{
21    Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array,
22    OffsetSizeTrait, StringViewArray, make_view,
23};
24use arrow::datatypes::DataType;
25use arrow_buffer::{NullBuffer, ScalarBuffer};
26use datafusion_common::cast::{
27    as_generic_string_array, as_int64_array, as_string_view_array,
28};
29use datafusion_common::exec_err;
30use std::cmp::Ordering;
31use std::ops::Range;
32use std::sync::Arc;
33
34/// A trait for `left` and `right` byte slicing operations
35pub(crate) trait LeftRightSlicer {
36    fn slice(string: &str, n: i64) -> Range<usize>;
37}
38
39pub(crate) struct LeftSlicer {}
40
41impl LeftRightSlicer for LeftSlicer {
42    fn slice(string: &str, n: i64) -> Range<usize> {
43        0..left_right_byte_length(string, n)
44    }
45}
46
47pub(crate) struct RightSlicer {}
48
49impl LeftRightSlicer for RightSlicer {
50    fn slice(string: &str, n: i64) -> Range<usize> {
51        if n == 0 {
52            // Return nothing for `n=0`
53            0..0
54        } else if n == i64::MIN {
55            // Special case for i64::MIN overflow
56            0..0
57        } else {
58            left_right_byte_length(string, -n)..string.len()
59        }
60    }
61}
62
63/// Calculate the byte length of the substring of `n` chars from string `string`
64#[inline]
65fn left_right_byte_length(string: &str, n: i64) -> usize {
66    match n.cmp(&0) {
67        Ordering::Less => string
68            .char_indices()
69            .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
70            .map(|(index, _)| index)
71            .unwrap_or(0),
72        Ordering::Equal => 0,
73        Ordering::Greater => string
74            .char_indices()
75            .nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
76            .map(|(index, _)| index)
77            .unwrap_or(string.len()),
78    }
79}
80
81/// General implementation for `left` and `right` functions
82pub(crate) fn general_left_right<F: LeftRightSlicer>(
83    args: &[ArrayRef],
84) -> datafusion_common::Result<ArrayRef> {
85    let n_array = as_int64_array(&args[1])?;
86
87    match args[0].data_type() {
88        DataType::Utf8 => {
89            let string_array = as_generic_string_array::<i32>(&args[0])?;
90            general_left_right_array::<i32, _, F>(string_array, n_array)
91        }
92        DataType::LargeUtf8 => {
93            let string_array = as_generic_string_array::<i64>(&args[0])?;
94            general_left_right_array::<i64, _, F>(string_array, n_array)
95        }
96        DataType::Utf8View => {
97            let string_view_array = as_string_view_array(&args[0])?;
98            general_left_right_view::<F>(string_view_array, n_array)
99        }
100        _ => exec_err!("Not supported"),
101    }
102}
103
104/// `general_left_right` implementation for strings
105fn general_left_right_array<
106    'a,
107    T: OffsetSizeTrait,
108    V: ArrayAccessor<Item = &'a str>,
109    F: LeftRightSlicer,
110>(
111    string_array: V,
112    n_array: &Int64Array,
113) -> datafusion_common::Result<ArrayRef> {
114    let iter = ArrayIter::new(string_array);
115    let result = iter
116        .zip(n_array.iter())
117        .map(|(string, n)| match (string, n) {
118            (Some(string), Some(n)) => {
119                let range = F::slice(string, n);
120                // Extract a given range from a byte-indexed slice
121                Some(&string[range])
122            }
123            _ => None,
124        })
125        .collect::<GenericStringArray<T>>();
126
127    Ok(Arc::new(result) as ArrayRef)
128}
129
130/// `general_left_right` implementation for StringViewArray
131fn general_left_right_view<F: LeftRightSlicer>(
132    string_view_array: &StringViewArray,
133    n_array: &Int64Array,
134) -> datafusion_common::Result<ArrayRef> {
135    let len = n_array.len();
136
137    let views = string_view_array.views();
138    // Every string in StringViewArray has one corresponding view in `views`
139    debug_assert!(views.len() == string_view_array.len());
140
141    // Compose null buffer at once
142    let string_nulls = string_view_array.nulls();
143    let n_nulls = n_array.nulls();
144    let new_nulls = NullBuffer::union(string_nulls, n_nulls);
145
146    let new_views = (0..len)
147        .map(|idx| {
148            let view = views[idx];
149
150            let is_valid = match &new_nulls {
151                Some(nulls_buf) => nulls_buf.is_valid(idx),
152                None => true,
153            };
154
155            if is_valid {
156                let string: &str = string_view_array.value(idx);
157                let n = n_array.value(idx);
158
159                // Input string comes from StringViewArray, so it should fit in 32-bit length
160                let range = F::slice(string, n);
161                let result_bytes = &string.as_bytes()[range.clone()];
162
163                let byte_view = ByteView::from(view);
164                // New offset starts at 0 for left, and at `range.start` for right,
165                // which is encoded in the given range
166                let new_offset = byte_view.offset + (range.start as u32);
167                // Reuse buffer
168                make_view(result_bytes, byte_view.buffer_index, new_offset)
169            } else {
170                // For nulls, keep the original view
171                view
172            }
173        })
174        .collect::<Vec<u128>>();
175
176    // Buffers are unchanged
177    let result = StringViewArray::try_new(
178        ScalarBuffer::from(new_views),
179        Vec::from(string_view_array.data_buffers()),
180        new_nulls,
181    )?;
182    Ok(Arc::new(result) as ArrayRef)
183}