Skip to main content

datafusion_functions/unicode/
common.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Common utilities for implementing unicode functions
19
20use arrow::array::{
21    Array, ArrayRef, ByteView, GenericStringArray, Int64Array, OffsetSizeTrait,
22    StringViewArray, make_view,
23};
24use arrow::datatypes::DataType;
25use arrow_buffer::{NullBuffer, ScalarBuffer};
26use datafusion_common::Result;
27use datafusion_common::ScalarValue;
28use datafusion_common::cast::{
29    as_generic_string_array, as_int64_array, as_string_view_array,
30};
31use datafusion_common::exec_err;
32use datafusion_expr::ColumnarValue;
33use std::cmp::Ordering;
34use std::ops::Range;
35use std::sync::Arc;
36
37/// If `cv` is a non-null scalar string, return its value.
38pub(crate) fn try_as_scalar_str(cv: &ColumnarValue) -> Option<&str> {
39    match cv {
40        ColumnarValue::Scalar(s) => s.try_as_str().flatten(),
41        _ => None,
42    }
43}
44
45/// If `cv` is a non-null scalar Int64, return its value.
46pub(crate) fn try_as_scalar_i64(cv: &ColumnarValue) -> Option<i64> {
47    match cv {
48        ColumnarValue::Scalar(ScalarValue::Int64(v)) => *v,
49        _ => None,
50    }
51}
52
53/// A trait for `left` and `right` byte slicing operations
54pub(crate) trait LeftRightSlicer {
55    fn slice(string: &str, n: i64) -> Range<usize>;
56}
57
58pub(crate) struct LeftSlicer {}
59
60impl LeftRightSlicer for LeftSlicer {
61    fn slice(string: &str, n: i64) -> Range<usize> {
62        0..left_right_byte_length(string, n)
63    }
64}
65
66pub(crate) struct RightSlicer {}
67
68impl LeftRightSlicer for RightSlicer {
69    fn slice(string: &str, n: i64) -> Range<usize> {
70        if n == 0 {
71            // Return nothing for `n=0`
72            0..0
73        } else if n == i64::MIN {
74            // Special case for i64::MIN overflow
75            0..0
76        } else {
77            left_right_byte_length(string, -n)..string.len()
78        }
79    }
80}
81
82/// Returns the byte offset of the `n`th codepoint in `string`,
83/// or `string.len()` if the string has fewer than `n` codepoints.
84#[inline]
85pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
86    string
87        .char_indices()
88        .nth(n)
89        .map_or(string.len(), |(i, _)| i)
90}
91
92/// If `string` has more than `n` codepoints, returns the byte offset of
93/// the `n`-th codepoint boundary. Otherwise returns the total codepoint count.
94#[inline]
95pub(crate) fn char_count_or_boundary(string: &str, n: usize) -> StringCharLen {
96    let mut count = 0;
97    for (byte_idx, _) in string.char_indices() {
98        if count == n {
99            return StringCharLen::ByteOffset(byte_idx);
100        }
101        count += 1;
102    }
103    StringCharLen::CharCount(count)
104}
105
106/// Result of [`char_count_or_boundary`].
107pub(crate) enum StringCharLen {
108    /// The string has more than `n` codepoints; contains the byte offset
109    /// at the `n`-th codepoint boundary.
110    ByteOffset(usize),
111    /// The string has `n` or fewer codepoints; contains the exact count.
112    CharCount(usize),
113}
114
115/// Calculate the byte length of the substring of `n` chars from string `string`
116#[inline]
117fn left_right_byte_length(string: &str, n: i64) -> usize {
118    match n.cmp(&0) {
119        Ordering::Less => string
120            .char_indices()
121            .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
122            .map(|(index, _)| index)
123            .unwrap_or(0),
124        Ordering::Equal => 0,
125        Ordering::Greater => {
126            byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
127        }
128    }
129}
130
131/// General implementation for `left` and `right` functions
132pub(crate) fn general_left_right<F: LeftRightSlicer>(
133    args: &[ArrayRef],
134) -> Result<ArrayRef> {
135    let n_array = as_int64_array(&args[1])?;
136
137    match args[0].data_type() {
138        DataType::Utf8 => {
139            let string_array = as_generic_string_array::<i32>(&args[0])?;
140            general_left_right_array::<i32, F>(string_array, n_array)
141        }
142        DataType::LargeUtf8 => {
143            let string_array = as_generic_string_array::<i64>(&args[0])?;
144            general_left_right_array::<i64, F>(string_array, n_array)
145        }
146        DataType::Utf8View => {
147            let string_view_array = as_string_view_array(&args[0])?;
148            general_left_right_view::<F>(string_view_array, n_array)
149        }
150        _ => exec_err!("Not supported"),
151    }
152}
153
154/// Returns true if all offsets in the array fit in i32, meaning the values
155/// buffer can be referenced by StringView's offset field.
156fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
157    string_array
158        .offsets()
159        .last()
160        .map(|offset| offset.as_usize() <= i32::MAX as usize)
161        .unwrap_or(true)
162}
163
164/// `left`/`right` for Utf8/LargeUtf8 input.
165///
166/// When offsets fit in i32, produces a zero-copy `StringViewArray` with views
167/// pointing into the input values buffer. Otherwise falls back to building a
168/// `StringViewArray` by copying.
169fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
170    string_array: &GenericStringArray<T>,
171    n_array: &Int64Array,
172) -> Result<ArrayRef> {
173    if !values_fit_in_i32(string_array) {
174        let result = string_array
175            .iter()
176            .zip(n_array.iter())
177            .map(|(string, n)| match (string, n) {
178                (Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
179                _ => None,
180            })
181            .collect::<StringViewArray>();
182        return Ok(Arc::new(result) as ArrayRef);
183    }
184
185    let len = string_array.len();
186    let offsets = string_array.value_offsets();
187    let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
188
189    let mut views_buf = Vec::with_capacity(len);
190    let mut has_out_of_line = false;
191
192    for (i, offset) in offsets.iter().enumerate().take(len) {
193        if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
194            views_buf.push(0);
195            continue;
196        }
197
198        // SAFETY: we just checked validity above
199        let string = unsafe { string_array.value_unchecked(i) };
200        let n = n_array.value(i);
201        let range = F::slice(string, n);
202        let result_bytes = &string.as_bytes()[range.clone()];
203        if result_bytes.len() > 12 {
204            has_out_of_line = true;
205        }
206
207        let buf_offset = offset.as_usize() as u32 + range.start as u32;
208        views_buf.push(make_view(result_bytes, 0, buf_offset));
209    }
210
211    let views = ScalarBuffer::from(views_buf);
212    let data_buffers = if has_out_of_line {
213        vec![string_array.values().clone()]
214    } else {
215        vec![]
216    };
217
218    // SAFETY:
219    // - Each view is produced by `make_view` with correct bytes and offset
220    // - Out-of-line views reference buffer index 0, which is the original
221    //   values buffer included in data_buffers when has_out_of_line is true
222    // - values_fit_in_i32 guarantees all offsets fit in i32
223    unsafe {
224        let array = StringViewArray::new_unchecked(views, data_buffers, nulls);
225        Ok(Arc::new(array) as ArrayRef)
226    }
227}
228
229/// `general_left_right` for StringViewArray input.
230fn general_left_right_view<F: LeftRightSlicer>(
231    string_view_array: &StringViewArray,
232    n_array: &Int64Array,
233) -> Result<ArrayRef> {
234    let views = string_view_array.views();
235    let new_nulls = NullBuffer::union(string_view_array.nulls(), n_array.nulls());
236    let len = n_array.len();
237    let mut has_out_of_line = false;
238
239    let new_views = (0..len)
240        .map(|idx| {
241            if new_nulls.as_ref().is_some_and(|n| n.is_null(idx)) {
242                return 0;
243            }
244
245            // SAFETY: we just checked validity above
246            let string: &str = unsafe { string_view_array.value_unchecked(idx) };
247            let n = n_array.value(idx);
248
249            let range = F::slice(string, n);
250            let result_bytes = &string.as_bytes()[range.clone()];
251            if result_bytes.len() > 12 {
252                has_out_of_line = true;
253            }
254
255            let byte_view = ByteView::from(views[idx]);
256            let new_offset = byte_view.offset + (range.start as u32);
257            make_view(result_bytes, byte_view.buffer_index, new_offset)
258        })
259        .collect::<Vec<u128>>();
260
261    let views = ScalarBuffer::from(new_views);
262    let data_buffers = if has_out_of_line {
263        string_view_array.data_buffers().to_vec()
264    } else {
265        vec![]
266    };
267
268    // SAFETY:
269    // - Each view is produced by `make_view` with correct bytes and offset
270    // - Out-of-line views reuse the original buffer index and adjusted offset
271    unsafe {
272        let array = StringViewArray::new_unchecked(views, data_buffers, new_nulls);
273        Ok(Arc::new(array) as ArrayRef)
274    }
275}