datafusion_functions/unicode/
common.rs1use arrow::array::{
21 Array, ArrayRef, ByteView, GenericStringArray, Int64Array, OffsetSizeTrait,
22 StringViewArray, make_view,
23};
24use arrow::datatypes::DataType;
25use arrow_buffer::{NullBuffer, ScalarBuffer};
26use datafusion_common::Result;
27use datafusion_common::ScalarValue;
28use datafusion_common::cast::{
29 as_generic_string_array, as_int64_array, as_string_view_array,
30};
31use datafusion_common::exec_err;
32use datafusion_expr::ColumnarValue;
33use std::cmp::Ordering;
34use std::ops::Range;
35use std::sync::Arc;
36
37pub(crate) fn try_as_scalar_str(cv: &ColumnarValue) -> Option<&str> {
39 match cv {
40 ColumnarValue::Scalar(s) => s.try_as_str().flatten(),
41 _ => None,
42 }
43}
44
45pub(crate) fn try_as_scalar_i64(cv: &ColumnarValue) -> Option<i64> {
47 match cv {
48 ColumnarValue::Scalar(ScalarValue::Int64(v)) => *v,
49 _ => None,
50 }
51}
52
53pub(crate) trait LeftRightSlicer {
55 fn slice(string: &str, n: i64) -> Range<usize>;
56}
57
58pub(crate) struct LeftSlicer {}
59
60impl LeftRightSlicer for LeftSlicer {
61 fn slice(string: &str, n: i64) -> Range<usize> {
62 0..left_right_byte_length(string, n)
63 }
64}
65
66pub(crate) struct RightSlicer {}
67
68impl LeftRightSlicer for RightSlicer {
69 fn slice(string: &str, n: i64) -> Range<usize> {
70 if n == 0 {
71 0..0
73 } else if n == i64::MIN {
74 0..0
76 } else {
77 left_right_byte_length(string, -n)..string.len()
78 }
79 }
80}
81
82#[inline]
85pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
86 string
87 .char_indices()
88 .nth(n)
89 .map_or(string.len(), |(i, _)| i)
90}
91
92#[inline]
95pub(crate) fn char_count_or_boundary(string: &str, n: usize) -> StringCharLen {
96 let mut count = 0;
97 for (byte_idx, _) in string.char_indices() {
98 if count == n {
99 return StringCharLen::ByteOffset(byte_idx);
100 }
101 count += 1;
102 }
103 StringCharLen::CharCount(count)
104}
105
106pub(crate) enum StringCharLen {
108 ByteOffset(usize),
111 CharCount(usize),
113}
114
115#[inline]
117fn left_right_byte_length(string: &str, n: i64) -> usize {
118 match n.cmp(&0) {
119 Ordering::Less => string
120 .char_indices()
121 .nth_back((n.unsigned_abs().min(usize::MAX as u64) - 1) as usize)
122 .map(|(index, _)| index)
123 .unwrap_or(0),
124 Ordering::Equal => 0,
125 Ordering::Greater => {
126 byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
127 }
128 }
129}
130
131pub(crate) fn general_left_right<F: LeftRightSlicer>(
133 args: &[ArrayRef],
134) -> Result<ArrayRef> {
135 let n_array = as_int64_array(&args[1])?;
136
137 match args[0].data_type() {
138 DataType::Utf8 => {
139 let string_array = as_generic_string_array::<i32>(&args[0])?;
140 general_left_right_array::<i32, F>(string_array, n_array)
141 }
142 DataType::LargeUtf8 => {
143 let string_array = as_generic_string_array::<i64>(&args[0])?;
144 general_left_right_array::<i64, F>(string_array, n_array)
145 }
146 DataType::Utf8View => {
147 let string_view_array = as_string_view_array(&args[0])?;
148 general_left_right_view::<F>(string_view_array, n_array)
149 }
150 _ => exec_err!("Not supported"),
151 }
152}
153
154fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
157 string_array
158 .offsets()
159 .last()
160 .map(|offset| offset.as_usize() <= i32::MAX as usize)
161 .unwrap_or(true)
162}
163
164fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
170 string_array: &GenericStringArray<T>,
171 n_array: &Int64Array,
172) -> Result<ArrayRef> {
173 if !values_fit_in_i32(string_array) {
174 let result = string_array
175 .iter()
176 .zip(n_array.iter())
177 .map(|(string, n)| match (string, n) {
178 (Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
179 _ => None,
180 })
181 .collect::<StringViewArray>();
182 return Ok(Arc::new(result) as ArrayRef);
183 }
184
185 let len = string_array.len();
186 let offsets = string_array.value_offsets();
187 let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
188
189 let mut views_buf = Vec::with_capacity(len);
190 let mut has_out_of_line = false;
191
192 for (i, offset) in offsets.iter().enumerate().take(len) {
193 if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
194 views_buf.push(0);
195 continue;
196 }
197
198 let string = unsafe { string_array.value_unchecked(i) };
200 let n = n_array.value(i);
201 let range = F::slice(string, n);
202 let result_bytes = &string.as_bytes()[range.clone()];
203 if result_bytes.len() > 12 {
204 has_out_of_line = true;
205 }
206
207 let buf_offset = offset.as_usize() as u32 + range.start as u32;
208 views_buf.push(make_view(result_bytes, 0, buf_offset));
209 }
210
211 let views = ScalarBuffer::from(views_buf);
212 let data_buffers = if has_out_of_line {
213 vec![string_array.values().clone()]
214 } else {
215 vec![]
216 };
217
218 unsafe {
224 let array = StringViewArray::new_unchecked(views, data_buffers, nulls);
225 Ok(Arc::new(array) as ArrayRef)
226 }
227}
228
229fn general_left_right_view<F: LeftRightSlicer>(
231 string_view_array: &StringViewArray,
232 n_array: &Int64Array,
233) -> Result<ArrayRef> {
234 let views = string_view_array.views();
235 let new_nulls = NullBuffer::union(string_view_array.nulls(), n_array.nulls());
236 let len = n_array.len();
237 let mut has_out_of_line = false;
238
239 let new_views = (0..len)
240 .map(|idx| {
241 if new_nulls.as_ref().is_some_and(|n| n.is_null(idx)) {
242 return 0;
243 }
244
245 let string: &str = unsafe { string_view_array.value_unchecked(idx) };
247 let n = n_array.value(idx);
248
249 let range = F::slice(string, n);
250 let result_bytes = &string.as_bytes()[range.clone()];
251 if result_bytes.len() > 12 {
252 has_out_of_line = true;
253 }
254
255 let byte_view = ByteView::from(views[idx]);
256 let new_offset = byte_view.offset + (range.start as u32);
257 make_view(result_bytes, byte_view.buffer_index, new_offset)
258 })
259 .collect::<Vec<u128>>();
260
261 let views = ScalarBuffer::from(new_views);
262 let data_buffers = if has_out_of_line {
263 string_view_array.data_buffers().to_vec()
264 } else {
265 vec![]
266 };
267
268 unsafe {
272 let array = StringViewArray::new_unchecked(views, data_buffers, new_nulls);
273 Ok(Arc::new(array) as ArrayRef)
274 }
275}