datafusion_common/utils/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! This module provides the bisect function, which implements binary search.
19
20pub mod expr;
21pub mod memory;
22pub mod proxy;
23pub mod string_utils;
24
25use crate::assert_or_internal_err;
26use crate::error::{_exec_datafusion_err, _internal_datafusion_err};
27use crate::{Result, ScalarValue};
28use arrow::array::{
29    Array, ArrayRef, FixedSizeListArray, LargeListArray, ListArray, OffsetSizeTrait,
30    cast::AsArray,
31};
32use arrow::buffer::OffsetBuffer;
33use arrow::compute::{SortColumn, SortOptions, partition};
34use arrow::datatypes::{DataType, Field, SchemaRef};
35#[cfg(feature = "sql")]
36use sqlparser::{ast::Ident, dialect::GenericDialect, parser::Parser};
37use std::borrow::{Borrow, Cow};
38use std::cmp::{Ordering, min};
39use std::collections::HashSet;
40use std::num::NonZero;
41use std::ops::Range;
42use std::sync::Arc;
43use std::thread::available_parallelism;
44
45/// Applies an optional projection to a [`SchemaRef`], returning the
46/// projected schema
47///
48/// Example:
49/// ```
50/// use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
51/// use datafusion_common::project_schema;
52///
53/// // Schema with columns 'a', 'b', and 'c'
54/// let schema = SchemaRef::new(Schema::new(vec![
55///     Field::new("a", DataType::Int32, true),
56///     Field::new("b", DataType::Int64, true),
57///     Field::new("c", DataType::Utf8, true),
58/// ]));
59///
60/// // Pick columns 'c' and 'b'
61/// let projection = Some(vec![2, 1]);
62/// let projected_schema = project_schema(&schema, projection.as_ref()).unwrap();
63///
64/// let expected_schema = SchemaRef::new(Schema::new(vec![
65///     Field::new("c", DataType::Utf8, true),
66///     Field::new("b", DataType::Int64, true),
67/// ]));
68///
69/// assert_eq!(projected_schema, expected_schema);
70/// ```
71pub fn project_schema(
72    schema: &SchemaRef,
73    projection: Option<&impl AsRef<[usize]>>,
74) -> Result<SchemaRef> {
75    let schema = match projection {
76        Some(columns) => Arc::new(schema.project(columns.as_ref())?),
77        None => Arc::clone(schema),
78    };
79    Ok(schema)
80}
81
82/// Extracts a row at the specified index from a set of columns and stores it in the provided buffer.
83pub fn extract_row_at_idx_to_buf(
84    columns: &[ArrayRef],
85    idx: usize,
86    buf: &mut Vec<ScalarValue>,
87) -> Result<()> {
88    buf.clear();
89
90    let iter = columns
91        .iter()
92        .map(|arr| ScalarValue::try_from_array(arr, idx));
93    for v in iter.into_iter() {
94        buf.push(v?);
95    }
96
97    Ok(())
98}
99/// Given column vectors, returns row at `idx`.
100pub fn get_row_at_idx(columns: &[ArrayRef], idx: usize) -> Result<Vec<ScalarValue>> {
101    columns
102        .iter()
103        .map(|arr| ScalarValue::try_from_array(arr, idx))
104        .collect()
105}
106
107/// This function compares two tuples depending on the given sort options.
108pub fn compare_rows(
109    x: &[ScalarValue],
110    y: &[ScalarValue],
111    sort_options: &[SortOptions],
112) -> Result<Ordering> {
113    let zip_it = x.iter().zip(y.iter()).zip(sort_options.iter());
114    // Preserving lexical ordering.
115    for ((lhs, rhs), sort_options) in zip_it {
116        // Consider all combinations of NULLS FIRST/LAST and ASC/DESC configurations.
117        let result = match (lhs.is_null(), rhs.is_null(), sort_options.nulls_first) {
118            (true, false, false) | (false, true, true) => Ordering::Greater,
119            (true, false, true) | (false, true, false) => Ordering::Less,
120            (false, false, _) => {
121                if sort_options.descending {
122                    rhs.try_cmp(lhs)?
123                } else {
124                    lhs.try_cmp(rhs)?
125                }
126            }
127            (true, true, _) => continue,
128        };
129        if result != Ordering::Equal {
130            return Ok(result);
131        }
132    }
133    Ok(Ordering::Equal)
134}
135
136/// This function searches for a tuple of given values (`target`) among the given
137/// rows (`item_columns`) using the bisection algorithm. It assumes that `item_columns`
138/// is sorted according to `sort_options` and returns the insertion index of `target`.
139/// Template argument `SIDE` being `true`/`false` means left/right insertion.
140pub fn bisect<const SIDE: bool>(
141    item_columns: &[ArrayRef],
142    target: &[ScalarValue],
143    sort_options: &[SortOptions],
144) -> Result<usize> {
145    let low: usize = 0;
146    let high: usize = item_columns
147        .first()
148        .ok_or_else(|| _internal_datafusion_err!("Column array shouldn't be empty"))?
149        .len();
150    let compare_fn = |current: &[ScalarValue], target: &[ScalarValue]| {
151        let cmp = compare_rows(current, target, sort_options)?;
152        Ok(if SIDE { cmp.is_lt() } else { cmp.is_le() })
153    };
154    find_bisect_point(item_columns, target, compare_fn, low, high)
155}
156
157/// This function searches for a tuple of given values (`target`) among a slice of
158/// the given rows (`item_columns`) using the bisection algorithm. The slice starts
159/// at the index `low` and ends at the index `high`. The boolean-valued function
160/// `compare_fn` specifies whether we bisect on the left (by returning `false`),
161/// or on the right (by returning `true`) when we compare the target value with
162/// the current value as we iteratively bisect the input.
163pub fn find_bisect_point<F>(
164    item_columns: &[ArrayRef],
165    target: &[ScalarValue],
166    compare_fn: F,
167    mut low: usize,
168    mut high: usize,
169) -> Result<usize>
170where
171    F: Fn(&[ScalarValue], &[ScalarValue]) -> Result<bool>,
172{
173    while low < high {
174        let mid = ((high - low) / 2) + low;
175        let val = get_row_at_idx(item_columns, mid)?;
176        if compare_fn(&val, target)? {
177            low = mid + 1;
178        } else {
179            high = mid;
180        }
181    }
182    Ok(low)
183}
184
185/// This function searches for a tuple of given values (`target`) among the given
186/// rows (`item_columns`) via a linear scan. It assumes that `item_columns` is sorted
187/// according to `sort_options` and returns the insertion index of `target`.
188/// Template argument `SIDE` being `true`/`false` means left/right insertion.
189pub fn linear_search<const SIDE: bool>(
190    item_columns: &[ArrayRef],
191    target: &[ScalarValue],
192    sort_options: &[SortOptions],
193) -> Result<usize> {
194    let low: usize = 0;
195    let high: usize = item_columns
196        .first()
197        .ok_or_else(|| _internal_datafusion_err!("Column array shouldn't be empty"))?
198        .len();
199    let compare_fn = |current: &[ScalarValue], target: &[ScalarValue]| {
200        let cmp = compare_rows(current, target, sort_options)?;
201        Ok(if SIDE { cmp.is_lt() } else { cmp.is_le() })
202    };
203    search_in_slice(item_columns, target, compare_fn, low, high)
204}
205
206/// This function searches for a tuple of given values (`target`) among a slice of
207/// the given rows (`item_columns`) via a linear scan. The slice starts at the index
208/// `low` and ends at the index `high`. The boolean-valued function `compare_fn`
209/// specifies the stopping criterion.
210pub fn search_in_slice<F>(
211    item_columns: &[ArrayRef],
212    target: &[ScalarValue],
213    compare_fn: F,
214    mut low: usize,
215    high: usize,
216) -> Result<usize>
217where
218    F: Fn(&[ScalarValue], &[ScalarValue]) -> Result<bool>,
219{
220    while low < high {
221        let val = get_row_at_idx(item_columns, low)?;
222        if !compare_fn(&val, target)? {
223            break;
224        }
225        low += 1;
226    }
227    Ok(low)
228}
229
230/// Given a list of 0 or more already sorted columns, finds the
231/// partition ranges that would partition equally across columns.
232///
233/// See [`partition`] for more details.
234pub fn evaluate_partition_ranges(
235    num_rows: usize,
236    partition_columns: &[SortColumn],
237) -> Result<Vec<Range<usize>>> {
238    Ok(if partition_columns.is_empty() {
239        vec![Range {
240            start: 0,
241            end: num_rows,
242        }]
243    } else {
244        let cols: Vec<_> = partition_columns
245            .iter()
246            .map(|x| Arc::clone(&x.values))
247            .collect();
248        partition(&cols)?.ranges()
249    })
250}
251
252/// Wraps identifier string in double quotes, escaping any double quotes in
253/// the identifier by replacing it with two double quotes
254///
255/// e.g. identifier `tab.le"name` becomes `"tab.le""name"`
256pub fn quote_identifier(s: &str) -> Cow<'_, str> {
257    if needs_quotes(s) {
258        Cow::Owned(format!("\"{}\"", s.replace('"', "\"\"")))
259    } else {
260        Cow::Borrowed(s)
261    }
262}
263
264/// returns true if this identifier needs quotes
265fn needs_quotes(s: &str) -> bool {
266    let mut chars = s.chars();
267
268    // first char can not be a number unless escaped
269    if let Some(first_char) = chars.next()
270        && !(first_char.is_ascii_lowercase() || first_char == '_')
271    {
272        return true;
273    }
274
275    !chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
276}
277
278#[cfg(feature = "sql")]
279pub(crate) fn parse_identifiers(s: &str) -> Result<Vec<Ident>> {
280    let dialect = GenericDialect;
281    let mut parser = Parser::new(&dialect).try_with_sql(s)?;
282    let idents = parser.parse_multipart_identifier()?;
283    Ok(idents)
284}
285
286/// Parse a string into a vector of identifiers.
287///
288/// Note: If ignore_case is false, the string will be normalized to lowercase.
289#[cfg(feature = "sql")]
290pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec<String> {
291    parse_identifiers(s)
292        .unwrap_or_default()
293        .into_iter()
294        .map(|id| match id.quote_style {
295            Some(_) => id.value,
296            None if ignore_case => id.value,
297            _ => id.value.to_ascii_lowercase(),
298        })
299        .collect::<Vec<_>>()
300}
301
302#[cfg(not(feature = "sql"))]
303pub(crate) fn parse_identifiers(s: &str) -> Result<Vec<String>> {
304    let mut result = Vec::new();
305    let mut current = String::new();
306    let mut in_quotes = false;
307
308    for ch in s.chars() {
309        match ch {
310            '"' => {
311                in_quotes = !in_quotes;
312                current.push(ch);
313            }
314            '.' if !in_quotes => {
315                result.push(current.clone());
316                current.clear();
317            }
318            _ => {
319                current.push(ch);
320            }
321        }
322    }
323
324    // Push the last part if it's not empty
325    if !current.is_empty() {
326        result.push(current);
327    }
328
329    Ok(result)
330}
331
332#[cfg(not(feature = "sql"))]
333pub(crate) fn parse_identifiers_normalized(s: &str, ignore_case: bool) -> Vec<String> {
334    parse_identifiers(s)
335        .unwrap_or_default()
336        .into_iter()
337        .map(|id| {
338            let is_double_quoted = if id.len() > 2 {
339                let mut chars = id.chars();
340                chars.next() == Some('"') && chars.last() == Some('"')
341            } else {
342                false
343            };
344            if is_double_quoted {
345                id[1..id.len() - 1].to_string().replace("\"\"", "\"")
346            } else if ignore_case {
347                id
348            } else {
349                id.to_ascii_lowercase()
350            }
351        })
352        .collect::<Vec<_>>()
353}
354
355/// This function "takes" the elements at `indices` from the slice `items`.
356pub fn get_at_indices<T: Clone, I: Borrow<usize>>(
357    items: &[T],
358    indices: impl IntoIterator<Item = I>,
359) -> Result<Vec<T>> {
360    indices
361        .into_iter()
362        .map(|idx| items.get(*idx.borrow()).cloned())
363        .collect::<Option<Vec<T>>>()
364        .ok_or_else(|| {
365            _exec_datafusion_err!("Expects indices to be in the range of searched vector")
366        })
367}
368
369/// This function finds the longest prefix of the form 0, 1, 2, ... within the
370/// collection `sequence`. Examples:
371/// - For 0, 1, 2, 4, 5; we would produce 3, meaning 0, 1, 2 is the longest satisfying
372///   prefix.
373/// - For 1, 2, 3, 4; we would produce 0, meaning there is no such prefix.
374pub fn longest_consecutive_prefix<T: Borrow<usize>>(
375    sequence: impl IntoIterator<Item = T>,
376) -> usize {
377    let mut count = 0;
378    for item in sequence {
379        if !count.eq(item.borrow()) {
380            break;
381        }
382        count += 1;
383    }
384    count
385}
386
387/// Creates single element [`ListArray`], [`LargeListArray`] and
388/// [`FixedSizeListArray`] from other arrays
389///
390/// For example this builder can convert `[1, 2, 3]` into `[[1, 2, 3]]`
391///
392/// # Example
393/// ```
394/// # use std::sync::Arc;
395/// # use arrow::array::{Array, ListArray};
396/// # use arrow::array::types::Int64Type;
397/// # use datafusion_common::utils::SingleRowListArrayBuilder;
398/// // Array is [1, 2, 3]
399/// let arr = ListArray::from_iter_primitive::<Int64Type, _, _>(vec![Some(vec![
400///     Some(1),
401///     Some(2),
402///     Some(3),
403/// ])]);
404/// // Wrap as a list array: [[1, 2, 3]]
405/// let list_arr = SingleRowListArrayBuilder::new(Arc::new(arr)).build_list_array();
406/// assert_eq!(list_arr.len(), 1);
407/// ```
408#[derive(Debug, Clone)]
409pub struct SingleRowListArrayBuilder {
410    /// array to be wrapped
411    arr: ArrayRef,
412    /// Should the resulting array be nullable? Defaults to `true`.
413    nullable: bool,
414    /// Specify the field name for the resulting array. Defaults to value used in
415    /// [`Field::new_list_field`]
416    field_name: Option<String>,
417}
418
419impl SingleRowListArrayBuilder {
420    /// Create a new instance of [`SingleRowListArrayBuilder`]
421    pub fn new(arr: ArrayRef) -> Self {
422        Self {
423            arr,
424            nullable: true,
425            field_name: None,
426        }
427    }
428
429    /// Set the nullable flag
430    pub fn with_nullable(mut self, nullable: bool) -> Self {
431        self.nullable = nullable;
432        self
433    }
434
435    /// sets the field name for the resulting array
436    pub fn with_field_name(mut self, field_name: Option<String>) -> Self {
437        self.field_name = field_name;
438        self
439    }
440
441    /// Copies field name and nullable from the specified field
442    pub fn with_field(self, field: &Field) -> Self {
443        self.with_field_name(Some(field.name().to_owned()))
444            .with_nullable(field.is_nullable())
445    }
446
447    /// Build a single element [`ListArray`]
448    pub fn build_list_array(self) -> ListArray {
449        let (field, arr) = self.into_field_and_arr();
450        let offsets = OffsetBuffer::from_lengths([arr.len()]);
451        ListArray::new(field, offsets, arr, None)
452    }
453
454    /// Build a single element [`ListArray`] and wrap as [`ScalarValue::List`]
455    pub fn build_list_scalar(self) -> ScalarValue {
456        ScalarValue::List(Arc::new(self.build_list_array()))
457    }
458
459    /// Build a single element [`LargeListArray`]
460    pub fn build_large_list_array(self) -> LargeListArray {
461        let (field, arr) = self.into_field_and_arr();
462        let offsets = OffsetBuffer::from_lengths([arr.len()]);
463        LargeListArray::new(field, offsets, arr, None)
464    }
465
466    /// Build a single element [`LargeListArray`] and wrap as [`ScalarValue::LargeList`]
467    pub fn build_large_list_scalar(self) -> ScalarValue {
468        ScalarValue::LargeList(Arc::new(self.build_large_list_array()))
469    }
470
471    /// Build a single element [`FixedSizeListArray`]
472    pub fn build_fixed_size_list_array(self, list_size: usize) -> FixedSizeListArray {
473        let (field, arr) = self.into_field_and_arr();
474        FixedSizeListArray::new(field, list_size as i32, arr, None)
475    }
476
477    /// Build a single element [`FixedSizeListArray`] and wrap as [`ScalarValue::FixedSizeList`]
478    pub fn build_fixed_size_list_scalar(self, list_size: usize) -> ScalarValue {
479        ScalarValue::FixedSizeList(Arc::new(self.build_fixed_size_list_array(list_size)))
480    }
481
482    /// Helper function: convert this builder into a tuple of field and array
483    fn into_field_and_arr(self) -> (Arc<Field>, ArrayRef) {
484        let Self {
485            arr,
486            nullable,
487            field_name,
488        } = self;
489        let data_type = arr.data_type().to_owned();
490        let field = match field_name {
491            Some(name) => Field::new(name, data_type, nullable),
492            None => Field::new_list_field(data_type, nullable),
493        };
494        (Arc::new(field), arr)
495    }
496}
497
498/// Wrap arrays into a single element `ListArray`.
499///
500/// Example:
501/// ```
502/// use arrow::array::{Int32Array, ListArray, ArrayRef};
503/// use arrow::datatypes::{Int32Type, Field};
504/// use std::sync::Arc;
505///
506/// let arr1 = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
507/// let arr2 = Arc::new(Int32Array::from(vec![4, 5, 6])) as ArrayRef;
508///
509/// let list_arr = datafusion_common::utils::arrays_into_list_array([arr1, arr2]).unwrap();
510///
511/// let expected = ListArray::from_iter_primitive::<Int32Type, _, _>(
512///    vec![
513///     Some(vec![Some(1), Some(2), Some(3)]),
514///     Some(vec![Some(4), Some(5), Some(6)]),
515///    ]
516/// );
517///
518/// assert_eq!(list_arr, expected);
519/// ```
520pub fn arrays_into_list_array(
521    arr: impl IntoIterator<Item = ArrayRef>,
522) -> Result<ListArray> {
523    let arr = arr.into_iter().collect::<Vec<_>>();
524    assert_or_internal_err!(!arr.is_empty(), "Cannot wrap empty array into list array");
525
526    let lens = arr.iter().map(|x| x.len()).collect::<Vec<_>>();
527    // Assume data type is consistent
528    let data_type = arr[0].data_type().to_owned();
529    let values = arr.iter().map(|x| x.as_ref()).collect::<Vec<_>>();
530    Ok(ListArray::new(
531        Arc::new(Field::new_list_field(data_type, true)),
532        OffsetBuffer::from_lengths(lens),
533        arrow::compute::concat(values.as_slice())?,
534        None,
535    ))
536}
537
538/// Helper function to convert a ListArray into a vector of ArrayRefs.
539pub fn list_to_arrays<O: OffsetSizeTrait>(a: &ArrayRef) -> Vec<ArrayRef> {
540    a.as_list::<O>().iter().flatten().collect::<Vec<_>>()
541}
542
543/// Helper function to convert a FixedSizeListArray into a vector of ArrayRefs.
544pub fn fixed_size_list_to_arrays(a: &ArrayRef) -> Vec<ArrayRef> {
545    a.as_fixed_size_list().iter().flatten().collect::<Vec<_>>()
546}
547
548/// Get the base type of a data type.
549///
550/// Example
551/// ```
552/// use arrow::datatypes::{DataType, Field};
553/// use datafusion_common::utils::base_type;
554/// use std::sync::Arc;
555///
556/// let data_type =
557///     DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
558/// assert_eq!(base_type(&data_type), DataType::Int32);
559///
560/// let data_type = DataType::Int32;
561/// assert_eq!(base_type(&data_type), DataType::Int32);
562/// ```
563pub fn base_type(data_type: &DataType) -> DataType {
564    match data_type {
565        DataType::List(field)
566        | DataType::LargeList(field)
567        | DataType::FixedSizeList(field, _) => base_type(field.data_type()),
568        _ => data_type.to_owned(),
569    }
570}
571
572/// Information about how to coerce lists.
573#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)]
574pub enum ListCoercion {
575    /// [`DataType::FixedSizeList`] should be coerced to [`DataType::List`].
576    FixedSizedListToList,
577}
578
579/// A helper function to coerce base type in List.
580///
581/// Example
582/// ```
583/// use arrow::datatypes::{DataType, Field};
584/// use datafusion_common::utils::coerced_type_with_base_type_only;
585/// use std::sync::Arc;
586///
587/// let data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true)));
588/// let base_type = DataType::Float64;
589/// let coerced_type = coerced_type_with_base_type_only(&data_type, &base_type, None);
590/// assert_eq!(coerced_type, DataType::List(Arc::new(Field::new_list_field(DataType::Float64, true))));
591/// ```
592pub fn coerced_type_with_base_type_only(
593    data_type: &DataType,
594    base_type: &DataType,
595    array_coercion: Option<&ListCoercion>,
596) -> DataType {
597    match (data_type, array_coercion) {
598        (DataType::List(field), _)
599        | (DataType::FixedSizeList(field, _), Some(ListCoercion::FixedSizedListToList)) =>
600        {
601            let field_type = coerced_type_with_base_type_only(
602                field.data_type(),
603                base_type,
604                array_coercion,
605            );
606
607            DataType::List(Arc::new(Field::new(
608                field.name(),
609                field_type,
610                field.is_nullable(),
611            )))
612        }
613        (DataType::FixedSizeList(field, len), _) => {
614            let field_type = coerced_type_with_base_type_only(
615                field.data_type(),
616                base_type,
617                array_coercion,
618            );
619
620            DataType::FixedSizeList(
621                Arc::new(Field::new(field.name(), field_type, field.is_nullable())),
622                *len,
623            )
624        }
625        (DataType::LargeList(field), _) => {
626            let field_type = coerced_type_with_base_type_only(
627                field.data_type(),
628                base_type,
629                array_coercion,
630            );
631
632            DataType::LargeList(Arc::new(Field::new(
633                field.name(),
634                field_type,
635                field.is_nullable(),
636            )))
637        }
638
639        _ => base_type.clone(),
640    }
641}
642
643/// Recursively coerce and `FixedSizeList` elements to `List`
644pub fn coerced_fixed_size_list_to_list(data_type: &DataType) -> DataType {
645    match data_type {
646        DataType::List(field) | DataType::FixedSizeList(field, _) => {
647            let field_type = coerced_fixed_size_list_to_list(field.data_type());
648
649            DataType::List(Arc::new(Field::new(
650                field.name(),
651                field_type,
652                field.is_nullable(),
653            )))
654        }
655        DataType::LargeList(field) => {
656            let field_type = coerced_fixed_size_list_to_list(field.data_type());
657
658            DataType::LargeList(Arc::new(Field::new(
659                field.name(),
660                field_type,
661                field.is_nullable(),
662            )))
663        }
664
665        _ => data_type.clone(),
666    }
667}
668
669/// Compute the number of dimensions in a list data type.
670pub fn list_ndims(data_type: &DataType) -> u64 {
671    match data_type {
672        DataType::List(field)
673        | DataType::LargeList(field)
674        | DataType::FixedSizeList(field, _) => 1 + list_ndims(field.data_type()),
675        _ => 0,
676    }
677}
678
679/// Adopted from strsim-rs for string similarity metrics
680pub mod datafusion_strsim {
681    // Source: https://github.com/dguo/strsim-rs/blob/master/src/lib.rs
682    // License: https://github.com/dguo/strsim-rs/blob/master/LICENSE
683    use std::cmp::min;
684    use std::str::Chars;
685
686    struct StringWrapper<'a>(&'a str);
687
688    impl<'b> IntoIterator for &StringWrapper<'b> {
689        type Item = char;
690        type IntoIter = Chars<'b>;
691
692        fn into_iter(self) -> Self::IntoIter {
693            self.0.chars()
694        }
695    }
696
697    /// Calculates the minimum number of insertions, deletions, and substitutions
698    /// required to change one sequence into the other, using a reusable cache buffer.
699    ///
700    /// This is the generic implementation that works with any iterator types.
701    /// The `cache` buffer will be resized as needed and reused across calls.
702    fn generic_levenshtein_with_buffer<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
703        a: &'a Iter1,
704        b: &'b Iter2,
705        cache: &mut Vec<usize>,
706    ) -> usize
707    where
708        &'a Iter1: IntoIterator<Item = Elem1>,
709        &'b Iter2: IntoIterator<Item = Elem2>,
710        Elem1: PartialEq<Elem2>,
711    {
712        let b_len = b.into_iter().count();
713
714        if a.into_iter().next().is_none() {
715            return b_len;
716        }
717
718        // Resize cache to fit b_len elements
719        cache.clear();
720        cache.extend(1..=b_len);
721
722        let mut result = 0;
723
724        for (i, a_elem) in a.into_iter().enumerate() {
725            result = i + 1;
726            let mut distance_b = i;
727
728            for (j, b_elem) in b.into_iter().enumerate() {
729                let cost = if a_elem == b_elem { 0usize } else { 1usize };
730                let distance_a = distance_b + cost;
731                distance_b = cache[j];
732                result = min(result + 1, min(distance_a, distance_b + 1));
733                cache[j] = result;
734            }
735        }
736
737        result
738    }
739
740    /// Calculates the minimum number of insertions, deletions, and substitutions
741    /// required to change one sequence into the other.
742    fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(
743        a: &'a Iter1,
744        b: &'b Iter2,
745    ) -> usize
746    where
747        &'a Iter1: IntoIterator<Item = Elem1>,
748        &'b Iter2: IntoIterator<Item = Elem2>,
749        Elem1: PartialEq<Elem2>,
750    {
751        let mut cache = Vec::new();
752        generic_levenshtein_with_buffer(a, b, &mut cache)
753    }
754
755    /// Calculates the minimum number of insertions, deletions, and substitutions
756    /// required to change one string into the other.
757    ///
758    /// ```
759    /// use datafusion_common::utils::datafusion_strsim::levenshtein;
760    ///
761    /// assert_eq!(3, levenshtein("kitten", "sitting"));
762    /// ```
763    pub fn levenshtein(a: &str, b: &str) -> usize {
764        generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
765    }
766
767    /// Calculates the Levenshtein distance using a reusable cache buffer.
768    /// This avoids allocating a new Vec for each call, improving performance
769    /// when computing many distances.
770    ///
771    /// The `cache` buffer will be resized as needed and reused across calls.
772    pub fn levenshtein_with_buffer(a: &str, b: &str, cache: &mut Vec<usize>) -> usize {
773        generic_levenshtein_with_buffer(&StringWrapper(a), &StringWrapper(b), cache)
774    }
775
776    /// Calculates the normalized Levenshtein distance between two strings.
777    /// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
778    /// that the strings are identical and 0.0 indicates no similarity.
779    ///
780    /// ```
781    /// use datafusion_common::utils::datafusion_strsim::normalized_levenshtein;
782    ///
783    /// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001);
784    ///
785    /// assert!(normalized_levenshtein("", "second").abs() < 0.00001);
786    ///
787    /// assert!((normalized_levenshtein("kitten", "sitten") - 0.833).abs() < 0.001);
788    /// ```
789    pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
790        if a.is_empty() && b.is_empty() {
791            return 1.0;
792        }
793        1.0 - (levenshtein(a, b) as f64)
794            / (a.chars().count().max(b.chars().count()) as f64)
795    }
796}
797
798/// Merges collections `first` and `second`, removes duplicates and sorts the
799/// result, returning it as a [`Vec`].
800pub fn merge_and_order_indices<T: Borrow<usize>, S: Borrow<usize>>(
801    first: impl IntoIterator<Item = T>,
802    second: impl IntoIterator<Item = S>,
803) -> Vec<usize> {
804    let mut result: Vec<_> = first
805        .into_iter()
806        .map(|e| *e.borrow())
807        .chain(second.into_iter().map(|e| *e.borrow()))
808        .collect::<HashSet<_>>()
809        .into_iter()
810        .collect();
811    result.sort();
812    result
813}
814
815/// Calculates the set difference between sequences `first` and `second`,
816/// returning the result as a [`Vec`]. Preserves the ordering of `first`.
817pub fn set_difference<T: Borrow<usize>, S: Borrow<usize>>(
818    first: impl IntoIterator<Item = T>,
819    second: impl IntoIterator<Item = S>,
820) -> Vec<usize> {
821    let set: HashSet<_> = second.into_iter().map(|e| *e.borrow()).collect();
822    first
823        .into_iter()
824        .map(|e| *e.borrow())
825        .filter(|e| !set.contains(e))
826        .collect()
827}
828
829/// Find indices of each element in `targets` inside `items`. If one of the
830/// elements is absent in `items`, returns an error.
831pub fn find_indices<T: PartialEq, S: Borrow<T>>(
832    items: &[T],
833    targets: impl IntoIterator<Item = S>,
834) -> Result<Vec<usize>> {
835    targets
836        .into_iter()
837        .map(|target| items.iter().position(|e| target.borrow().eq(e)))
838        .collect::<Option<_>>()
839        .ok_or_else(|| _exec_datafusion_err!("Target not found"))
840}
841
842/// Transposes the given vector of vectors.
843pub fn transpose<T>(original: Vec<Vec<T>>) -> Vec<Vec<T>> {
844    match original.as_slice() {
845        [] => vec![],
846        [first, ..] => {
847            let mut result = (0..first.len()).map(|_| vec![]).collect::<Vec<_>>();
848            for row in original {
849                for (item, transposed_row) in row.into_iter().zip(&mut result) {
850                    transposed_row.push(item);
851                }
852            }
853            result
854        }
855    }
856}
857
858/// Computes the `skip` and `fetch` parameters of a single limit that would be
859/// equivalent to two consecutive limits with the given `skip`/`fetch` parameters.
860///
861/// There are multiple cases to consider:
862///
863/// # Case 0: Parent and child are disjoint (`child_fetch <= skip`).
864///
865/// ```text
866///   Before merging:
867///                     |........skip........|---fetch-->|     Parent limit
868///    |...child_skip...|---child_fetch-->|                    Child limit
869/// ```
870///
871///   After merging:
872/// ```text
873///    |.........(child_skip + skip).........|
874/// ```
875///
876/// # Case 1: Parent is beyond child's range (`skip < child_fetch <= skip + fetch`).
877///
878///   Before merging:
879/// ```text
880///                     |...skip...|------------fetch------------>|   Parent limit
881///    |...child_skip...|-------------child_fetch------------>|       Child limit
882/// ```
883///
884///   After merging:
885/// ```text
886///    |....(child_skip + skip)....|---(child_fetch - skip)-->|
887/// ```
888///
889///  # Case 2: Parent is within child's range (`skip + fetch < child_fetch`).
890///
891///   Before merging:
892/// ```text
893///                     |...skip...|---fetch-->|                   Parent limit
894///    |...child_skip...|-------------child_fetch------------>|    Child limit
895/// ```
896///
897///   After merging:
898/// ```text
899///    |....(child_skip + skip)....|---fetch-->|
900/// ```
901pub fn combine_limit(
902    parent_skip: usize,
903    parent_fetch: Option<usize>,
904    child_skip: usize,
905    child_fetch: Option<usize>,
906) -> (usize, Option<usize>) {
907    let combined_skip = child_skip.saturating_add(parent_skip);
908
909    let combined_fetch = match (parent_fetch, child_fetch) {
910        (Some(parent_fetch), Some(child_fetch)) => {
911            Some(min(parent_fetch, child_fetch.saturating_sub(parent_skip)))
912        }
913        (Some(parent_fetch), None) => Some(parent_fetch),
914        (None, Some(child_fetch)) => Some(child_fetch.saturating_sub(parent_skip)),
915        (None, None) => None,
916    };
917
918    (combined_skip, combined_fetch)
919}
920
921/// Returns the estimated number of threads available for parallel execution.
922///
923/// This is a wrapper around `std::thread::available_parallelism`, providing a default value
924/// of `1` if the system's parallelism cannot be determined.
925pub fn get_available_parallelism() -> usize {
926    available_parallelism()
927        .unwrap_or(NonZero::new(1).expect("literal value `1` shouldn't be zero"))
928        .get()
929}
930
931/// Converts a collection of function arguments into a fixed-size array of length N
932/// producing a reasonable error message in case of unexpected number of arguments.
933///
934/// # Example
935/// ```
936/// # use datafusion_common::Result;
937/// # use datafusion_common::utils::take_function_args;
938/// # use datafusion_common::ScalarValue;
939/// fn my_function(args: &[ScalarValue]) -> Result<()> {
940///     // function expects 2 args, so create a 2-element array
941///     let [arg1, arg2] = take_function_args("my_function", args)?;
942///     // ... do stuff..
943///     Ok(())
944/// }
945///
946/// // Calling the function with 1 argument produces an error:
947/// let args = vec![ScalarValue::Int32(Some(10))];
948/// let err = my_function(&args).unwrap_err();
949/// assert_eq!(
950///     err.to_string(),
951///     "Execution error: my_function function requires 2 arguments, got 1"
952/// );
953/// // Calling the function with 2 arguments works great
954/// let args = vec![ScalarValue::Int32(Some(10)), ScalarValue::Int32(Some(20))];
955/// my_function(&args).unwrap();
956/// ```
957pub fn take_function_args<const N: usize, T>(
958    function_name: &str,
959    args: impl IntoIterator<Item = T>,
960) -> Result<[T; N]> {
961    let args = args.into_iter().collect::<Vec<_>>();
962    args.try_into().map_err(|v: Vec<T>| {
963        _exec_datafusion_err!(
964            "{} function requires {} {}, got {}",
965            function_name,
966            N,
967            if N == 1 { "argument" } else { "arguments" },
968            v.len()
969        )
970    })
971}
972
973#[cfg(test)]
974mod tests {
975    use super::*;
976    use crate::ScalarValue::Null;
977    use arrow::array::Float64Array;
978
979    #[test]
980    fn test_bisect_linear_left_and_right() -> Result<()> {
981        let arrays: Vec<ArrayRef> = vec![
982            Arc::new(Float64Array::from(vec![5.0, 7.0, 8.0, 9., 10.])),
983            Arc::new(Float64Array::from(vec![2.0, 3.0, 3.0, 4.0, 5.0])),
984            Arc::new(Float64Array::from(vec![5.0, 7.0, 8.0, 10., 11.0])),
985            Arc::new(Float64Array::from(vec![15.0, 13.0, 8.0, 5., 0.0])),
986        ];
987        let search_tuple: Vec<ScalarValue> = vec![
988            ScalarValue::Float64(Some(8.0)),
989            ScalarValue::Float64(Some(3.0)),
990            ScalarValue::Float64(Some(8.0)),
991            ScalarValue::Float64(Some(8.0)),
992        ];
993        let ords = [
994            SortOptions {
995                descending: false,
996                nulls_first: true,
997            },
998            SortOptions {
999                descending: false,
1000                nulls_first: true,
1001            },
1002            SortOptions {
1003                descending: false,
1004                nulls_first: true,
1005            },
1006            SortOptions {
1007                descending: true,
1008                nulls_first: true,
1009            },
1010        ];
1011        let res = bisect::<true>(&arrays, &search_tuple, &ords)?;
1012        assert_eq!(res, 2);
1013        let res = bisect::<false>(&arrays, &search_tuple, &ords)?;
1014        assert_eq!(res, 3);
1015        let res = linear_search::<true>(&arrays, &search_tuple, &ords)?;
1016        assert_eq!(res, 2);
1017        let res = linear_search::<false>(&arrays, &search_tuple, &ords)?;
1018        assert_eq!(res, 3);
1019        Ok(())
1020    }
1021
1022    #[test]
1023    fn vector_ord() {
1024        assert!(vec![1, 0, 0, 0, 0, 0, 0, 1] < vec![1, 0, 0, 0, 0, 0, 0, 2]);
1025        assert!(vec![1, 0, 0, 0, 0, 0, 1, 1] > vec![1, 0, 0, 0, 0, 0, 0, 2]);
1026        assert!(
1027            vec![
1028                ScalarValue::Int32(Some(2)),
1029                Null,
1030                ScalarValue::Int32(Some(0)),
1031            ] < vec![
1032                ScalarValue::Int32(Some(2)),
1033                Null,
1034                ScalarValue::Int32(Some(1)),
1035            ]
1036        );
1037        assert!(
1038            vec![
1039                ScalarValue::Int32(Some(2)),
1040                ScalarValue::Int32(None),
1041                ScalarValue::Int32(Some(0)),
1042            ] < vec![
1043                ScalarValue::Int32(Some(2)),
1044                ScalarValue::Int32(None),
1045                ScalarValue::Int32(Some(1)),
1046            ]
1047        );
1048    }
1049
1050    #[test]
1051    fn ord_same_type() {
1052        assert!((ScalarValue::Int32(Some(2)) < ScalarValue::Int32(Some(3))));
1053    }
1054
1055    #[test]
1056    fn test_bisect_linear_left_and_right_diff_sort() -> Result<()> {
1057        // Descending, left
1058        let arrays: Vec<ArrayRef> =
1059            vec![Arc::new(Float64Array::from(vec![4.0, 3.0, 2.0, 1.0, 0.0]))];
1060        let search_tuple: Vec<ScalarValue> = vec![ScalarValue::Float64(Some(4.0))];
1061        let ords = [SortOptions {
1062            descending: true,
1063            nulls_first: true,
1064        }];
1065        let res = bisect::<true>(&arrays, &search_tuple, &ords)?;
1066        assert_eq!(res, 0);
1067        let res = linear_search::<true>(&arrays, &search_tuple, &ords)?;
1068        assert_eq!(res, 0);
1069
1070        // Descending, right
1071        let arrays: Vec<ArrayRef> =
1072            vec![Arc::new(Float64Array::from(vec![4.0, 3.0, 2.0, 1.0, 0.0]))];
1073        let search_tuple: Vec<ScalarValue> = vec![ScalarValue::Float64(Some(4.0))];
1074        let ords = [SortOptions {
1075            descending: true,
1076            nulls_first: true,
1077        }];
1078        let res = bisect::<false>(&arrays, &search_tuple, &ords)?;
1079        assert_eq!(res, 1);
1080        let res = linear_search::<false>(&arrays, &search_tuple, &ords)?;
1081        assert_eq!(res, 1);
1082
1083        // Ascending, left
1084        let arrays: Vec<ArrayRef> =
1085            vec![Arc::new(Float64Array::from(vec![5.0, 7.0, 8.0, 9., 10.]))];
1086        let search_tuple: Vec<ScalarValue> = vec![ScalarValue::Float64(Some(7.0))];
1087        let ords = [SortOptions {
1088            descending: false,
1089            nulls_first: true,
1090        }];
1091        let res = bisect::<true>(&arrays, &search_tuple, &ords)?;
1092        assert_eq!(res, 1);
1093        let res = linear_search::<true>(&arrays, &search_tuple, &ords)?;
1094        assert_eq!(res, 1);
1095
1096        // Ascending, right
1097        let arrays: Vec<ArrayRef> =
1098            vec![Arc::new(Float64Array::from(vec![5.0, 7.0, 8.0, 9., 10.]))];
1099        let search_tuple: Vec<ScalarValue> = vec![ScalarValue::Float64(Some(7.0))];
1100        let ords = [SortOptions {
1101            descending: false,
1102            nulls_first: true,
1103        }];
1104        let res = bisect::<false>(&arrays, &search_tuple, &ords)?;
1105        assert_eq!(res, 2);
1106        let res = linear_search::<false>(&arrays, &search_tuple, &ords)?;
1107        assert_eq!(res, 2);
1108
1109        let arrays: Vec<ArrayRef> = vec![
1110            Arc::new(Float64Array::from(vec![5.0, 7.0, 8.0, 8.0, 9., 10.])),
1111            Arc::new(Float64Array::from(vec![10.0, 9.0, 8.0, 7.5, 7., 6.])),
1112        ];
1113        let search_tuple: Vec<ScalarValue> = vec![
1114            ScalarValue::Float64(Some(8.0)),
1115            ScalarValue::Float64(Some(8.0)),
1116        ];
1117        let ords = [
1118            SortOptions {
1119                descending: false,
1120                nulls_first: true,
1121            },
1122            SortOptions {
1123                descending: true,
1124                nulls_first: true,
1125            },
1126        ];
1127        let res = bisect::<false>(&arrays, &search_tuple, &ords)?;
1128        assert_eq!(res, 3);
1129        let res = linear_search::<false>(&arrays, &search_tuple, &ords)?;
1130        assert_eq!(res, 3);
1131
1132        let res = bisect::<true>(&arrays, &search_tuple, &ords)?;
1133        assert_eq!(res, 2);
1134        let res = linear_search::<true>(&arrays, &search_tuple, &ords)?;
1135        assert_eq!(res, 2);
1136        Ok(())
1137    }
1138
1139    #[test]
1140    fn test_evaluate_partition_ranges() -> Result<()> {
1141        let arrays: Vec<ArrayRef> = vec![
1142            Arc::new(Float64Array::from(vec![1.0, 1.0, 1.0, 2.0, 2.0, 2.0])),
1143            Arc::new(Float64Array::from(vec![4.0, 4.0, 3.0, 2.0, 1.0, 1.0])),
1144        ];
1145        let n_row = arrays[0].len();
1146        let options: Vec<SortOptions> = vec![
1147            SortOptions {
1148                descending: false,
1149                nulls_first: false,
1150            },
1151            SortOptions {
1152                descending: true,
1153                nulls_first: false,
1154            },
1155        ];
1156        let sort_columns = arrays
1157            .into_iter()
1158            .zip(options)
1159            .map(|(values, options)| SortColumn {
1160                values,
1161                options: Some(options),
1162            })
1163            .collect::<Vec<_>>();
1164        let ranges = evaluate_partition_ranges(n_row, &sort_columns)?;
1165        assert_eq!(ranges.len(), 4);
1166        assert_eq!(ranges[0], Range { start: 0, end: 2 });
1167        assert_eq!(ranges[1], Range { start: 2, end: 3 });
1168        assert_eq!(ranges[2], Range { start: 3, end: 4 });
1169        assert_eq!(ranges[3], Range { start: 4, end: 6 });
1170        Ok(())
1171    }
1172
1173    #[cfg(feature = "sql")]
1174    #[test]
1175    fn test_quote_identifier() -> Result<()> {
1176        let cases = vec![
1177            ("foo", r#"foo"#),
1178            ("_foo", r#"_foo"#),
1179            ("foo_bar", r#"foo_bar"#),
1180            ("foo-bar", r#""foo-bar""#),
1181            // name itself has a period, needs to be quoted
1182            ("foo.bar", r#""foo.bar""#),
1183            ("Foo", r#""Foo""#),
1184            ("Foo.Bar", r#""Foo.Bar""#),
1185            // name starting with a number needs to be quoted
1186            ("test1", r#"test1"#),
1187            ("1test", r#""1test""#),
1188        ];
1189
1190        for (identifier, quoted_identifier) in cases {
1191            println!("input: \n{identifier}\nquoted_identifier:\n{quoted_identifier}");
1192
1193            assert_eq!(quote_identifier(identifier), quoted_identifier);
1194
1195            // When parsing the quoted identifier, it should be a
1196            // a single identifier without normalization, and not in multiple parts
1197            let quote_style = if quoted_identifier.starts_with('"') {
1198                Some('"')
1199            } else {
1200                None
1201            };
1202
1203            let expected_parsed = vec![Ident {
1204                value: identifier.to_string(),
1205                quote_style,
1206                span: sqlparser::tokenizer::Span::empty(),
1207            }];
1208
1209            assert_eq!(
1210                parse_identifiers(quoted_identifier).unwrap(),
1211                expected_parsed
1212            );
1213        }
1214
1215        Ok(())
1216    }
1217
1218    #[test]
1219    fn test_get_at_indices() -> Result<()> {
1220        let in_vec = vec![1, 2, 3, 4, 5, 6, 7];
1221        assert_eq!(get_at_indices(&in_vec, [0, 2])?, vec![1, 3]);
1222        assert_eq!(get_at_indices(&in_vec, [4, 2])?, vec![5, 3]);
1223        // 7 is outside the range
1224        assert!(get_at_indices(&in_vec, [7]).is_err());
1225        Ok(())
1226    }
1227
1228    #[test]
1229    fn test_longest_consecutive_prefix() {
1230        assert_eq!(longest_consecutive_prefix([0, 3, 4]), 1);
1231        assert_eq!(longest_consecutive_prefix([0, 1, 3, 4]), 2);
1232        assert_eq!(longest_consecutive_prefix([0, 1, 2, 3, 4]), 5);
1233        assert_eq!(longest_consecutive_prefix([1, 2, 3, 4]), 0);
1234    }
1235
1236    #[test]
1237    fn test_merge_and_order_indices() {
1238        assert_eq!(
1239            merge_and_order_indices([0, 3, 4], [1, 3, 5]),
1240            vec![0, 1, 3, 4, 5]
1241        );
1242        // Result should be ordered, even if inputs are not
1243        assert_eq!(
1244            merge_and_order_indices([3, 0, 4], [5, 1, 3]),
1245            vec![0, 1, 3, 4, 5]
1246        );
1247    }
1248
1249    #[test]
1250    fn test_set_difference() {
1251        assert_eq!(set_difference([0, 3, 4], [1, 2]), vec![0, 3, 4]);
1252        assert_eq!(set_difference([0, 3, 4], [1, 2, 4]), vec![0, 3]);
1253        // return value should have same ordering with the in1
1254        assert_eq!(set_difference([3, 4, 0], [1, 2, 4]), vec![3, 0]);
1255        assert_eq!(set_difference([0, 3, 4], [4, 1, 2]), vec![0, 3]);
1256        assert_eq!(set_difference([3, 4, 0], [4, 1, 2]), vec![3, 0]);
1257    }
1258
1259    #[test]
1260    fn test_find_indices() -> Result<()> {
1261        assert_eq!(find_indices(&[0, 3, 4], [0, 3, 4])?, vec![0, 1, 2]);
1262        assert_eq!(find_indices(&[0, 3, 4], [0, 4, 3])?, vec![0, 2, 1]);
1263        assert_eq!(find_indices(&[3, 0, 4], [0, 3])?, vec![1, 0]);
1264        assert!(find_indices(&[0, 3], [0, 3, 4]).is_err());
1265        assert!(find_indices(&[0, 3, 4], [0, 2]).is_err());
1266        Ok(())
1267    }
1268
1269    #[test]
1270    fn test_transpose() -> Result<()> {
1271        let in_data = vec![vec![1, 2, 3], vec![4, 5, 6]];
1272        let transposed = transpose(in_data);
1273        let expected = vec![vec![1, 4], vec![2, 5], vec![3, 6]];
1274        assert_eq!(expected, transposed);
1275        Ok(())
1276    }
1277}
datafusion_common/utils/mod.rs

datafusion_common/utils/
mod.rs