simd_kernels/kernels/
string.rs

1// Copyright Peter Bower 2025. All Rights Reserved.
2// Licensed under Mozilla Public License (MPL) 2.0.
3
4//! # **String Operations Kernels Module** - *High-Performance String Processing and Text Analysis*
5//!
6//! String processing kernels for text manipulation, pattern matching,
7//! and string analysis operations with UTF-8 awareness and null-safe semantics. Essential infrastructure
8//! for text analytics, data cleansing, and string-heavy analytical workloads.
9//!
10//! ## Core Operations
11//! - **String transformations**: Case conversion, trimming, padding, and substring operations
12//! - **Pattern matching**: Regular expression support with compiled pattern caching  
13//! - **String comparison**: Lexicographic ordering with UTF-8 aware collation
14//! - **Text analysis**: Length calculation, character counting, and encoding detection
15//! - **String aggregation**: Concatenation with configurable delimiters and null handling
16//! - **Search operations**: Contains, starts with, ends with predicates with optimised implementations
17
18#[cfg(feature = "fast_hash")]
19use ahash::{AHashMap, AHashSet};
20#[cfg(not(feature = "fast_hash"))]
21use std::collections::{HashMap, HashSet};
22
23use minarrow::{
24    Bitmask, BooleanArray, CategoricalArray, Integer, IntegerArray, MaskedArray, StringArray,
25    Vec64,
26    aliases::{CategoricalAVT, StringAVT},
27};
28#[cfg(feature = "regex")]
29use regex::Regex;
30
31use crate::errors::KernelError;
32use crate::utils::confirm_mask_capacity;
33use std::marker::PhantomData;
34
35/// Helper for predicate kernels: produce optional input masks and a fresh output mask
36#[inline(always)]
37pub fn string_predicate_masks<'a>(
38    lhs_mask: Option<&'a Bitmask>,
39    rhs_mask: Option<&'a Bitmask>,
40    len: usize,
41) -> (Option<&'a Bitmask>, Option<&'a Bitmask>, Bitmask) {
42    let out = Bitmask::new_set_all(len, false);
43    (lhs_mask, rhs_mask, out)
44}
45
46// Concatenation
47
48/// Concatenates corresponding string pairs from two string arrays element-wise.
49///
50/// Performs element-wise concatenation of strings from two `StringArray` sources,
51/// producing a new string array where each result string is the concatenation of
52/// the corresponding left and right input strings.
53///
54/// # Parameters
55/// - `lhs`: Left-hand string array view tuple `(StringArray, offset, length)`
56/// - `rhs`: Right-hand string array view tuple `(StringArray, offset, length)`
57///
58/// # Returns
59/// A new `StringArray<T>` containing concatenated strings with proper null handling.
60///
61/// # Null Handling
62/// - If either input string is null, the result is null
63/// - Output null mask reflects the union of input null positions
64///
65/// # Performance
66/// - Pre-computes total memory requirements to minimise allocations
67/// - Uses unsafe unchecked access for validated indices
68/// - Optimised for bulk string concatenation operations
69pub fn concat_str_str<T: Integer>(lhs: StringAVT<T>, rhs: StringAVT<T>) -> StringArray<T> {
70    let (larr, loff, llen) = lhs;
71    let (rarr, roff, rlen) = rhs;
72    let len = llen.min(rlen);
73
74    let (lmask, rmask, mut out_mask) =
75        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
76    let _ = confirm_mask_capacity(larr.len(), lmask);
77    let _ = confirm_mask_capacity(rarr.len(), rmask);
78
79    // Compute total byte size required
80    let mut total_bytes = 0;
81    for i in 0..len {
82        let valid = lmask.map_or(true, |m| unsafe { m.get_unchecked(loff + i) })
83            && rmask.map_or(true, |m| unsafe { m.get_unchecked(roff + i) });
84        if valid {
85            let l = unsafe { larr.get_str_unchecked(loff + i) };
86            let r = unsafe { rarr.get_str_unchecked(roff + i) };
87            total_bytes += l.len() + r.len();
88        }
89    }
90
91    // Allocate offsets and data buffers
92    let mut offsets = Vec64::<T>::with_capacity(len + 1);
93    unsafe {
94        offsets.set_len(len + 1);
95    }
96    let mut values = Vec64::<u8>::with_capacity(total_bytes);
97
98    // Fill values and offsets
99    offsets[0] = T::zero();
100    let mut cur = 0;
101
102    for i in 0..len {
103        let valid = lmask.map_or(true, |m| unsafe { m.get_unchecked(loff + i) })
104            && rmask.map_or(true, |m| unsafe { m.get_unchecked(roff + i) });
105
106        if valid {
107            let l = unsafe { larr.get_str_unchecked(loff + i).as_bytes() };
108            let r = unsafe { rarr.get_str_unchecked(roff + i).as_bytes() };
109
110            values.extend_from_slice(l);
111            values.extend_from_slice(r);
112            cur += l.len() + r.len();
113
114            unsafe {
115                out_mask.set_unchecked(i, true);
116            }
117        } else {
118            unsafe {
119                out_mask.set_unchecked(i, false);
120            }
121        }
122
123        offsets[i + 1] = T::from_usize(cur);
124    }
125
126    StringArray {
127        offsets: offsets.into(),
128        data: values.into(),
129        null_mask: Some(out_mask),
130    }
131}
132
133/// Concatenates corresponding string pairs from two categorical arrays element-wise.
134///
135/// Performs element-wise concatenation by looking up dictionary values from both
136/// categorical arrays and concatenating the resolved strings. Creates a new string
137/// array with the concatenated results.
138///
139/// # Parameters
140/// - `lhs`: Left-hand categorical array view tuple `(CategoricalArray, offset, length)`
141/// - `rhs`: Right-hand categorical array view tuple `(CategoricalArray, offset, length)`
142///
143/// # Returns
144/// A new `StringArray<T>` containing concatenated dictionary strings.
145///
146/// # Null Handling
147/// - If either categorical value is null, the result is null
148/// - Output null mask reflects the union of input null positions
149///
150/// # Implementation
151/// - Dictionary lookups resolve categorical codes to actual strings
152/// - Memory allocation optimised based on total concatenated length
153pub fn concat_dict_dict<T: Integer>(
154    lhs: CategoricalAVT<T>,
155    rhs: CategoricalAVT<T>,
156) -> Result<CategoricalArray<T>, KernelError> {
157    let (larr, loff, llen) = lhs;
158    let (rarr, roff, rlen) = rhs;
159    let len = llen.min(rlen);
160
161    let (lmask, rmask, mut out_mask) =
162        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
163    let _ = confirm_mask_capacity(larr.data.len(), lmask)?;
164    let _ = confirm_mask_capacity(rarr.data.len(), rmask)?;
165
166    // Use max possible unique count for preallocation. Worst case is all unique.
167    let mut data = Vec64::<T>::with_capacity(len);
168    unsafe {
169        data.set_len(len);
170    }
171
172    let mut unique_values = Vec64::<String>::with_capacity(len);
173    #[cfg(feature = "fast_hash")]
174    let mut seen: AHashMap<String, T> = AHashMap::with_capacity(len);
175    #[cfg(not(feature = "fast_hash"))]
176    let mut seen: HashMap<String, T> = HashMap::with_capacity(len);
177    let mut unique_idx = 0;
178
179    for i in 0..len {
180        let valid = lmask.map_or(true, |m| unsafe { m.get_unchecked(loff + i) })
181            && rmask.map_or(true, |m| unsafe { m.get_unchecked(roff + i) });
182
183        if valid {
184            let l = unsafe { larr.get_str_unchecked(loff + i) };
185            let r = unsafe { rarr.get_str_unchecked(roff + i) };
186            let cat = format!("{l}{r}");
187
188            let idx = match seen.get(&cat) {
189                Some(ix) => *ix,
190                None => {
191                    let ix = T::from_usize(unique_idx);
192                    unique_values.push(cat.clone());
193                    seen.insert(cat, ix);
194                    unique_idx += 1;
195                    ix
196                }
197            };
198
199            unsafe {
200                *data.get_unchecked_mut(i) = idx;
201                out_mask.set_unchecked(i, true);
202            }
203        } else {
204            unsafe {
205                *data.get_unchecked_mut(i) = T::zero();
206                out_mask.set_unchecked(i, false);
207            }
208        }
209    }
210
211    unsafe {
212        unique_values.set_len(unique_idx);
213    }
214
215    Ok(CategoricalArray {
216        data: data.into(),
217        unique_values,
218        null_mask: Some(out_mask),
219    })
220}
221
222/// Concatenates strings from a string array with dictionary values from a categorical array.
223///
224/// Performs element-wise concatenation where left operands come from a string array
225/// and right operands are resolved from a categorical array's dictionary.
226///
227/// # Parameters
228/// - `lhs`: String array view tuple `(StringArray, offset, length)`
229/// - `rhs`: Categorical array view tuple `(CategoricalArray, offset, length)`
230///
231/// # Type Parameters
232/// - `T`: Integer type for string array offsets
233/// - `U`: Integer type for categorical array indices
234///
235/// # Returns
236/// A new `StringArray<T>` containing concatenated string-dictionary pairs.
237///
238/// # Null Handling
239/// Results are null if either input value is null at the corresponding position.
240pub fn concat_str_dict<T: Integer, U: Integer>(
241    lhs: StringAVT<T>,
242    rhs: CategoricalAVT<U>,
243) -> Result<StringArray<T>, KernelError> {
244    let (larr, loff, llen) = lhs;
245    let (rarr, roff, rlen) = rhs;
246    let len = llen.min(rlen);
247
248    let (lmask, rmask, mut out_mask) =
249        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
250    let _ = confirm_mask_capacity(larr.len(), lmask)?;
251    let _ = confirm_mask_capacity(rarr.data.len(), rmask)?;
252
253    // Compute total byte size required
254    let mut total_bytes = 0;
255    for i in 0..len {
256        let valid = lmask.map_or(true, |m| unsafe { m.get_unchecked(loff + i) })
257            && rmask.map_or(true, |m| unsafe { m.get_unchecked(roff + i) });
258        if valid {
259            let a = unsafe { larr.get_str_unchecked(loff + i) };
260            let b = unsafe { rarr.get_str_unchecked(roff + i) };
261            total_bytes += a.len() + b.len();
262        }
263    }
264
265    // Preallocate offsets and values
266    let mut offsets = Vec64::<T>::with_capacity(len + 1);
267    unsafe {
268        offsets.set_len(len + 1);
269    }
270    let mut values = Vec64::<u8>::with_capacity(total_bytes);
271
272    // Fill values and offsets
273    offsets[0] = T::zero();
274    let mut cur = 0;
275
276    for i in 0..len {
277        let valid = lmask.map_or(true, |m| unsafe { m.get_unchecked(loff + i) })
278            && rmask.map_or(true, |m| unsafe { m.get_unchecked(roff + i) });
279
280        if valid {
281            let a = unsafe { larr.get_str_unchecked(loff + i).as_bytes() };
282            let b = unsafe { rarr.get_str_unchecked(roff + i).as_bytes() };
283
284            values.extend_from_slice(a);
285            values.extend_from_slice(b);
286            cur += a.len() + b.len();
287
288            unsafe {
289                out_mask.set_unchecked(i, true);
290            }
291        } else {
292            unsafe {
293                out_mask.set_unchecked(i, false);
294            }
295        }
296
297        offsets[i + 1] = T::from_usize(cur);
298    }
299
300    Ok(StringArray {
301        offsets: offsets.into(),
302        data: values.into(),
303        null_mask: Some(out_mask),
304    })
305}
306
307/// Concatenates dictionary values from a categorical array with strings from a string array.
308///
309/// Performs element-wise concatenation where left operands are resolved from a
310/// categorical array's dictionary and right operands come from a string array.
311///
312/// # Parameters
313/// - `lhs`: Categorical array view tuple `(CategoricalArray, offset, length)`
314/// - `rhs`: String array view tuple `(StringArray, offset, length)`
315///
316/// # Type Parameters
317/// - `T`: Integer type for string array offsets
318/// - `U`: Integer type for categorical array indices
319///
320/// # Returns
321/// A new `StringArray<T>` containing concatenated dictionary-string pairs.
322///
323/// # Null Handling
324/// Results are null if either input value is null at the corresponding position.
325pub fn concat_dict_str<T: Integer, U: Integer>(
326    lhs: CategoricalAVT<U>,
327    rhs: StringAVT<T>,
328) -> Result<StringArray<T>, KernelError> {
329    concat_str_dict(rhs, lhs)
330}
331
332macro_rules! binary_str_pred_loop {
333    ($len:expr, $lmask:expr, $rmask:expr, $out_mask:expr, $lhs:expr, $rhs:expr, $method:ident) => {{
334        let mut data = Bitmask::new_set_all($len, false);
335        // ensure masks cover offset + len
336        let lhs_off = $lhs.1;
337        let rhs_off = $rhs.1;
338        let _ = confirm_mask_capacity(lhs_off + $len, $lmask)?;
339        let _ = confirm_mask_capacity(rhs_off + $len, $rmask)?;
340        for i in 0..$len {
341            let li = lhs_off + i;
342            let ri = rhs_off + i;
343            let valid = $lmask.map_or(true, |m| unsafe { m.get_unchecked(li) })
344                && $rmask.map_or(true, |m| unsafe { m.get_unchecked(ri) });
345            let result = valid && {
346                let s = unsafe { $lhs.0.get_str_unchecked(li) };
347                let pat = unsafe { $rhs.0.get_str_unchecked(ri) };
348                !pat.is_empty() && s.$method(pat)
349            };
350            unsafe {
351                data.set_unchecked(i, result);
352                $out_mask.set_unchecked(i, valid);
353            }
354        }
355        data
356    }};
357}
358
359// STRING PREDICATES - contains/starts_with/ends_with
360
361/// Generates string predicate functions that compare string arrays.
362macro_rules! str_predicate {
363    ($fn_name:ident, $method:ident) => {
364        /// Performs string predicate operations between two string arrays.
365        /// 
366        /// Applies the specified string method (contains, starts_with, ends_with)
367        /// to compare corresponding elements of two string arrays.
368        /// 
369        /// # Type Parameters
370        /// 
371        /// * `T` - Integer type for left string array offsets
372        /// * `U` - Integer type for right string array offsets
373        /// 
374        /// # Arguments
375        /// 
376        /// * `lhs` - Left string array (data, offset, length)
377        /// * `rhs` - Right string array (data, offset, length)
378        /// 
379        /// # Returns
380        /// 
381        /// Boolean array containing comparison results
382        pub fn $fn_name<T: Integer, U: Integer>(
383            lhs: StringAVT<T>,
384            rhs: StringAVT<U>,
385        ) -> BooleanArray<()> {
386            let (larr, loff, llen) = lhs;
387            let (rarr, roff, rlen) = rhs;
388            let len = llen.min(rlen);
389            // Grab raw pointers & slices once
390            let lmask = larr.null_mask.as_ref();
391            let rmask = rarr.null_mask.as_ref();
392            let mut out = Bitmask::new_set_all(len, false);
393
394            for i in 0..len {
395                unsafe {
396                    // Check null‐mask validity without bounds checks
397                    let lv = lmask.map_or(true, |m| m.get_unchecked(loff + i));
398                    let rv = rmask.map_or(true, |m| m.get_unchecked(roff + i));
399                    if !lv || !rv {
400                        // leave out[i]=false
401                        continue;
402                    }
403                    // Slice out the raw bytes
404                    let ls = larr.offsets[loff + i].to_usize();
405                    let le = larr.offsets[loff + i + 1].to_usize();
406                    let rs = rarr.offsets[roff + i].to_usize();
407                    let re = rarr.offsets[roff + i + 1].to_usize();
408                    let s = std::str::from_utf8_unchecked(&larr.data[ls..le]);
409                    let p = std::str::from_utf8_unchecked(&rarr.data[rs..re]);
410                    // Only non-empty pattern can match
411                    if !p.is_empty() && s.$method(p) {
412                        out.set_unchecked(i, true);
413                    }
414                }
415            }
416            // Tight bitmask with no nulls - nulls became 'false'
417            BooleanArray {
418                data: out.into(),
419                null_mask: None,
420                len,
421                _phantom: PhantomData,
422            }
423        }
424    };
425}
426
427/// Generates string-to-categorical predicate functions.
428macro_rules! str_cat_predicate {
429    ($fn_name:ident, $method:ident) => {
430        /// Performs string predicate operations between string and categorical arrays.
431        /// 
432        /// Applies the specified string method (contains, starts_with, ends_with)
433        /// to compare string array elements with categorical array elements.
434        /// 
435        /// # Type Parameters
436        /// 
437        /// * `T` - Integer type for string array offsets
438        /// * `U` - Integer type for categorical array offsets
439        /// 
440        /// # Arguments
441        /// 
442        /// * `lhs` - String array (data, offset, length)
443        /// * `rhs` - Categorical array (data, offset, length)
444        /// 
445        /// # Returns
446        /// 
447        /// Result containing boolean array with comparison results, or error
448        pub fn $fn_name<T: Integer, U: Integer>(
449            lhs: StringAVT<T>,
450            rhs: CategoricalAVT<U>,
451        ) -> Result<BooleanArray<()>, KernelError> {
452            let (larr, loff, llen) = lhs;
453            let (rarr, roff, rlen) = rhs;
454            let len = llen.min(rlen);
455
456            let (lmask, rmask, mut out_mask) =
457                string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
458
459            let data = binary_str_pred_loop!(
460                len,
461                lmask,
462                rmask,
463                out_mask,
464                (larr, loff),
465                (rarr, roff),
466                $method
467            );
468
469            Ok(BooleanArray {
470                data: data.into(),
471                null_mask: Some(out_mask),
472                len,
473                _phantom: PhantomData,
474            })
475        }
476    };
477}
478
479/// Generates categorical-to-categorical predicate functions.
480macro_rules! cat_cat_predicate {
481    ($fn_name:ident, $method:ident) => {
482        /// Performs string predicate operations between two categorical arrays.
483        /// 
484        /// Applies the specified string method (contains, starts_with, ends_with)
485        /// to compare corresponding elements of two categorical arrays.
486        /// 
487        /// # Type Parameters
488        /// 
489        /// * `T` - Integer type for categorical array offsets
490        /// 
491        /// # Arguments
492        /// 
493        /// * `lhs` - Left categorical array (data, offset, length)
494        /// * `rhs` - Right categorical array (data, offset, length)
495        /// 
496        /// # Returns
497        /// 
498        /// Result containing boolean array with comparison results, or error
499        pub fn $fn_name<T: Integer>(
500            lhs: CategoricalAVT<T>,
501            rhs: CategoricalAVT<T>,
502        ) -> Result<BooleanArray<()>, KernelError> {
503            let (larr, loff, llen) = lhs;
504            let (rarr, roff, rlen) = rhs;
505            let len = llen.min(rlen);
506
507            let (lmask, rmask, mut out_mask) =
508                string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
509
510            let data = binary_str_pred_loop!(
511                len,
512                lmask,
513                rmask,
514                out_mask,
515                (larr, loff),
516                (rarr, roff),
517                $method
518            );
519
520            Ok(BooleanArray {
521                data: data.into(),
522                null_mask: Some(out_mask),
523                len,
524                _phantom: PhantomData,
525            })
526        }
527    };
528}
529
530/// Generates categorical-to-string predicate functions.
531macro_rules! dict_str_predicate {
532    ($fn_name:ident, $method:ident) => {
533        /// Performs string predicate operations between categorical and string arrays.
534        /// 
535        /// Applies the specified string method (contains, starts_with, ends_with)
536        /// to compare categorical array elements with string array elements.
537        /// 
538        /// # Type Parameters
539        /// 
540        /// * `T` - Integer type for categorical array offsets
541        /// * `U` - Integer type for string array offsets
542        /// 
543        /// # Arguments
544        /// 
545        /// * `lhs` - Categorical array (data, offset, length)
546        /// * `rhs` - String array (data, offset, length)
547        /// 
548        /// # Returns
549        /// 
550        /// Result containing boolean array with comparison results, or error
551        pub fn $fn_name<T: Integer, U: Integer>(
552            lhs: CategoricalAVT<T>,
553            rhs: StringAVT<U>,
554        ) -> Result<BooleanArray<()>, KernelError> {
555            let (larr, loff, llen) = lhs;
556            let (rarr, roff, rlen) = rhs;
557            let len = llen.min(rlen);
558
559            let (lmask, rmask, mut out_mask) =
560                string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
561            let _ = confirm_mask_capacity(larr.data.len(), lmask)?;
562            let _ = confirm_mask_capacity(rarr.len(), rmask)?;
563
564            let mut data = Bitmask::new_set_all(len, false);
565            for i in 0..len {
566                let valid = lmask.map_or(true, |m| unsafe { m.get_unchecked(loff + i) })
567                    && rmask.map_or(true, |m| unsafe { m.get_unchecked(roff + i) });
568                let match_i = valid && {
569                    let hay = unsafe { larr.get_str_unchecked(loff + i) };
570                    let needle = unsafe { rarr.get_str_unchecked(roff + i) };
571                    !needle.is_empty() && hay.$method(needle)
572                };
573                unsafe { data.set_unchecked(i, match_i) };
574                unsafe { out_mask.set_unchecked(i, valid) };
575            }
576
577            Ok(BooleanArray {
578                data: data.into(),
579                null_mask: Some(out_mask),
580                len,
581                _phantom: PhantomData,
582            })
583        }
584    };
585}
586
587str_predicate!(contains_str_str, contains);
588str_predicate!(starts_with_str_str, starts_with);
589str_predicate!(ends_with_str_str, ends_with);
590str_cat_predicate!(contains_str_dict, contains);
591cat_cat_predicate!(contains_dict_dict, contains);
592str_cat_predicate!(starts_with_str_dict, starts_with);
593cat_cat_predicate!(starts_with_dict_dict, starts_with);
594str_cat_predicate!(ends_with_str_dict, ends_with);
595cat_cat_predicate!(ends_with_dict_dict, ends_with);
596dict_str_predicate!(contains_dict_str, contains);
597dict_str_predicate!(starts_with_dict_str, starts_with);
598dict_str_predicate!(ends_with_dict_str, ends_with);
599
600// Regex match
601
602#[cfg(feature = "regex")]
603macro_rules! regex_match_loop {
604    ($len:expr, $lmask:expr, $rmask:expr, $out_mask:expr, $lhs_arr:expr, $lhs_off:expr, $rhs_arr:expr, $rhs_off:expr) => {{
605        let mut data = Bitmask::new_set_all($len, false);
606        let _ = confirm_mask_capacity($len + $lhs_off, $lmask)?;
607        let _ = confirm_mask_capacity($len + $rhs_off, $rmask)?;
608        for i in 0..$len {
609            let valid = $lmask.map_or(true, |m| unsafe { m.get_unchecked($lhs_off + i) })
610                && $rmask.map_or(true, |m| unsafe { m.get_unchecked($rhs_off + i) });
611            let matched = if valid {
612                let s = unsafe { $lhs_arr.get_str_unchecked($lhs_off + i) };
613                let pat = unsafe { $rhs_arr.get_str_unchecked($rhs_off + i) };
614                if pat.is_empty() {
615                    false
616                } else {
617                    match Regex::new(pat) {
618                        Ok(re) => re.is_match(s),
619                        Err(_) => {
620                            return Err(KernelError::InvalidArguments(
621                                "Invalid regex string".to_string(),
622                            ));
623                        }
624                    }
625                }
626            } else {
627                false
628            };
629            unsafe { data.set_unchecked(i, matched) };
630            unsafe { $out_mask.set_unchecked(i, valid) };
631        }
632        data
633    }};
634}
635
636/// Applies regular expression pattern matching between two string arrays.
637///
638/// Evaluates regex patterns from the right-hand string array against corresponding
639/// strings in the left-hand array, producing a boolean array indicating matches.
640///
641/// # Parameters
642/// - `lhs`: Source string array view tuple `(StringArray, offset, length)`
643/// - `rhs`: Pattern string array view tuple `(StringArray, offset, length)`
644///
645/// # Type Parameters
646/// - `T`: Integer type for left array offsets
647/// - `U`: Integer type for right array offsets
648///
649/// # Returns
650/// `Result<BooleanArray<()>, KernelError>` where true indicates pattern match.
651///
652/// # Errors
653/// Returns `KernelError` for invalid regular expression patterns.
654///
655/// # Feature Gate
656/// Requires the `regex` feature to be enabled.
657///
658/// # Performance
659/// - Regex compilation overhead amortised across bulk operations
660/// - Pattern caching opportunities for repeated patterns
661#[cfg(feature = "regex")]
662pub fn regex_str_str<'a, T: Integer, U: Integer>(
663    lhs: StringAVT<'a, T>,
664    rhs: StringAVT<'a, U>,
665) -> Result<BooleanArray<()>, KernelError> {
666    let (larr, loff, llen) = lhs;
667    let (rarr, roff, rlen) = rhs;
668    let len = llen.min(rlen);
669    let (lmask, rmask, mut out_mask) =
670        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
671
672    let data = regex_match_loop!(len, lmask, rmask, out_mask, larr, loff, rarr, roff);
673    Ok(BooleanArray {
674        data: data.into(),
675        null_mask: Some(out_mask),
676        len,
677        _phantom: PhantomData,
678    })
679}
680
681/// Applies regular expression patterns to categorical array values against string patterns.
682///
683/// Evaluates regex patterns from the string array against dictionary-resolved strings
684/// from the categorical array, producing a boolean array indicating matches.
685///
686/// # Parameters
687/// - `lhs`: Categorical array view tuple `(CategoricalArray, offset, length)`
688/// - `rhs`: Pattern string array view tuple `(StringArray, offset, length)`
689///
690/// # Type Parameters
691/// - `U`: Integer type for categorical array indices
692/// - `T`: Integer type for pattern array offsets
693///
694/// # Returns
695/// `Result<BooleanArray<()>, KernelError>` where true indicates pattern match.
696///
697/// # Errors
698/// Returns `KernelError` for invalid regular expression patterns.
699///
700/// # Feature Gate
701/// Requires the `regex` feature to be enabled.
702#[cfg(feature = "regex")]
703pub fn regex_dict_str<'a, U: Integer, T: Integer>(
704    lhs: CategoricalAVT<'a, U>,
705    rhs: StringAVT<'a, T>,
706) -> Result<BooleanArray<()>, KernelError> {
707    let (larr, loff, llen) = lhs;
708    let (rarr, roff, rlen) = rhs;
709    let len = llen.min(rlen);
710    let (lmask, rmask, mut out_mask) =
711        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
712
713    let data = regex_match_loop!(len, lmask, rmask, out_mask, larr, loff, rarr, roff);
714    Ok(BooleanArray {
715        data: data.into(),
716        null_mask: Some(out_mask),
717        len,
718        _phantom: PhantomData,
719    })
720}
721
722/// Applies regular expression patterns from categorical dictionary against string values.
723///
724/// Evaluates regex patterns resolved from the categorical array's dictionary against
725/// corresponding strings in the string array, producing a boolean array of matches.
726///
727/// # Parameters
728/// - `lhs`: Source string array view tuple `(StringArray, offset, length)`
729/// - `rhs`: Pattern categorical array view tuple `(CategoricalArray, offset, length)`
730///
731/// # Type Parameters
732/// - `T`: Integer type for string array offsets
733/// - `U`: Integer type for categorical array indices
734///
735/// # Returns
736/// `Result<BooleanArray<()>, KernelError>` where true indicates pattern match.
737///
738/// # Errors
739/// Returns `KernelError` for invalid regular expression patterns in dictionary.
740///
741/// # Feature Gate
742/// Requires the `regex` feature to be enabled.
743#[cfg(feature = "regex")]
744pub fn regex_str_dict<'a, T: Integer, U: Integer>(
745    lhs: StringAVT<'a, T>,
746    rhs: CategoricalAVT<'a, U>,
747) -> Result<BooleanArray<()>, KernelError> {
748    let (larr, loff, llen) = lhs;
749    let (rarr, roff, rlen) = rhs;
750    let len = llen.min(rlen);
751    let (lmask, rmask, mut out_mask) =
752        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
753
754    let data = regex_match_loop!(len, lmask, rmask, out_mask, larr, loff, rarr, roff);
755    Ok(BooleanArray {
756        data: data.into(),
757        null_mask: Some(out_mask),
758        len,
759        _phantom: PhantomData,
760    })
761}
762
763/// Applies regular expression patterns between two categorical arrays via dictionary lookup.
764///
765/// Evaluates regex patterns by resolving both pattern and target strings from their
766/// respective categorical dictionaries, producing a boolean array indicating matches.
767///
768/// # Parameters
769/// - `lhs`: Source categorical array view tuple `(CategoricalArray, offset, length)`
770/// - `rhs`: Pattern categorical array view tuple `(CategoricalArray, offset, length)`
771///
772/// # Type Parameters
773/// - `T`: Integer type for categorical array indices
774///
775/// # Returns
776/// `Result<BooleanArray<()>, KernelError>` where true indicates pattern match.
777///
778/// # Errors
779/// Returns `KernelError` for invalid regular expression patterns in dictionaries.
780///
781/// # Feature Gate
782/// Requires the `regex` feature to be enabled.
783///
784/// # Performance
785/// - Dictionary lookups amortised across categorical operations
786/// - Pattern compilation cached for repeated dictionary patterns
787#[cfg(feature = "regex")]
788pub fn regex_dict_dict<'a, T: Integer>(
789    lhs: CategoricalAVT<'a, T>,
790    rhs: CategoricalAVT<'a, T>,
791) -> Result<BooleanArray<()>, KernelError> {
792    let (larr, loff, llen) = lhs;
793    let (rarr, roff, rlen) = rhs;
794    let len = llen.min(rlen);
795    let (lmask, rmask, mut out_mask) =
796        string_predicate_masks(larr.null_mask.as_ref(), rarr.null_mask.as_ref(), len);
797
798    let data = regex_match_loop!(len, lmask, rmask, out_mask, larr, loff, rarr, roff);
799    Ok(BooleanArray {
800        data: data.into(),
801        null_mask: Some(out_mask),
802        len,
803        _phantom: PhantomData,
804    })
805}
806
807/// Computes the character length of each string in a `StringArray<T>` slice,
808/// returning an `IntegerArray<T>` with the same length and null semantics.
809///
810/// This applies to a windowed slice (`offset`, `len`) of the input array.
811/// The output null mask mirrors the sliced portion of the input mask.
812///
813/// # Parameters
814/// - `input`: A `(StringArray<T>, offset, len)` tuple defining the slice to operate on.
815///
816/// # Returns
817/// An `IntegerArray<T>` of the same length, with each element representing the character count
818/// of the corresponding (non-null) string value.
819pub fn len_str<'a, T: Integer + Copy>(
820    input: StringAVT<'a, T>,
821) -> Result<IntegerArray<T>, KernelError> {
822    let (array, offset, len) = input;
823    debug_assert!(offset + len <= array.offsets.len() - 1);
824
825    let mask_opt = array.null_mask.as_ref().map(|orig| {
826        let mut m = Bitmask::new_set_all(len, true);
827        for i in 0..len {
828            unsafe {
829                m.set_unchecked(i, orig.get_unchecked(offset + i));
830            }
831        }
832        m
833    });
834
835    let mut data = Vec64::<T>::with_capacity(len);
836    unsafe { data.set_len(len) };
837    for i in 0..len {
838        let valid = mask_opt
839            .as_ref()
840            .map_or(true, |m| unsafe { m.get_unchecked(i) });
841        if valid {
842            let start = array.offsets[offset + i].to_usize();
843            let end = array.offsets[offset + i + 1].to_usize();
844            let s = unsafe { std::str::from_utf8_unchecked(&array.data[start..end]) };
845            data[i] = T::from(s.chars().count()).unwrap();
846        } else {
847            data[i] = T::zero();
848        }
849    }
850
851    Ok(IntegerArray {
852        data: data.into(),
853        null_mask: mask_opt,
854    })
855}
856
857/// Computes the character length of each string in a `CategoricalArray<T>` slice,
858/// returning an `IntegerArray<T>` with the same length and null semantics.
859///
860/// This applies to a windowed slice (`offset`, `len`) of the input categorical array,
861/// using dictionary lookup to resolve each string.
862///
863/// # Parameters
864/// - `input`: A `(CategoricalArray<T>, offset, len)` tuple defining the slice to operate on.
865///
866/// # Returns
867/// An `IntegerArray<T>` of the same length, with each element representing the character count
868/// of the corresponding (non-null) resolved string.
869pub fn len_dict<'a, T: Integer>(
870    input: CategoricalAVT<'a, T>,
871) -> Result<IntegerArray<T>, KernelError> {
872    let (array, offset, len) = input;
873    debug_assert!(offset + len <= array.data.len());
874
875    let mask_opt = array.null_mask.as_ref().map(|orig| {
876        let mut m = Bitmask::new_set_all(len, true);
877        for i in 0..len {
878            unsafe {
879                m.set_unchecked(i, orig.get_unchecked(offset + i));
880            }
881        }
882        m
883    });
884
885    let mut data = Vec64::<T>::with_capacity(len);
886    unsafe { data.set_len(len) };
887    for i in 0..len {
888        let valid = mask_opt
889            .as_ref()
890            .map_or(true, |m| unsafe { m.get_unchecked(i) });
891        data[i] = if valid {
892            T::from(
893                unsafe { array.get_str_unchecked(offset + i) }
894                    .chars()
895                    .count(),
896            )
897            .unwrap()
898        } else {
899            T::zero()
900        };
901    }
902
903    Ok(IntegerArray {
904        data: data.into(),
905        null_mask: mask_opt,
906    })
907}
908
909/// Finds the lexicographically minimum string in a string array window.
910///
911/// Scans through a windowed portion of a string array to determine the minimum
912/// string value according to lexicographic ordering, ignoring null values.
913///
914/// # Parameters
915/// - `window`: String array view tuple `(StringArray, offset, length)` defining the scan window
916///
917/// # Returns
918/// `Option<String>` containing the minimum string, or `None` if all values are null.
919#[inline]
920pub fn min_string_array<T: Integer>(window: StringAVT<T>) -> Option<String> {
921    let (arr, offset, len) = window;
922    let mut min_str: Option<&str> = None;
923    for i in offset..offset + len {
924        if arr
925            .null_mask
926            .as_ref()
927            .map_or(true, |b| unsafe { b.get_unchecked(i) })
928        {
929            let s = unsafe { arr.get_str_unchecked(i) };
930            if min_str.map_or(true, |min| s < min) {
931                min_str = Some(s);
932            }
933        }
934    }
935    min_str.map(str::to_owned)
936}
937
938/// Finds the lexicographically maximum string in a string array window.
939///
940/// Scans through a windowed portion of a string array to determine the maximum
941/// string value according to lexicographic ordering, ignoring null values.
942///
943/// # Parameters
944/// - `window`: String array view tuple `(StringArray, offset, length)` defining the scan window
945///
946/// # Returns
947/// `Option<String>` containing the maximum string, or `None` if all values are null.
948#[inline]
949pub fn max_string_array<T: Integer>(window: StringAVT<T>) -> Option<String> {
950    let (arr, offset, len) = window;
951    let mut max_str: Option<&str> = None;
952    for i in offset..offset + len {
953        if arr
954            .null_mask
955            .as_ref()
956            .map_or(true, |b| unsafe { b.get_unchecked(i) })
957        {
958            let s = unsafe { arr.get_str_unchecked(i) };
959            if max_str.map_or(true, |max| s > max) {
960                max_str = Some(s);
961            }
962        }
963    }
964    max_str.map(str::to_owned)
965}
966
967
968/// Finds the lexicographically minimum dictionary string in a categorical array window.
969///
970/// Scans through a windowed portion of a categorical array, resolves dictionary values,
971/// and determines the minimum string according to lexicographic ordering.
972///
973/// # Parameters
974/// - `window`: Categorical array view tuple `(CategoricalArray, offset, length)` defining the scan window
975///
976/// # Returns
977/// `Option<String>` containing the minimum dictionary string, or `None` if all values are null.
978#[inline]
979pub fn min_categorical_array<T: Integer>(window: CategoricalAVT<T>) -> Option<String> {
980    let (arr, offset, len) = window;
981    let mut min_code: Option<T> = None;
982    for i in offset..offset + len {
983        if arr
984            .null_mask
985            .as_ref()
986            .map_or(true, |b| unsafe { b.get_unchecked(i) })
987        {
988            let code = arr.data[i];
989            if min_code.map_or(true, |min| {
990                let sc = &arr.unique_values[code.to_usize()];
991                let sm = &arr.unique_values[min.to_usize()];
992                sc < sm
993            }) {
994                min_code = Some(code);
995            }
996        }
997    }
998    min_code.map(|code| arr.unique_values[code.to_usize()].clone())
999}
1000
1001
1002
1003/// Finds the lexicographically maximum dictionary string in a categorical array window.
1004///
1005/// Scans through a windowed portion of a categorical array, resolves dictionary values,
1006/// and determines the maximum string according to lexicographic ordering.
1007///
1008/// # Parameters
1009/// - `window`: Categorical array view tuple `(CategoricalArray, offset, length)` defining the scan window
1010///
1011/// # Returns
1012/// `Option<String>` containing the maximum dictionary string, or `None` if all values are null.
1013#[inline]
1014pub fn max_categorical_array<T: Integer>(window: CategoricalAVT<T>) -> Option<String> {
1015    let (arr, offset, len) = window;
1016    let mut max_code: Option<T> = None;
1017    for i in offset..offset + len {
1018        if arr
1019            .null_mask
1020            .as_ref()
1021            .map_or(true, |b| unsafe { b.get_unchecked(i) })
1022        {
1023            let code = arr.data[i];
1024            if max_code.map_or(true, |max| {
1025                let sc = &arr.unique_values[code.to_usize()];
1026                let sm = &arr.unique_values[max.to_usize()];
1027                sc > sm
1028            }) {
1029                max_code = Some(code);
1030            }
1031        }
1032    }
1033    max_code.map(|code| arr.unique_values[code.to_usize()].clone())
1034}
1035
1036/// Counts the number of distinct string values in a string array window.
1037///
1038/// Computes the cardinality of unique strings within a windowed portion of a
1039/// string array, using efficient hash-based deduplication.
1040///
1041/// # Parameters
1042/// - `window`: String array view tuple `(StringArray, offset, length)` defining the count window
1043///
1044/// # Returns
1045/// `usize` representing the count of distinct non-null string values.
1046///
1047/// # Hash Algorithm
1048/// Uses either AHash (if `fast_hash` feature enabled) or standard HashMap for deduplication.
1049#[inline(always)]
1050pub fn count_distinct_string<T: Integer>(window: StringAVT<T>) -> usize {
1051    let (arr, offset, len) = window;
1052    #[cfg(feature = "fast_hash")]
1053    let mut set = AHashSet::with_capacity(len);
1054    #[cfg(not(feature = "fast_hash"))]
1055    let mut set = HashSet::with_capacity(len);
1056    let null_mask = arr.null_mask.as_ref();
1057
1058    for i in offset..offset + len {
1059        let valid = null_mask.map_or(true, |b| unsafe { b.get_unchecked(i) });
1060        if valid {
1061            let s = unsafe { arr.get_str_unchecked(i) };
1062            set.insert(s);
1063            if set.len() == len {
1064                break;
1065            }
1066        }
1067    }
1068    set.len()
1069}
1070
1071#[cfg(test)]
1072mod tests {
1073    use minarrow::{CategoricalArray, StringArray, vec64};
1074
1075    use super::*;
1076
1077    // --- Helper constructors
1078
1079    fn str_array<T: Integer>(vals: &[&str]) -> StringArray<T> {
1080        StringArray::<T>::from_slice(vals)
1081    }
1082
1083    fn dict_array<T: Integer>(vals: &[&str]) -> CategoricalArray<T> {
1084        let owned: Vec<&str> = vals.to_vec();
1085        CategoricalArray::<T>::from_values(owned)
1086    }
1087
1088    fn bm(bools: &[bool]) -> Bitmask {
1089        Bitmask::from_bools(bools)
1090    }
1091
1092    // --- Concat
1093
1094    #[test]
1095    fn test_concat_str_str() {
1096        let a = str_array::<u32>(&["foo", "bar", ""]);
1097        let b = str_array::<u32>(&["baz", "qux", "quux"]);
1098        let out = concat_str_str((&a, 0, a.len()), (&b, 0, b.len()));
1099        assert_eq!(out.get(0), Some("foobaz"));
1100        assert_eq!(out.get(1), Some("barqux"));
1101        assert_eq!(out.get(2), Some("quux"));
1102        assert!(out.null_mask.as_ref().unwrap().all_set());
1103    }
1104
1105    #[test]
1106    fn test_concat_str_str_chunk() {
1107        let a = str_array::<u32>(&["XXX", "foo", "bar", ""]);
1108        let b = str_array::<u32>(&["YYY", "baz", "qux", "quux"]);
1109        // Window is [1..4) for both, i.e., ["foo", "bar", ""]
1110        let out = concat_str_str((&a, 1, 3), (&b, 1, 3));
1111        assert_eq!(out.get(0), Some("foobaz"));
1112        assert_eq!(out.get(1), Some("barqux"));
1113        assert_eq!(out.get(2), Some("quux"));
1114        assert!(out.null_mask.as_ref().unwrap().all_set());
1115    }
1116
1117    #[test]
1118    fn test_concat_dict_dict() {
1119        let a = dict_array::<u32>(&["x", "y"]);
1120        let b = dict_array::<u32>(&["1", "2"]);
1121        let out = concat_dict_dict((&a, 0, a.len()), (&b, 0, b.len())).unwrap();
1122        let s0 = out.get(0).unwrap();
1123        let s1 = out.get(1).unwrap();
1124        assert!(["x1", "y2"].contains(&s0));
1125        assert!(["x1", "y2"].contains(&s1));
1126        assert!(out.null_mask.as_ref().unwrap().all_set());
1127    }
1128
1129    #[test]
1130    fn test_concat_dict_dict_chunk() {
1131        let a = dict_array::<u32>(&["foo", "x", "y", "bar"]);
1132        let b = dict_array::<u32>(&["A", "1", "2", "B"]);
1133        let out = concat_dict_dict((&a, 1, 2), (&b, 1, 2)).unwrap();
1134        let s0 = out.get(0).unwrap();
1135        let s1 = out.get(1).unwrap();
1136        assert!(["x1", "y2"].contains(&s0));
1137        assert!(["x1", "y2"].contains(&s1));
1138        assert!(out.null_mask.as_ref().unwrap().all_set());
1139    }
1140
1141    #[test]
1142    fn test_concat_str_dict() {
1143        let a = str_array::<u32>(&["ab", "cd", ""]);
1144        let b = dict_array::<u32>(&["xy", "zq", ""]);
1145        let out = concat_str_dict((&a, 0, a.len()), (&b, 0, b.len())).unwrap();
1146        assert_eq!(out.get(0), Some("abxy"));
1147        assert_eq!(out.get(1), Some("cdzq"));
1148        assert_eq!(out.get(2), Some(""));
1149        assert!(out.null_mask.as_ref().unwrap().all_set());
1150    }
1151
1152    #[test]
1153    fn test_concat_str_dict_chunk() {
1154        let a = str_array::<u32>(&["dummy", "ab", "cd", ""]);
1155        let b = dict_array::<u32>(&["dummy", "xy", "zq", ""]);
1156        let out = concat_str_dict((&a, 1, 3), (&b, 1, 3)).unwrap();
1157        assert_eq!(out.get(0), Some("abxy"));
1158        assert_eq!(out.get(1), Some("cdzq"));
1159        assert_eq!(out.get(2), Some(""));
1160        assert!(out.null_mask.as_ref().unwrap().all_set());
1161    }
1162
1163    #[test]
1164    fn test_concat_dict_str() {
1165        let a = dict_array::<u32>(&["hi", "ho"]);
1166        let b = str_array::<u32>(&["yo", "no"]);
1167        let out = concat_dict_str((&a, 0, a.len()), (&b, 0, b.len())).unwrap();
1168        assert_eq!(out.get(0), Some("yohi"));
1169        assert_eq!(out.get(1), Some("noho"));
1170        assert!(out.null_mask.as_ref().unwrap().all_set());
1171    }
1172
1173    #[test]
1174    fn test_concat_dict_str_chunk() {
1175        let a = dict_array::<u32>(&["dummy", "hi", "ho", "zzz"]);
1176        let b = str_array::<u32>(&["dummy", "yo", "no", "xxx"]);
1177        let out = concat_dict_str((&a, 1, 2), (&b, 1, 2)).unwrap();
1178        assert_eq!(out.get(0), Some("yohi"));
1179        assert_eq!(out.get(1), Some("noho"));
1180        assert!(out.null_mask.as_ref().unwrap().all_set());
1181    }
1182
1183    // --- String predicates
1184
1185    #[test]
1186    fn test_contains_str_str() {
1187        let s = str_array::<u32>(&["abc", "def", "ghijk"]);
1188        let p = str_array::<u32>(&["b", "x", "jk"]);
1189        let out = contains_str_str((&s, 0, s.len()), (&p, 0, p.len()));
1190        assert_eq!(out.get(0), Some(true));
1191        assert_eq!(out.get(1), Some(false));
1192        assert_eq!(out.get(2), Some(true));
1193    }
1194
1195    #[test]
1196    fn test_contains_str_str_chunk() {
1197        let s = str_array::<u32>(&["dummy", "abc", "def", "ghijk"]);
1198        let p = str_array::<u32>(&["dummy", "b", "x", "jk"]);
1199        let out = contains_str_str((&s, 1, 3), (&p, 1, 3));
1200        assert_eq!(out.get(0), Some(true));
1201        assert_eq!(out.get(1), Some(false));
1202        assert_eq!(out.get(2), Some(true));
1203    }
1204
1205    #[test]
1206    fn test_starts_with_str_str() {
1207        let s = str_array::<u32>(&["apricot", "banana", "apple"]);
1208        let p = str_array::<u32>(&["ap", "ba", "a"]);
1209        let out = starts_with_str_str((&s, 0, s.len()), (&p, 0, p.len()));
1210        assert_eq!(out.get(0), Some(true));
1211        assert_eq!(out.get(1), Some(true));
1212        assert_eq!(out.get(2), Some(true));
1213    }
1214
1215    #[test]
1216    fn test_starts_with_str_str_chunk() {
1217        let s = str_array::<u32>(&["dummy", "apricot", "banana", "apple"]);
1218        let p = str_array::<u32>(&["dummy", "ap", "ba", "a"]);
1219        let out = starts_with_str_str((&s, 1, 3), (&p, 1, 3));
1220        assert_eq!(out.get(0), Some(true));
1221        assert_eq!(out.get(1), Some(true));
1222        assert_eq!(out.get(2), Some(true));
1223    }
1224
1225    #[test]
1226    fn test_ends_with_str_str() {
1227        let s = str_array::<u32>(&["robot", "fast", "last"]);
1228        let p = str_array::<u32>(&["ot", "st", "ast"]);
1229        let out = ends_with_str_str((&s, 0, s.len()), (&p, 0, p.len()));
1230        assert_eq!(out.get(0), Some(true));
1231        assert_eq!(out.get(1), Some(true));
1232        assert_eq!(out.get(2), Some(true));
1233    }
1234
1235    #[test]
1236    fn test_ends_with_str_str_chunk() {
1237        let s = str_array::<u32>(&["dummy", "robot", "fast", "last"]);
1238        let p = str_array::<u32>(&["dummy", "ot", "st", "ast"]);
1239        let out = ends_with_str_str((&s, 1, 3), (&p, 1, 3));
1240        assert_eq!(out.get(0), Some(true));
1241        assert_eq!(out.get(1), Some(true));
1242        assert_eq!(out.get(2), Some(true));
1243    }
1244
1245    #[test]
1246    fn test_contains_str_dict() {
1247        let s = str_array::<u32>(&["abcde", "xyz", "qrstuv"]);
1248        let p = dict_array::<u32>(&["c", "z", "tu"]);
1249        let out = contains_str_dict((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1250        assert_eq!(out.get(0), Some(true));
1251        assert_eq!(out.get(1), Some(true));
1252        assert_eq!(out.get(2), Some(true));
1253    }
1254
1255    #[test]
1256    fn test_contains_str_dict_chunk() {
1257        let s = str_array::<u32>(&["dummy", "abcde", "xyz", "qrstuv"]);
1258        let p = dict_array::<u32>(&["dummy", "c", "z", "tu"]);
1259        let out = contains_str_dict((&s, 1, 3), (&p, 1, 3)).unwrap();
1260        assert_eq!(out.get(0), Some(true));
1261        assert_eq!(out.get(1), Some(true));
1262        assert_eq!(out.get(2), Some(true));
1263    }
1264
1265    #[test]
1266    fn test_contains_dict_dict() {
1267        let s = dict_array::<u32>(&["cdef", "foo", "bar"]);
1268        let p = dict_array::<u32>(&["cd", "oo", "baz"]);
1269        let out = contains_dict_dict((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1270        assert_eq!(out.get(0), Some(true));
1271        assert_eq!(out.get(1), Some(true));
1272        assert_eq!(out.get(2), Some(false));
1273    }
1274
1275    #[test]
1276    fn test_contains_dict_dict_chunk() {
1277        let s = dict_array::<u32>(&["dummy", "cdef", "foo", "bar"]);
1278        let p = dict_array::<u32>(&["dummy", "cd", "oo", "baz"]);
1279        let out = contains_dict_dict((&s, 1, 3), (&p, 1, 3)).unwrap();
1280        assert_eq!(out.get(0), Some(true));
1281        assert_eq!(out.get(1), Some(true));
1282        assert_eq!(out.get(2), Some(false));
1283    }
1284
1285    #[test]
1286    fn test_contains_dict_str() {
1287        let s = dict_array::<u32>(&["hello", "world"]);
1288        let p = str_array::<u32>(&["he", "o"]);
1289        let out = contains_dict_str((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1290        assert_eq!(out.get(0), Some(true));
1291        assert_eq!(out.get(1), Some(true));
1292    }
1293
1294    #[test]
1295    fn test_contains_dict_str_chunk() {
1296        let s = dict_array::<u32>(&["dummy", "hello", "world"]);
1297        let p = str_array::<u32>(&["dummy", "he", "o"]);
1298        let out = contains_dict_str((&s, 1, 2), (&p, 1, 2)).unwrap();
1299        assert_eq!(out.get(0), Some(true));
1300        assert_eq!(out.get(1), Some(true));
1301    }
1302
1303    #[test]
1304    fn test_starts_with_str_dict() {
1305        let s = str_array::<u32>(&["abcdef", "foobar", "quux"]);
1306        let p = dict_array::<u32>(&["ab", "foo", "qu"]);
1307        let out = starts_with_str_dict((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1308        assert_eq!(out.get(0), Some(true));
1309        assert_eq!(out.get(1), Some(true));
1310        assert_eq!(out.get(2), Some(true));
1311    }
1312
1313    #[test]
1314    fn test_starts_with_str_dict_chunk() {
1315        let s = str_array::<u32>(&["dummy", "abcdef", "foobar", "quux"]);
1316        let p = dict_array::<u32>(&["dummy", "ab", "foo", "qu"]);
1317        let out = starts_with_str_dict((&s, 1, 3), (&p, 1, 3)).unwrap();
1318        assert_eq!(out.get(0), Some(true));
1319        assert_eq!(out.get(1), Some(true));
1320        assert_eq!(out.get(2), Some(true));
1321    }
1322
1323    #[test]
1324    fn test_starts_with_dict_dict() {
1325        let s = dict_array::<u32>(&["qwerty", "banana"]);
1326        let p = dict_array::<u32>(&["qw", "ban"]);
1327        let out = starts_with_dict_dict((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1328        assert_eq!(out.get(0), Some(true));
1329        assert_eq!(out.get(1), Some(true));
1330    }
1331
1332    #[test]
1333    fn test_starts_with_dict_dict_chunk() {
1334        let s = dict_array::<u32>(&["dummy", "qwerty", "banana"]);
1335        let p = dict_array::<u32>(&["dummy", "qw", "ban"]);
1336        let out = starts_with_dict_dict((&s, 1, 2), (&p, 1, 2)).unwrap();
1337        assert_eq!(out.get(0), Some(true));
1338        assert_eq!(out.get(1), Some(true));
1339    }
1340
1341    #[test]
1342    fn test_ends_with_str_dict() {
1343        let s = str_array::<u32>(&["poem", "dome", "gnome"]);
1344        let p = dict_array::<u32>(&["em", "me", "ome"]);
1345        let out = ends_with_str_dict((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1346        assert_eq!(out.get(0), Some(true));
1347        assert_eq!(out.get(1), Some(true));
1348        assert_eq!(out.get(2), Some(true));
1349    }
1350
1351    #[test]
1352    fn test_ends_with_str_dict_chunk() {
1353        let s = str_array::<u32>(&["dummy", "poem", "dome", "gnome"]);
1354        let p = dict_array::<u32>(&["dummy", "em", "me", "ome"]);
1355        let out = ends_with_str_dict((&s, 1, 3), (&p, 1, 3)).unwrap();
1356        assert_eq!(out.get(0), Some(true));
1357        assert_eq!(out.get(1), Some(true));
1358        assert_eq!(out.get(2), Some(true));
1359    }
1360
1361    #[test]
1362    fn test_ends_with_dict_dict() {
1363        let s = dict_array::<u32>(&["tablet", "let", "bet"]);
1364        let p = dict_array::<u32>(&["let", "et", "xyz"]);
1365        let out = ends_with_dict_dict((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1366        assert_eq!(out.get(0), Some(true));
1367        assert_eq!(out.get(1), Some(true));
1368        assert_eq!(out.get(2), Some(false));
1369    }
1370
1371    #[test]
1372    fn test_ends_with_dict_dict_chunk() {
1373        let s = dict_array::<u32>(&["dummy", "tablet", "let", "bet"]);
1374        let p = dict_array::<u32>(&["dummy", "let", "et", "xyz"]);
1375        let out = ends_with_dict_dict((&s, 1, 3), (&p, 1, 3)).unwrap();
1376        assert_eq!(out.get(0), Some(true));
1377        assert_eq!(out.get(1), Some(true));
1378        assert_eq!(out.get(2), Some(false));
1379    }
1380
1381    // --- len_str, len_dict
1382
1383    #[test]
1384    fn test_len_str() {
1385        let arr = str_array::<u32>(&["", "a", "abc", "bar"]);
1386        let out = len_str((&arr, 0, arr.len())).unwrap();
1387        assert_eq!(&out.data[..], &[0, 1, 3, 3]);
1388    }
1389
1390    #[test]
1391    fn test_len_str_chunk() {
1392        let arr = str_array::<u32>(&["zzz", "", "a", "abc", "bar"]);
1393        let out = len_str((&arr, 1, 4)).unwrap(); // ["", "a", "abc", "bar"]
1394        assert_eq!(&out.data[..], &[0, 1, 3, 3]);
1395    }
1396
1397    #[test]
1398    fn test_len_dict() {
1399        let arr = dict_array::<u32>(&["", "one", "seven"]);
1400        let out = len_dict((&arr, 0, arr.len())).unwrap();
1401        assert_eq!(&out.data[..], &[0, 3, 5]);
1402    }
1403
1404    #[test]
1405    fn test_len_dict_chunk() {
1406        let arr = dict_array::<u32>(&["q", "", "one", "seven"]);
1407        let out = len_dict((&arr, 1, 3)).unwrap(); // ["", "one", "seven"]
1408        assert_eq!(&out.data[..], &[0, 3, 5]);
1409    }
1410
1411    #[test]
1412    fn test_contains_empty_pattern() {
1413        let s = str_array::<u32>(&["foo", "bar"]);
1414        let p = str_array::<u32>(&["", ""]);
1415        let out = contains_str_str((&s, 0, s.len()), (&p, 0, p.len()));
1416        // always false
1417        assert_eq!(out.get(0), Some(false));
1418        assert_eq!(out.get(1), Some(false));
1419        assert!(out.null_mask.as_ref().is_none());
1420    }
1421
1422    #[test]
1423    fn test_contains_empty_pattern_chunk() {
1424        let s = str_array::<u32>(&["z", "foo", "bar"]);
1425        let p = str_array::<u32>(&["z", "", ""]);
1426        let out = contains_str_str((&s, 1, 2), (&p, 1, 2));
1427        assert_eq!(out.get(0), Some(false));
1428        assert_eq!(out.get(1), Some(false));
1429        assert!(out.null_mask.as_ref().is_none());
1430    }
1431
1432    #[test]
1433    fn test_contains_str_str_nulls_on_pattern() {
1434        let mut s = str_array::<u32>(&["abc", "def"]);
1435        s.null_mask = Some(bm(&[true, true]));
1436        let mut p = str_array::<u32>(&["b", "e"]);
1437        p.null_mask = Some(bm(&[true, false])); // second pattern is null
1438        let out = contains_str_str((&s, 0, s.len()), (&p, 0, p.len()));
1439        assert_eq!(out.get(0), Some(true));
1440        assert_eq!(out.get(1), Some(false));
1441    }
1442
1443    #[test]
1444    fn test_contains_str_str_nulls_on_pattern_chunk() {
1445        let mut s = str_array::<u32>(&["X", "abc", "def"]);
1446        s.null_mask = Some(bm(&[true, true, true]));
1447        let mut p = str_array::<u32>(&["X", "b", "e"]);
1448        p.null_mask = Some(bm(&[true, true, false])); // last pattern is null
1449        let out = contains_str_str((&s, 1, 2), (&p, 1, 2));
1450        assert_eq!(out.get(0), Some(true));
1451        assert_eq!(out.get(1), Some(false));
1452    }
1453
1454    #[cfg(feature = "regex")]
1455    #[test]
1456    fn test_regex_invalid_pattern_returns_err() {
1457        let s = str_array::<u32>(&["abc"]);
1458        let p = str_array::<u32>(&["["]);
1459        let err = regex_str_str((&s, 0, s.len()), (&p, 0, p.len())).unwrap_err();
1460        match err {
1461            KernelError::InvalidArguments(_) => {}
1462            _ => panic!("expected InvalidArguments"),
1463        }
1464    }
1465
1466    #[cfg(feature = "regex")]
1467    #[test]
1468    fn test_regex_invalid_pattern_returns_err_chunk() {
1469        let s = str_array::<u32>(&["foo", "abc"]);
1470        let p = str_array::<u32>(&["bar", "["]);
1471        let err = regex_str_str((&s, 1, 1), (&p, 1, 1)).unwrap_err();
1472        match err {
1473            KernelError::InvalidArguments(_) => {}
1474            _ => panic!("expected InvalidArguments"),
1475        }
1476    }
1477
1478    #[cfg(feature = "regex")]
1479    #[test]
1480    fn test_regex_empty_pattern_always_false() {
1481        let s = str_array::<u32>(&["abc", "def"]);
1482        let p = str_array::<u32>(&["", ""]);
1483        let out = regex_str_str((&s, 0, s.len()), (&p, 0, p.len())).unwrap();
1484        assert_eq!(out.get(0), Some(false));
1485        assert_eq!(out.get(1), Some(false));
1486        assert!(out.null_mask.unwrap().all_set());
1487    }
1488
1489    #[cfg(feature = "regex")]
1490    #[test]
1491    fn test_regex_empty_pattern_always_false_chunk() {
1492        let s = str_array::<u32>(&["z", "abc", "def"]);
1493        let p = str_array::<u32>(&["z", "", ""]);
1494        let out = regex_str_str((&s, 1, 2), (&p, 1, 2)).unwrap();
1495        assert_eq!(out.get(0), Some(false));
1496        assert_eq!(out.get(1), Some(false));
1497        assert!(out.null_mask.unwrap().all_set());
1498    }
1499
1500    #[test]
1501    fn test_len_str_with_nulls() {
1502        let mut arr = str_array::<u32>(&["foo", "", "bar"]);
1503        arr.null_mask = Some(bm(&[true, false, true]));
1504        let len_arr = len_str((&arr, 0, arr.len())).unwrap();
1505        assert_eq!(len_arr.data.as_slice(), &[3, 0, 3]);
1506        assert_eq!(
1507            len_arr.null_mask.unwrap().as_slice(),
1508            bm(&[true, false, true]).as_slice()
1509        );
1510    }
1511
1512    #[test]
1513    fn test_len_str_with_nulls_chunk() {
1514        let mut arr = str_array::<u32>(&["x", "foo", "", "bar"]);
1515        arr.null_mask = Some(bm(&[true, true, false, true]));
1516        let len_arr = len_str((&arr, 1, 3)).unwrap();
1517        assert_eq!(len_arr.data.as_slice(), &[3, 0, 3]);
1518        assert_eq!(
1519            len_arr.null_mask.unwrap().as_slice(),
1520            bm(&[true, false, true]).as_slice()
1521        );
1522    }
1523
1524    #[test]
1525    fn test_len_dict_with_nulls() {
1526        let mut arr = dict_array::<u32>(&["x", "yy", "zzz"]);
1527        arr.null_mask = Some(bm(&[false, true, true]));
1528        let len_arr = len_dict((&arr, 0, arr.len())).unwrap();
1529        assert_eq!(len_arr.data.as_slice(), &[0, 2, 3]);
1530        assert_eq!(
1531            len_arr.null_mask.unwrap().as_slice(),
1532            bm(&[false, true, true]).as_slice()
1533        );
1534    }
1535
1536    #[test]
1537    fn test_len_dict_with_nulls_chunk() {
1538        let mut arr = dict_array::<u32>(&["z", "x", "yy", "zzz"]);
1539        arr.null_mask = Some(bm(&[true, false, true, true]));
1540        let len_arr = len_dict((&arr, 1, 3)).unwrap();
1541        assert_eq!(len_arr.data.as_slice(), &[0, 2, 3]);
1542        assert_eq!(
1543            len_arr.null_mask.unwrap().as_slice(),
1544            bm(&[false, true, true]).as_slice()
1545        );
1546    }
1547
1548    fn bitmask_from_vec(v: &[bool]) -> Bitmask {
1549        let mut bm = Bitmask::with_capacity(v.len());
1550        for (i, &b) in v.iter().enumerate() {
1551            bm.set(i, b);
1552        }
1553        bm
1554    }
1555
1556    #[test]
1557    fn test_min_string_array_all_valid() {
1558        let arr = StringArray::<u32>::from_slice(&["zulu", "alpha", "echo", "bravo"]);
1559        let view = (&arr, 0, arr.len());
1560        let result = min_string_array::<u32>(view);
1561        assert_eq!(result, Some("alpha".to_string()));
1562    }
1563
1564    #[test]
1565    fn test_min_string_array_with_nulls() {
1566        let mut arr = StringArray::<u32>::from_slice(&["zulu", "alpha", "echo", "bravo"]);
1567        arr.null_mask = Some(bitmask_from_vec(&[false, true, true, true]));
1568        let view = (&arr, 0, arr.len());
1569        let result = min_string_array::<u32>(view);
1570        assert_eq!(result, Some("alpha".to_string()));
1571    }
1572
1573    #[test]
1574    fn test_min_string_array_all_null() {
1575        let arr = StringArray::<u32>::from_slice(&["zulu", "alpha", "echo", "bravo"]);
1576        let mut null_mask = Bitmask::with_capacity(arr.len());
1577        for i in 0..arr.len() {
1578            null_mask.set(i, false);
1579        }
1580        let arr = StringArray::<u32> {
1581            null_mask: Some(null_mask),
1582            ..arr
1583        };
1584        let view = (&arr, 0, arr.len());
1585        let result = min_string_array::<u32>(view);
1586        assert_eq!(result, None);
1587    }
1588
1589    #[test]
1590    fn test_max_string_array_all_valid() {
1591        let arr = StringArray::<u32>::from_slice(&["zulu", "alpha", "echo", "bravo"]);
1592        let view = (&arr, 0, arr.len());
1593        let result = max_string_array::<u32>(view);
1594        assert_eq!(result, Some("zulu".to_string()));
1595    }
1596
1597    #[test]
1598    fn test_max_string_array_with_nulls() {
1599        let mut arr = StringArray::<u32>::from_slice(&["zulu", "alpha", "echo", "bravo"]);
1600        arr.null_mask = Some(bitmask_from_vec(&[true, false, true, false]));
1601        let view = (&arr, 0, arr.len());
1602        let result = max_string_array::<u32>(view);
1603        assert_eq!(result, Some("zulu".to_string()));
1604    }
1605
1606    #[test]
1607    fn test_max_string_array_all_null() {
1608        let arr = StringArray::<u32>::from_slice(&["zulu", "alpha", "echo", "bravo"]);
1609        let mut null_mask = Bitmask::with_capacity(arr.len());
1610        for i in 0..arr.len() {
1611            null_mask.set(i, false);
1612        }
1613        let arr = StringArray::<u32> {
1614            null_mask: Some(null_mask),
1615            ..arr
1616        };
1617        let view = (&arr, 0, arr.len());
1618        let result = max_string_array::<u32>(view);
1619        assert_eq!(result, None);
1620    }
1621
1622    #[test]
1623    fn test_min_categorical_array() {
1624        let uniques = vec64![
1625            "dog".to_string(),
1626            "zebra".to_string(),
1627            "ant".to_string(),
1628            "bee".to_string()
1629        ];
1630        let indices = vec64![1u32, 0, 3, 2]; // "zebra", "dog", "bee", "ant"
1631        let cat = CategoricalArray {
1632            data: indices.clone().into(),
1633            unique_values: uniques.clone().into(),
1634            null_mask: None,
1635        };
1636        let result = min_categorical_array((&cat, 0, indices.len()));
1637        assert_eq!(result, Some("ant".to_string()));
1638    }
1639
1640    #[test]
1641    fn test_max_categorical_array() {
1642        let uniques = vec64![
1643            "dog".to_string(),
1644            "zebra".to_string(),
1645            "ant".to_string(),
1646            "bee".to_string()
1647        ];
1648        let indices = vec64![2u32, 0, 1, 3]; // "ant", "dog", "zebra", "bee"
1649        let cat = CategoricalArray {
1650            data: indices.clone().into(),
1651            unique_values: uniques.clone().into(),
1652            null_mask: None,
1653        };
1654        let result = max_categorical_array((&cat, 0, indices.len()));
1655        assert_eq!(result, Some("zebra".to_string()));
1656    }
1657
1658    #[test]
1659    fn test_min_categorical_array_with_nulls() {
1660        let uniques = vec64!["dog".to_string(), "zebra".to_string(), "ant".to_string()];
1661        let indices = vec64![1u32, 2, 0];
1662        let mut null_mask = Bitmask::with_capacity(indices.len());
1663        null_mask.set(0, true);
1664        null_mask.set(1, false);
1665        null_mask.set(2, true);
1666        let cat = CategoricalArray {
1667            data: indices.clone().into(),
1668            unique_values: uniques.clone().into(),
1669            null_mask: Some(null_mask),
1670        };
1671        let result = min_categorical_array((&cat, 0, indices.len()));
1672        assert_eq!(result, Some("dog".to_string())); // Only positions 0 and 2 valid: "zebra", "dog" → "dog" is smaller
1673    }
1674
1675    #[test]
1676    fn test_max_categorical_array_with_nulls() {
1677        let uniques = vec64!["dog".to_string(), "zebra".to_string(), "ant".to_string()];
1678        let indices = vec64![1u32, 2, 0];
1679        let mut null_mask = Bitmask::with_capacity(indices.len());
1680        null_mask.set(0, true);
1681        null_mask.set(1, false);
1682        null_mask.set(2, true);
1683        let cat = CategoricalArray {
1684            data: indices.clone().into(),
1685            unique_values: uniques.clone().into(),
1686            null_mask: Some(null_mask),
1687        };
1688        let result = max_categorical_array((&cat, 0, indices.len()));
1689        assert_eq!(result, Some("zebra".to_string())); // Only positions 0 and 2 valid: "zebra", "dog" → "zebra" is larger
1690    }
1691}