Skip to main content

minarrow/structs/variants/
categorical.rs

1// Copyright 2025 Peter Garfield Bower
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! # **CategoricalArray Module** - *Mid-Level, Inner Typed Categorical Array*
16//!
17//! CategoricalArray uses dictionary-encoded strings where each row stores a
18//! small integer “code” that references a per-column dictionary of unique strings.
19//! This saves memory and accelerates comparisons/joins when many values repeat.
20//!
21//! ## Interop
22//! - Arrow-compatible dictionary layout (`indices` + string `dictionary`), and
23//!   round-trips over the Arrow C Data Interface to/from the `Dictionary` array type.
24//! - Index width is the generic `T` (e.g., `u8/u16/u32/u64`) and corresponds to
25//!   Arrow’s `CategoricalIndexType`.
26//!
27//! ## Features
28//! - Optional `null_mask`: bit-packed, where `1 = valid`, `0 = null`
29//! - Builders from raw values (`from_values`, `from_vec64`) and from raw parts.
30//! - Iterators over indices and over resolved strings (nullable and non-nullable).
31//! - Convert to a dense `StringArray` via `to_string_array()` when needed.
32//! - Parallel helpers behind `parallel_proc` feature.
33//!
34//! ## When to use
35//! Use for arrays with repeated strings to reduce memory and speed up operations.
36
37use std::collections::HashMap;
38use std::fmt::{Debug, Display, Formatter};
39use std::slice::{Iter, IterMut};
40
41#[cfg(feature = "parallel_proc")]
42use rayon::iter::ParallelIterator;
43
44use crate::aliases::CategoricalAVT;
45use crate::enums::error::MinarrowError;
46use crate::enums::shape_dim::ShapeDim;
47#[cfg(feature = "shared_dict")]
48use crate::structs::dictionary::Dictionary;
49use crate::traits::concatenate::Concatenate;
50use crate::traits::shape::Shape;
51use crate::traits::type_unions::Integer;
52use crate::utils::validate_null_mask_len;
53use crate::{
54    Bitmask, Buffer, Length, MaskedArray, Offset, StringArray, impl_arc_masked_array,
55    impl_array_ref_deref,
56};
57use ::vec64::{Vec64, Vec64Alloc};
58
59/// Without `shared_dict`, returns the existing code for `value` if it
60/// is already in `unique_values`, else pushes it and returns the new
61/// code. Linear scan. Panics if the new cardinality would exceed the
62/// capacity of `T`.
63#[cfg(not(feature = "shared_dict"))]
64#[inline]
65fn add_category<T: Integer>(unique_values: &mut Vec64<String>, value: &str) -> T {
66    if let Some(pos) = unique_values.iter().position(|s| s.as_str() == value) {
67        return T::from_usize(pos);
68    }
69    let i = unique_values.len();
70    let c = T::try_from(i).ok().unwrap_or_else(|| {
71        panic!(
72            "Categorical cardinality exceeded the capacity of the index \
73             type {}. Consider a wider index width.",
74            std::any::type_name::<T>()
75        )
76    });
77    unique_values.push(value.to_owned());
78    c
79}
80
81/// # CategoricalArray
82///
83/// Categorical array with unique string instances mapped to indices.
84///
85/// ## Role
86/// - Many will prefer the higher level `Array` type, which dispatches to this when
87/// necessary.
88/// - Can be used as a standalone text array or as the text arm of `TextArray` / `Array`.
89///
90/// ## Description
91/// Compatible with the `Arrow Dictionary` memory layout, where each value is
92/// represented as an index into a dictionary of unique strings, and materialises
93/// into the format over FFI.
94///
95/// ### Fields:
96/// - `data`: indices buffer referencing entries in `unique_values`.
97/// - `unique_values`: dictionary of unique string values.
98/// - `null_mask`: optional bit-packed validity bitmap (1=valid, 0=null).
99///
100/// ## Purpose
101/// Consider this when you have a common set of unique string values, and want to
102/// save space and increase speed by storing the string values only once
103/// *(in the `unique_values` Vec)*, and then only the integers that map to them
104/// in the `data` field.
105///
106/// ## Example
107/// ```rust
108/// use minarrow::{CategoricalArray, MaskedArray};
109///
110/// let arr = CategoricalArray::<u8>::from_values(vec!["apple", "banana", "apple", "cherry"]);
111/// assert_eq!(arr.len(), 4);
112///
113/// // Indices into the unique_values dictionary
114/// assert_eq!(arr.indices(), &[0u8, 1, 0, 2]);
115///
116/// // Dictionary of unique values
117/// assert_eq!(arr.unique_values(), &["apple".to_string(), "banana".to_string(), "cherry".to_string()]);
118///
119/// // Resolved value lookups
120/// assert_eq!(arr.get_str(0), Some("apple"));
121/// assert_eq!(arr.get_str(1), Some("banana"));
122/// assert_eq!(arr.get_str(2), Some("apple"));
123/// assert_eq!(arr.get_str(3), Some("cherry"));
124/// ```
125#[repr(C, align(64))]
126#[derive(PartialEq, Clone, Debug, Default)]
127pub struct CategoricalArray<T: Integer> {
128    /// Indices buffer (references into the dictionary).
129    pub data: Buffer<T>,
130    /// Dictionary values, i.e., the unique strings indexed by `data`.
131    #[cfg(not(feature = "shared_dict"))]
132    pub unique_values: Vec64<String>,
133    /// When the `shared_dict` feature is on, a shared dictionary
134    /// reference is used to ensure that categories remain aligned across
135    /// related categorical arrays.
136    #[cfg(feature = "shared_dict")]
137    pub dictionary: Dictionary<T>,
138    /// Optional null mask (bit-packed; 1=valid, 0=null).
139    pub null_mask: Option<Bitmask>,
140}
141
142impl<T: Integer> CategoricalArray<T> {
143    /// Constructs a new CategoricalArray
144    #[inline]
145    pub fn new(
146        data: impl Into<Buffer<T>>,
147        unique_values: Vec64<String>,
148        null_mask: Option<Bitmask>,
149    ) -> Self {
150        let data: Buffer<T> = data.into();
151
152        validate_null_mask_len(data.len(), &null_mask);
153        // Per the Arrow spec, values at null positions are unspecified, so we
154        // skip them here. Pandas, for instance, writes a sentinel (-1, which
155        // wraps to 255 for u8 indices) into null slots when exporting a
156        // Categorical over the C Data Interface.
157        for (i, code) in data.iter().enumerate() {
158            let is_valid = null_mask.as_ref().map_or(true, |m| m.get(i));
159            if !is_valid {
160                continue;
161            }
162            let idx = code
163                .to_usize()
164                .unwrap_or_else(|| panic!("Failed to convert code to usize at position {}", i));
165            assert!(
166                idx < unique_values.len(),
167                "Index {} out of bounds for unique_values (len = {}) at position {}",
168                idx,
169                unique_values.len(),
170                i
171            );
172        }
173
174        Self {
175            data,
176            #[cfg(not(feature = "shared_dict"))]
177            unique_values,
178            #[cfg(feature = "shared_dict")]
179            dictionary: Dictionary::from(unique_values),
180            null_mask,
181        }
182    }
183
184    /// Constructs a `CategoricalArray` that joins an existing dictionary's
185    /// sharing group. The provided `Dictionary` is cloned (Arc bump), so
186    /// the resulting array's codes are mutually meaningful with every
187    /// other array sharing that dictionary. Used by streaming batch
188    /// consolidation and by FFI imports that have already deduplicated
189    /// dictionaries upstream.
190    #[cfg(feature = "shared_dict")]
191    #[inline]
192    pub fn new_existing_dict(
193        data: impl Into<Buffer<T>>,
194        dictionary: Dictionary<T>,
195        null_mask: Option<Bitmask>,
196    ) -> Self {
197        let data: Buffer<T> = data.into();
198        validate_null_mask_len(data.len(), &null_mask);
199        Self {
200            data,
201            dictionary,
202            null_mask,
203        }
204    }
205
206    /// Construct an empty categorical with reserved capacity for `cap` indices.
207    /// Pass `unique_values` to pre-populate the dictionary, or `None` for an
208    /// empty one.
209    #[inline]
210    pub fn with_capacity(
211        cap: usize,
212        unique_values: Option<Vec64<String>>,
213        null_mask: bool,
214    ) -> Self {
215        Self {
216            data: Vec64::with_capacity(cap).into(),
217            #[cfg(not(feature = "shared_dict"))]
218            unique_values: unique_values.unwrap_or_default(),
219            #[cfg(feature = "shared_dict")]
220            dictionary: unique_values.map(Dictionary::from).unwrap_or_default(),
221            null_mask: if null_mask {
222                // All-valid (1) default - reserved validity slots default to
223                // valid under Arrow's 1=valid, 0=null convention.
224                Some(Bitmask::new_set_all(cap, true))
225            } else {
226                None
227            },
228        }
229    }
230
231    /// Build a categorical column from raw string values, auto-deriving the dictionary.
232    #[inline]
233    pub fn from_vec64(values: Vec64<&str>, null_mask: Option<Bitmask>) -> Self {
234        validate_null_mask_len(values.len(), &null_mask);
235
236        let len = values.len();
237        let mut codes = Vec64::with_capacity(len);
238        let mut unique_values: Vec64<String> = Vec64::new();
239        let mut dict = HashMap::new();
240
241        for (i, s) in values.into_iter().enumerate() {
242            // nulls get the default code, but do not participate in the dictionary
243            let is_valid = null_mask.as_ref().map_or(true, |m| m.get(i));
244            if !is_valid {
245                codes.push(T::default());
246                continue;
247            }
248
249            if let Some(&code) = dict.get(&s) {
250                codes.push(code);
251            } else {
252                let idx = unique_values.len();
253                let code = T::try_from(idx).ok().unwrap_or_else(|| {
254                    panic!(
255                        "Unique category count ({}) exceeds capacity of index type {}",
256                        idx + 1,
257                        std::any::type_name::<T>()
258                    )
259                });
260                unique_values.push(s.to_string());
261                dict.insert(s, code);
262                codes.push(code);
263            }
264        }
265
266        Self {
267            data: codes.into(),
268            #[cfg(not(feature = "shared_dict"))]
269            unique_values,
270            #[cfg(feature = "shared_dict")]
271            dictionary: Dictionary::from(unique_values),
272            null_mask,
273        }
274    }
275
276    /// Vec wrapper
277    #[inline]
278    pub fn from_vec(values: Vec<&str>, null_mask: Option<Bitmask>) -> Self {
279        Self::from_vec64(values.into(), null_mask)
280    }
281
282    /// Constructs a new CategoricalArray without validation. The caller must ensure consistency.
283    #[inline]
284    pub fn new_unchecked(
285        data: Vec64<T>,
286        unique_values: Vec64<String>,
287        null_mask: Option<Bitmask>,
288    ) -> Self {
289        Self {
290            data: data.into(),
291            #[cfg(not(feature = "shared_dict"))]
292            unique_values,
293            #[cfg(feature = "shared_dict")]
294            dictionary: Dictionary::from(unique_values),
295            null_mask,
296        }
297    }
298
299    /// Constructs a dense DictionaryArray from index and value slices (no nulls).
300    #[inline]
301    pub fn from_slices(indices: &[T], unique_values: &[String]) -> Self {
302        assert!(
303            indices.iter().all(|&idx| {
304                let i = idx.to_usize();
305                i < unique_values.len()
306            }),
307            "All indices must be valid for unique_values"
308        );
309        let dict_values: Vec64<String> = Vec64(unique_values.to_vec_in(Vec64Alloc::default()));
310        Self {
311            data: Vec64(indices.to_vec_in(Vec64Alloc::default())).into(),
312            #[cfg(not(feature = "shared_dict"))]
313            unique_values: dict_values,
314            #[cfg(feature = "shared_dict")]
315            dictionary: Dictionary::from(dict_values),
316            null_mask: None,
317        }
318    }
319
320    /// Returns the current dictionary values as a slice.
321    ///
322    /// Under `shared_dict` the dictionary may be updated concurrently
323    /// by other clones in the sharing group; this method returns the
324    /// published prefix at the moment of the call.
325    #[inline]
326    pub fn unique_values(&self) -> &[String] {
327        #[cfg(not(feature = "shared_dict"))]
328        {
329            &self.unique_values
330        }
331        #[cfg(feature = "shared_dict")]
332        {
333            self.dictionary.values()
334        }
335    }
336
337    /// Returns the dictionary indices as a slice.
338    ///
339    /// Remember, the indices are the data,
340    /// because the values are the unique Strings,
341    /// in contrast to what a dictionary usually refers to.
342    #[inline]
343    pub fn indices(&self) -> &[T] {
344        &self.data
345    }
346
347    /// Returns an iterator of dictionary indices (backing buffer).
348    pub fn indices_iter(&self) -> Iter<'_, T> {
349        self.data.iter()
350    }
351
352    /// Returns an iterator of dictionary values (unique strings).
353    pub fn values_iter(&self) -> Iter<'_, String> {
354        self.unique_values().iter()
355    }
356
357    /// Returns a mutable iterator over indices buffer.
358    pub fn indices_iter_mut(&mut self) -> IterMut<'_, T> {
359        self.data.iter_mut()
360    }
361
362    /// Returns a mutable iterator over dictionary values.
363    ///
364    /// Mutating an existing entry replaces the string that every code
365    /// assigned at that position decodes to; codes against the old value
366    /// no longer mean what they previously meant. For adding new values,
367    /// `push_str` is the append-only path.
368    ///
369    /// Under `shared_dict` this categorical's dictionary is detached
370    /// from its sharing group first, so sibling chunks and any parent
371    /// `SuperArray` / `SuperTable` manager keep pointing at the
372    /// original dictionary.
373    pub fn values_iter_mut(&mut self) -> IterMut<'_, String> {
374        #[cfg(not(feature = "shared_dict"))]
375        {
376            self.unique_values.iter_mut()
377        }
378        #[cfg(feature = "shared_dict")]
379        {
380            self.dictionary.detach_to_owned();
381            self.dictionary
382                .try_values_iter_mut()
383                .expect("detach_to_owned just left this Arc unique")
384        }
385    }
386
387    /// Extend with an iterator of &str.
388    pub fn extend<'a, I: Iterator<Item = &'a str>>(&mut self, iter: I) {
389        for s in iter {
390            self.push(s.to_owned());
391        }
392    }
393
394    /// Append string, adding to dictionary if new. Returns dictionary index used.
395    #[inline]
396    pub fn push_str(&mut self, value: &str) -> T {
397        #[cfg(not(feature = "shared_dict"))]
398        let code: T = add_category(&mut self.unique_values, value);
399        #[cfg(feature = "shared_dict")]
400        let code: T = self.dictionary.add_cat(value).expect(
401            "Dictionary category interning failed: cardinality exceeded capacity \
402             of the categorical integer. Consider a CategoricalArray<T> with a \
403             greater `T` capacity.",
404        );
405        self.data.push(code);
406        let row = self.len() - 1;
407        if let Some(mask) = &mut self.null_mask {
408            mask.set(row, true);
409        }
410        code
411    }
412
413    /// Appends a string without bounds checks, adding to the dictionary if new.
414    ///
415    /// # Safety
416    /// - The caller must ensure `self.data` has sufficient capacity (i.e., already resized).
417    /// - `self.null_mask`, if present, must also have space for this index.
418    /// - This method assumes exclusive mutable access and no concurrent modification.
419    #[inline(always)]
420    pub unsafe fn push_str_unchecked(&mut self, value: &str) {
421        let idx = self.data.len();
422        unsafe { self.set_str_unchecked(idx, value) };
423    }
424
425    /// Retrieves the value at the given index, or None if null.
426    #[inline]
427    pub fn get_str(&self, idx: usize) -> Option<&str> {
428        if self.is_null(idx) {
429            return None;
430        }
431        let dict_idx = self.data[idx].to_usize();
432        Some(&self.unique_values()[dict_idx])
433    }
434
435    /// Like `get`, but skips bounds checks.
436    #[inline(always)]
437    pub unsafe fn get_str_unchecked(&self, idx: usize) -> &str {
438        if let Some(mask) = &self.null_mask {
439            if !unsafe { mask.get_unchecked(idx) } {
440                return "";
441            }
442        }
443        let dict_idx = unsafe { self.data.get_unchecked(idx).to_usize().unwrap() };
444        unsafe { self.unique_values().get_unchecked(dict_idx) }
445    }
446
447    /// Sets the value at `idx`. Marks as valid.
448    #[inline]
449    pub fn set_str(&mut self, idx: usize, value: &str) {
450        assert!(idx < self.data.len(), "index out of bounds");
451
452        #[cfg(not(feature = "shared_dict"))]
453        let code: T = add_category(&mut self.unique_values, value);
454        #[cfg(feature = "shared_dict")]
455        let code: T = self.dictionary.add_cat(value).expect(
456            "Dictionary category interning failed: cardinality exceeded capacity \
457             of the categorical integer. Consider a CategoricalArray<T> with a \
458             greater `T` capacity.",
459        );
460
461        self.data[idx] = code;
462
463        if let Some(mask) = &mut self.null_mask {
464            mask.set(idx, true);
465        } else {
466            let mut m = Bitmask::new_set_all(self.data.len(), false);
467            m.set(idx, true);
468            self.null_mask = Some(m);
469        }
470    }
471
472    /// Like `set`, but skips all bounds checks.
473    #[inline(always)]
474    pub unsafe fn set_str_unchecked(&mut self, idx: usize, value: &str) {
475        #[cfg(not(feature = "shared_dict"))]
476        let code: T = add_category(&mut self.unique_values, value);
477        #[cfg(feature = "shared_dict")]
478        let code: T = self.dictionary.add_cat(value).expect(
479            "Dictionary category interning failed: cardinality exceeded capacity \
480             of the categorical integer. Consider a CategoricalArray<T> with a \
481             greater `T` capacity.",
482        );
483        let data = self.data.as_mut_slice();
484        data[idx] = code;
485        if let Some(mask) = &mut self.null_mask {
486            mask.set(idx, true);
487        } else {
488            let mut m = Bitmask::new_set_all(self.len(), false);
489            m.set(idx, true);
490            self.null_mask = Some(m);
491        }
492    }
493
494    /// Returns an iterator of &str (nulls yielded as empty string).
495    #[inline]
496    pub fn iter_str(&self) -> impl Iterator<Item = &str> + '_ {
497        self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
498            if self.is_null(idx) {
499                ""
500            } else {
501                &self.unique_values()[dict_idx.to_usize()]
502            }
503        })
504    }
505
506    /// Returns an iterator of Option<&str>, None if value is null.
507    #[inline]
508    pub fn iter_str_opt(&self) -> impl Iterator<Item = Option<&str>> + '_ {
509        self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
510            if self.is_null(idx) {
511                None
512            } else {
513                Some(self.unique_values()[dict_idx.to_usize()].as_str())
514            }
515        })
516    }
517
518    /// Returns an iterator of `&str` values (nulls yield `""`) for a specified range.
519    #[inline]
520    pub fn iter_str_range(&self, offset: usize, len: usize) -> impl Iterator<Item = &str> + '_ {
521        self.data[offset..offset + len]
522            .iter()
523            .enumerate()
524            .map(move |(i, &dict_idx)| {
525                let idx = offset + i;
526                if self.is_null(idx) {
527                    ""
528                } else {
529                    &self.unique_values()[dict_idx.to_usize()]
530                }
531            })
532    }
533
534    /// Returns an iterator of `Option<&str>` values for a specified range.
535    #[inline]
536    pub fn iter_str_opt_range(
537        &self,
538        offset: usize,
539        len: usize,
540    ) -> impl Iterator<Item = Option<&str>> + '_ {
541        self.data[offset..offset + len]
542            .iter()
543            .enumerate()
544            .map(move |(i, &dict_idx)| {
545                let idx = offset + i;
546                if self.is_null(idx) {
547                    None
548                } else {
549                    Some(self.unique_values()[dict_idx.to_usize()].as_str())
550                }
551            })
552    }
553
554    /// Build from an iterator of &str in one pass.
555    pub fn from_values<'a, I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
556        use std::collections::HashMap;
557        let mut dict = Vec64::<String>::new();
558        let mut map = HashMap::<&str, usize>::new();
559        let mut idx_buf = Vec64::<T>::new();
560
561        for s in iter {
562            let pos = *map.entry(s).or_insert_with(|| {
563                let i = dict.len();
564                dict.push(s.to_owned());
565                i
566            });
567            idx_buf.push(<T>::from_usize(pos));
568        }
569
570        Self {
571            data: idx_buf.into(),
572            #[cfg(not(feature = "shared_dict"))]
573            unique_values: dict,
574            #[cfg(feature = "shared_dict")]
575            dictionary: Dictionary::from(dict),
576            null_mask: None,
577        }
578    }
579
580    /// Create from raw buffers (indices & dictionary) without copying.
581    #[inline]
582    pub fn from_parts(
583        indices: Vec64<T>,
584        unique_values: Vec64<String>,
585        null_mask: Option<Bitmask>,
586    ) -> Self {
587        Self {
588            data: indices.into(),
589            #[cfg(not(feature = "shared_dict"))]
590            unique_values,
591            #[cfg(feature = "shared_dict")]
592            dictionary: Dictionary::from(unique_values),
593            null_mask,
594        }
595    }
596
597    /// Materialise the categorical as a dense StringArray<T>.
598    #[inline]
599    pub fn to_string_array(&self) -> StringArray<T> {
600        let len = self.data.len();
601        let mut offsets = Vec64::with_capacity(len + 1);
602        let mut data = Vec64::<u8>::new();
603        offsets.push(T::zero());
604
605        for i in 0..len {
606            if self.is_null(i) {
607                offsets.push(T::from(data.len()).unwrap());
608            } else {
609                let dict_idx = self.data[i].to_usize();
610                let s = &self.unique_values()[dict_idx];
611                data.extend_from_slice(s.as_bytes());
612                offsets.push(T::from(data.len()).unwrap());
613            }
614        }
615
616        StringArray {
617            offsets: offsets.into(),
618            data: data.into(),
619            null_mask: self.null_mask.clone(),
620        }
621    }
622}
623
624impl<T: Integer> MaskedArray for CategoricalArray<T> {
625    type T = T;
626
627    type Container = Buffer<T>;
628
629    type LogicalType = String;
630
631    type CopyType<'a> = &'a str where Self: 'a;
632
633    /// Removes the rows in `[start, end)`, shifting later rows left.
634    /// The dictionary is unchanged: entries left unreferenced remain valid.
635    ///
636    /// # Panics
637    /// Panics if `start > end` or `end > len`.
638    fn delete_range(&mut self, start: usize, end: usize) {
639        self.data.delete_range(start, end);
640        if let Some(mask) = &mut self.null_mask {
641            mask.delete_range(start, end);
642        }
643    }
644
645    #[inline]
646    fn len(&self) -> usize {
647        self.data.len()
648    }
649
650    fn data(&self) -> &Self::Container {
651        &self.data
652    }
653
654    fn data_mut(&mut self) -> &mut Self::Container {
655        &mut self.data
656    }
657
658    /// Retrieves the value at the given index, or `None` if null.
659    ///
660    /// The returned `&str` borrows from `self`, tied to the lifetime of `&self`
661    /// via the trait's GAT `CopyType<'a>`.
662    ///
663    /// # Panics
664    /// Panics if `idx >= self.len()` or if `data[idx]` is an invalid index into `unique_values`.
665    #[inline]
666    fn get(&self, idx: usize) -> Option<&str> {
667        if self.is_null(idx) {
668            return None;
669        }
670
671        let dict_idx = self.data[idx].to_usize();
672        Some(&self.unique_values()[dict_idx])
673    }
674
675    /// Sets the value at `idx`. Marks as valid.
676    ///
677    /// Prefer `set_str` when you have a `&str` to avoid the `String` allocation.
678    #[inline]
679    fn set(&mut self, idx: usize, value: Self::LogicalType) {
680        self.set_str(idx, &value)
681    }
682
683    /// Like `get`, but skips bounds checks on both the data and dictionary index.
684    ///
685    /// # Safety
686    /// Caller must ensure:
687    /// - `idx` is within bounds of `self.data`
688    /// - `self.data[idx]` yields a valid index into `self.unique_values`
689    #[inline]
690    unsafe fn get_unchecked(&self, idx: usize) -> Option<&str> {
691        if let Some(mask) = &self.null_mask {
692            if !mask.get(idx) {
693                return None;
694            }
695        }
696
697        let dict_idx = unsafe { self.data.get_unchecked(idx).to_usize().unwrap() };
698        Some(unsafe { self.unique_values().get_unchecked(dict_idx).as_str() })
699    }
700
701    /// Like `set`, but skips all bounds checks.
702    ///
703    /// Prefer `set_str_unchecked` when you have a `&str` to avoid the `String` allocation.
704    #[inline]
705    unsafe fn set_unchecked(&mut self, idx: usize, value: Self::LogicalType) {
706        #[cfg(not(feature = "shared_dict"))]
707        let code: T = add_category(&mut self.unique_values, &value);
708        #[cfg(feature = "shared_dict")]
709        let code: T = self.dictionary.add_cat(&value).expect(
710            "Dictionary category interning failed: cardinality exceeded capacity \
711             of the categorical integer. Consider a CategoricalArray<T> with a \
712             greater `T` capacity.",
713        );
714        let data = self.data.as_mut_slice();
715        data[idx] = code;
716        if let Some(mask) = &mut self.null_mask {
717            mask.set(idx, true);
718        } else {
719            let mut m = Bitmask::new_set_all(self.len(), false);
720            m.set(idx, true);
721            self.null_mask = Some(m);
722        }
723    }
724
725    /// Returns an iterator of `&str` values borrowed from `self`.
726    ///
727    /// Nulls are represented as an empty string `""`.
728    #[inline]
729    fn iter(&self) -> impl Iterator<Item = &str> + '_ {
730        self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
731            if self.is_null(idx) {
732                ""
733            } else {
734                self.unique_values()[dict_idx.to_usize()].as_str()
735            }
736        })
737    }
738
739    /// Returns an iterator over `Option<&str>`, yielding `None` for nulls.
740    ///
741    /// The returned references borrow from `self`.
742    #[inline]
743    fn iter_opt(&self) -> impl Iterator<Item = Option<&str>> + '_ {
744        self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
745            if self.is_null(idx) {
746                None
747            } else {
748                Some(self.unique_values()[dict_idx.to_usize()].as_str())
749            }
750        })
751    }
752
753    /// Returns an iterator of `&str` values for a specified range.
754    /// Nulls yield `""`.
755    #[inline]
756    fn iter_range(&self, offset: usize, len: usize) -> impl Iterator<Item = &str> + '_ {
757        self.data[offset..offset + len]
758            .iter()
759            .enumerate()
760            .map(move |(i, &dict_idx)| {
761                let idx = offset + i;
762                if self.is_null(idx) {
763                    ""
764                } else {
765                    self.unique_values()[dict_idx.to_usize()].as_str()
766                }
767            })
768    }
769
770    /// Returns an iterator over `Option<&str>` values for a specified range.
771    #[inline]
772    fn iter_opt_range(
773        &self,
774        offset: usize,
775        len: usize,
776    ) -> impl Iterator<Item = Option<&str>> + '_ {
777        self.data[offset..offset + len]
778            .iter()
779            .enumerate()
780            .map(move |(i, &dict_idx)| {
781                let idx = offset + i;
782                if self.is_null(idx) {
783                    None
784                } else {
785                    Some(self.unique_values()[dict_idx.to_usize()].as_str())
786                }
787            })
788    }
789
790    /// Append string, adding to dictionary if new.
791    ///
792    /// Prefer `push_str` when you have a `&str` to avoid the `String` allocation;
793    /// it also returns the assigned dictionary code.
794    #[inline]
795    fn push(&mut self, value: Self::LogicalType) {
796        self.push_str(&value);
797    }
798
799    /// Append string, adding to dictionary if new, without bounds checking.
800    ///
801    /// Prefer `push_str_unchecked` when you have a `&str` to avoid the `String` allocation.
802    ///
803    /// # Safety
804    /// - The caller must ensure `self.data` has sufficient capacity (i.e., already resized).
805    /// - `self.null_mask`, if present, must also have space for this index.
806    /// - This method assumes exclusive mutable access and no concurrent modification.
807    #[inline]
808    unsafe fn push_unchecked(&mut self, value: Self::LogicalType) {
809        self.push_str(&value);
810    }
811
812    /// Returns a logical slice of the categorical array [offset, offset+len)
813    /// as a new `CategoricalArray` object.
814    ///
815    /// For a non-copy slice view, use `slice` from the parent Array object
816    fn slice_clone(&self, offset: usize, len: usize) -> Self {
817        assert!(
818            offset + len <= self.data.len(),
819            "slice window out of bounds"
820        );
821
822        let data = self.data[offset..offset + len].to_vec_in(Vec64Alloc::default());
823        let null_mask = self
824            .null_mask
825            .as_ref()
826            .map(|nm| nm.slice_clone(offset, len));
827        Self {
828            data: Vec64(data).into(),
829            #[cfg(not(feature = "shared_dict"))]
830            unique_values: self.unique_values.clone(),
831            #[cfg(feature = "shared_dict")]
832            dictionary: self.dictionary.clone(),
833            null_mask,
834        }
835    }
836
837    /// Borrows a `CategoricalArray` with its window parameters
838    /// to a `CategoricalArrayView<'a>` alias. Like a slice, but
839    /// retains access to the `&CategoricalArray`.
840    ///
841    /// `Offset` and `Length` are `usize` aliases.
842    #[inline(always)]
843    fn tuple_ref<'a>(&'a self, offset: Offset, len: Length) -> CategoricalAVT<'a, T> {
844        (&self, offset, len)
845    }
846
847    /// Returns the total number of nulls.
848    fn null_count(&self) -> usize {
849        self.null_mask
850            .as_ref()
851            .map(|m| m.count_zeros())
852            .unwrap_or(0)
853    }
854
855    /// Resizes the data in-place so that `len` is equal to `new_len`.
856    fn resize(&mut self, n: usize, value: Self::LogicalType) {
857        let current_len = self.len();
858
859        #[cfg(not(feature = "shared_dict"))]
860        let encoded: T = add_category(&mut self.unique_values, &value);
861        #[cfg(feature = "shared_dict")]
862        let encoded: T = self.dictionary.add_cat(&value).expect(
863            "Dictionary category interning failed: cardinality exceeded capacity \
864             of the categorical integer. Consider a CategoricalArray<T> with a \
865             greater `T` capacity.",
866        );
867
868        if n > current_len {
869            self.data.reserve(n - current_len);
870            for _ in current_len..n {
871                self.data.push(encoded);
872            }
873        } else if n < current_len {
874            self.data.truncate(n);
875        }
876    }
877
878    /// Returns a reference to the null bitmask
879    fn null_mask(&self) -> Option<&Bitmask> {
880        self.null_mask.as_ref()
881    }
882
883    /// Returns a mutable reference to the null bitmask
884    fn null_mask_mut(&mut self) -> Option<&mut Bitmask> {
885        self.null_mask.as_mut()
886    }
887
888    /// Sets the bitmask from a supplied one or `None`
889    fn set_null_mask(&mut self, mask: Option<Bitmask>) {
890        self.null_mask = mask
891    }
892
893    /// Appends all values (and null mask if present) from `other` to `self`.
894    fn append_array(&mut self, other: &Self) {
895        let orig_len = self.len();
896        let other_len = other.len();
897        if other_len == 0 { return; }
898
899        self.data_mut().extend_from_slice(other.data());
900
901        match (self.null_mask_mut(), other.null_mask()) {
902            (Some(self_mask), Some(other_mask)) => {
903                self_mask.extend_from_bitmask(other_mask);
904            }
905            (Some(self_mask), None) => {
906                self_mask.resize(orig_len + other_len, true);
907            }
908            (None, Some(other_mask)) => {
909                let mut mask = Bitmask::new_set_all(orig_len, true);
910                mask.extend_from_bitmask(other_mask);
911                self.set_null_mask(Some(mask));
912            }
913            (None, None) => {}
914        }
915    }
916
917    fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> {
918        if len == 0 { return Ok(()); }
919        if offset + len > other.len() {
920            return Err(MinarrowError::IndexError(
921                format!("append_range: offset {} + len {} exceeds source length {}", offset, len, other.len())
922            ));
923        }
924        let orig_len = self.len();
925
926        self.data_mut().extend_from_slice(&other.data()[offset..offset + len]);
927
928        match (self.null_mask_mut(), other.null_mask()) {
929            (Some(self_mask), Some(other_mask)) => {
930                self_mask.extend_from_bitmask_range(other_mask, offset, len);
931            }
932            (Some(self_mask), None) => {
933                self_mask.resize(orig_len + len, true);
934            }
935            (None, Some(other_mask)) => {
936                let mut mask = Bitmask::new_set_all(orig_len, true);
937                mask.extend_from_bitmask_range(other_mask, offset, len);
938                self.set_null_mask(Some(mask));
939            }
940            (None, None) => {}
941        }
942        Ok(())
943    }
944
945    /// Inserts all values from `other` into `self` at the specified index.
946    ///
947    /// This is an O(n) operation for CategoricalArray.
948    fn insert_rows(&mut self, index: usize, other: &Self) -> Result<(), MinarrowError> {
949        use crate::enums::error::MinarrowError;
950
951        let orig_len = self.len();
952        let other_len = other.len();
953
954        if index > orig_len {
955            return Err(MinarrowError::IndexError(format!(
956                "Index {} out of bounds for array of length {}",
957                index, orig_len
958            )));
959        }
960
961        if other_len == 0 {
962            return Ok(());
963        }
964
965        // Map each of `other`'s dictionary codes to the code that the
966        // same string will have in `self`. Existing strings are looked
967        // up; novel strings are added.
968        #[cfg(not(feature = "shared_dict"))]
969        let index_map: Vec<T> = {
970            let mut m = Vec::with_capacity(other.unique_values.len());
971            for other_value in other.unique_values.iter() {
972                m.push(add_category(&mut self.unique_values, other_value));
973            }
974            m
975        };
976        #[cfg(feature = "shared_dict")]
977        let index_map: Vec<T> = {
978            let mut m = Vec::with_capacity(other.dictionary.len());
979            for other_value in other.dictionary.values().iter() {
980                let code = match self.dictionary.lookup(other_value) {
981                    Some(code) => code,
982                    None => self.dictionary.add_cat(other_value)?,
983                };
984                m.push(code);
985            }
986            m
987        };
988
989        // Insert and remap other's data
990        let new_len = orig_len + other_len;
991        self.data.resize(new_len, T::from_usize(0));
992
993        // Shift existing elements using unchecked operations
994        for i in (index..orig_len).rev() {
995            unsafe {
996                let val = *self.data.as_ref().get_unchecked(i);
997                *self.data.as_mut().get_unchecked_mut(i + other_len) = val;
998            }
999        }
1000
1001        // Copy and remap other's data
1002        for i in 0..other_len {
1003            unsafe {
1004                let other_idx = *other.data.as_ref().get_unchecked(i);
1005                let remapped_idx = *index_map.get_unchecked(other_idx.to_usize());
1006                *self.data.as_mut().get_unchecked_mut(index + i) = remapped_idx;
1007            }
1008        }
1009
1010        // Handle null masks with unchecked operations
1011        match (self.null_mask.as_mut(), other.null_mask.as_ref()) {
1012            (Some(self_mask), Some(other_mask)) => {
1013                let mut new_mask = Bitmask::new_set_all(new_len, true);
1014                for i in 0..index {
1015                    unsafe {
1016                        new_mask.set_unchecked(i, self_mask.get_unchecked(i));
1017                    }
1018                }
1019                for i in 0..other_len {
1020                    unsafe {
1021                        new_mask.set_unchecked(index + i, other_mask.get_unchecked(i));
1022                    }
1023                }
1024                for i in index..orig_len {
1025                    unsafe {
1026                        new_mask.set_unchecked(other_len + i, self_mask.get_unchecked(i));
1027                    }
1028                }
1029                *self_mask = new_mask;
1030            }
1031            (Some(self_mask), None) => {
1032                let mut new_mask = Bitmask::new_set_all(new_len, true);
1033                for i in 0..index {
1034                    unsafe {
1035                        new_mask.set_unchecked(i, self_mask.get_unchecked(i));
1036                    }
1037                }
1038                for i in index..orig_len {
1039                    unsafe {
1040                        new_mask.set_unchecked(other_len + i, self_mask.get_unchecked(i));
1041                    }
1042                }
1043                *self_mask = new_mask;
1044            }
1045            (None, Some(other_mask)) => {
1046                let mut new_mask = Bitmask::new_set_all(new_len, true);
1047                for i in 0..other_len {
1048                    unsafe {
1049                        new_mask.set_unchecked(index + i, other_mask.get_unchecked(i));
1050                    }
1051                }
1052                self.null_mask = Some(new_mask);
1053            }
1054            (None, None) => {}
1055        }
1056
1057        Ok(())
1058    }
1059
1060    /// Splits the CategoricalArray at the specified index, consuming self and returning two arrays.
1061    fn split(mut self, index: usize) -> Result<(Self, Self), MinarrowError> {
1062        use crate::enums::error::MinarrowError;
1063
1064        if index == 0 || index >= self.len() {
1065            return Err(MinarrowError::IndexError(format!(
1066                "Split index {} out of valid range (0, {})",
1067                index,
1068                self.len()
1069            )));
1070        }
1071
1072        // Split the data buffer
1073        let after_data = self.data.split_off(index);
1074
1075        // Split null mask
1076        let after_mask = self.null_mask.as_mut().map(|mask| mask.split_off(index));
1077
1078        // Both arrays share the same dictionary handle (cheap clone:
1079        // a `Vec64` clone under no `shared_dict`; an Arc bump under it).
1080        let after = CategoricalArray {
1081            data: after_data,
1082            #[cfg(not(feature = "shared_dict"))]
1083            unique_values: self.unique_values.clone(),
1084            #[cfg(feature = "shared_dict")]
1085            dictionary: self.dictionary.clone(),
1086            null_mask: after_mask,
1087        };
1088
1089        Ok((self, after))
1090    }
1091
1092    /// Extends the categorical array from an iterator with pre-allocated capacity.
1093    /// Reserves capacity in the underlying index buffer to avoid reallocations
1094    /// during bulk insertion. Dictionary is expanded as new unique values are encountered.
1095    fn extend_from_iter_with_capacity<I>(&mut self, iter: I, additional_capacity: usize)
1096    where
1097        I: Iterator<Item = Self::LogicalType>,
1098    {
1099        self.data.reserve(additional_capacity);
1100        let values: Vec<Self::LogicalType> = iter.collect();
1101        let start_len = self.data.len();
1102        // Extend the length to accommodate new elements
1103        self.data.resize(start_len + values.len(), T::from_usize(0));
1104        // Extend null mask if it exists
1105        if let Some(mask) = &mut self.null_mask {
1106            mask.resize(start_len + values.len(), true);
1107        }
1108        for (i, value) in values.iter().enumerate() {
1109            let owned = value.to_string();
1110            #[cfg(not(feature = "shared_dict"))]
1111            let code: T = add_category(&mut self.unique_values, &owned);
1112            #[cfg(feature = "shared_dict")]
1113            let code: T = self.dictionary.add_cat(&owned).expect(
1114                "Dictionary category interning failed: cardinality exceeded capacity \
1115                 of the categorical integer. Consider a CategoricalArray<T> with a \
1116                 greater `T` capacity.",
1117            );
1118            {
1119                let data = self.data.as_mut_slice();
1120                data[start_len + i] = code;
1121            }
1122            if let Some(mask) = &mut self.null_mask {
1123                unsafe { mask.set_unchecked(start_len + i, true) };
1124            }
1125        }
1126    }
1127
1128    /// Extends the categorical array from a slice of string values.
1129    /// Pre-allocates capacity for the index buffer and efficiently processes
1130    /// each string through the internal dictionary for optimal categorical encoding.
1131    fn extend_from_slice(&mut self, slice: &[Self::LogicalType]) {
1132        let start_len = self.data.len();
1133        self.data.reserve(slice.len());
1134        // Extend the length to accommodate new elements
1135        self.data.resize(start_len + slice.len(), T::from_usize(0));
1136        // Extend null mask if it exists
1137        if let Some(mask) = &mut self.null_mask {
1138            mask.resize(start_len + slice.len(), true);
1139        }
1140        for (i, value) in slice.iter().enumerate() {
1141            let owned = value.to_string();
1142            #[cfg(not(feature = "shared_dict"))]
1143            let code: T = add_category(&mut self.unique_values, &owned);
1144            #[cfg(feature = "shared_dict")]
1145            let code: T = self.dictionary.add_cat(&owned).expect(
1146                "Dictionary category interning failed: cardinality exceeded capacity \
1147                 of the categorical integer. Consider a CategoricalArray<T> with a \
1148                 greater `T` capacity.",
1149            );
1150            {
1151                let data = self.data.as_mut_slice();
1152                data[start_len + i] = code;
1153            }
1154            if let Some(mask) = &mut self.null_mask {
1155                unsafe { mask.set_unchecked(start_len + i, true) };
1156            }
1157        }
1158    }
1159
1160    /// Creates a new categorical array filled with the specified string repeated `count` times.
1161    /// The dictionary will contain only one unique value, making this highly memory-efficient
1162    /// for repeated categorical values.
1163    fn fill(value: Self::LogicalType, count: usize) -> Self {
1164        let mut array = CategoricalArray::<T>::from_vec64(crate::Vec64::with_capacity(count), None);
1165        // Extend the length to accommodate new elements
1166        array.data.resize(count, T::from_usize(0));
1167        // Fresh array; dictionary holds one entry once we intern.
1168        let owned_value = value.to_string();
1169        #[cfg(not(feature = "shared_dict"))]
1170        let dict_index: T = add_category(&mut array.unique_values, &owned_value);
1171        #[cfg(feature = "shared_dict")]
1172        let dict_index: T = array.dictionary.add_cat(&owned_value).expect(
1173            "Dictionary category interning failed: cardinality exceeded capacity \
1174             of the categorical integer. Consider a CategoricalArray<T> with a \
1175             greater `T` capacity.",
1176        );
1177        // Now use unchecked operations since we have proper length
1178        for i in 0..count {
1179            {
1180                let data = array.data.as_mut_slice();
1181                data[i] = dict_index;
1182            }
1183        }
1184        array
1185    }
1186}
1187
1188#[cfg(feature = "parallel_proc")]
1189impl<T: Integer + Send + Sync> CategoricalArray<T> {
1190    /// Parallel iterator over &str (null yields "").
1191    #[inline]
1192    pub fn par_iter(&self) -> rayon::slice::Iter<'_, T> {
1193        self.data.par_iter()
1194    }
1195
1196    /// Parallel mut iterator over &str (null yields "").
1197    #[inline]
1198    pub fn par_iter_mut(&mut self) -> rayon::slice::IterMut<'_, T> {
1199        self.data.par_iter_mut()
1200    }
1201
1202    /// Parallel iterator over Option<&str> (None if null).
1203    #[inline]
1204    pub fn par_iter_opt(&self) -> impl ParallelIterator<Item = Option<&str>> + '_ {
1205        self.par_iter_range_opt(0, self.len())
1206    }
1207
1208    /// `[start,end)` -> `&str` (null ⇒ `""`)
1209    #[inline]
1210    pub fn par_iter_range(
1211        &self,
1212        start: usize,
1213        end: usize,
1214    ) -> impl ParallelIterator<Item = &str> + '_ {
1215        use rayon::prelude::*;
1216        let null_mask = self.null_mask.as_ref();
1217        let dict = self.unique_values();
1218        let idx_buf = &self.data;
1219        debug_assert!(start <= end && end <= idx_buf.len());
1220        (start..end).into_par_iter().map(move |i| {
1221            if null_mask.map(|m| !m.get(i)).unwrap_or(false) {
1222                ""
1223            } else {
1224                &dict[idx_buf[i].to_usize()]
1225            }
1226        })
1227    }
1228
1229    // `[start,end)` -> `Option<&str>`
1230    #[inline]
1231    pub fn par_iter_range_opt(
1232        &self,
1233        start: usize,
1234        end: usize,
1235    ) -> impl ParallelIterator<Item = Option<&str>> + '_ {
1236        use rayon::prelude::*;
1237        let null_mask = self.null_mask.as_ref();
1238        let dict = self.unique_values();
1239        let idx_buf = &self.data;
1240        debug_assert!(start <= end && end <= idx_buf.len());
1241        (start..end).into_par_iter().map(move |i| {
1242            if null_mask.map(|m| !m.get(i)).unwrap_or(false) {
1243                None
1244            } else {
1245                Some(dict[idx_buf[i].to_usize()].as_str())
1246            }
1247        })
1248    }
1249
1250    /// `[start,end)` -> `&str` (null ⇒ `""`) - no bounds checks
1251    #[inline]
1252    pub fn par_iter_range_unchecked(
1253        &self,
1254        start: usize,
1255        end: usize,
1256    ) -> impl rayon::prelude::ParallelIterator<Item = &str> + '_ {
1257        use rayon::prelude::*;
1258        let null_mask = self.null_mask.as_ref();
1259        let dict = self.unique_values();
1260        let idx_buf = &self.data;
1261        (start..end).into_par_iter().map(move |i| {
1262            if let Some(mask) = null_mask {
1263                if !unsafe { mask.get_unchecked(i) } {
1264                    return "";
1265                }
1266            }
1267            let idx = unsafe { *idx_buf.get_unchecked(i) }.to_usize();
1268            unsafe { dict.get_unchecked(idx).as_str() }
1269        })
1270    }
1271
1272    /// `[start,end)` -> `Option<&str>` -  no bounds checks
1273    #[inline]
1274    pub fn par_iter_range_opt_unchecked(
1275        &self,
1276        start: usize,
1277        end: usize,
1278    ) -> impl rayon::prelude::ParallelIterator<Item = Option<&str>> + '_ {
1279        use rayon::prelude::*;
1280        let null_mask = self.null_mask.as_ref();
1281        let dict = self.unique_values();
1282        let idx_buf = &self.data;
1283        (start..end).into_par_iter().map(move |i| {
1284            if let Some(mask) = null_mask {
1285                if !unsafe { mask.get_unchecked(i) } {
1286                    return None;
1287                }
1288            }
1289            let idx = unsafe { *idx_buf.get_unchecked(i) }.to_usize();
1290            Some(unsafe { dict.get_unchecked(idx).as_str() })
1291        })
1292    }
1293}
1294
1295#[cfg(feature = "chunked")]
1296impl<'a, T: Integer> crate::traits::consolidate::Consolidate
1297    for Vec<crate::aliases::CategoricalAVT<'a, T>>
1298{
1299    type Output = CategoricalArray<T>;
1300
1301    /// Consolidate a vector of `(CategoricalArray<T>, offset, len)` view
1302    /// tuples into one contiguous `CategoricalArray<T>`.
1303    ///
1304    /// When every chunk shares the same `Shared` dictionary
1305    /// (`Arc::ptr_eq` via `shares_with`), the indices buffers are
1306    /// concatenated directly and the result binds to the same dictionary
1307    /// snapshot - one copy per chunk, no dictionary work. Otherwise
1308    /// each view is slice-cloned and folded via `Concatenate::concat`,
1309    /// which handles the prefix and divergent-intern paths internally.
1310    fn consolidate(self) -> CategoricalArray<T> {
1311        use crate::traits::masked_array::MaskedArray;
1312
1313        assert!(!self.is_empty(), "consolidate() called on empty Vec<CategoricalAVT>");
1314
1315        // Fast path: all chunks point at the same Shared dictionary Arc.
1316        // `shares_with` is always `false` without the `shared_dict`
1317        // feature, so this branch is only ever taken under it.
1318        #[cfg(feature = "shared_dict")]
1319        {
1320            use crate::structs::bitmask::Bitmask;
1321            use crate::traits::consolidate::extend_null_mask;
1322
1323            let first_dict = &self[0].0.dictionary;
1324            let all_same_dict = self
1325                .iter()
1326                .all(|(arr, _, _)| arr.dictionary.shares_with(first_dict));
1327
1328            if all_same_dict {
1329                let total_len: usize = self.iter().map(|(_, _, len)| *len).sum();
1330                let has_nulls = self.iter().any(|(arr, _, _)| arr.null_mask.is_some());
1331
1332                let mut result_data: Vec64<T> = Vec64::with_capacity(total_len);
1333                let mut result_mask: Option<Bitmask> = if has_nulls {
1334                    Some(Bitmask::default())
1335                } else {
1336                    None
1337                };
1338                let mut current_len = 0;
1339
1340                for (arr, offset, len) in &self {
1341                    let data: &[T] = &arr.data[*offset..*offset + *len];
1342                    result_data.extend_from_slice(data);
1343                    extend_null_mask(
1344                        &mut result_mask,
1345                        current_len,
1346                        arr.null_mask(),
1347                        *offset,
1348                        *len,
1349                    );
1350                    current_len += *len;
1351                }
1352
1353                // `Dictionary` is always Arc-backed under `shared_dict`;
1354                // clone bumps the Arc to join the same sharing group.
1355                let dict_handle = first_dict.clone();
1356                return CategoricalArray::<T>::new_existing_dict(
1357                    result_data,
1358                    dict_handle,
1359                    result_mask,
1360                );
1361            }
1362        }
1363
1364        // Fallback: divergent dictionaries. Slice-clone each view and
1365        // fold through `Concatenate::concat`, which already handles the
1366        // prefix and divergent-intern paths.
1367        let mut iter = self.into_iter();
1368        let (first_arr, first_off, first_len) = iter.next().expect("non-empty");
1369        let mut result = first_arr.slice_clone(first_off, first_len);
1370        for (arr, off, len) in iter {
1371            let chunk = arr.slice_clone(off, len);
1372            result = result
1373                .concat(chunk)
1374                .expect("Failed to concatenate CategoricalArray");
1375        }
1376        result
1377    }
1378}
1379
1380impl<T: Integer> Shape for CategoricalArray<T> {
1381    fn shape(&self) -> ShapeDim {
1382        ShapeDim::Rank1(self.len())
1383    }
1384}
1385
1386impl<T: Integer> Concatenate for CategoricalArray<T> {
1387    /// Concatenates `other` onto `self` with three dictionary-handling paths:
1388    ///
1389    /// 1. **Shared Arc** (`Arc::ptr_eq`): both batches already point at the
1390    ///    same dictionary. Codes are mutually meaningful, so this is a pure
1391    ///    buffer concat with no dictionary work.
1392    /// 2. **Prefix**: one dictionary is a prefix of the other. Codes from
1393    ///    the shorter side decode identically against the longer side, so
1394    ///    the result adopts the longer Arc and the data buffer is appended
1395    ///    without remapping.
1396    /// 3. **Divergent**: both dictionaries grew independently. Append the
1397    ///    missing entries into `self`'s dictionary via `intern` (O(1) per
1398    ///    string) and remap `other`'s codes into the combined space.
1399    fn concat(
1400        mut self,
1401        other: Self,
1402    ) -> core::result::Result<Self, crate::enums::error::MinarrowError> {
1403        let orig_len = self.len();
1404        let other_len = other.len();
1405
1406        if other_len == 0 {
1407            return Ok(self);
1408        }
1409
1410        #[cfg(feature = "shared_dict")]
1411        {
1412            let share = self.dictionary.shares_with(&other.dictionary);
1413            if share {
1414                // Same dictionary instance: pure buffer concat.
1415                self.data.extend_from_slice(other.data.as_ref());
1416            } else if other.dictionary.values().len() <= self.dictionary.values().len()
1417                && other.dictionary.is_prefix_of(&self.dictionary)
1418            {
1419                // `other`'s codes are already valid against the longer `self` dictionary.
1420                self.data.extend_from_slice(other.data.as_ref());
1421            } else if self.dictionary.is_prefix_of(&other.dictionary) {
1422                // `self`'s codes are valid against the longer `other` dictionary.
1423                // Adopt `other`'s dictionary and append `other`'s data verbatim.
1424                self.dictionary = other.dictionary.clone();
1425                self.data.extend_from_slice(other.data.as_ref());
1426            } else {
1427                // Divergent: bring missing entries from other into self's
1428                // dictionary, then remap other's codes through the union.
1429                let n_other_codes = other.dictionary.values().len();
1430                let mut remap: Vec<T> = Vec::with_capacity(n_other_codes);
1431                for other_value in other.dictionary.values().iter() {
1432                    let code = self.dictionary.add_cat(other_value)?;
1433                    remap.push(code);
1434                }
1435                for &other_code in other.data.iter() {
1436                    let mapped = remap[other_code.to_usize()];
1437                    self.data.push(mapped);
1438                }
1439            }
1440        }
1441        #[cfg(not(feature = "shared_dict"))]
1442        {
1443            // Without `shared_dict` each categorical owns its dictionary
1444            // outright; merge by interning every entry of `other`'s
1445            // dictionary into `self`'s, then remap `other`'s codes.
1446            let mut remap: Vec<T> = Vec::with_capacity(other.unique_values.len());
1447            for other_value in other.unique_values.iter() {
1448                remap.push(add_category(&mut self.unique_values, other_value));
1449            }
1450            for &other_code in other.data.iter() {
1451                let mapped = remap[other_code.to_usize()];
1452                self.data.push(mapped);
1453            }
1454        }
1455
1456        // Merge null masks
1457        match (self.null_mask_mut(), other.null_mask()) {
1458            (Some(self_mask), Some(other_mask)) => {
1459                self_mask.extend_from_bitmask(other_mask);
1460            }
1461            (Some(self_mask), None) => {
1462                self_mask.resize(orig_len + other_len, true);
1463            }
1464            (None, Some(other_mask)) => {
1465                let mut mask = Bitmask::new_set_all(orig_len + other_len, true);
1466                for i in 0..other_len {
1467                    mask.set(orig_len + i, other_mask.get(i));
1468                }
1469                self.set_null_mask(Some(mask));
1470            }
1471            (None, None) => {
1472                // No mask in either: nothing to do.
1473            }
1474        }
1475
1476        Ok(self)
1477    }
1478}
1479
1480impl_arc_masked_array!(
1481    Inner = CategoricalArray<T>,
1482    T = T,
1483    Container = Buffer<T>,
1484    LogicalType = String,
1485    CopyType = &'a str,
1486    BufferT = T,
1487    Variant = TextArray,
1488    Bound = Integer,
1489);
1490
1491impl_array_ref_deref!(CategoricalArray<T>: Integer);
1492
1493impl<T> Display for CategoricalArray<T>
1494where
1495    T: Integer + std::fmt::Debug,
1496{
1497    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1498        let len = self.len();
1499        let null_count = self.null_count();
1500        let dict_size = self.unique_values().len();
1501
1502        writeln!(
1503            f,
1504            "CategoricalArray [{} values]s] (dtype: categorical[str], nulls: {}, dictionary size: {})",
1505            len, null_count, dict_size
1506        )?;
1507
1508        const MAX_PREVIEW: usize = 25;
1509        write!(f, "[")?;
1510        for i in 0..usize::min(len, MAX_PREVIEW) {
1511            if i > 0 {
1512                write!(f, ", ")?;
1513            }
1514            match self.get(i) {
1515                Some(s) => write!(f, "\"{}\"", s)?,
1516                None => write!(f, "null")?,
1517            }
1518        }
1519        if len > MAX_PREVIEW {
1520            write!(f, ", … ({} total)", len)?;
1521        }
1522        write!(f, "]")
1523    }
1524}
1525
1526#[cfg(test)]
1527mod tests {
1528
1529    use super::*;
1530    use crate::traits::masked_array::MaskedArray;
1531    use crate::vec64;
1532
1533    fn bm(bits: &[bool]) -> Bitmask {
1534        let mut m = Bitmask::new_set_all(bits.len(), false);
1535        for (i, &b) in bits.iter().enumerate() {
1536            m.set(i, b);
1537        }
1538        m
1539    }
1540
1541    #[test]
1542    fn empty_new() {
1543        let arr = CategoricalArray::<u8>::default();
1544        assert!(arr.is_empty());
1545        assert!(arr.unique_values().is_empty());
1546    }
1547
1548    #[test]
1549    fn test_new_and_with_capacity() {
1550        let mut arr = CategoricalArray::<u32>::with_capacity(8, None, true);
1551        assert_eq!(arr.len(), 0);
1552        assert!(arr.data.capacity() >= 8);
1553        assert!(arr.null_mask.is_some());
1554
1555        // Reserved null-mask slots must default to valid (1).
1556        assert_eq!(arr.null_count(), 0);
1557
1558        arr.push_str("alpha");
1559        arr.push_str("beta");
1560        assert_eq!(arr.null_count(), 0);
1561
1562        arr.push_null();
1563        assert_eq!(arr.null_count(), 1);
1564    }
1565
1566    #[test]
1567    fn push_and_get() {
1568        let mut arr = CategoricalArray::<u8>::default();
1569        let i1 = arr.push_str("hello");
1570        let i2 = arr.push_str("world");
1571        let i3 = arr.push_str("hello");
1572        assert_eq!(i1, 0);
1573        assert_eq!(i2, 1);
1574        assert_eq!(i3, 0);
1575        assert_eq!(arr.indices(), &[0u8, 1, 0]);
1576        assert_eq!(arr.unique_values(), &["hello", "world".into()]);
1577        assert_eq!(arr.get(1), Some("world"));
1578    }
1579
1580    #[test]
1581    fn null_handling() {
1582        let mut arr = CategoricalArray::<u16>::default();
1583        arr.push_str("a");
1584        arr.push_null();
1585        arr.push_str("b");
1586        assert_eq!(arr.len(), 3);
1587        assert_eq!(arr.get(0), Some("a"));
1588        assert_eq!(arr.get(1), None);
1589        assert!(arr.is_null(1));
1590        assert_eq!(arr.get(2), Some("b"));
1591    }
1592
1593    #[test]
1594    fn new_tolerates_out_of_range_indices_at_null_positions() {
1595        // Mirrors pandas' Arrow export of a Categorical with NA: the indices
1596        // buffer holds -1 at null slots, which becomes 255 when the index
1597        // type is u8. The null mask correctly marks the slot invalid.
1598        let data: Vec64<u8> = vec64![0, 1, 255, 0];
1599        let unique_values: Vec64<String> =
1600            vec64!["Yes".to_string(), "No".to_string()];
1601        let mask = bm(&[true, true, false, true]);
1602
1603        let arr = CategoricalArray::<u8>::new(data, unique_values, Some(mask));
1604
1605        assert_eq!(arr.len(), 4);
1606        assert_eq!(arr.get_str(0), Some("Yes"));
1607        assert_eq!(arr.get_str(1), Some("No"));
1608        assert_eq!(arr.get_str(2), None);
1609        assert_eq!(arr.get_str(3), Some("Yes"));
1610    }
1611
1612    #[test]
1613    #[should_panic(expected = "Index 255 out of bounds")]
1614    fn new_still_rejects_out_of_range_indices_at_valid_positions() {
1615        // Same shape as above, but the offending slot is marked valid -
1616        // construction must still fail loudly.
1617        let data: Vec64<u8> = vec64![0, 1, 255, 0];
1618        let unique_values: Vec64<String> =
1619            vec64!["Yes".to_string(), "No".to_string()];
1620        let mask = bm(&[true, true, true, true]);
1621
1622        let _ = CategoricalArray::<u8>::new(data, unique_values, Some(mask));
1623    }
1624
1625    #[test]
1626    fn set_overwrite_and_new() {
1627        let mut arr = CategoricalArray::<u32>::default();
1628        arr.push_str("x");
1629        arr.push_str("y");
1630        arr.set_str(1, "x");
1631        assert_eq!(arr.get(1), Some("x"));
1632        arr.set_str(0, "zebra");
1633        assert!(arr.unique_values().contains(&"zebra".to_string()));
1634        assert_eq!(arr.get(0), Some("zebra"));
1635    }
1636
1637    #[test]
1638    fn extend_and_builder() {
1639        let mut arr = CategoricalArray::<u8>::default();
1640        arr.extend(["a", "b", "a", "c"].iter().copied());
1641        assert_eq!(arr.len(), 4);
1642        assert_eq!(arr.get(2), Some("a"));
1643
1644        let built = CategoricalArray::<u8>::from_values(vec!["k", "l", "k"]);
1645        assert_eq!(built.indices(), &[0u8, 1, 0]);
1646        assert_eq!(built.get(1), Some("l"));
1647    }
1648
1649    #[test]
1650    fn set_null_after_push() {
1651        let mut arr = CategoricalArray::<u8>::default();
1652        arr.push_str("one");
1653        arr.push_str("two");
1654        arr.set_null(1);
1655        assert!(arr.is_null(1));
1656        assert_eq!(arr.get(1), None);
1657    }
1658
1659    #[test]
1660    fn test_categorical_iter() {
1661        let arr =
1662            CategoricalArray::from_slices(&[0u32, 1, 2], &["a".into(), "b".into(), "c".into()]);
1663        let vals: Vec<_> = arr.iter().collect();
1664        assert_eq!(vals, vec!["a", "b", "c"]);
1665        let opt: Vec<_> = arr.iter_str_opt().collect();
1666        assert_eq!(opt, vec![Some("a"), Some("b"), Some("c")]);
1667    }
1668
1669    #[test]
1670    fn test_categorical_array_slice() {
1671        let arr = CategoricalArray::<u8>::new(
1672            vec64![2u8, 1, 0],
1673            vec64!["green".to_string(), "blue".to_string(), "red".to_string()],
1674            Some(Bitmask::from_bools(&[false, true, true])),
1675        );
1676        let sliced = arr.slice_clone(0, 3);
1677        assert_eq!(
1678            sliced.iter_str_opt().collect::<Vec<_>>(),
1679            vec![None, Some("blue"), Some("green")]
1680        );
1681    }
1682
1683    #[test]
1684    fn test_categorical_set_and_get() {
1685        let mut arr = CategoricalArray::<u32>::from_values(["a", "b", "c"].iter().cloned());
1686        // initial null mask none => all valid
1687        assert!(arr.null_mask.is_none());
1688
1689        // set index 1 to "d" (new entry)
1690        arr.set_str(1, "d");
1691        assert_eq!(arr.get(1), Some("d"));
1692        // dictionary should have "d" appended
1693        assert_eq!(arr.unique_values().len(), 4);
1694        assert!(arr.unique_values().contains(&"d".to_string()));
1695
1696        // set index 2 to existing "a"
1697        arr.set_str(2, "a");
1698        assert_eq!(arr.get(2), Some("a"));
1699        // dictionary length unchanged
1700        assert_eq!(arr.unique_values().len(), 4);
1701    }
1702
1703    #[test]
1704    fn test_categorical_set_unchecked_and_null_mask() {
1705        let mut arr = CategoricalArray::<u32>::from_values(["x", "y", "z"].iter().cloned());
1706        arr.null_mask = Some(bm(&[true, false, true]));
1707
1708        // unsafe unchecked set index 1 to "w"
1709        unsafe { arr.set_str_unchecked(1, "w") };
1710        // now index 1 should be "w"
1711        assert_eq!(arr.get(1), Some("w"));
1712        // null mask at 1 now true
1713        let mask = arr.null_mask.as_ref().unwrap();
1714        assert!(mask.get(1));
1715        // dictionary should contain "w"
1716        assert!(arr.unique_values().contains(&"w".to_string()));
1717    }
1718
1719    #[test]
1720    #[should_panic(expected = "index out of bounds")]
1721    fn test_categorical_set_oob() {
1722        let mut arr = CategoricalArray::<u32>::from_values(["foo"].iter().cloned());
1723        // this should panic
1724        arr.set_str(5, "bar");
1725    }
1726
1727    #[test]
1728    fn test_to_string_array() {
1729        let unique = vec64!["foo".to_string(), "bar".to_string()];
1730        let data = vec64![0u32, 0u32, 1u32];
1731        let mut mask = Bitmask::new_set_all(3, true);
1732        mask.set(1, false); // second entry is null
1733
1734        let cat = CategoricalArray {
1735            data: data.into(),
1736            #[cfg(not(feature = "shared_dict"))]
1737            unique_values: unique,
1738            #[cfg(feature = "shared_dict")]
1739            dictionary: Dictionary::from(unique),
1740            null_mask: Some(mask),
1741        };
1742
1743        let str_arr = cat.to_string_array();
1744
1745        assert_eq!(str_arr.get(0), Some("foo"));
1746        assert_eq!(str_arr.get(1), None);
1747        assert_eq!(str_arr.get(2), Some("bar"));
1748
1749        assert_eq!(str_arr.offsets, vec64![0u32, 3, 3, 6]);
1750        assert_eq!(str_arr.data, Vec64::from_slice(b"foobar"));
1751        assert_eq!(str_arr.null_mask.unwrap().count_zeros(), 1);
1752    }
1753
1754    #[test]
1755    fn test_iterators_yield_correct_values() {
1756        let mut arr = CategoricalArray::<u8>::default();
1757        arr.push_str("cat");
1758        arr.push_str("dog");
1759        arr.push_str("bird");
1760
1761        let mut it = arr.indices_iter();
1762        assert_eq!(it.next(), Some(&0u8));
1763        assert_eq!(it.next(), Some(&1u8));
1764
1765        let mut it = arr.values_iter();
1766        assert!(it.any(|s| s == "cat"));
1767        assert!(it.any(|s| s == "dog"));
1768
1769        let mut it_mut = arr.indices_iter_mut();
1770        if let Some(v) = it_mut.next() {
1771            *v = 2;
1772        }
1773        assert_eq!(arr.get(0), Some("bird"));
1774    }
1775
1776    #[test]
1777    fn test_resize_expands_and_truncates() {
1778        let mut arr = CategoricalArray::<u8>::default();
1779        arr.push_str("one");
1780        arr.push_str("two");
1781
1782        arr.resize(5, "two".to_string());
1783        assert_eq!(arr.len(), 5);
1784        assert_eq!(arr.get(4), Some("two"));
1785
1786        arr.resize(2, "ignored".to_string());
1787        assert_eq!(arr.len(), 2);
1788    }
1789
1790    #[test]
1791    fn test_from_parts_exact_match() {
1792        let data = vec64![0u8, 1u8];
1793        let dict = vec64!["alpha".to_string(), "beta".to_string()];
1794        let mask = Some(Bitmask::from_bools(&[true, false]));
1795        let arr = CategoricalArray::from_parts(data, dict, mask.clone());
1796
1797        assert_eq!(arr.get(0), Some("alpha"));
1798        assert_eq!(arr.get(1), None);
1799        assert_eq!(arr.null_mask(), mask.as_ref());
1800    }
1801
1802    #[test]
1803    fn test_batch_extend_from_iter_with_capacity() {
1804        let mut arr = CategoricalArray::<u32>::default();
1805        let data = vec![
1806            "cat".to_string(),
1807            "dog".to_string(),
1808            "cat".to_string(),
1809            "bird".to_string(),
1810        ];
1811
1812        arr.extend_from_iter_with_capacity(data.into_iter(), 4);
1813
1814        assert_eq!(arr.len(), 4);
1815        assert_eq!(arr.get(0), Some("cat"));
1816        assert_eq!(arr.get(1), Some("dog"));
1817        assert_eq!(arr.get(2), Some("cat"));
1818        assert_eq!(arr.get(3), Some("bird"));
1819
1820        // Dictionary should have 3 unique values
1821        assert_eq!(arr.unique_values().len(), 3);
1822    }
1823
1824    #[test]
1825    fn test_batch_extend_from_slice_dictionary_growth() {
1826        let mut arr = CategoricalArray::<u32>::default();
1827        arr.push("initial".to_string());
1828
1829        let data = &[
1830            "apple".to_string(),
1831            "banana".to_string(),
1832            "apple".to_string(),
1833        ];
1834        arr.extend_from_slice(data);
1835
1836        assert_eq!(arr.len(), 4);
1837        assert_eq!(arr.get(0), Some("initial"));
1838        assert_eq!(arr.get(1), Some("apple"));
1839        assert_eq!(arr.get(2), Some("banana"));
1840        assert_eq!(arr.get(3), Some("apple"));
1841
1842        // Dictionary: initial, apple, banana
1843        assert_eq!(arr.unique_values().len(), 3);
1844    }
1845
1846    #[test]
1847    fn test_batch_fill_single_category() {
1848        let arr = CategoricalArray::<u32>::fill("repeated".to_string(), 100);
1849
1850        assert_eq!(arr.len(), 100);
1851        assert_eq!(arr.null_count(), 0);
1852
1853        // All values should be the same category
1854        for i in 0..100 {
1855            assert_eq!(arr.get(i), Some("repeated"));
1856        }
1857
1858        // Dictionary should contain only one unique value
1859        assert_eq!(arr.unique_values().len(), 1);
1860        assert_eq!(arr.unique_values()[0], "repeated");
1861
1862        // All indices should point to the same dictionary entry (0)
1863        for i in 0..100 {
1864            assert_eq!(arr.data[i], 0u32);
1865        }
1866    }
1867
1868    #[test]
1869    fn test_batch_operations_with_nulls() {
1870        let mut arr = CategoricalArray::<u32>::default();
1871        arr.push("first".to_string());
1872        arr.push_null();
1873
1874        let data = &["second".to_string(), "first".to_string()];
1875        arr.extend_from_slice(data);
1876
1877        assert_eq!(arr.len(), 4);
1878        assert_eq!(arr.get(0), Some("first"));
1879        assert_eq!(arr.get(1), None);
1880        assert_eq!(arr.get(2), Some("second"));
1881        assert_eq!(arr.get(3), Some("first"));
1882        assert!(arr.null_count() >= 1); // At least the initial null
1883
1884        // Dictionary: first, second
1885        assert!(arr.unique_values().len() >= 2); // At least first and second
1886    }
1887
1888    #[test]
1889    fn test_batch_operations_preserve_categorical_efficiency() {
1890        let mut arr = CategoricalArray::<u32>::default();
1891
1892        // Create data with many repeated categories
1893        let categories = ["A", "B", "C"];
1894        let mut data = Vec::new();
1895        for _ in 0..100 {
1896            for cat in &categories {
1897                data.push(cat.to_string());
1898            }
1899        }
1900
1901        arr.extend_from_slice(&data);
1902
1903        assert_eq!(arr.len(), 300);
1904        assert_eq!(arr.unique_values().len(), 3); // Only 3 unique despite 300 entries
1905
1906        // Verify all categories are represented correctly
1907        for i in 0..300 {
1908            let expected = categories[i % 3];
1909            assert_eq!(arr.get(i), Some(expected));
1910        }
1911    }
1912
1913    #[test]
1914    fn test_categorical_array_concat() {
1915        let arr1 = CategoricalArray::<u32>::from_values(["apple", "banana", "apple"]);
1916        let arr2 = CategoricalArray::<u32>::from_values(["cherry", "apple"]);
1917
1918        let result = arr1.concat(arr2).unwrap();
1919
1920        assert_eq!(result.len(), 5);
1921        assert_eq!(result.get_str(0), Some("apple"));
1922        assert_eq!(result.get_str(1), Some("banana"));
1923        assert_eq!(result.get_str(2), Some("apple"));
1924        assert_eq!(result.get_str(3), Some("cherry"));
1925        assert_eq!(result.get_str(4), Some("apple"));
1926
1927        // Dictionary should be merged: apple, banana, cherry
1928        assert_eq!(result.unique_values().len(), 3);
1929        assert!(result.unique_values().contains(&"apple".to_string()));
1930        assert!(result.unique_values().contains(&"banana".to_string()));
1931        assert!(result.unique_values().contains(&"cherry".to_string()));
1932    }
1933
1934    #[test]
1935    fn test_categorical_array_concat_with_nulls() {
1936        let mut arr1 = CategoricalArray::<u32>::default();
1937        arr1.push_str("red");
1938        arr1.push_null();
1939        arr1.push_str("blue");
1940
1941        let mut arr2 = CategoricalArray::<u32>::default();
1942        arr2.push_str("green");
1943        arr2.push_null();
1944
1945        let result = arr1.concat(arr2).unwrap();
1946
1947        assert_eq!(result.len(), 5);
1948        assert_eq!(result.get_str(0), Some("red"));
1949        assert_eq!(result.get_str(1), None);
1950        assert_eq!(result.get_str(2), Some("blue"));
1951        assert_eq!(result.get_str(3), Some("green"));
1952        assert_eq!(result.get_str(4), None);
1953        assert_eq!(result.null_count(), 2);
1954    }
1955
1956    #[test]
1957    fn test_categorical_array_concat_disjoint_dictionaries() {
1958        // First array with dictionary: [red, blue, green]
1959        let arr1 = CategoricalArray::<u32>::from_values(["red", "blue", "green", "red", "blue"]);
1960
1961        // Second array with completely different dictionary: [alpha, beta, gamma]
1962        let arr2 = CategoricalArray::<u32>::from_values(["alpha", "beta", "gamma", "alpha"]);
1963
1964        // Verify initial state
1965        assert_eq!(arr1.unique_values().len(), 3); // red, blue, green
1966        assert_eq!(arr2.unique_values().len(), 3); // alpha, beta, gamma
1967
1968        // Verify arr1 indices point to correct values
1969        assert_eq!(arr1.get_str(0), Some("red"));
1970        assert_eq!(arr1.get_str(1), Some("blue"));
1971        assert_eq!(arr1.get_str(2), Some("green"));
1972        assert_eq!(arr1.get_str(3), Some("red"));
1973        assert_eq!(arr1.get_str(4), Some("blue"));
1974
1975        // Verify arr2 indices point to correct values
1976        assert_eq!(arr2.get_str(0), Some("alpha"));
1977        assert_eq!(arr2.get_str(1), Some("beta"));
1978        assert_eq!(arr2.get_str(2), Some("gamma"));
1979        assert_eq!(arr2.get_str(3), Some("alpha"));
1980
1981        let result = arr1.concat(arr2).unwrap();
1982
1983        // After concatenation, dictionary should have all 6 unique values
1984        assert_eq!(result.unique_values().len(), 6);
1985        assert!(result.unique_values().contains(&"red".to_string()));
1986        assert!(result.unique_values().contains(&"blue".to_string()));
1987        assert!(result.unique_values().contains(&"green".to_string()));
1988        assert!(result.unique_values().contains(&"alpha".to_string()));
1989        assert!(result.unique_values().contains(&"beta".to_string()));
1990        assert!(result.unique_values().contains(&"gamma".to_string()));
1991
1992        // Verify all values are correctly accessible after remapping
1993        assert_eq!(result.len(), 9);
1994
1995        // Original arr1 values should be unchanged
1996        assert_eq!(result.get_str(0), Some("red"));
1997        assert_eq!(result.get_str(1), Some("blue"));
1998        assert_eq!(result.get_str(2), Some("green"));
1999        assert_eq!(result.get_str(3), Some("red"));
2000        assert_eq!(result.get_str(4), Some("blue"));
2001
2002        // arr2 values should be correctly remapped
2003        assert_eq!(result.get_str(5), Some("alpha"));
2004        assert_eq!(result.get_str(6), Some("beta"));
2005        assert_eq!(result.get_str(7), Some("gamma"));
2006        assert_eq!(result.get_str(8), Some("alpha"));
2007    }
2008}
2009
2010#[cfg(test)]
2011#[cfg(feature = "parallel_proc")]
2012mod parallel_tests {
2013    use super::*;
2014    use crate::vec64;
2015    #[test]
2016    fn test_categorical_par_iter() {
2017        let arr =
2018            CategoricalArray::from_slices(&[0u32, 1, 2], &["a".into(), "b".into(), "c".into()]);
2019        let vals: Vec<_> = arr.par_iter().collect();
2020        assert_eq!(vals.len(), 3);
2021        let opt: Vec<_> = arr.par_iter_opt().collect();
2022        assert!(opt.iter().all(|v| v.is_some()));
2023    }
2024
2025    #[test]
2026    fn test_categoricalarray_par_iter_opt() {
2027        let mut arr = CategoricalArray::<u32>::default();
2028        arr.push_str("alpha");
2029        arr.push_str("beta");
2030        arr.push_null();
2031        arr.push_str("gamma");
2032
2033        let par: Vec<_> = arr.par_iter_opt().collect();
2034        let expected = vec![Some("alpha"), Some("beta"), None, Some("gamma")];
2035        assert_eq!(par, expected);
2036    }
2037
2038    #[test]
2039    fn test_categoricalarray_par_iter_range_unchecked() {
2040        let dict = vec64!["one".to_string(), "two".to_string(), "three".to_string()];
2041        let arr = CategoricalArray::<u32>::from_parts(vec64![0, 2, 1, 0, 2], dict, None);
2042        let out: Vec<&str> = arr.par_iter_range_unchecked(1, 4).collect();
2043        assert_eq!(out, vec!["three", "two", "one"]);
2044    }
2045
2046    #[test]
2047    fn test_categoricalarray_par_iter_range_opt_unchecked() {
2048        let dict = vec64!["x".to_string(), "y".to_string(), "z".to_string()];
2049        let mut arr = CategoricalArray::<u32>::from_parts(vec64![1, 0, 2, 1, 0], dict, None);
2050        arr.null_mask = Some(Bitmask::from_bools(&[true, false, true, false, true]));
2051        let out: Vec<Option<&str>> = arr.par_iter_range_opt_unchecked(0, 5).collect();
2052        assert_eq!(
2053            out,
2054            vec![
2055                Some("y"), // 0 (valid)
2056                None,      // 1 (null)
2057                Some("z"), // 2 (valid)
2058                None,      // 3 (null)
2059                Some("x")  // 4 (valid)
2060            ]
2061        );
2062    }
2063}