1use std::collections::HashMap;
38use std::fmt::{Debug, Display, Formatter};
39use std::slice::{Iter, IterMut};
40
41#[cfg(feature = "parallel_proc")]
42use rayon::iter::ParallelIterator;
43
44use crate::aliases::CategoricalAVT;
45use crate::enums::error::MinarrowError;
46use crate::enums::shape_dim::ShapeDim;
47#[cfg(feature = "shared_dict")]
48use crate::structs::dictionary::Dictionary;
49use crate::traits::concatenate::Concatenate;
50use crate::traits::shape::Shape;
51use crate::traits::type_unions::Integer;
52use crate::utils::validate_null_mask_len;
53use crate::{
54 Bitmask, Buffer, Length, MaskedArray, Offset, StringArray, impl_arc_masked_array,
55 impl_array_ref_deref,
56};
57use ::vec64::{Vec64, Vec64Alloc};
58
59#[cfg(not(feature = "shared_dict"))]
64#[inline]
65fn add_category<T: Integer>(unique_values: &mut Vec64<String>, value: &str) -> T {
66 if let Some(pos) = unique_values.iter().position(|s| s.as_str() == value) {
67 return T::from_usize(pos);
68 }
69 let i = unique_values.len();
70 let c = T::try_from(i).ok().unwrap_or_else(|| {
71 panic!(
72 "Categorical cardinality exceeded the capacity of the index \
73 type {}. Consider a wider index width.",
74 std::any::type_name::<T>()
75 )
76 });
77 unique_values.push(value.to_owned());
78 c
79}
80
81#[repr(C, align(64))]
126#[derive(PartialEq, Clone, Debug, Default)]
127pub struct CategoricalArray<T: Integer> {
128 pub data: Buffer<T>,
130 #[cfg(not(feature = "shared_dict"))]
132 pub unique_values: Vec64<String>,
133 #[cfg(feature = "shared_dict")]
137 pub dictionary: Dictionary<T>,
138 pub null_mask: Option<Bitmask>,
140}
141
142impl<T: Integer> CategoricalArray<T> {
143 #[inline]
145 pub fn new(
146 data: impl Into<Buffer<T>>,
147 unique_values: Vec64<String>,
148 null_mask: Option<Bitmask>,
149 ) -> Self {
150 let data: Buffer<T> = data.into();
151
152 validate_null_mask_len(data.len(), &null_mask);
153 for (i, code) in data.iter().enumerate() {
158 let is_valid = null_mask.as_ref().map_or(true, |m| m.get(i));
159 if !is_valid {
160 continue;
161 }
162 let idx = code
163 .to_usize()
164 .unwrap_or_else(|| panic!("Failed to convert code to usize at position {}", i));
165 assert!(
166 idx < unique_values.len(),
167 "Index {} out of bounds for unique_values (len = {}) at position {}",
168 idx,
169 unique_values.len(),
170 i
171 );
172 }
173
174 Self {
175 data,
176 #[cfg(not(feature = "shared_dict"))]
177 unique_values,
178 #[cfg(feature = "shared_dict")]
179 dictionary: Dictionary::from(unique_values),
180 null_mask,
181 }
182 }
183
184 #[cfg(feature = "shared_dict")]
191 #[inline]
192 pub fn new_existing_dict(
193 data: impl Into<Buffer<T>>,
194 dictionary: Dictionary<T>,
195 null_mask: Option<Bitmask>,
196 ) -> Self {
197 let data: Buffer<T> = data.into();
198 validate_null_mask_len(data.len(), &null_mask);
199 Self {
200 data,
201 dictionary,
202 null_mask,
203 }
204 }
205
206 #[inline]
210 pub fn with_capacity(
211 cap: usize,
212 unique_values: Option<Vec64<String>>,
213 null_mask: bool,
214 ) -> Self {
215 Self {
216 data: Vec64::with_capacity(cap).into(),
217 #[cfg(not(feature = "shared_dict"))]
218 unique_values: unique_values.unwrap_or_default(),
219 #[cfg(feature = "shared_dict")]
220 dictionary: unique_values.map(Dictionary::from).unwrap_or_default(),
221 null_mask: if null_mask {
222 Some(Bitmask::new_set_all(cap, true))
225 } else {
226 None
227 },
228 }
229 }
230
231 #[inline]
233 pub fn from_vec64(values: Vec64<&str>, null_mask: Option<Bitmask>) -> Self {
234 validate_null_mask_len(values.len(), &null_mask);
235
236 let len = values.len();
237 let mut codes = Vec64::with_capacity(len);
238 let mut unique_values: Vec64<String> = Vec64::new();
239 let mut dict = HashMap::new();
240
241 for (i, s) in values.into_iter().enumerate() {
242 let is_valid = null_mask.as_ref().map_or(true, |m| m.get(i));
244 if !is_valid {
245 codes.push(T::default());
246 continue;
247 }
248
249 if let Some(&code) = dict.get(&s) {
250 codes.push(code);
251 } else {
252 let idx = unique_values.len();
253 let code = T::try_from(idx).ok().unwrap_or_else(|| {
254 panic!(
255 "Unique category count ({}) exceeds capacity of index type {}",
256 idx + 1,
257 std::any::type_name::<T>()
258 )
259 });
260 unique_values.push(s.to_string());
261 dict.insert(s, code);
262 codes.push(code);
263 }
264 }
265
266 Self {
267 data: codes.into(),
268 #[cfg(not(feature = "shared_dict"))]
269 unique_values,
270 #[cfg(feature = "shared_dict")]
271 dictionary: Dictionary::from(unique_values),
272 null_mask,
273 }
274 }
275
276 #[inline]
278 pub fn from_vec(values: Vec<&str>, null_mask: Option<Bitmask>) -> Self {
279 Self::from_vec64(values.into(), null_mask)
280 }
281
282 #[inline]
284 pub fn new_unchecked(
285 data: Vec64<T>,
286 unique_values: Vec64<String>,
287 null_mask: Option<Bitmask>,
288 ) -> Self {
289 Self {
290 data: data.into(),
291 #[cfg(not(feature = "shared_dict"))]
292 unique_values,
293 #[cfg(feature = "shared_dict")]
294 dictionary: Dictionary::from(unique_values),
295 null_mask,
296 }
297 }
298
299 #[inline]
301 pub fn from_slices(indices: &[T], unique_values: &[String]) -> Self {
302 assert!(
303 indices.iter().all(|&idx| {
304 let i = idx.to_usize();
305 i < unique_values.len()
306 }),
307 "All indices must be valid for unique_values"
308 );
309 let dict_values: Vec64<String> = Vec64(unique_values.to_vec_in(Vec64Alloc::default()));
310 Self {
311 data: Vec64(indices.to_vec_in(Vec64Alloc::default())).into(),
312 #[cfg(not(feature = "shared_dict"))]
313 unique_values: dict_values,
314 #[cfg(feature = "shared_dict")]
315 dictionary: Dictionary::from(dict_values),
316 null_mask: None,
317 }
318 }
319
320 #[inline]
326 pub fn unique_values(&self) -> &[String] {
327 #[cfg(not(feature = "shared_dict"))]
328 {
329 &self.unique_values
330 }
331 #[cfg(feature = "shared_dict")]
332 {
333 self.dictionary.values()
334 }
335 }
336
337 #[inline]
343 pub fn indices(&self) -> &[T] {
344 &self.data
345 }
346
347 pub fn indices_iter(&self) -> Iter<'_, T> {
349 self.data.iter()
350 }
351
352 pub fn values_iter(&self) -> Iter<'_, String> {
354 self.unique_values().iter()
355 }
356
357 pub fn indices_iter_mut(&mut self) -> IterMut<'_, T> {
359 self.data.iter_mut()
360 }
361
362 pub fn values_iter_mut(&mut self) -> IterMut<'_, String> {
374 #[cfg(not(feature = "shared_dict"))]
375 {
376 self.unique_values.iter_mut()
377 }
378 #[cfg(feature = "shared_dict")]
379 {
380 self.dictionary.detach_to_owned();
381 self.dictionary
382 .try_values_iter_mut()
383 .expect("detach_to_owned just left this Arc unique")
384 }
385 }
386
387 pub fn extend<'a, I: Iterator<Item = &'a str>>(&mut self, iter: I) {
389 for s in iter {
390 self.push(s.to_owned());
391 }
392 }
393
394 #[inline]
396 pub fn push_str(&mut self, value: &str) -> T {
397 #[cfg(not(feature = "shared_dict"))]
398 let code: T = add_category(&mut self.unique_values, value);
399 #[cfg(feature = "shared_dict")]
400 let code: T = self.dictionary.add_cat(value).expect(
401 "Dictionary category interning failed: cardinality exceeded capacity \
402 of the categorical integer. Consider a CategoricalArray<T> with a \
403 greater `T` capacity.",
404 );
405 self.data.push(code);
406 let row = self.len() - 1;
407 if let Some(mask) = &mut self.null_mask {
408 mask.set(row, true);
409 }
410 code
411 }
412
413 #[inline(always)]
420 pub unsafe fn push_str_unchecked(&mut self, value: &str) {
421 let idx = self.data.len();
422 unsafe { self.set_str_unchecked(idx, value) };
423 }
424
425 #[inline]
427 pub fn get_str(&self, idx: usize) -> Option<&str> {
428 if self.is_null(idx) {
429 return None;
430 }
431 let dict_idx = self.data[idx].to_usize();
432 Some(&self.unique_values()[dict_idx])
433 }
434
435 #[inline(always)]
437 pub unsafe fn get_str_unchecked(&self, idx: usize) -> &str {
438 if let Some(mask) = &self.null_mask {
439 if !unsafe { mask.get_unchecked(idx) } {
440 return "";
441 }
442 }
443 let dict_idx = unsafe { self.data.get_unchecked(idx).to_usize().unwrap() };
444 unsafe { self.unique_values().get_unchecked(dict_idx) }
445 }
446
447 #[inline]
449 pub fn set_str(&mut self, idx: usize, value: &str) {
450 assert!(idx < self.data.len(), "index out of bounds");
451
452 #[cfg(not(feature = "shared_dict"))]
453 let code: T = add_category(&mut self.unique_values, value);
454 #[cfg(feature = "shared_dict")]
455 let code: T = self.dictionary.add_cat(value).expect(
456 "Dictionary category interning failed: cardinality exceeded capacity \
457 of the categorical integer. Consider a CategoricalArray<T> with a \
458 greater `T` capacity.",
459 );
460
461 self.data[idx] = code;
462
463 if let Some(mask) = &mut self.null_mask {
464 mask.set(idx, true);
465 } else {
466 let mut m = Bitmask::new_set_all(self.data.len(), false);
467 m.set(idx, true);
468 self.null_mask = Some(m);
469 }
470 }
471
472 #[inline(always)]
474 pub unsafe fn set_str_unchecked(&mut self, idx: usize, value: &str) {
475 #[cfg(not(feature = "shared_dict"))]
476 let code: T = add_category(&mut self.unique_values, value);
477 #[cfg(feature = "shared_dict")]
478 let code: T = self.dictionary.add_cat(value).expect(
479 "Dictionary category interning failed: cardinality exceeded capacity \
480 of the categorical integer. Consider a CategoricalArray<T> with a \
481 greater `T` capacity.",
482 );
483 let data = self.data.as_mut_slice();
484 data[idx] = code;
485 if let Some(mask) = &mut self.null_mask {
486 mask.set(idx, true);
487 } else {
488 let mut m = Bitmask::new_set_all(self.len(), false);
489 m.set(idx, true);
490 self.null_mask = Some(m);
491 }
492 }
493
494 #[inline]
496 pub fn iter_str(&self) -> impl Iterator<Item = &str> + '_ {
497 self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
498 if self.is_null(idx) {
499 ""
500 } else {
501 &self.unique_values()[dict_idx.to_usize()]
502 }
503 })
504 }
505
506 #[inline]
508 pub fn iter_str_opt(&self) -> impl Iterator<Item = Option<&str>> + '_ {
509 self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
510 if self.is_null(idx) {
511 None
512 } else {
513 Some(self.unique_values()[dict_idx.to_usize()].as_str())
514 }
515 })
516 }
517
518 #[inline]
520 pub fn iter_str_range(&self, offset: usize, len: usize) -> impl Iterator<Item = &str> + '_ {
521 self.data[offset..offset + len]
522 .iter()
523 .enumerate()
524 .map(move |(i, &dict_idx)| {
525 let idx = offset + i;
526 if self.is_null(idx) {
527 ""
528 } else {
529 &self.unique_values()[dict_idx.to_usize()]
530 }
531 })
532 }
533
534 #[inline]
536 pub fn iter_str_opt_range(
537 &self,
538 offset: usize,
539 len: usize,
540 ) -> impl Iterator<Item = Option<&str>> + '_ {
541 self.data[offset..offset + len]
542 .iter()
543 .enumerate()
544 .map(move |(i, &dict_idx)| {
545 let idx = offset + i;
546 if self.is_null(idx) {
547 None
548 } else {
549 Some(self.unique_values()[dict_idx.to_usize()].as_str())
550 }
551 })
552 }
553
554 pub fn from_values<'a, I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
556 use std::collections::HashMap;
557 let mut dict = Vec64::<String>::new();
558 let mut map = HashMap::<&str, usize>::new();
559 let mut idx_buf = Vec64::<T>::new();
560
561 for s in iter {
562 let pos = *map.entry(s).or_insert_with(|| {
563 let i = dict.len();
564 dict.push(s.to_owned());
565 i
566 });
567 idx_buf.push(<T>::from_usize(pos));
568 }
569
570 Self {
571 data: idx_buf.into(),
572 #[cfg(not(feature = "shared_dict"))]
573 unique_values: dict,
574 #[cfg(feature = "shared_dict")]
575 dictionary: Dictionary::from(dict),
576 null_mask: None,
577 }
578 }
579
580 #[inline]
582 pub fn from_parts(
583 indices: Vec64<T>,
584 unique_values: Vec64<String>,
585 null_mask: Option<Bitmask>,
586 ) -> Self {
587 Self {
588 data: indices.into(),
589 #[cfg(not(feature = "shared_dict"))]
590 unique_values,
591 #[cfg(feature = "shared_dict")]
592 dictionary: Dictionary::from(unique_values),
593 null_mask,
594 }
595 }
596
597 #[inline]
599 pub fn to_string_array(&self) -> StringArray<T> {
600 let len = self.data.len();
601 let mut offsets = Vec64::with_capacity(len + 1);
602 let mut data = Vec64::<u8>::new();
603 offsets.push(T::zero());
604
605 for i in 0..len {
606 if self.is_null(i) {
607 offsets.push(T::from(data.len()).unwrap());
608 } else {
609 let dict_idx = self.data[i].to_usize();
610 let s = &self.unique_values()[dict_idx];
611 data.extend_from_slice(s.as_bytes());
612 offsets.push(T::from(data.len()).unwrap());
613 }
614 }
615
616 StringArray {
617 offsets: offsets.into(),
618 data: data.into(),
619 null_mask: self.null_mask.clone(),
620 }
621 }
622}
623
624impl<T: Integer> MaskedArray for CategoricalArray<T> {
625 type T = T;
626
627 type Container = Buffer<T>;
628
629 type LogicalType = String;
630
631 type CopyType<'a> = &'a str where Self: 'a;
632
633 fn delete_range(&mut self, start: usize, end: usize) {
639 self.data.delete_range(start, end);
640 if let Some(mask) = &mut self.null_mask {
641 mask.delete_range(start, end);
642 }
643 }
644
645 #[inline]
646 fn len(&self) -> usize {
647 self.data.len()
648 }
649
650 fn data(&self) -> &Self::Container {
651 &self.data
652 }
653
654 fn data_mut(&mut self) -> &mut Self::Container {
655 &mut self.data
656 }
657
658 #[inline]
666 fn get(&self, idx: usize) -> Option<&str> {
667 if self.is_null(idx) {
668 return None;
669 }
670
671 let dict_idx = self.data[idx].to_usize();
672 Some(&self.unique_values()[dict_idx])
673 }
674
675 #[inline]
679 fn set(&mut self, idx: usize, value: Self::LogicalType) {
680 self.set_str(idx, &value)
681 }
682
683 #[inline]
690 unsafe fn get_unchecked(&self, idx: usize) -> Option<&str> {
691 if let Some(mask) = &self.null_mask {
692 if !mask.get(idx) {
693 return None;
694 }
695 }
696
697 let dict_idx = unsafe { self.data.get_unchecked(idx).to_usize().unwrap() };
698 Some(unsafe { self.unique_values().get_unchecked(dict_idx).as_str() })
699 }
700
701 #[inline]
705 unsafe fn set_unchecked(&mut self, idx: usize, value: Self::LogicalType) {
706 #[cfg(not(feature = "shared_dict"))]
707 let code: T = add_category(&mut self.unique_values, &value);
708 #[cfg(feature = "shared_dict")]
709 let code: T = self.dictionary.add_cat(&value).expect(
710 "Dictionary category interning failed: cardinality exceeded capacity \
711 of the categorical integer. Consider a CategoricalArray<T> with a \
712 greater `T` capacity.",
713 );
714 let data = self.data.as_mut_slice();
715 data[idx] = code;
716 if let Some(mask) = &mut self.null_mask {
717 mask.set(idx, true);
718 } else {
719 let mut m = Bitmask::new_set_all(self.len(), false);
720 m.set(idx, true);
721 self.null_mask = Some(m);
722 }
723 }
724
725 #[inline]
729 fn iter(&self) -> impl Iterator<Item = &str> + '_ {
730 self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
731 if self.is_null(idx) {
732 ""
733 } else {
734 self.unique_values()[dict_idx.to_usize()].as_str()
735 }
736 })
737 }
738
739 #[inline]
743 fn iter_opt(&self) -> impl Iterator<Item = Option<&str>> + '_ {
744 self.data.iter().enumerate().map(move |(idx, &dict_idx)| {
745 if self.is_null(idx) {
746 None
747 } else {
748 Some(self.unique_values()[dict_idx.to_usize()].as_str())
749 }
750 })
751 }
752
753 #[inline]
756 fn iter_range(&self, offset: usize, len: usize) -> impl Iterator<Item = &str> + '_ {
757 self.data[offset..offset + len]
758 .iter()
759 .enumerate()
760 .map(move |(i, &dict_idx)| {
761 let idx = offset + i;
762 if self.is_null(idx) {
763 ""
764 } else {
765 self.unique_values()[dict_idx.to_usize()].as_str()
766 }
767 })
768 }
769
770 #[inline]
772 fn iter_opt_range(
773 &self,
774 offset: usize,
775 len: usize,
776 ) -> impl Iterator<Item = Option<&str>> + '_ {
777 self.data[offset..offset + len]
778 .iter()
779 .enumerate()
780 .map(move |(i, &dict_idx)| {
781 let idx = offset + i;
782 if self.is_null(idx) {
783 None
784 } else {
785 Some(self.unique_values()[dict_idx.to_usize()].as_str())
786 }
787 })
788 }
789
790 #[inline]
795 fn push(&mut self, value: Self::LogicalType) {
796 self.push_str(&value);
797 }
798
799 #[inline]
808 unsafe fn push_unchecked(&mut self, value: Self::LogicalType) {
809 self.push_str(&value);
810 }
811
812 fn slice_clone(&self, offset: usize, len: usize) -> Self {
817 assert!(
818 offset + len <= self.data.len(),
819 "slice window out of bounds"
820 );
821
822 let data = self.data[offset..offset + len].to_vec_in(Vec64Alloc::default());
823 let null_mask = self
824 .null_mask
825 .as_ref()
826 .map(|nm| nm.slice_clone(offset, len));
827 Self {
828 data: Vec64(data).into(),
829 #[cfg(not(feature = "shared_dict"))]
830 unique_values: self.unique_values.clone(),
831 #[cfg(feature = "shared_dict")]
832 dictionary: self.dictionary.clone(),
833 null_mask,
834 }
835 }
836
837 #[inline(always)]
843 fn tuple_ref<'a>(&'a self, offset: Offset, len: Length) -> CategoricalAVT<'a, T> {
844 (&self, offset, len)
845 }
846
847 fn null_count(&self) -> usize {
849 self.null_mask
850 .as_ref()
851 .map(|m| m.count_zeros())
852 .unwrap_or(0)
853 }
854
855 fn resize(&mut self, n: usize, value: Self::LogicalType) {
857 let current_len = self.len();
858
859 #[cfg(not(feature = "shared_dict"))]
860 let encoded: T = add_category(&mut self.unique_values, &value);
861 #[cfg(feature = "shared_dict")]
862 let encoded: T = self.dictionary.add_cat(&value).expect(
863 "Dictionary category interning failed: cardinality exceeded capacity \
864 of the categorical integer. Consider a CategoricalArray<T> with a \
865 greater `T` capacity.",
866 );
867
868 if n > current_len {
869 self.data.reserve(n - current_len);
870 for _ in current_len..n {
871 self.data.push(encoded);
872 }
873 } else if n < current_len {
874 self.data.truncate(n);
875 }
876 }
877
878 fn null_mask(&self) -> Option<&Bitmask> {
880 self.null_mask.as_ref()
881 }
882
883 fn null_mask_mut(&mut self) -> Option<&mut Bitmask> {
885 self.null_mask.as_mut()
886 }
887
888 fn set_null_mask(&mut self, mask: Option<Bitmask>) {
890 self.null_mask = mask
891 }
892
893 fn append_array(&mut self, other: &Self) {
895 let orig_len = self.len();
896 let other_len = other.len();
897 if other_len == 0 { return; }
898
899 self.data_mut().extend_from_slice(other.data());
900
901 match (self.null_mask_mut(), other.null_mask()) {
902 (Some(self_mask), Some(other_mask)) => {
903 self_mask.extend_from_bitmask(other_mask);
904 }
905 (Some(self_mask), None) => {
906 self_mask.resize(orig_len + other_len, true);
907 }
908 (None, Some(other_mask)) => {
909 let mut mask = Bitmask::new_set_all(orig_len, true);
910 mask.extend_from_bitmask(other_mask);
911 self.set_null_mask(Some(mask));
912 }
913 (None, None) => {}
914 }
915 }
916
917 fn append_range(&mut self, other: &Self, offset: usize, len: usize) -> Result<(), MinarrowError> {
918 if len == 0 { return Ok(()); }
919 if offset + len > other.len() {
920 return Err(MinarrowError::IndexError(
921 format!("append_range: offset {} + len {} exceeds source length {}", offset, len, other.len())
922 ));
923 }
924 let orig_len = self.len();
925
926 self.data_mut().extend_from_slice(&other.data()[offset..offset + len]);
927
928 match (self.null_mask_mut(), other.null_mask()) {
929 (Some(self_mask), Some(other_mask)) => {
930 self_mask.extend_from_bitmask_range(other_mask, offset, len);
931 }
932 (Some(self_mask), None) => {
933 self_mask.resize(orig_len + len, true);
934 }
935 (None, Some(other_mask)) => {
936 let mut mask = Bitmask::new_set_all(orig_len, true);
937 mask.extend_from_bitmask_range(other_mask, offset, len);
938 self.set_null_mask(Some(mask));
939 }
940 (None, None) => {}
941 }
942 Ok(())
943 }
944
945 fn insert_rows(&mut self, index: usize, other: &Self) -> Result<(), MinarrowError> {
949 use crate::enums::error::MinarrowError;
950
951 let orig_len = self.len();
952 let other_len = other.len();
953
954 if index > orig_len {
955 return Err(MinarrowError::IndexError(format!(
956 "Index {} out of bounds for array of length {}",
957 index, orig_len
958 )));
959 }
960
961 if other_len == 0 {
962 return Ok(());
963 }
964
965 #[cfg(not(feature = "shared_dict"))]
969 let index_map: Vec<T> = {
970 let mut m = Vec::with_capacity(other.unique_values.len());
971 for other_value in other.unique_values.iter() {
972 m.push(add_category(&mut self.unique_values, other_value));
973 }
974 m
975 };
976 #[cfg(feature = "shared_dict")]
977 let index_map: Vec<T> = {
978 let mut m = Vec::with_capacity(other.dictionary.len());
979 for other_value in other.dictionary.values().iter() {
980 let code = match self.dictionary.lookup(other_value) {
981 Some(code) => code,
982 None => self.dictionary.add_cat(other_value)?,
983 };
984 m.push(code);
985 }
986 m
987 };
988
989 let new_len = orig_len + other_len;
991 self.data.resize(new_len, T::from_usize(0));
992
993 for i in (index..orig_len).rev() {
995 unsafe {
996 let val = *self.data.as_ref().get_unchecked(i);
997 *self.data.as_mut().get_unchecked_mut(i + other_len) = val;
998 }
999 }
1000
1001 for i in 0..other_len {
1003 unsafe {
1004 let other_idx = *other.data.as_ref().get_unchecked(i);
1005 let remapped_idx = *index_map.get_unchecked(other_idx.to_usize());
1006 *self.data.as_mut().get_unchecked_mut(index + i) = remapped_idx;
1007 }
1008 }
1009
1010 match (self.null_mask.as_mut(), other.null_mask.as_ref()) {
1012 (Some(self_mask), Some(other_mask)) => {
1013 let mut new_mask = Bitmask::new_set_all(new_len, true);
1014 for i in 0..index {
1015 unsafe {
1016 new_mask.set_unchecked(i, self_mask.get_unchecked(i));
1017 }
1018 }
1019 for i in 0..other_len {
1020 unsafe {
1021 new_mask.set_unchecked(index + i, other_mask.get_unchecked(i));
1022 }
1023 }
1024 for i in index..orig_len {
1025 unsafe {
1026 new_mask.set_unchecked(other_len + i, self_mask.get_unchecked(i));
1027 }
1028 }
1029 *self_mask = new_mask;
1030 }
1031 (Some(self_mask), None) => {
1032 let mut new_mask = Bitmask::new_set_all(new_len, true);
1033 for i in 0..index {
1034 unsafe {
1035 new_mask.set_unchecked(i, self_mask.get_unchecked(i));
1036 }
1037 }
1038 for i in index..orig_len {
1039 unsafe {
1040 new_mask.set_unchecked(other_len + i, self_mask.get_unchecked(i));
1041 }
1042 }
1043 *self_mask = new_mask;
1044 }
1045 (None, Some(other_mask)) => {
1046 let mut new_mask = Bitmask::new_set_all(new_len, true);
1047 for i in 0..other_len {
1048 unsafe {
1049 new_mask.set_unchecked(index + i, other_mask.get_unchecked(i));
1050 }
1051 }
1052 self.null_mask = Some(new_mask);
1053 }
1054 (None, None) => {}
1055 }
1056
1057 Ok(())
1058 }
1059
1060 fn split(mut self, index: usize) -> Result<(Self, Self), MinarrowError> {
1062 use crate::enums::error::MinarrowError;
1063
1064 if index == 0 || index >= self.len() {
1065 return Err(MinarrowError::IndexError(format!(
1066 "Split index {} out of valid range (0, {})",
1067 index,
1068 self.len()
1069 )));
1070 }
1071
1072 let after_data = self.data.split_off(index);
1074
1075 let after_mask = self.null_mask.as_mut().map(|mask| mask.split_off(index));
1077
1078 let after = CategoricalArray {
1081 data: after_data,
1082 #[cfg(not(feature = "shared_dict"))]
1083 unique_values: self.unique_values.clone(),
1084 #[cfg(feature = "shared_dict")]
1085 dictionary: self.dictionary.clone(),
1086 null_mask: after_mask,
1087 };
1088
1089 Ok((self, after))
1090 }
1091
1092 fn extend_from_iter_with_capacity<I>(&mut self, iter: I, additional_capacity: usize)
1096 where
1097 I: Iterator<Item = Self::LogicalType>,
1098 {
1099 self.data.reserve(additional_capacity);
1100 let values: Vec<Self::LogicalType> = iter.collect();
1101 let start_len = self.data.len();
1102 self.data.resize(start_len + values.len(), T::from_usize(0));
1104 if let Some(mask) = &mut self.null_mask {
1106 mask.resize(start_len + values.len(), true);
1107 }
1108 for (i, value) in values.iter().enumerate() {
1109 let owned = value.to_string();
1110 #[cfg(not(feature = "shared_dict"))]
1111 let code: T = add_category(&mut self.unique_values, &owned);
1112 #[cfg(feature = "shared_dict")]
1113 let code: T = self.dictionary.add_cat(&owned).expect(
1114 "Dictionary category interning failed: cardinality exceeded capacity \
1115 of the categorical integer. Consider a CategoricalArray<T> with a \
1116 greater `T` capacity.",
1117 );
1118 {
1119 let data = self.data.as_mut_slice();
1120 data[start_len + i] = code;
1121 }
1122 if let Some(mask) = &mut self.null_mask {
1123 unsafe { mask.set_unchecked(start_len + i, true) };
1124 }
1125 }
1126 }
1127
1128 fn extend_from_slice(&mut self, slice: &[Self::LogicalType]) {
1132 let start_len = self.data.len();
1133 self.data.reserve(slice.len());
1134 self.data.resize(start_len + slice.len(), T::from_usize(0));
1136 if let Some(mask) = &mut self.null_mask {
1138 mask.resize(start_len + slice.len(), true);
1139 }
1140 for (i, value) in slice.iter().enumerate() {
1141 let owned = value.to_string();
1142 #[cfg(not(feature = "shared_dict"))]
1143 let code: T = add_category(&mut self.unique_values, &owned);
1144 #[cfg(feature = "shared_dict")]
1145 let code: T = self.dictionary.add_cat(&owned).expect(
1146 "Dictionary category interning failed: cardinality exceeded capacity \
1147 of the categorical integer. Consider a CategoricalArray<T> with a \
1148 greater `T` capacity.",
1149 );
1150 {
1151 let data = self.data.as_mut_slice();
1152 data[start_len + i] = code;
1153 }
1154 if let Some(mask) = &mut self.null_mask {
1155 unsafe { mask.set_unchecked(start_len + i, true) };
1156 }
1157 }
1158 }
1159
1160 fn fill(value: Self::LogicalType, count: usize) -> Self {
1164 let mut array = CategoricalArray::<T>::from_vec64(crate::Vec64::with_capacity(count), None);
1165 array.data.resize(count, T::from_usize(0));
1167 let owned_value = value.to_string();
1169 #[cfg(not(feature = "shared_dict"))]
1170 let dict_index: T = add_category(&mut array.unique_values, &owned_value);
1171 #[cfg(feature = "shared_dict")]
1172 let dict_index: T = array.dictionary.add_cat(&owned_value).expect(
1173 "Dictionary category interning failed: cardinality exceeded capacity \
1174 of the categorical integer. Consider a CategoricalArray<T> with a \
1175 greater `T` capacity.",
1176 );
1177 for i in 0..count {
1179 {
1180 let data = array.data.as_mut_slice();
1181 data[i] = dict_index;
1182 }
1183 }
1184 array
1185 }
1186}
1187
1188#[cfg(feature = "parallel_proc")]
1189impl<T: Integer + Send + Sync> CategoricalArray<T> {
1190 #[inline]
1192 pub fn par_iter(&self) -> rayon::slice::Iter<'_, T> {
1193 self.data.par_iter()
1194 }
1195
1196 #[inline]
1198 pub fn par_iter_mut(&mut self) -> rayon::slice::IterMut<'_, T> {
1199 self.data.par_iter_mut()
1200 }
1201
1202 #[inline]
1204 pub fn par_iter_opt(&self) -> impl ParallelIterator<Item = Option<&str>> + '_ {
1205 self.par_iter_range_opt(0, self.len())
1206 }
1207
1208 #[inline]
1210 pub fn par_iter_range(
1211 &self,
1212 start: usize,
1213 end: usize,
1214 ) -> impl ParallelIterator<Item = &str> + '_ {
1215 use rayon::prelude::*;
1216 let null_mask = self.null_mask.as_ref();
1217 let dict = self.unique_values();
1218 let idx_buf = &self.data;
1219 debug_assert!(start <= end && end <= idx_buf.len());
1220 (start..end).into_par_iter().map(move |i| {
1221 if null_mask.map(|m| !m.get(i)).unwrap_or(false) {
1222 ""
1223 } else {
1224 &dict[idx_buf[i].to_usize()]
1225 }
1226 })
1227 }
1228
1229 #[inline]
1231 pub fn par_iter_range_opt(
1232 &self,
1233 start: usize,
1234 end: usize,
1235 ) -> impl ParallelIterator<Item = Option<&str>> + '_ {
1236 use rayon::prelude::*;
1237 let null_mask = self.null_mask.as_ref();
1238 let dict = self.unique_values();
1239 let idx_buf = &self.data;
1240 debug_assert!(start <= end && end <= idx_buf.len());
1241 (start..end).into_par_iter().map(move |i| {
1242 if null_mask.map(|m| !m.get(i)).unwrap_or(false) {
1243 None
1244 } else {
1245 Some(dict[idx_buf[i].to_usize()].as_str())
1246 }
1247 })
1248 }
1249
1250 #[inline]
1252 pub fn par_iter_range_unchecked(
1253 &self,
1254 start: usize,
1255 end: usize,
1256 ) -> impl rayon::prelude::ParallelIterator<Item = &str> + '_ {
1257 use rayon::prelude::*;
1258 let null_mask = self.null_mask.as_ref();
1259 let dict = self.unique_values();
1260 let idx_buf = &self.data;
1261 (start..end).into_par_iter().map(move |i| {
1262 if let Some(mask) = null_mask {
1263 if !unsafe { mask.get_unchecked(i) } {
1264 return "";
1265 }
1266 }
1267 let idx = unsafe { *idx_buf.get_unchecked(i) }.to_usize();
1268 unsafe { dict.get_unchecked(idx).as_str() }
1269 })
1270 }
1271
1272 #[inline]
1274 pub fn par_iter_range_opt_unchecked(
1275 &self,
1276 start: usize,
1277 end: usize,
1278 ) -> impl rayon::prelude::ParallelIterator<Item = Option<&str>> + '_ {
1279 use rayon::prelude::*;
1280 let null_mask = self.null_mask.as_ref();
1281 let dict = self.unique_values();
1282 let idx_buf = &self.data;
1283 (start..end).into_par_iter().map(move |i| {
1284 if let Some(mask) = null_mask {
1285 if !unsafe { mask.get_unchecked(i) } {
1286 return None;
1287 }
1288 }
1289 let idx = unsafe { *idx_buf.get_unchecked(i) }.to_usize();
1290 Some(unsafe { dict.get_unchecked(idx).as_str() })
1291 })
1292 }
1293}
1294
1295#[cfg(feature = "chunked")]
1296impl<'a, T: Integer> crate::traits::consolidate::Consolidate
1297 for Vec<crate::aliases::CategoricalAVT<'a, T>>
1298{
1299 type Output = CategoricalArray<T>;
1300
1301 fn consolidate(self) -> CategoricalArray<T> {
1311 use crate::traits::masked_array::MaskedArray;
1312
1313 assert!(!self.is_empty(), "consolidate() called on empty Vec<CategoricalAVT>");
1314
1315 #[cfg(feature = "shared_dict")]
1319 {
1320 use crate::structs::bitmask::Bitmask;
1321 use crate::traits::consolidate::extend_null_mask;
1322
1323 let first_dict = &self[0].0.dictionary;
1324 let all_same_dict = self
1325 .iter()
1326 .all(|(arr, _, _)| arr.dictionary.shares_with(first_dict));
1327
1328 if all_same_dict {
1329 let total_len: usize = self.iter().map(|(_, _, len)| *len).sum();
1330 let has_nulls = self.iter().any(|(arr, _, _)| arr.null_mask.is_some());
1331
1332 let mut result_data: Vec64<T> = Vec64::with_capacity(total_len);
1333 let mut result_mask: Option<Bitmask> = if has_nulls {
1334 Some(Bitmask::default())
1335 } else {
1336 None
1337 };
1338 let mut current_len = 0;
1339
1340 for (arr, offset, len) in &self {
1341 let data: &[T] = &arr.data[*offset..*offset + *len];
1342 result_data.extend_from_slice(data);
1343 extend_null_mask(
1344 &mut result_mask,
1345 current_len,
1346 arr.null_mask(),
1347 *offset,
1348 *len,
1349 );
1350 current_len += *len;
1351 }
1352
1353 let dict_handle = first_dict.clone();
1356 return CategoricalArray::<T>::new_existing_dict(
1357 result_data,
1358 dict_handle,
1359 result_mask,
1360 );
1361 }
1362 }
1363
1364 let mut iter = self.into_iter();
1368 let (first_arr, first_off, first_len) = iter.next().expect("non-empty");
1369 let mut result = first_arr.slice_clone(first_off, first_len);
1370 for (arr, off, len) in iter {
1371 let chunk = arr.slice_clone(off, len);
1372 result = result
1373 .concat(chunk)
1374 .expect("Failed to concatenate CategoricalArray");
1375 }
1376 result
1377 }
1378}
1379
1380impl<T: Integer> Shape for CategoricalArray<T> {
1381 fn shape(&self) -> ShapeDim {
1382 ShapeDim::Rank1(self.len())
1383 }
1384}
1385
1386impl<T: Integer> Concatenate for CategoricalArray<T> {
1387 fn concat(
1400 mut self,
1401 other: Self,
1402 ) -> core::result::Result<Self, crate::enums::error::MinarrowError> {
1403 let orig_len = self.len();
1404 let other_len = other.len();
1405
1406 if other_len == 0 {
1407 return Ok(self);
1408 }
1409
1410 #[cfg(feature = "shared_dict")]
1411 {
1412 let share = self.dictionary.shares_with(&other.dictionary);
1413 if share {
1414 self.data.extend_from_slice(other.data.as_ref());
1416 } else if other.dictionary.values().len() <= self.dictionary.values().len()
1417 && other.dictionary.is_prefix_of(&self.dictionary)
1418 {
1419 self.data.extend_from_slice(other.data.as_ref());
1421 } else if self.dictionary.is_prefix_of(&other.dictionary) {
1422 self.dictionary = other.dictionary.clone();
1425 self.data.extend_from_slice(other.data.as_ref());
1426 } else {
1427 let n_other_codes = other.dictionary.values().len();
1430 let mut remap: Vec<T> = Vec::with_capacity(n_other_codes);
1431 for other_value in other.dictionary.values().iter() {
1432 let code = self.dictionary.add_cat(other_value)?;
1433 remap.push(code);
1434 }
1435 for &other_code in other.data.iter() {
1436 let mapped = remap[other_code.to_usize()];
1437 self.data.push(mapped);
1438 }
1439 }
1440 }
1441 #[cfg(not(feature = "shared_dict"))]
1442 {
1443 let mut remap: Vec<T> = Vec::with_capacity(other.unique_values.len());
1447 for other_value in other.unique_values.iter() {
1448 remap.push(add_category(&mut self.unique_values, other_value));
1449 }
1450 for &other_code in other.data.iter() {
1451 let mapped = remap[other_code.to_usize()];
1452 self.data.push(mapped);
1453 }
1454 }
1455
1456 match (self.null_mask_mut(), other.null_mask()) {
1458 (Some(self_mask), Some(other_mask)) => {
1459 self_mask.extend_from_bitmask(other_mask);
1460 }
1461 (Some(self_mask), None) => {
1462 self_mask.resize(orig_len + other_len, true);
1463 }
1464 (None, Some(other_mask)) => {
1465 let mut mask = Bitmask::new_set_all(orig_len + other_len, true);
1466 for i in 0..other_len {
1467 mask.set(orig_len + i, other_mask.get(i));
1468 }
1469 self.set_null_mask(Some(mask));
1470 }
1471 (None, None) => {
1472 }
1474 }
1475
1476 Ok(self)
1477 }
1478}
1479
1480impl_arc_masked_array!(
1481 Inner = CategoricalArray<T>,
1482 T = T,
1483 Container = Buffer<T>,
1484 LogicalType = String,
1485 CopyType = &'a str,
1486 BufferT = T,
1487 Variant = TextArray,
1488 Bound = Integer,
1489);
1490
1491impl_array_ref_deref!(CategoricalArray<T>: Integer);
1492
1493impl<T> Display for CategoricalArray<T>
1494where
1495 T: Integer + std::fmt::Debug,
1496{
1497 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
1498 let len = self.len();
1499 let null_count = self.null_count();
1500 let dict_size = self.unique_values().len();
1501
1502 writeln!(
1503 f,
1504 "CategoricalArray [{} values]s] (dtype: categorical[str], nulls: {}, dictionary size: {})",
1505 len, null_count, dict_size
1506 )?;
1507
1508 const MAX_PREVIEW: usize = 25;
1509 write!(f, "[")?;
1510 for i in 0..usize::min(len, MAX_PREVIEW) {
1511 if i > 0 {
1512 write!(f, ", ")?;
1513 }
1514 match self.get(i) {
1515 Some(s) => write!(f, "\"{}\"", s)?,
1516 None => write!(f, "null")?,
1517 }
1518 }
1519 if len > MAX_PREVIEW {
1520 write!(f, ", … ({} total)", len)?;
1521 }
1522 write!(f, "]")
1523 }
1524}
1525
1526#[cfg(test)]
1527mod tests {
1528
1529 use super::*;
1530 use crate::traits::masked_array::MaskedArray;
1531 use crate::vec64;
1532
1533 fn bm(bits: &[bool]) -> Bitmask {
1534 let mut m = Bitmask::new_set_all(bits.len(), false);
1535 for (i, &b) in bits.iter().enumerate() {
1536 m.set(i, b);
1537 }
1538 m
1539 }
1540
1541 #[test]
1542 fn empty_new() {
1543 let arr = CategoricalArray::<u8>::default();
1544 assert!(arr.is_empty());
1545 assert!(arr.unique_values().is_empty());
1546 }
1547
1548 #[test]
1549 fn test_new_and_with_capacity() {
1550 let mut arr = CategoricalArray::<u32>::with_capacity(8, None, true);
1551 assert_eq!(arr.len(), 0);
1552 assert!(arr.data.capacity() >= 8);
1553 assert!(arr.null_mask.is_some());
1554
1555 assert_eq!(arr.null_count(), 0);
1557
1558 arr.push_str("alpha");
1559 arr.push_str("beta");
1560 assert_eq!(arr.null_count(), 0);
1561
1562 arr.push_null();
1563 assert_eq!(arr.null_count(), 1);
1564 }
1565
1566 #[test]
1567 fn push_and_get() {
1568 let mut arr = CategoricalArray::<u8>::default();
1569 let i1 = arr.push_str("hello");
1570 let i2 = arr.push_str("world");
1571 let i3 = arr.push_str("hello");
1572 assert_eq!(i1, 0);
1573 assert_eq!(i2, 1);
1574 assert_eq!(i3, 0);
1575 assert_eq!(arr.indices(), &[0u8, 1, 0]);
1576 assert_eq!(arr.unique_values(), &["hello", "world".into()]);
1577 assert_eq!(arr.get(1), Some("world"));
1578 }
1579
1580 #[test]
1581 fn null_handling() {
1582 let mut arr = CategoricalArray::<u16>::default();
1583 arr.push_str("a");
1584 arr.push_null();
1585 arr.push_str("b");
1586 assert_eq!(arr.len(), 3);
1587 assert_eq!(arr.get(0), Some("a"));
1588 assert_eq!(arr.get(1), None);
1589 assert!(arr.is_null(1));
1590 assert_eq!(arr.get(2), Some("b"));
1591 }
1592
1593 #[test]
1594 fn new_tolerates_out_of_range_indices_at_null_positions() {
1595 let data: Vec64<u8> = vec64![0, 1, 255, 0];
1599 let unique_values: Vec64<String> =
1600 vec64!["Yes".to_string(), "No".to_string()];
1601 let mask = bm(&[true, true, false, true]);
1602
1603 let arr = CategoricalArray::<u8>::new(data, unique_values, Some(mask));
1604
1605 assert_eq!(arr.len(), 4);
1606 assert_eq!(arr.get_str(0), Some("Yes"));
1607 assert_eq!(arr.get_str(1), Some("No"));
1608 assert_eq!(arr.get_str(2), None);
1609 assert_eq!(arr.get_str(3), Some("Yes"));
1610 }
1611
1612 #[test]
1613 #[should_panic(expected = "Index 255 out of bounds")]
1614 fn new_still_rejects_out_of_range_indices_at_valid_positions() {
1615 let data: Vec64<u8> = vec64![0, 1, 255, 0];
1618 let unique_values: Vec64<String> =
1619 vec64!["Yes".to_string(), "No".to_string()];
1620 let mask = bm(&[true, true, true, true]);
1621
1622 let _ = CategoricalArray::<u8>::new(data, unique_values, Some(mask));
1623 }
1624
1625 #[test]
1626 fn set_overwrite_and_new() {
1627 let mut arr = CategoricalArray::<u32>::default();
1628 arr.push_str("x");
1629 arr.push_str("y");
1630 arr.set_str(1, "x");
1631 assert_eq!(arr.get(1), Some("x"));
1632 arr.set_str(0, "zebra");
1633 assert!(arr.unique_values().contains(&"zebra".to_string()));
1634 assert_eq!(arr.get(0), Some("zebra"));
1635 }
1636
1637 #[test]
1638 fn extend_and_builder() {
1639 let mut arr = CategoricalArray::<u8>::default();
1640 arr.extend(["a", "b", "a", "c"].iter().copied());
1641 assert_eq!(arr.len(), 4);
1642 assert_eq!(arr.get(2), Some("a"));
1643
1644 let built = CategoricalArray::<u8>::from_values(vec!["k", "l", "k"]);
1645 assert_eq!(built.indices(), &[0u8, 1, 0]);
1646 assert_eq!(built.get(1), Some("l"));
1647 }
1648
1649 #[test]
1650 fn set_null_after_push() {
1651 let mut arr = CategoricalArray::<u8>::default();
1652 arr.push_str("one");
1653 arr.push_str("two");
1654 arr.set_null(1);
1655 assert!(arr.is_null(1));
1656 assert_eq!(arr.get(1), None);
1657 }
1658
1659 #[test]
1660 fn test_categorical_iter() {
1661 let arr =
1662 CategoricalArray::from_slices(&[0u32, 1, 2], &["a".into(), "b".into(), "c".into()]);
1663 let vals: Vec<_> = arr.iter().collect();
1664 assert_eq!(vals, vec!["a", "b", "c"]);
1665 let opt: Vec<_> = arr.iter_str_opt().collect();
1666 assert_eq!(opt, vec![Some("a"), Some("b"), Some("c")]);
1667 }
1668
1669 #[test]
1670 fn test_categorical_array_slice() {
1671 let arr = CategoricalArray::<u8>::new(
1672 vec64![2u8, 1, 0],
1673 vec64!["green".to_string(), "blue".to_string(), "red".to_string()],
1674 Some(Bitmask::from_bools(&[false, true, true])),
1675 );
1676 let sliced = arr.slice_clone(0, 3);
1677 assert_eq!(
1678 sliced.iter_str_opt().collect::<Vec<_>>(),
1679 vec![None, Some("blue"), Some("green")]
1680 );
1681 }
1682
1683 #[test]
1684 fn test_categorical_set_and_get() {
1685 let mut arr = CategoricalArray::<u32>::from_values(["a", "b", "c"].iter().cloned());
1686 assert!(arr.null_mask.is_none());
1688
1689 arr.set_str(1, "d");
1691 assert_eq!(arr.get(1), Some("d"));
1692 assert_eq!(arr.unique_values().len(), 4);
1694 assert!(arr.unique_values().contains(&"d".to_string()));
1695
1696 arr.set_str(2, "a");
1698 assert_eq!(arr.get(2), Some("a"));
1699 assert_eq!(arr.unique_values().len(), 4);
1701 }
1702
1703 #[test]
1704 fn test_categorical_set_unchecked_and_null_mask() {
1705 let mut arr = CategoricalArray::<u32>::from_values(["x", "y", "z"].iter().cloned());
1706 arr.null_mask = Some(bm(&[true, false, true]));
1707
1708 unsafe { arr.set_str_unchecked(1, "w") };
1710 assert_eq!(arr.get(1), Some("w"));
1712 let mask = arr.null_mask.as_ref().unwrap();
1714 assert!(mask.get(1));
1715 assert!(arr.unique_values().contains(&"w".to_string()));
1717 }
1718
1719 #[test]
1720 #[should_panic(expected = "index out of bounds")]
1721 fn test_categorical_set_oob() {
1722 let mut arr = CategoricalArray::<u32>::from_values(["foo"].iter().cloned());
1723 arr.set_str(5, "bar");
1725 }
1726
1727 #[test]
1728 fn test_to_string_array() {
1729 let unique = vec64!["foo".to_string(), "bar".to_string()];
1730 let data = vec64![0u32, 0u32, 1u32];
1731 let mut mask = Bitmask::new_set_all(3, true);
1732 mask.set(1, false); let cat = CategoricalArray {
1735 data: data.into(),
1736 #[cfg(not(feature = "shared_dict"))]
1737 unique_values: unique,
1738 #[cfg(feature = "shared_dict")]
1739 dictionary: Dictionary::from(unique),
1740 null_mask: Some(mask),
1741 };
1742
1743 let str_arr = cat.to_string_array();
1744
1745 assert_eq!(str_arr.get(0), Some("foo"));
1746 assert_eq!(str_arr.get(1), None);
1747 assert_eq!(str_arr.get(2), Some("bar"));
1748
1749 assert_eq!(str_arr.offsets, vec64![0u32, 3, 3, 6]);
1750 assert_eq!(str_arr.data, Vec64::from_slice(b"foobar"));
1751 assert_eq!(str_arr.null_mask.unwrap().count_zeros(), 1);
1752 }
1753
1754 #[test]
1755 fn test_iterators_yield_correct_values() {
1756 let mut arr = CategoricalArray::<u8>::default();
1757 arr.push_str("cat");
1758 arr.push_str("dog");
1759 arr.push_str("bird");
1760
1761 let mut it = arr.indices_iter();
1762 assert_eq!(it.next(), Some(&0u8));
1763 assert_eq!(it.next(), Some(&1u8));
1764
1765 let mut it = arr.values_iter();
1766 assert!(it.any(|s| s == "cat"));
1767 assert!(it.any(|s| s == "dog"));
1768
1769 let mut it_mut = arr.indices_iter_mut();
1770 if let Some(v) = it_mut.next() {
1771 *v = 2;
1772 }
1773 assert_eq!(arr.get(0), Some("bird"));
1774 }
1775
1776 #[test]
1777 fn test_resize_expands_and_truncates() {
1778 let mut arr = CategoricalArray::<u8>::default();
1779 arr.push_str("one");
1780 arr.push_str("two");
1781
1782 arr.resize(5, "two".to_string());
1783 assert_eq!(arr.len(), 5);
1784 assert_eq!(arr.get(4), Some("two"));
1785
1786 arr.resize(2, "ignored".to_string());
1787 assert_eq!(arr.len(), 2);
1788 }
1789
1790 #[test]
1791 fn test_from_parts_exact_match() {
1792 let data = vec64![0u8, 1u8];
1793 let dict = vec64!["alpha".to_string(), "beta".to_string()];
1794 let mask = Some(Bitmask::from_bools(&[true, false]));
1795 let arr = CategoricalArray::from_parts(data, dict, mask.clone());
1796
1797 assert_eq!(arr.get(0), Some("alpha"));
1798 assert_eq!(arr.get(1), None);
1799 assert_eq!(arr.null_mask(), mask.as_ref());
1800 }
1801
1802 #[test]
1803 fn test_batch_extend_from_iter_with_capacity() {
1804 let mut arr = CategoricalArray::<u32>::default();
1805 let data = vec![
1806 "cat".to_string(),
1807 "dog".to_string(),
1808 "cat".to_string(),
1809 "bird".to_string(),
1810 ];
1811
1812 arr.extend_from_iter_with_capacity(data.into_iter(), 4);
1813
1814 assert_eq!(arr.len(), 4);
1815 assert_eq!(arr.get(0), Some("cat"));
1816 assert_eq!(arr.get(1), Some("dog"));
1817 assert_eq!(arr.get(2), Some("cat"));
1818 assert_eq!(arr.get(3), Some("bird"));
1819
1820 assert_eq!(arr.unique_values().len(), 3);
1822 }
1823
1824 #[test]
1825 fn test_batch_extend_from_slice_dictionary_growth() {
1826 let mut arr = CategoricalArray::<u32>::default();
1827 arr.push("initial".to_string());
1828
1829 let data = &[
1830 "apple".to_string(),
1831 "banana".to_string(),
1832 "apple".to_string(),
1833 ];
1834 arr.extend_from_slice(data);
1835
1836 assert_eq!(arr.len(), 4);
1837 assert_eq!(arr.get(0), Some("initial"));
1838 assert_eq!(arr.get(1), Some("apple"));
1839 assert_eq!(arr.get(2), Some("banana"));
1840 assert_eq!(arr.get(3), Some("apple"));
1841
1842 assert_eq!(arr.unique_values().len(), 3);
1844 }
1845
1846 #[test]
1847 fn test_batch_fill_single_category() {
1848 let arr = CategoricalArray::<u32>::fill("repeated".to_string(), 100);
1849
1850 assert_eq!(arr.len(), 100);
1851 assert_eq!(arr.null_count(), 0);
1852
1853 for i in 0..100 {
1855 assert_eq!(arr.get(i), Some("repeated"));
1856 }
1857
1858 assert_eq!(arr.unique_values().len(), 1);
1860 assert_eq!(arr.unique_values()[0], "repeated");
1861
1862 for i in 0..100 {
1864 assert_eq!(arr.data[i], 0u32);
1865 }
1866 }
1867
1868 #[test]
1869 fn test_batch_operations_with_nulls() {
1870 let mut arr = CategoricalArray::<u32>::default();
1871 arr.push("first".to_string());
1872 arr.push_null();
1873
1874 let data = &["second".to_string(), "first".to_string()];
1875 arr.extend_from_slice(data);
1876
1877 assert_eq!(arr.len(), 4);
1878 assert_eq!(arr.get(0), Some("first"));
1879 assert_eq!(arr.get(1), None);
1880 assert_eq!(arr.get(2), Some("second"));
1881 assert_eq!(arr.get(3), Some("first"));
1882 assert!(arr.null_count() >= 1); assert!(arr.unique_values().len() >= 2); }
1887
1888 #[test]
1889 fn test_batch_operations_preserve_categorical_efficiency() {
1890 let mut arr = CategoricalArray::<u32>::default();
1891
1892 let categories = ["A", "B", "C"];
1894 let mut data = Vec::new();
1895 for _ in 0..100 {
1896 for cat in &categories {
1897 data.push(cat.to_string());
1898 }
1899 }
1900
1901 arr.extend_from_slice(&data);
1902
1903 assert_eq!(arr.len(), 300);
1904 assert_eq!(arr.unique_values().len(), 3); for i in 0..300 {
1908 let expected = categories[i % 3];
1909 assert_eq!(arr.get(i), Some(expected));
1910 }
1911 }
1912
1913 #[test]
1914 fn test_categorical_array_concat() {
1915 let arr1 = CategoricalArray::<u32>::from_values(["apple", "banana", "apple"]);
1916 let arr2 = CategoricalArray::<u32>::from_values(["cherry", "apple"]);
1917
1918 let result = arr1.concat(arr2).unwrap();
1919
1920 assert_eq!(result.len(), 5);
1921 assert_eq!(result.get_str(0), Some("apple"));
1922 assert_eq!(result.get_str(1), Some("banana"));
1923 assert_eq!(result.get_str(2), Some("apple"));
1924 assert_eq!(result.get_str(3), Some("cherry"));
1925 assert_eq!(result.get_str(4), Some("apple"));
1926
1927 assert_eq!(result.unique_values().len(), 3);
1929 assert!(result.unique_values().contains(&"apple".to_string()));
1930 assert!(result.unique_values().contains(&"banana".to_string()));
1931 assert!(result.unique_values().contains(&"cherry".to_string()));
1932 }
1933
1934 #[test]
1935 fn test_categorical_array_concat_with_nulls() {
1936 let mut arr1 = CategoricalArray::<u32>::default();
1937 arr1.push_str("red");
1938 arr1.push_null();
1939 arr1.push_str("blue");
1940
1941 let mut arr2 = CategoricalArray::<u32>::default();
1942 arr2.push_str("green");
1943 arr2.push_null();
1944
1945 let result = arr1.concat(arr2).unwrap();
1946
1947 assert_eq!(result.len(), 5);
1948 assert_eq!(result.get_str(0), Some("red"));
1949 assert_eq!(result.get_str(1), None);
1950 assert_eq!(result.get_str(2), Some("blue"));
1951 assert_eq!(result.get_str(3), Some("green"));
1952 assert_eq!(result.get_str(4), None);
1953 assert_eq!(result.null_count(), 2);
1954 }
1955
1956 #[test]
1957 fn test_categorical_array_concat_disjoint_dictionaries() {
1958 let arr1 = CategoricalArray::<u32>::from_values(["red", "blue", "green", "red", "blue"]);
1960
1961 let arr2 = CategoricalArray::<u32>::from_values(["alpha", "beta", "gamma", "alpha"]);
1963
1964 assert_eq!(arr1.unique_values().len(), 3); assert_eq!(arr2.unique_values().len(), 3); assert_eq!(arr1.get_str(0), Some("red"));
1970 assert_eq!(arr1.get_str(1), Some("blue"));
1971 assert_eq!(arr1.get_str(2), Some("green"));
1972 assert_eq!(arr1.get_str(3), Some("red"));
1973 assert_eq!(arr1.get_str(4), Some("blue"));
1974
1975 assert_eq!(arr2.get_str(0), Some("alpha"));
1977 assert_eq!(arr2.get_str(1), Some("beta"));
1978 assert_eq!(arr2.get_str(2), Some("gamma"));
1979 assert_eq!(arr2.get_str(3), Some("alpha"));
1980
1981 let result = arr1.concat(arr2).unwrap();
1982
1983 assert_eq!(result.unique_values().len(), 6);
1985 assert!(result.unique_values().contains(&"red".to_string()));
1986 assert!(result.unique_values().contains(&"blue".to_string()));
1987 assert!(result.unique_values().contains(&"green".to_string()));
1988 assert!(result.unique_values().contains(&"alpha".to_string()));
1989 assert!(result.unique_values().contains(&"beta".to_string()));
1990 assert!(result.unique_values().contains(&"gamma".to_string()));
1991
1992 assert_eq!(result.len(), 9);
1994
1995 assert_eq!(result.get_str(0), Some("red"));
1997 assert_eq!(result.get_str(1), Some("blue"));
1998 assert_eq!(result.get_str(2), Some("green"));
1999 assert_eq!(result.get_str(3), Some("red"));
2000 assert_eq!(result.get_str(4), Some("blue"));
2001
2002 assert_eq!(result.get_str(5), Some("alpha"));
2004 assert_eq!(result.get_str(6), Some("beta"));
2005 assert_eq!(result.get_str(7), Some("gamma"));
2006 assert_eq!(result.get_str(8), Some("alpha"));
2007 }
2008}
2009
2010#[cfg(test)]
2011#[cfg(feature = "parallel_proc")]
2012mod parallel_tests {
2013 use super::*;
2014 use crate::vec64;
2015 #[test]
2016 fn test_categorical_par_iter() {
2017 let arr =
2018 CategoricalArray::from_slices(&[0u32, 1, 2], &["a".into(), "b".into(), "c".into()]);
2019 let vals: Vec<_> = arr.par_iter().collect();
2020 assert_eq!(vals.len(), 3);
2021 let opt: Vec<_> = arr.par_iter_opt().collect();
2022 assert!(opt.iter().all(|v| v.is_some()));
2023 }
2024
2025 #[test]
2026 fn test_categoricalarray_par_iter_opt() {
2027 let mut arr = CategoricalArray::<u32>::default();
2028 arr.push_str("alpha");
2029 arr.push_str("beta");
2030 arr.push_null();
2031 arr.push_str("gamma");
2032
2033 let par: Vec<_> = arr.par_iter_opt().collect();
2034 let expected = vec![Some("alpha"), Some("beta"), None, Some("gamma")];
2035 assert_eq!(par, expected);
2036 }
2037
2038 #[test]
2039 fn test_categoricalarray_par_iter_range_unchecked() {
2040 let dict = vec64!["one".to_string(), "two".to_string(), "three".to_string()];
2041 let arr = CategoricalArray::<u32>::from_parts(vec64![0, 2, 1, 0, 2], dict, None);
2042 let out: Vec<&str> = arr.par_iter_range_unchecked(1, 4).collect();
2043 assert_eq!(out, vec!["three", "two", "one"]);
2044 }
2045
2046 #[test]
2047 fn test_categoricalarray_par_iter_range_opt_unchecked() {
2048 let dict = vec64!["x".to_string(), "y".to_string(), "z".to_string()];
2049 let mut arr = CategoricalArray::<u32>::from_parts(vec64![1, 0, 2, 1, 0], dict, None);
2050 arr.null_mask = Some(Bitmask::from_bools(&[true, false, true, false, true]));
2051 let out: Vec<Option<&str>> = arr.par_iter_range_opt_unchecked(0, 5).collect();
2052 assert_eq!(
2053 out,
2054 vec![
2055 Some("y"), None, Some("z"), None, Some("x") ]
2061 );
2062 }
2063}