Skip to main content

clickhouse_native_client/column/
lowcardinality.rs

1//! LowCardinality column implementation (dictionary encoding)
2//!
3//! **ClickHouse Documentation:** <https://clickhouse.com/docs/en/sql-reference/data-types/lowcardinality>
4//!
5//! ## Overview
6//!
7//! LowCardinality is a specialized type that wraps other data types (String,
8//! FixedString, Date, DateTime, and numbers) to provide dictionary encoding.
9//! This dramatically reduces storage and improves query performance for
10//! columns with low cardinality (few unique values relative to total rows).
11//!
12//! ## Type Nesting Rules
13//!
14//! **✅ Correct nesting order:**
15//! - `LowCardinality(Nullable(String))` - Dictionary-encoded nullable strings
16//! - `Array(LowCardinality(String))` - Array of dictionary-encoded strings
17//! - `Array(LowCardinality(Nullable(String)))` - Array of nullable
18//!   dictionary-encoded strings
19//!
20//! **❌ Invalid nesting:**
21//! - `Nullable(LowCardinality(String))` - Error: "Nested type LowCardinality
22//!   cannot be inside Nullable type"
23//!
24//! See: <https://github.com/ClickHouse/ClickHouse/issues/42456>
25//!
26//! ## Wire Format
27//!
28//! LowCardinality uses a complex serialization format:
29//! ```text
30//! [serialization_version: UInt64]
31//! [index_type: UInt64]
32//! [dictionary: Column]
33//! [indices: UInt8/UInt16/UInt32/UInt64 * num_rows]
34//! ```
35//!
36//! ## Performance Tips
37//!
38//! - Best for columns with cardinality < 10,000 unique values
39//! - Excellent for enum-like data, country codes, status flags, etc.
40//! - See ClickHouse tips: <https://www.tinybird.co/blog-posts/tips-10-null-behavior-with-lowcardinality-columns>
41
42use super::{
43    Column,
44    ColumnRef,
45};
46use crate::{
47    types::Type,
48    Error,
49    Result,
50};
51use bytes::{
52    Buf,
53    BufMut,
54    BytesMut,
55};
56use std::{
57    collections::HashMap,
58    sync::Arc,
59};
60
61use super::column_value::{
62    append_column_item,
63    compute_hash_key,
64    get_column_item,
65    ColumnValue,
66};
67
68/// Column for LowCardinality type (dictionary encoding)
69///
70/// Stores unique values in a dictionary and uses indices to reference them,
71/// providing compression for columns with many repeated values.
72///
73/// **Reference Implementation:** See
74/// `clickhouse-cpp/clickhouse/columns/lowcardinality.cpp`
75pub struct ColumnLowCardinality {
76    type_: Type,
77    dictionary: ColumnRef, // Stores unique values
78    indices: Vec<u64>,     // Indices into dictionary
79    unique_map: HashMap<(u64, u64), u64>, /* Hash pair -> dictionary index
80                            * for fast lookup */
81}
82
83impl ColumnLowCardinality {
84    /// Create a new empty LowCardinality column for the given type.
85    pub fn new(type_: Type) -> Self {
86        // Extract the nested type from LowCardinality
87        let dictionary_type = match &type_ {
88            Type::LowCardinality { nested_type } => {
89                nested_type.as_ref().clone()
90            }
91            _ => panic!("ColumnLowCardinality requires LowCardinality type"),
92        };
93
94        // Create the dictionary column
95        let dictionary =
96            crate::io::block_stream::create_column(&dictionary_type)
97                .expect("Failed to create dictionary column");
98
99        Self {
100            type_,
101            dictionary,
102            indices: Vec::new(),
103            unique_map: HashMap::new(),
104        }
105    }
106
107    /// Get a reference to the dictionary column as a specific type
108    ///
109    /// # Example
110    /// ```ignore
111    /// let col: ColumnLowCardinality = /* ... */;
112    /// let dict: &ColumnString = col.dictionary();
113    /// ```
114    pub fn dictionary<T: Column + 'static>(&self) -> &T {
115        self.dictionary
116            .as_any()
117            .downcast_ref::<T>()
118            .expect("Failed to downcast dictionary column to requested type")
119    }
120
121    /// Get mutable reference to the dictionary column as a specific type
122    ///
123    /// # Example
124    /// ```ignore
125    /// let mut col: ColumnLowCardinality = /* ... */;
126    /// let dict_mut: &mut ColumnString = col.dictionary_mut();
127    /// ```
128    pub fn dictionary_mut<T: Column + 'static>(&mut self) -> &mut T {
129        Arc::get_mut(&mut self.dictionary)
130            .expect("Cannot get mutable reference to shared dictionary column")
131            .as_any_mut()
132            .downcast_mut::<T>()
133            .expect("Failed to downcast dictionary column to requested type")
134    }
135
136    /// Get the dictionary column as a `ColumnRef` (`Arc<dyn Column>`)
137    pub fn dictionary_ref(&self) -> ColumnRef {
138        self.dictionary.clone()
139    }
140
141    /// Get the number of unique values in the dictionary
142    pub fn dictionary_size(&self) -> usize {
143        self.dictionary.size()
144    }
145
146    /// Get the index at position
147    pub fn index_at(&self, index: usize) -> u64 {
148        self.indices[index]
149    }
150
151    /// Returns the number of values (rows) in this column.
152    pub fn len(&self) -> usize {
153        self.indices.len()
154    }
155
156    /// Returns `true` if the column contains no values.
157    pub fn is_empty(&self) -> bool {
158        self.indices.is_empty()
159    }
160
161    /// Append a value with hash-based deduplication (like C++ AppendUnsafe)
162    /// This is the core method for adding values to LowCardinality columns
163    pub fn append_unsafe(&mut self, value: &ColumnValue) -> Result<()> {
164        let hash_key = compute_hash_key(value);
165        let current_dict_size = self.dictionary.size() as u64;
166
167        // Check if value already exists in dictionary
168        let index = if let Some(&existing_idx) = self.unique_map.get(&hash_key)
169        {
170            // Value exists - reuse existing dictionary index
171            existing_idx
172        } else {
173            // New value - add to dictionary
174            let dict_mut = Arc::get_mut(&mut self.dictionary).ok_or_else(|| {
175                Error::Protocol(
176                    "Cannot append to shared dictionary - column has multiple references"
177                        .to_string(),
178                )
179            })?;
180
181            // Append to dictionary
182            append_column_item(dict_mut, value)?;
183
184            // Record in unique_map
185            self.unique_map.insert(hash_key, current_dict_size);
186
187            current_dict_size
188        };
189
190        // Append index
191        self.indices.push(index);
192
193        Ok(())
194    }
195
196    /// Bulk append values from an iterator with deduplication
197    pub fn append_values<I>(&mut self, values: I) -> Result<()>
198    where
199        I: IntoIterator<Item = ColumnValue>,
200    {
201        for value in values {
202            self.append_unsafe(&value)?;
203        }
204        Ok(())
205    }
206}
207
208impl Column for ColumnLowCardinality {
209    fn column_type(&self) -> &Type {
210        &self.type_
211    }
212
213    fn size(&self) -> usize {
214        self.indices.len()
215    }
216
217    fn clear(&mut self) {
218        self.indices.clear();
219        self.unique_map.clear();
220        // Note: We don't clear the dictionary to preserve unique values
221        // In a full implementation, we might compact the dictionary
222    }
223
224    fn reserve(&mut self, new_cap: usize) {
225        // Match C++ Reserve implementation with sqrt heuristic
226        // Assumption: dictionary size is typically much smaller than row count
227        // Use sqrt(new_cap) as a reasonable estimate for dictionary size
228        let estimated_dict_size = (new_cap as f64).sqrt().ceil() as usize;
229
230        // Reserve dictionary capacity
231        if let Some(dict_mut) = Arc::get_mut(&mut self.dictionary) {
232            dict_mut.reserve(estimated_dict_size);
233        }
234
235        // Reserve indices capacity (+2 for potential null/default items)
236        self.indices.reserve(new_cap + 2);
237    }
238
239    fn append_column(&mut self, other: ColumnRef) -> Result<()> {
240        let other = other
241            .as_any()
242            .downcast_ref::<ColumnLowCardinality>()
243            .ok_or_else(|| Error::TypeMismatch {
244                expected: self.type_.name(),
245                actual: other.column_type().name(),
246            })?;
247
248        // Check dictionary types match
249        if self.dictionary.column_type().name()
250            != other.dictionary.column_type().name()
251        {
252            return Err(Error::TypeMismatch {
253                expected: self.dictionary.column_type().name(),
254                actual: other.dictionary.column_type().name(),
255            });
256        }
257
258        // Hash-based dictionary merging with deduplication
259        // This matches the C++ clickhouse-cpp implementation
260        //
261        // For each value in other:
262        // 1. Extract ColumnValue from other's dictionary using other's index
263        // 2. Use append_unsafe which:
264        //    - Computes hash
265        //    - Checks unique_map for existing entry
266        //    - If exists: reuses existing dictionary index
267        //    - If new: adds to dictionary and updates unique_map
268        // 3. Appends the (possibly deduplicated) index
269
270        for &other_index in &other.indices {
271            // Get the value from other's dictionary
272            let value = get_column_item(
273                other.dictionary.as_ref(),
274                other_index as usize,
275            )?;
276
277            // Add with deduplication
278            self.append_unsafe(&value)?;
279        }
280
281        Ok(())
282    }
283
284    fn load_prefix(&mut self, buffer: &mut &[u8], _rows: usize) -> Result<()> {
285        // Read key_version (should be 1)
286        // Matches C++ LoadPrefix
287        if buffer.len() < 8 {
288            return Err(Error::Protocol(
289                "Not enough data for LowCardinality key version".to_string(),
290            ));
291        }
292
293        let key_version = buffer.get_u64_le();
294        const SHARED_DICTIONARIES_WITH_ADDITIONAL_KEYS: u64 = 1;
295
296        if key_version != SHARED_DICTIONARIES_WITH_ADDITIONAL_KEYS {
297            return Err(Error::Protocol(format!(
298                "Invalid LowCardinality key version: expected {}, got {}",
299                SHARED_DICTIONARIES_WITH_ADDITIONAL_KEYS, key_version
300            )));
301        }
302
303        Ok(())
304    }
305
306    fn load_from_buffer(
307        &mut self,
308        buffer: &mut &[u8],
309        rows: usize,
310    ) -> Result<()> {
311        // LowCardinality wire format (following C++ clickhouse-cpp):
312        // LoadPrefix (called separately via block reader):
313        //   1. key_version (UInt64) - should be 1
314        //      (SharedDictionariesWithAdditionalKeys)
315        // LoadBody (this method):
316        //   2. index_serialization_type (UInt64) - contains flags and index
317        //      type
318        //   3. number_of_keys (UInt64) - dictionary size
319        //   4. Dictionary column data (nested type)
320        //   5. number_of_rows (UInt64) - should match rows parameter
321        //   6. Index column data (UInt8/16/32/64 depending on index type)
322
323        // Read index_serialization_type
324        if buffer.len() < 8 {
325            return Err(Error::Protocol(
326                "Not enough data for LowCardinality index serialization type"
327                    .to_string(),
328            ));
329        }
330
331        let index_serialization_type = buffer.get_u64_le();
332
333        const INDEX_TYPE_MASK: u64 = 0xFF;
334        const NEED_GLOBAL_DICTIONARY_BIT: u64 = 1 << 8;
335        const HAS_ADDITIONAL_KEYS_BIT: u64 = 1 << 9;
336
337        let index_type = index_serialization_type & INDEX_TYPE_MASK;
338
339        // Check flags
340        if (index_serialization_type & NEED_GLOBAL_DICTIONARY_BIT) != 0 {
341            return Err(Error::Protocol(
342                "Global dictionary is not supported".to_string(),
343            ));
344        }
345
346        if (index_serialization_type & HAS_ADDITIONAL_KEYS_BIT) == 0 {
347            // Don't fail - try to continue reading
348        }
349
350        // Read number of dictionary keys
351        if buffer.len() < 8 {
352            return Err(Error::Protocol(
353                "Not enough data for dictionary size".to_string(),
354            ));
355        }
356        let number_of_keys = buffer.get_u64_le() as usize;
357
358        // Load dictionary values
359        // IMPORTANT: For Nullable dictionaries, we only load the NESTED column
360        // data The null bitmap is NOT part of the dictionary
361        // serialization (matching C++ implementation in
362        // lowcardinality.cpp::Load)
363        if number_of_keys > 0 {
364            let dict_mut = Arc::get_mut(&mut self.dictionary).ok_or_else(|| {
365                Error::Protocol(
366                    "Cannot load into shared dictionary - column has multiple references"
367                        .to_string(),
368                )
369            })?;
370
371            // Check if dictionary is Nullable - if so, load only nested data
372            use super::nullable::ColumnNullable;
373            if let Some(nullable_col) =
374                dict_mut.as_any_mut().downcast_mut::<ColumnNullable>()
375            {
376                // Use nested_ref_mut to get mutable access without knowing
377                // concrete type
378                let nested_ref = nullable_col.nested_ref_mut();
379                let nested_mut = Arc::get_mut(nested_ref)
380                    .ok_or_else(|| {
381                        Error::Protocol(
382                            "Cannot load into shared nested column - column has multiple references"
383                                .to_string(),
384                        )
385                    })?;
386                nested_mut.load_from_buffer(buffer, number_of_keys)?;
387
388                // After loading, mark all entries as non-null for now
389                // (The C++ code reconstructs the null bitmap after loading)
390                for _ in 0..number_of_keys {
391                    nullable_col.append_non_null();
392                }
393            } else {
394                // Non-nullable dictionary - load normally
395                dict_mut.load_from_buffer(buffer, number_of_keys)?;
396            }
397        }
398
399        // Read number of rows (should match the rows parameter)
400        // Note: In some cases this field may be omitted/truncated
401        let _number_of_rows = if buffer.len() >= 8 {
402            let val = buffer.get_u64_le() as usize;
403
404            if val != rows {
405                return Err(Error::Protocol(format!(
406                    "LowCardinality row count mismatch: expected {}, got {}",
407                    rows, val
408                )));
409            }
410            val
411        } else {
412            // If not enough bytes, assume number_of_rows equals rows parameter
413            // This may happen in certain protocol versions or formats
414            rows
415        };
416
417        // Read indices based on index type
418        self.indices.reserve(rows);
419        match index_type {
420            0 => {
421                // UInt8 indices
422                for _ in 0..rows {
423                    if buffer.is_empty() {
424                        return Err(Error::Protocol(
425                            "Not enough data for LowCardinality index"
426                                .to_string(),
427                        ));
428                    }
429                    let index = buffer.get_u8() as u64;
430                    self.indices.push(index);
431                }
432            }
433            1 => {
434                // UInt16 indices
435                for _ in 0..rows {
436                    if buffer.len() < 2 {
437                        return Err(Error::Protocol(
438                            "Not enough data for LowCardinality index"
439                                .to_string(),
440                        ));
441                    }
442                    let index = buffer.get_u16_le() as u64;
443                    self.indices.push(index);
444                }
445            }
446            2 => {
447                // UInt32 indices
448                for _ in 0..rows {
449                    if buffer.len() < 4 {
450                        return Err(Error::Protocol(
451                            "Not enough data for LowCardinality index"
452                                .to_string(),
453                        ));
454                    }
455                    let index = buffer.get_u32_le() as u64;
456                    self.indices.push(index);
457                }
458            }
459            3 => {
460                // UInt64 indices
461                for _ in 0..rows {
462                    if buffer.len() < 8 {
463                        return Err(Error::Protocol(
464                            "Not enough data for LowCardinality index"
465                                .to_string(),
466                        ));
467                    }
468                    let index = buffer.get_u64_le();
469                    self.indices.push(index);
470                }
471            }
472            _ => {
473                return Err(Error::Protocol(format!(
474                    "Unknown LowCardinality index type: {}",
475                    index_type
476                )));
477            }
478        }
479
480        // Rebuild unique_map from dictionary
481        self.unique_map.clear();
482        for i in 0..self.dictionary.size() {
483            let value = get_column_item(self.dictionary.as_ref(), i)?;
484            let hash_key = compute_hash_key(&value);
485            self.unique_map.insert(hash_key, i as u64);
486        }
487
488        Ok(())
489    }
490
491    fn save_prefix(&self, buffer: &mut BytesMut) -> Result<()> {
492        // Write key serialization version (matches C++ SavePrefix)
493        // KeySerializationVersion::SharedDictionariesWithAdditionalKeys = 1
494        const SHARED_DICTIONARIES_WITH_ADDITIONAL_KEYS: u64 = 1;
495        buffer.put_u64_le(SHARED_DICTIONARIES_WITH_ADDITIONAL_KEYS);
496        Ok(())
497    }
498
499    fn save_to_buffer(&self, buffer: &mut BytesMut) -> Result<()> {
500        // LowCardinality wire format (matching C++ SaveBody):
501        // 1. index_serialization_type (UInt64) - contains index type + flags
502        // 2. number_of_keys (UInt64) - dictionary size
503        // 3. Dictionary column data (for Nullable, only nested part!)
504        // 4. number_of_rows (UInt64) - index column size
505        // 5. Index column data
506
507        // Index type flags (from C++ lowcardinality.cpp)
508        const HAS_ADDITIONAL_KEYS_BIT: u64 = 1 << 9;
509
510        // For now, we always use UInt64 indices (index_type = 3)
511        // TODO: Use dynamic index type (UInt8/16/32/64) based on dictionary
512        // size
513        const INDEX_TYPE_UINT64: u64 = 3;
514
515        // 1. Write index_serialization_type
516        let index_serialization_type =
517            INDEX_TYPE_UINT64 | HAS_ADDITIONAL_KEYS_BIT;
518        buffer.put_u64_le(index_serialization_type);
519
520        // 2. Write number_of_keys (dictionary size)
521        buffer.put_u64_le(self.dictionary.size() as u64);
522
523        // 3. Write dictionary data
524        // IMPORTANT: For Nullable dictionaries, only write the NESTED column
525        // data (matching C++ implementation in
526        // lowcardinality.cpp::SaveBody)
527        use super::nullable::ColumnNullable;
528        if let Some(nullable_col) =
529            self.dictionary.as_any().downcast_ref::<ColumnNullable>()
530        {
531            // For Nullable, save only the nested column (no null bitmap)
532            nullable_col.nested_ref().save_to_buffer(buffer)?;
533        } else {
534            // For non-Nullable, save normally
535            self.dictionary.save_to_buffer(buffer)?;
536        }
537
538        // 4. Write number_of_rows (index column size)
539        buffer.put_u64_le(self.indices.len() as u64);
540
541        // 5. Write index data (as UInt64 for now)
542        for &index in &self.indices {
543            buffer.put_u64_le(index);
544        }
545
546        Ok(())
547    }
548
549    fn clone_empty(&self) -> ColumnRef {
550        Arc::new(ColumnLowCardinality::new(self.type_.clone()))
551    }
552
553    fn slice(&self, begin: usize, len: usize) -> Result<ColumnRef> {
554        if begin + len > self.indices.len() {
555            return Err(Error::InvalidArgument(format!(
556                "Slice out of bounds: begin={}, len={}, size={}",
557                begin,
558                len,
559                self.indices.len()
560            )));
561        }
562
563        // Create compact slice with only referenced dictionary entries
564        // (matching C++ implementation which rebuilds dictionary)
565        let mut sliced = ColumnLowCardinality::new(self.type_.clone());
566
567        // For each value in the slice, get from original dictionary and append
568        // This automatically rebuilds the dictionary with only referenced
569        // items
570        for i in begin..begin + len {
571            let dict_index = self.indices[i] as usize;
572            let value = get_column_item(self.dictionary.as_ref(), dict_index)?;
573            sliced.append_unsafe(&value)?;
574        }
575
576        Ok(Arc::new(sliced))
577    }
578
579    fn as_any(&self) -> &dyn std::any::Any {
580        self
581    }
582
583    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
584        self
585    }
586}
587
588#[cfg(test)]
589#[cfg_attr(coverage_nightly, coverage(off))]
590mod tests {
591    use super::*;
592    use crate::types::TypeCode;
593
594    #[test]
595    fn test_lowcardinality_creation() {
596        let lc_type = Type::LowCardinality {
597            nested_type: Box::new(Type::Simple(TypeCode::String)),
598        };
599
600        let col = ColumnLowCardinality::new(lc_type);
601        assert_eq!(col.len(), 0);
602        assert!(col.is_empty());
603        assert_eq!(col.dictionary_size(), 0);
604    }
605
606    #[test]
607    fn test_lowcardinality_empty() {
608        let lc_type = Type::LowCardinality {
609            nested_type: Box::new(Type::Simple(TypeCode::UInt32)),
610        };
611
612        let col = ColumnLowCardinality::new(lc_type);
613        assert_eq!(col.dictionary_size(), 0);
614        assert_eq!(col.size(), 0);
615    }
616
617    #[test]
618    fn test_lowcardinality_slice() {
619        use crate::column::column_value::ColumnValue;
620
621        let lc_type = Type::LowCardinality {
622            nested_type: Box::new(Type::Simple(TypeCode::String)),
623        };
624
625        let mut col = ColumnLowCardinality::new(lc_type);
626
627        // Add data: ["a", "b", "c", "b", "a"] (3 unique values)
628        col.append_unsafe(&ColumnValue::from_string("a")).unwrap();
629        col.append_unsafe(&ColumnValue::from_string("b")).unwrap();
630        col.append_unsafe(&ColumnValue::from_string("c")).unwrap();
631        col.append_unsafe(&ColumnValue::from_string("b")).unwrap();
632        col.append_unsafe(&ColumnValue::from_string("a")).unwrap();
633
634        assert_eq!(col.len(), 5);
635        assert_eq!(col.dictionary_size(), 3); // "a", "b", "c"
636
637        // Slice [1:3] = ["b", "c"] - only uses 2 unique values
638        let sliced = col.slice(1, 2).unwrap();
639        let sliced_col =
640            sliced.as_any().downcast_ref::<ColumnLowCardinality>().unwrap();
641
642        assert_eq!(sliced_col.len(), 2);
643        // CRITICAL: Dictionary should be compacted to only 2 items (not 3!)
644        assert_eq!(
645            sliced_col.dictionary_size(),
646            2,
647            "Dictionary should be compacted"
648        );
649
650        // Values should still match
651        let val0 = get_column_item(
652            sliced_col.dictionary.as_ref(),
653            sliced_col.index_at(0) as usize,
654        )
655        .unwrap();
656        let val1 = get_column_item(
657            sliced_col.dictionary.as_ref(),
658            sliced_col.index_at(1) as usize,
659        )
660        .unwrap();
661        assert_eq!(val0.as_string().unwrap(), "b");
662        assert_eq!(val1.as_string().unwrap(), "c");
663    }
664
665    #[test]
666    fn test_lowcardinality_slice_memory_efficiency() {
667        use crate::column::column_value::ColumnValue;
668
669        let lc_type = Type::LowCardinality {
670            nested_type: Box::new(Type::Simple(TypeCode::String)),
671        };
672
673        let mut col = ColumnLowCardinality::new(lc_type);
674
675        // Add 1000 unique values
676        for i in 0..1000 {
677            col.append_unsafe(&ColumnValue::from_string(&format!(
678                "value_{}",
679                i
680            )))
681            .unwrap();
682        }
683
684        assert_eq!(col.dictionary_size(), 1000);
685
686        // Slice only the first 10 items
687        let sliced = col.slice(0, 10).unwrap();
688        let sliced_col =
689            sliced.as_any().downcast_ref::<ColumnLowCardinality>().unwrap();
690
691        assert_eq!(sliced_col.len(), 10);
692        // Dictionary should be compacted to only 10 items (not 1000!)
693        assert_eq!(
694            sliced_col.dictionary_size(),
695            10,
696            "Dictionary should be compacted to only referenced items"
697        );
698    }
699
700    #[test]
701    fn test_lowcardinality_slice_with_duplicates() {
702        use crate::column::column_value::ColumnValue;
703
704        let lc_type = Type::LowCardinality {
705            nested_type: Box::new(Type::Simple(TypeCode::String)),
706        };
707
708        let mut col = ColumnLowCardinality::new(lc_type);
709
710        // Add pattern: ["x", "y", "z", "x", "x", "z"]
711        col.append_unsafe(&ColumnValue::from_string("x")).unwrap();
712        col.append_unsafe(&ColumnValue::from_string("y")).unwrap();
713        col.append_unsafe(&ColumnValue::from_string("z")).unwrap();
714        col.append_unsafe(&ColumnValue::from_string("x")).unwrap();
715        col.append_unsafe(&ColumnValue::from_string("x")).unwrap();
716        col.append_unsafe(&ColumnValue::from_string("z")).unwrap();
717
718        assert_eq!(col.dictionary_size(), 3); // "x", "y", "z"
719
720        // Slice [3:3] = ["x", "x", "z"] - only uses 2 unique values
721        let sliced = col.slice(3, 3).unwrap();
722        let sliced_col =
723            sliced.as_any().downcast_ref::<ColumnLowCardinality>().unwrap();
724
725        assert_eq!(sliced_col.len(), 3);
726        assert_eq!(
727            sliced_col.dictionary_size(),
728            2,
729            "Only 'x' and 'z' should be in dictionary"
730        );
731
732        // Verify deduplication in sliced column
733        // Both "x" values should point to same dictionary entry
734        assert_eq!(
735            sliced_col.index_at(0),
736            sliced_col.index_at(1),
737            "Duplicate 'x' should use same index"
738        );
739    }
740
741    #[test]
742    fn test_lowcardinality_clear() {
743        let lc_type = Type::LowCardinality {
744            nested_type: Box::new(Type::Simple(TypeCode::String)),
745        };
746
747        let mut col = ColumnLowCardinality::new(lc_type);
748        col.indices = vec![0, 1, 2];
749
750        col.clear();
751        assert_eq!(col.len(), 0);
752        assert!(col.is_empty());
753    }
754
755    #[test]
756    fn test_lowcardinality_reserve() {
757        let lc_type = Type::LowCardinality {
758            nested_type: Box::new(Type::Simple(TypeCode::String)),
759        };
760
761        let mut col = ColumnLowCardinality::new(lc_type);
762
763        // Reserve for 10,000 rows
764        // Expected dictionary size ≈ sqrt(10000) = 100
765        col.reserve(10_000);
766
767        // Verify indices capacity increased
768        assert!(col.indices.capacity() >= 10_000);
769
770        // Add some data to verify reserve didn't break anything
771        use crate::column::column_value::ColumnValue;
772        col.append_unsafe(&ColumnValue::from_string("test")).unwrap();
773        assert_eq!(col.len(), 1);
774        assert_eq!(col.dictionary_size(), 1);
775    }
776
777    #[test]
778    fn test_lowcardinality_reserve_performance() {
779        use crate::column::column_value::ColumnValue;
780
781        let lc_type = Type::LowCardinality {
782            nested_type: Box::new(Type::Simple(TypeCode::String)),
783        };
784
785        // Test that pre-reserving improves performance
786        // (fewer reallocations during insertion)
787
788        let mut col_with_reserve = ColumnLowCardinality::new(lc_type.clone());
789        col_with_reserve.reserve(1000);
790
791        let mut col_without_reserve = ColumnLowCardinality::new(lc_type);
792
793        // Both should work correctly with or without reserve
794        for i in 0..100 {
795            let value = format!("value_{}", i % 10); // 10 unique values, repeated
796            col_with_reserve
797                .append_unsafe(&ColumnValue::from_string(&value))
798                .unwrap();
799            col_without_reserve
800                .append_unsafe(&ColumnValue::from_string(&value))
801                .unwrap();
802        }
803
804        assert_eq!(col_with_reserve.len(), 100);
805        assert_eq!(col_without_reserve.len(), 100);
806        assert_eq!(col_with_reserve.dictionary_size(), 10);
807        assert_eq!(col_without_reserve.dictionary_size(), 10);
808
809        // col_with_reserve should have pre-allocated capacity
810        assert!(col_with_reserve.indices.capacity() >= 1000);
811    }
812
813    #[test]
814    fn test_lowcardinality_save_load_roundtrip() {
815        use bytes::BytesMut;
816
817        // Create a LowCardinality(String) column
818        let lc_type = Type::LowCardinality {
819            nested_type: Box::new(Type::Simple(TypeCode::String)),
820        };
821
822        let mut col = ColumnLowCardinality::new(lc_type.clone());
823
824        // Add some test data with repeated values
825        use crate::column::column_value::ColumnValue;
826        col.append_unsafe(&ColumnValue::from_string("hello")).unwrap();
827        col.append_unsafe(&ColumnValue::from_string("world")).unwrap();
828        col.append_unsafe(&ColumnValue::from_string("hello")).unwrap(); // Duplicate
829        col.append_unsafe(&ColumnValue::from_string("test")).unwrap();
830        col.append_unsafe(&ColumnValue::from_string("world")).unwrap(); // Duplicate
831
832        // Verify initial state
833        assert_eq!(col.len(), 5);
834        assert_eq!(col.dictionary_size(), 3); // "hello", "world", "test"
835
836        // Save to buffer
837        let mut buffer = BytesMut::new();
838        col.save_prefix(&mut buffer).unwrap();
839        col.save_to_buffer(&mut buffer).unwrap();
840
841        // Verify buffer format (matching C++ protocol):
842        let mut read_buf = &buffer[..];
843        use bytes::Buf;
844
845        // 1. key_version (from save_prefix)
846        let key_version = read_buf.get_u64_le();
847        assert_eq!(key_version, 1, "key_version should be 1");
848
849        // 2. index_serialization_type (from save_to_buffer)
850        let index_serialization_type = read_buf.get_u64_le();
851        let index_type = index_serialization_type & 0xFF;
852        let has_additional_keys = (index_serialization_type & (1 << 9)) != 0;
853        assert_eq!(index_type, 3, "index_type should be 3 (UInt64)");
854        assert!(has_additional_keys, "HasAdditionalKeysBit should be set");
855
856        // 3. number_of_keys
857        let number_of_keys = read_buf.get_u64_le();
858        assert_eq!(
859            number_of_keys, 3,
860            "dictionary should have 3 unique values"
861        );
862
863        // Load into new column
864        let mut loaded_col = ColumnLowCardinality::new(lc_type);
865        let mut load_buf = &buffer[..];
866        loaded_col.load_prefix(&mut load_buf, 5).unwrap();
867        loaded_col.load_from_buffer(&mut load_buf, 5).unwrap();
868
869        // Verify loaded data
870        assert_eq!(loaded_col.len(), 5);
871        assert_eq!(loaded_col.dictionary_size(), 3);
872
873        // Verify indices match (deduplication preserved)
874        assert_eq!(loaded_col.index_at(0), col.index_at(0)); // "hello"
875        assert_eq!(loaded_col.index_at(1), col.index_at(1)); // "world"
876        assert_eq!(loaded_col.index_at(2), col.index_at(2)); // "hello" (same as 0)
877        assert_eq!(loaded_col.index_at(3), col.index_at(3)); // "test"
878        assert_eq!(loaded_col.index_at(4), col.index_at(4)); // "world" (same as 1)
879
880        // Verify duplicates point to same dictionary entry
881        assert_eq!(loaded_col.index_at(0), loaded_col.index_at(2));
882        assert_eq!(loaded_col.index_at(1), loaded_col.index_at(4));
883    }
884
885    #[test]
886    fn test_lowcardinality_nullable_save_format() {
887        use bytes::BytesMut;
888
889        // Create a LowCardinality(Nullable(String)) column
890        let lc_type = Type::LowCardinality {
891            nested_type: Box::new(Type::Nullable {
892                nested_type: Box::new(Type::Simple(TypeCode::String)),
893            }),
894        };
895
896        let mut col = ColumnLowCardinality::new(lc_type.clone());
897
898        // Add test data with nulls
899        use crate::column::column_value::ColumnValue;
900        col.append_unsafe(&ColumnValue::from_string("hello")).unwrap();
901        col.append_unsafe(&ColumnValue::void()).unwrap(); // null value
902        col.append_unsafe(&ColumnValue::from_string("world")).unwrap();
903
904        assert_eq!(col.len(), 3);
905
906        // Save to buffer
907        let mut buffer = BytesMut::new();
908        col.save_prefix(&mut buffer).unwrap();
909        col.save_to_buffer(&mut buffer).unwrap();
910
911        // The key point: for Nullable dictionaries, only nested data is saved
912        // (verified by checking buffer structure matches C++ protocol)
913        assert!(!buffer.is_empty(), "Buffer should contain data");
914
915        // Verify buffer starts with correct key_version
916        use bytes::Buf;
917        let mut read_buf = &buffer[..];
918        let key_version = read_buf.get_u64_le();
919        assert_eq!(key_version, 1, "key_version should be 1");
920
921        // Full round-trip testing for Nullable LowCardinality is complex
922        // due to the nested save format. The integration tests cover this.
923    }
924}