Skip to main content

nodedb_columnar/memtable/column_data/
dict_encode.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Dictionary encoding conversion for `ColumnData`.
4
5use super::types::ColumnData;
6
7impl ColumnData {
8    /// Attempt to convert a `String` column to `DictEncoded`.
9    pub fn try_dict_encode(col: &ColumnData, max_cardinality: u32) -> Option<ColumnData> {
10        let (data, offsets, valid) = match col {
11            ColumnData::String {
12                data,
13                offsets,
14                valid,
15            } => (data, offsets, valid),
16            _ => return None,
17        };
18
19        let row_count = col.len();
20        let mut dictionary: Vec<String> = Vec::new();
21        let mut reverse: std::collections::HashMap<String, u32> = std::collections::HashMap::new();
22        // no-governor: cold memtable dictionary encode; row_count = memtable size, governed at flush call site
23        let mut ids: Vec<u32> = Vec::with_capacity(row_count);
24
25        for i in 0..row_count {
26            if valid.as_ref().is_some_and(|v| !v[i]) {
27                ids.push(0);
28                continue;
29            }
30            let start = offsets[i] as usize;
31            let end = offsets[i + 1] as usize;
32            let s = match std::str::from_utf8(&data[start..end]) {
33                Ok(s) => s,
34                Err(_) => return None,
35            };
36            let id = if let Some(&existing) = reverse.get(s) {
37                existing
38            } else {
39                if dictionary.len() as u32 >= max_cardinality {
40                    return None;
41                }
42                let new_id = dictionary.len() as u32;
43                dictionary.push(s.to_string());
44                reverse.insert(s.to_string(), new_id);
45                new_id
46            };
47            ids.push(id);
48        }
49
50        Some(ColumnData::DictEncoded {
51            ids,
52            dictionary,
53            reverse,
54            valid: valid.clone(),
55        })
56    }
57}