Skip to main content

scirs2_io/columnar/
types.rs

1//! Column types and data structures for the columnar format.
2//!
3//! Defines the core types for column-oriented storage including
4//! column data variants, encoding strategies, and table structures.
5
6use std::collections::HashMap;
7use std::fmt;
8
9use crate::error::{IoError, Result};
10
11/// Magic bytes identifying the columnar format file
12pub const COLUMNAR_MAGIC: &[u8; 8] = b"SCIRCOL\x01";
13
14/// Current format version
15pub const FORMAT_VERSION: u32 = 1;
16
17/// Column data type tag stored in the file header
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19#[repr(u8)]
20pub enum ColumnTypeTag {
21    /// 64-bit floating point
22    Float64 = 0,
23    /// 64-bit signed integer
24    Int64 = 1,
25    /// UTF-8 string
26    Str = 2,
27    /// Boolean
28    Bool = 3,
29}
30
31impl TryFrom<u8> for ColumnTypeTag {
32    type Error = IoError;
33
34    fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
35        match value {
36            0 => Ok(ColumnTypeTag::Float64),
37            1 => Ok(ColumnTypeTag::Int64),
38            2 => Ok(ColumnTypeTag::Str),
39            3 => Ok(ColumnTypeTag::Bool),
40            _ => Err(IoError::FormatError(format!(
41                "Unknown column type tag: {}",
42                value
43            ))),
44        }
45    }
46}
47
48/// Encoding strategy for a column
49#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50#[repr(u8)]
51pub enum EncodingType {
52    /// Plain (no encoding)
53    Plain = 0,
54    /// Run-length encoding
55    Rle = 1,
56    /// Dictionary encoding
57    Dictionary = 2,
58    /// Delta encoding (for sorted numeric columns)
59    Delta = 3,
60}
61
62impl TryFrom<u8> for EncodingType {
63    type Error = IoError;
64
65    fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
66        match value {
67            0 => Ok(EncodingType::Plain),
68            1 => Ok(EncodingType::Rle),
69            2 => Ok(EncodingType::Dictionary),
70            3 => Ok(EncodingType::Delta),
71            _ => Err(IoError::FormatError(format!(
72                "Unknown encoding type: {}",
73                value
74            ))),
75        }
76    }
77}
78
79/// A single column's data
80#[derive(Debug, Clone)]
81pub enum ColumnData {
82    /// 64-bit floating point values
83    Float64(Vec<f64>),
84    /// 64-bit signed integer values
85    Int64(Vec<i64>),
86    /// UTF-8 string values
87    Str(Vec<String>),
88    /// Boolean values
89    Bool(Vec<bool>),
90}
91
92impl ColumnData {
93    /// Returns the number of values in this column
94    pub fn len(&self) -> usize {
95        match self {
96            ColumnData::Float64(v) => v.len(),
97            ColumnData::Int64(v) => v.len(),
98            ColumnData::Str(v) => v.len(),
99            ColumnData::Bool(v) => v.len(),
100        }
101    }
102
103    /// Returns true if the column is empty
104    pub fn is_empty(&self) -> bool {
105        self.len() == 0
106    }
107
108    /// Returns the column type tag
109    pub fn type_tag(&self) -> ColumnTypeTag {
110        match self {
111            ColumnData::Float64(_) => ColumnTypeTag::Float64,
112            ColumnData::Int64(_) => ColumnTypeTag::Int64,
113            ColumnData::Str(_) => ColumnTypeTag::Str,
114            ColumnData::Bool(_) => ColumnTypeTag::Bool,
115        }
116    }
117
118    /// Try to get f64 data
119    pub fn as_f64(&self) -> Result<&[f64]> {
120        match self {
121            ColumnData::Float64(v) => Ok(v),
122            _ => Err(IoError::ConversionError(format!(
123                "Column is {:?}, not Float64",
124                self.type_tag()
125            ))),
126        }
127    }
128
129    /// Try to get i64 data
130    pub fn as_i64(&self) -> Result<&[i64]> {
131        match self {
132            ColumnData::Int64(v) => Ok(v),
133            _ => Err(IoError::ConversionError(format!(
134                "Column is {:?}, not Int64",
135                self.type_tag()
136            ))),
137        }
138    }
139
140    /// Try to get string data
141    pub fn as_str(&self) -> Result<&[String]> {
142        match self {
143            ColumnData::Str(v) => Ok(v),
144            _ => Err(IoError::ConversionError(format!(
145                "Column is {:?}, not Str",
146                self.type_tag()
147            ))),
148        }
149    }
150
151    /// Try to get bool data
152    pub fn as_bool(&self) -> Result<&[bool]> {
153        match self {
154            ColumnData::Bool(v) => Ok(v),
155            _ => Err(IoError::ConversionError(format!(
156                "Column is {:?}, not Bool",
157                self.type_tag()
158            ))),
159        }
160    }
161
162    /// Determine best encoding for this column's data
163    pub fn best_encoding(&self) -> EncodingType {
164        match self {
165            ColumnData::Float64(v) => {
166                if is_sorted_f64(v) {
167                    EncodingType::Delta
168                } else if has_runs_f64(v) {
169                    EncodingType::Rle
170                } else {
171                    EncodingType::Plain
172                }
173            }
174            ColumnData::Int64(v) => {
175                if is_sorted_i64(v) {
176                    EncodingType::Delta
177                } else if has_runs_i64(v) {
178                    EncodingType::Rle
179                } else {
180                    EncodingType::Plain
181                }
182            }
183            ColumnData::Str(v) => {
184                let unique_count = count_unique_strings(v);
185                if unique_count < v.len() / 2 {
186                    EncodingType::Dictionary
187                } else if has_runs_str(v) {
188                    EncodingType::Rle
189                } else {
190                    EncodingType::Plain
191                }
192            }
193            ColumnData::Bool(v) => {
194                if has_runs_bool(v) {
195                    EncodingType::Rle
196                } else {
197                    EncodingType::Plain
198                }
199            }
200        }
201    }
202}
203
204impl fmt::Display for ColumnData {
205    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
206        match self {
207            ColumnData::Float64(v) => write!(f, "Float64[{}]", v.len()),
208            ColumnData::Int64(v) => write!(f, "Int64[{}]", v.len()),
209            ColumnData::Str(v) => write!(f, "Str[{}]", v.len()),
210            ColumnData::Bool(v) => write!(f, "Bool[{}]", v.len()),
211        }
212    }
213}
214
215/// A named column in a table
216#[derive(Debug, Clone)]
217pub struct Column {
218    /// Column name
219    pub name: String,
220    /// Column data
221    pub data: ColumnData,
222}
223
224impl Column {
225    /// Create a new column with f64 data
226    pub fn float64(name: impl Into<String>, data: Vec<f64>) -> Self {
227        Column {
228            name: name.into(),
229            data: ColumnData::Float64(data),
230        }
231    }
232
233    /// Create a new column with i64 data
234    pub fn int64(name: impl Into<String>, data: Vec<i64>) -> Self {
235        Column {
236            name: name.into(),
237            data: ColumnData::Int64(data),
238        }
239    }
240
241    /// Create a new column with string data
242    pub fn string(name: impl Into<String>, data: Vec<String>) -> Self {
243        Column {
244            name: name.into(),
245            data: ColumnData::Str(data),
246        }
247    }
248
249    /// Create a new column with bool data
250    pub fn boolean(name: impl Into<String>, data: Vec<bool>) -> Self {
251        Column {
252            name: name.into(),
253            data: ColumnData::Bool(data),
254        }
255    }
256
257    /// Returns the length (number of rows) of this column
258    pub fn len(&self) -> usize {
259        self.data.len()
260    }
261
262    /// Returns true if column is empty
263    pub fn is_empty(&self) -> bool {
264        self.data.is_empty()
265    }
266}
267
268/// A table containing multiple named columns
269#[derive(Debug, Clone)]
270pub struct ColumnarTable {
271    /// Columns in order
272    columns: Vec<Column>,
273    /// Name-to-index lookup
274    index: HashMap<String, usize>,
275}
276
277impl ColumnarTable {
278    /// Create a new empty table
279    pub fn new() -> Self {
280        ColumnarTable {
281            columns: Vec::new(),
282            index: HashMap::new(),
283        }
284    }
285
286    /// Create a table from a list of columns
287    pub fn from_columns(columns: Vec<Column>) -> Result<Self> {
288        // Validate all columns have same length
289        if !columns.is_empty() {
290            let expected_len = columns[0].len();
291            for col in &columns[1..] {
292                if col.len() != expected_len {
293                    return Err(IoError::FormatError(format!(
294                        "Column '{}' has {} rows, expected {}",
295                        col.name,
296                        col.len(),
297                        expected_len
298                    )));
299                }
300            }
301        }
302
303        let mut index = HashMap::new();
304        for (i, col) in columns.iter().enumerate() {
305            if index.contains_key(&col.name) {
306                return Err(IoError::FormatError(format!(
307                    "Duplicate column name: '{}'",
308                    col.name
309                )));
310            }
311            index.insert(col.name.clone(), i);
312        }
313
314        Ok(ColumnarTable { columns, index })
315    }
316
317    /// Add a column to the table
318    pub fn add_column(&mut self, column: Column) -> Result<()> {
319        if !self.columns.is_empty() && column.len() != self.num_rows() {
320            return Err(IoError::FormatError(format!(
321                "Column '{}' has {} rows, expected {}",
322                column.name,
323                column.len(),
324                self.num_rows()
325            )));
326        }
327        if self.index.contains_key(&column.name) {
328            return Err(IoError::FormatError(format!(
329                "Duplicate column name: '{}'",
330                column.name
331            )));
332        }
333        let idx = self.columns.len();
334        self.index.insert(column.name.clone(), idx);
335        self.columns.push(column);
336        Ok(())
337    }
338
339    /// Number of rows in the table
340    pub fn num_rows(&self) -> usize {
341        self.columns.first().map(|c| c.len()).unwrap_or(0)
342    }
343
344    /// Number of columns
345    pub fn num_columns(&self) -> usize {
346        self.columns.len()
347    }
348
349    /// Get column names in order
350    pub fn column_names(&self) -> Vec<&str> {
351        self.columns.iter().map(|c| c.name.as_str()).collect()
352    }
353
354    /// Get a column by name
355    pub fn column(&self, name: &str) -> Result<&Column> {
356        self.index
357            .get(name)
358            .map(|&idx| &self.columns[idx])
359            .ok_or_else(|| IoError::NotFound(format!("Column '{}' not found", name)))
360    }
361
362    /// Get a column by index
363    pub fn column_by_index(&self, idx: usize) -> Result<&Column> {
364        self.columns
365            .get(idx)
366            .ok_or_else(|| IoError::NotFound(format!("Column index {} out of range", idx)))
367    }
368
369    /// Get all columns as a slice
370    pub fn columns(&self) -> &[Column] {
371        &self.columns
372    }
373
374    /// Get f64 column data by name
375    pub fn get_f64(&self, name: &str) -> Result<&[f64]> {
376        self.column(name)?.data.as_f64()
377    }
378
379    /// Get i64 column data by name
380    pub fn get_i64(&self, name: &str) -> Result<&[i64]> {
381        self.column(name)?.data.as_i64()
382    }
383
384    /// Get string column data by name
385    pub fn get_str(&self, name: &str) -> Result<&[String]> {
386        self.column(name)?.data.as_str()
387    }
388
389    /// Get bool column data by name
390    pub fn get_bool(&self, name: &str) -> Result<&[bool]> {
391        self.column(name)?.data.as_bool()
392    }
393}
394
395impl Default for ColumnarTable {
396    fn default() -> Self {
397        Self::new()
398    }
399}
400
401// Helper functions for encoding detection
402
403fn is_sorted_f64(data: &[f64]) -> bool {
404    if data.len() < 2 {
405        return true;
406    }
407    data.windows(2).all(|w| w[0] <= w[1])
408}
409
410fn is_sorted_i64(data: &[i64]) -> bool {
411    if data.len() < 2 {
412        return true;
413    }
414    data.windows(2).all(|w| w[0] <= w[1])
415}
416
417fn has_runs_f64(data: &[f64]) -> bool {
418    if data.len() < 4 {
419        return false;
420    }
421    let mut run_count = 0;
422    let mut i = 0;
423    while i < data.len() {
424        let val = data[i];
425        let mut run_len = 1;
426        while i + run_len < data.len() && data[i + run_len] == val {
427            run_len += 1;
428        }
429        if run_len > 1 {
430            run_count += 1;
431        }
432        i += run_len;
433    }
434    // Beneficial if at least 20% of groups are runs
435    run_count * 5 >= data.len()
436}
437
438fn has_runs_i64(data: &[i64]) -> bool {
439    if data.len() < 4 {
440        return false;
441    }
442    let mut run_count = 0;
443    let mut i = 0;
444    while i < data.len() {
445        let val = data[i];
446        let mut run_len = 1;
447        while i + run_len < data.len() && data[i + run_len] == val {
448            run_len += 1;
449        }
450        if run_len > 1 {
451            run_count += 1;
452        }
453        i += run_len;
454    }
455    run_count * 5 >= data.len()
456}
457
458fn has_runs_str(data: &[String]) -> bool {
459    if data.len() < 4 {
460        return false;
461    }
462    let mut run_count = 0;
463    let mut i = 0;
464    while i < data.len() {
465        let val = &data[i];
466        let mut run_len = 1;
467        while i + run_len < data.len() && &data[i + run_len] == val {
468            run_len += 1;
469        }
470        if run_len > 1 {
471            run_count += 1;
472        }
473        i += run_len;
474    }
475    run_count * 5 >= data.len()
476}
477
478fn has_runs_bool(data: &[bool]) -> bool {
479    if data.len() < 4 {
480        return false;
481    }
482    let mut run_count = 0;
483    let mut i = 0;
484    while i < data.len() {
485        let val = data[i];
486        let mut run_len = 1;
487        while i + run_len < data.len() && data[i + run_len] == val {
488            run_len += 1;
489        }
490        if run_len > 1 {
491            run_count += 1;
492        }
493        i += run_len;
494    }
495    run_count * 5 >= data.len()
496}
497
498fn count_unique_strings(data: &[String]) -> usize {
499    let mut seen = std::collections::HashSet::new();
500    for s in data {
501        seen.insert(s.as_str());
502    }
503    seen.len()
504}