1use std::collections::HashMap;
7use std::fmt;
8
9use crate::error::{IoError, Result};
10
11pub const COLUMNAR_MAGIC: &[u8; 8] = b"SCIRCOL\x01";
13
14pub const FORMAT_VERSION: u32 = 1;
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19#[repr(u8)]
20pub enum ColumnTypeTag {
21 Float64 = 0,
23 Int64 = 1,
25 Str = 2,
27 Bool = 3,
29}
30
31impl TryFrom<u8> for ColumnTypeTag {
32 type Error = IoError;
33
34 fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
35 match value {
36 0 => Ok(ColumnTypeTag::Float64),
37 1 => Ok(ColumnTypeTag::Int64),
38 2 => Ok(ColumnTypeTag::Str),
39 3 => Ok(ColumnTypeTag::Bool),
40 _ => Err(IoError::FormatError(format!(
41 "Unknown column type tag: {}",
42 value
43 ))),
44 }
45 }
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq)]
50#[repr(u8)]
51pub enum EncodingType {
52 Plain = 0,
54 Rle = 1,
56 Dictionary = 2,
58 Delta = 3,
60}
61
62impl TryFrom<u8> for EncodingType {
63 type Error = IoError;
64
65 fn try_from(value: u8) -> std::result::Result<Self, Self::Error> {
66 match value {
67 0 => Ok(EncodingType::Plain),
68 1 => Ok(EncodingType::Rle),
69 2 => Ok(EncodingType::Dictionary),
70 3 => Ok(EncodingType::Delta),
71 _ => Err(IoError::FormatError(format!(
72 "Unknown encoding type: {}",
73 value
74 ))),
75 }
76 }
77}
78
79#[derive(Debug, Clone)]
81pub enum ColumnData {
82 Float64(Vec<f64>),
84 Int64(Vec<i64>),
86 Str(Vec<String>),
88 Bool(Vec<bool>),
90}
91
92impl ColumnData {
93 pub fn len(&self) -> usize {
95 match self {
96 ColumnData::Float64(v) => v.len(),
97 ColumnData::Int64(v) => v.len(),
98 ColumnData::Str(v) => v.len(),
99 ColumnData::Bool(v) => v.len(),
100 }
101 }
102
103 pub fn is_empty(&self) -> bool {
105 self.len() == 0
106 }
107
108 pub fn type_tag(&self) -> ColumnTypeTag {
110 match self {
111 ColumnData::Float64(_) => ColumnTypeTag::Float64,
112 ColumnData::Int64(_) => ColumnTypeTag::Int64,
113 ColumnData::Str(_) => ColumnTypeTag::Str,
114 ColumnData::Bool(_) => ColumnTypeTag::Bool,
115 }
116 }
117
118 pub fn as_f64(&self) -> Result<&[f64]> {
120 match self {
121 ColumnData::Float64(v) => Ok(v),
122 _ => Err(IoError::ConversionError(format!(
123 "Column is {:?}, not Float64",
124 self.type_tag()
125 ))),
126 }
127 }
128
129 pub fn as_i64(&self) -> Result<&[i64]> {
131 match self {
132 ColumnData::Int64(v) => Ok(v),
133 _ => Err(IoError::ConversionError(format!(
134 "Column is {:?}, not Int64",
135 self.type_tag()
136 ))),
137 }
138 }
139
140 pub fn as_str(&self) -> Result<&[String]> {
142 match self {
143 ColumnData::Str(v) => Ok(v),
144 _ => Err(IoError::ConversionError(format!(
145 "Column is {:?}, not Str",
146 self.type_tag()
147 ))),
148 }
149 }
150
151 pub fn as_bool(&self) -> Result<&[bool]> {
153 match self {
154 ColumnData::Bool(v) => Ok(v),
155 _ => Err(IoError::ConversionError(format!(
156 "Column is {:?}, not Bool",
157 self.type_tag()
158 ))),
159 }
160 }
161
162 pub fn best_encoding(&self) -> EncodingType {
164 match self {
165 ColumnData::Float64(v) => {
166 if is_sorted_f64(v) {
167 EncodingType::Delta
168 } else if has_runs_f64(v) {
169 EncodingType::Rle
170 } else {
171 EncodingType::Plain
172 }
173 }
174 ColumnData::Int64(v) => {
175 if is_sorted_i64(v) {
176 EncodingType::Delta
177 } else if has_runs_i64(v) {
178 EncodingType::Rle
179 } else {
180 EncodingType::Plain
181 }
182 }
183 ColumnData::Str(v) => {
184 let unique_count = count_unique_strings(v);
185 if unique_count < v.len() / 2 {
186 EncodingType::Dictionary
187 } else if has_runs_str(v) {
188 EncodingType::Rle
189 } else {
190 EncodingType::Plain
191 }
192 }
193 ColumnData::Bool(v) => {
194 if has_runs_bool(v) {
195 EncodingType::Rle
196 } else {
197 EncodingType::Plain
198 }
199 }
200 }
201 }
202}
203
204impl fmt::Display for ColumnData {
205 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
206 match self {
207 ColumnData::Float64(v) => write!(f, "Float64[{}]", v.len()),
208 ColumnData::Int64(v) => write!(f, "Int64[{}]", v.len()),
209 ColumnData::Str(v) => write!(f, "Str[{}]", v.len()),
210 ColumnData::Bool(v) => write!(f, "Bool[{}]", v.len()),
211 }
212 }
213}
214
215#[derive(Debug, Clone)]
217pub struct Column {
218 pub name: String,
220 pub data: ColumnData,
222}
223
224impl Column {
225 pub fn float64(name: impl Into<String>, data: Vec<f64>) -> Self {
227 Column {
228 name: name.into(),
229 data: ColumnData::Float64(data),
230 }
231 }
232
233 pub fn int64(name: impl Into<String>, data: Vec<i64>) -> Self {
235 Column {
236 name: name.into(),
237 data: ColumnData::Int64(data),
238 }
239 }
240
241 pub fn string(name: impl Into<String>, data: Vec<String>) -> Self {
243 Column {
244 name: name.into(),
245 data: ColumnData::Str(data),
246 }
247 }
248
249 pub fn boolean(name: impl Into<String>, data: Vec<bool>) -> Self {
251 Column {
252 name: name.into(),
253 data: ColumnData::Bool(data),
254 }
255 }
256
257 pub fn len(&self) -> usize {
259 self.data.len()
260 }
261
262 pub fn is_empty(&self) -> bool {
264 self.data.is_empty()
265 }
266}
267
268#[derive(Debug, Clone)]
270pub struct ColumnarTable {
271 columns: Vec<Column>,
273 index: HashMap<String, usize>,
275}
276
277impl ColumnarTable {
278 pub fn new() -> Self {
280 ColumnarTable {
281 columns: Vec::new(),
282 index: HashMap::new(),
283 }
284 }
285
286 pub fn from_columns(columns: Vec<Column>) -> Result<Self> {
288 if !columns.is_empty() {
290 let expected_len = columns[0].len();
291 for col in &columns[1..] {
292 if col.len() != expected_len {
293 return Err(IoError::FormatError(format!(
294 "Column '{}' has {} rows, expected {}",
295 col.name,
296 col.len(),
297 expected_len
298 )));
299 }
300 }
301 }
302
303 let mut index = HashMap::new();
304 for (i, col) in columns.iter().enumerate() {
305 if index.contains_key(&col.name) {
306 return Err(IoError::FormatError(format!(
307 "Duplicate column name: '{}'",
308 col.name
309 )));
310 }
311 index.insert(col.name.clone(), i);
312 }
313
314 Ok(ColumnarTable { columns, index })
315 }
316
317 pub fn add_column(&mut self, column: Column) -> Result<()> {
319 if !self.columns.is_empty() && column.len() != self.num_rows() {
320 return Err(IoError::FormatError(format!(
321 "Column '{}' has {} rows, expected {}",
322 column.name,
323 column.len(),
324 self.num_rows()
325 )));
326 }
327 if self.index.contains_key(&column.name) {
328 return Err(IoError::FormatError(format!(
329 "Duplicate column name: '{}'",
330 column.name
331 )));
332 }
333 let idx = self.columns.len();
334 self.index.insert(column.name.clone(), idx);
335 self.columns.push(column);
336 Ok(())
337 }
338
339 pub fn num_rows(&self) -> usize {
341 self.columns.first().map(|c| c.len()).unwrap_or(0)
342 }
343
344 pub fn num_columns(&self) -> usize {
346 self.columns.len()
347 }
348
349 pub fn column_names(&self) -> Vec<&str> {
351 self.columns.iter().map(|c| c.name.as_str()).collect()
352 }
353
354 pub fn column(&self, name: &str) -> Result<&Column> {
356 self.index
357 .get(name)
358 .map(|&idx| &self.columns[idx])
359 .ok_or_else(|| IoError::NotFound(format!("Column '{}' not found", name)))
360 }
361
362 pub fn column_by_index(&self, idx: usize) -> Result<&Column> {
364 self.columns
365 .get(idx)
366 .ok_or_else(|| IoError::NotFound(format!("Column index {} out of range", idx)))
367 }
368
369 pub fn columns(&self) -> &[Column] {
371 &self.columns
372 }
373
374 pub fn get_f64(&self, name: &str) -> Result<&[f64]> {
376 self.column(name)?.data.as_f64()
377 }
378
379 pub fn get_i64(&self, name: &str) -> Result<&[i64]> {
381 self.column(name)?.data.as_i64()
382 }
383
384 pub fn get_str(&self, name: &str) -> Result<&[String]> {
386 self.column(name)?.data.as_str()
387 }
388
389 pub fn get_bool(&self, name: &str) -> Result<&[bool]> {
391 self.column(name)?.data.as_bool()
392 }
393}
394
395impl Default for ColumnarTable {
396 fn default() -> Self {
397 Self::new()
398 }
399}
400
401fn is_sorted_f64(data: &[f64]) -> bool {
404 if data.len() < 2 {
405 return true;
406 }
407 data.windows(2).all(|w| w[0] <= w[1])
408}
409
410fn is_sorted_i64(data: &[i64]) -> bool {
411 if data.len() < 2 {
412 return true;
413 }
414 data.windows(2).all(|w| w[0] <= w[1])
415}
416
417fn has_runs_f64(data: &[f64]) -> bool {
418 if data.len() < 4 {
419 return false;
420 }
421 let mut run_count = 0;
422 let mut i = 0;
423 while i < data.len() {
424 let val = data[i];
425 let mut run_len = 1;
426 while i + run_len < data.len() && data[i + run_len] == val {
427 run_len += 1;
428 }
429 if run_len > 1 {
430 run_count += 1;
431 }
432 i += run_len;
433 }
434 run_count * 5 >= data.len()
436}
437
438fn has_runs_i64(data: &[i64]) -> bool {
439 if data.len() < 4 {
440 return false;
441 }
442 let mut run_count = 0;
443 let mut i = 0;
444 while i < data.len() {
445 let val = data[i];
446 let mut run_len = 1;
447 while i + run_len < data.len() && data[i + run_len] == val {
448 run_len += 1;
449 }
450 if run_len > 1 {
451 run_count += 1;
452 }
453 i += run_len;
454 }
455 run_count * 5 >= data.len()
456}
457
458fn has_runs_str(data: &[String]) -> bool {
459 if data.len() < 4 {
460 return false;
461 }
462 let mut run_count = 0;
463 let mut i = 0;
464 while i < data.len() {
465 let val = &data[i];
466 let mut run_len = 1;
467 while i + run_len < data.len() && &data[i + run_len] == val {
468 run_len += 1;
469 }
470 if run_len > 1 {
471 run_count += 1;
472 }
473 i += run_len;
474 }
475 run_count * 5 >= data.len()
476}
477
478fn has_runs_bool(data: &[bool]) -> bool {
479 if data.len() < 4 {
480 return false;
481 }
482 let mut run_count = 0;
483 let mut i = 0;
484 while i < data.len() {
485 let val = data[i];
486 let mut run_len = 1;
487 while i + run_len < data.len() && data[i + run_len] == val {
488 run_len += 1;
489 }
490 if run_len > 1 {
491 run_count += 1;
492 }
493 i += run_len;
494 }
495 run_count * 5 >= data.len()
496}
497
498fn count_unique_strings(data: &[String]) -> usize {
499 let mut seen = std::collections::HashSet::new();
500 for s in data {
501 seen.insert(s.as_str());
502 }
503 seen.len()
504}