Skip to main content

reifydb_column/
compress.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2025 ReifyDB
3
4use std::sync::Arc;
5
6use reifydb_core::value::column::{
7	array::{Column, canonical::Canonical},
8	encoding::EncodingId,
9};
10use reifydb_type::Result;
11
12use crate::encoding::{self, Encoding};
13
14#[derive(Clone, Debug)]
15pub struct CompressConfig {
16	pub sample_size: usize,
17	pub sample_count: usize,
18	pub max_depth: u8,
19	pub min_compression_ratio: f32,
20}
21
22impl Default for CompressConfig {
23	fn default() -> Self {
24		Self {
25			sample_size: 1024,
26			sample_count: 4,
27			max_depth: 3,
28			min_compression_ratio: 0.85,
29		}
30	}
31}
32
33pub struct Compressor {
34	candidates: Vec<Arc<dyn Encoding>>,
35	cfg: CompressConfig,
36}
37
38impl Compressor {
39	pub fn new(cfg: CompressConfig) -> Self {
40		let registry = encoding::global();
41		let order = [
42			EncodingId::CANONICAL_BOOL, // canonical always last via fallback
43			EncodingId::CONSTANT,
44			EncodingId::ALL_NONE,
45			EncodingId::DICT,
46			EncodingId::RLE,
47			EncodingId::DELTA,
48			EncodingId::DELTA_RLE,
49			EncodingId::FOR,
50			EncodingId::BITPACK,
51			EncodingId::SPARSE,
52		];
53		// Start with compressed candidates in a fixed order; the canonical fallback
54		// happens outside the candidate loop, so we skip canonical ids here.
55		let candidates = order
56			.into_iter()
57			.filter(|id| {
58				!matches!(
59					*id,
60					EncodingId::CANONICAL_BOOL
61						| EncodingId::CANONICAL_FIXED | EncodingId::CANONICAL_VARLEN
62						| EncodingId::CANONICAL_BIGNUM
63				)
64			})
65			.filter_map(|id| registry.get(id).cloned())
66			.collect();
67		Self {
68			candidates,
69			cfg,
70		}
71	}
72
73	pub fn compress(&self, input: &Canonical) -> Result<Column> {
74		for candidate in &self.candidates {
75			if let Some(compressed) = candidate.try_compress(input, &self.cfg)? {
76				return Ok(compressed);
77			}
78		}
79		Ok(Column::from_canonical(input.clone()))
80	}
81}
82
83pub fn compress(input: &Canonical) -> Result<Column> {
84	Compressor::new(CompressConfig::default()).compress(input)
85}
86
87#[cfg(test)]
88mod tests {
89	use reifydb_core::value::column::buffer::ColumnBuffer;
90
91	use super::*;
92
93	#[test]
94	fn compress_falls_back_to_canonical_when_no_stub_applies() {
95		let cd = ColumnBuffer::int4([1i32, 2, 3, 4]);
96		let canon = Canonical::from_column_buffer(&cd).unwrap();
97		let out = compress(&canon).unwrap();
98		assert_eq!(out.encoding(), EncodingId::CANONICAL_FIXED);
99		assert_eq!(out.len(), 4);
100	}
101
102	#[test]
103	fn compress_utf8_falls_back_to_canonical_varlen() {
104		let cd = ColumnBuffer::utf8(["alpha", "bravo"]);
105		let canon = Canonical::from_column_buffer(&cd).unwrap();
106		let out = compress(&canon).unwrap();
107		assert_eq!(out.encoding(), EncodingId::CANONICAL_VARLEN);
108		assert_eq!(out.len(), 2);
109	}
110}