Skip to main content

reifydb_column/
compress.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2025 ReifyDB
3
4use std::sync::Arc;
5
6use reifydb_core::value::column::{
7	data::{Column, canonical::Canonical},
8	encoding::EncodingId,
9};
10use reifydb_type::Result;
11
12use crate::encoding::{self, Encoding};
13
14#[derive(Clone, Debug)]
15pub struct CompressConfig {
16	pub sample_size: usize,
17	pub sample_count: usize,
18	pub max_depth: u8,
19	pub min_compression_ratio: f32,
20}
21
22impl Default for CompressConfig {
23	fn default() -> Self {
24		Self {
25			sample_size: 1024,
26			sample_count: 4,
27			max_depth: 3,
28			min_compression_ratio: 0.85,
29		}
30	}
31}
32
33pub struct Compressor {
34	candidates: Vec<Arc<dyn Encoding>>,
35	cfg: CompressConfig,
36}
37
38impl Compressor {
39	pub fn new(cfg: CompressConfig) -> Self {
40		let registry = encoding::global();
41		let order = [
42			EncodingId::CANONICAL_BOOL,
43			EncodingId::CONSTANT,
44			EncodingId::ALL_NONE,
45			EncodingId::DICT,
46			EncodingId::RLE,
47			EncodingId::DELTA,
48			EncodingId::DELTA_RLE,
49			EncodingId::FOR,
50			EncodingId::BITPACK,
51			EncodingId::SPARSE,
52		];
53
54		let candidates = order
55			.into_iter()
56			.filter(|id| {
57				!matches!(
58					*id,
59					EncodingId::CANONICAL_BOOL
60						| EncodingId::CANONICAL_FIXED | EncodingId::CANONICAL_VARLEN
61						| EncodingId::CANONICAL_BIGNUM
62				)
63			})
64			.filter_map(|id| registry.get(id).cloned())
65			.collect();
66		Self {
67			candidates,
68			cfg,
69		}
70	}
71
72	pub fn compress(&self, input: &Canonical) -> Result<Column> {
73		for candidate in &self.candidates {
74			if let Some(compressed) = candidate.try_compress(input, &self.cfg)? {
75				return Ok(compressed);
76			}
77		}
78		Ok(Column::from_canonical(input.clone()))
79	}
80}
81
82pub fn compress(input: &Canonical) -> Result<Column> {
83	Compressor::new(CompressConfig::default()).compress(input)
84}
85
86#[cfg(test)]
87mod tests {
88	use reifydb_core::value::column::buffer::ColumnBuffer;
89
90	use super::*;
91
92	#[test]
93	fn compress_falls_back_to_canonical_when_no_stub_applies() {
94		let cd = ColumnBuffer::int4([1i32, 2, 3, 4]);
95		let canon = Canonical::from_column_buffer(&cd).unwrap();
96		let out = compress(&canon).unwrap();
97		assert_eq!(out.encoding(), EncodingId::CANONICAL_FIXED);
98		assert_eq!(out.len(), 4);
99	}
100
101	#[test]
102	fn compress_utf8_falls_back_to_canonical_varlen() {
103		let cd = ColumnBuffer::utf8(["alpha", "bravo"]);
104		let canon = Canonical::from_column_buffer(&cd).unwrap();
105		let out = compress(&canon).unwrap();
106		assert_eq!(out.encoding(), EncodingId::CANONICAL_VARLEN);
107		assert_eq!(out.len(), 2);
108	}
109}