Skip to main content

nodedb_codec/
detect.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Codec auto-detection from column type and data distribution.
4//!
5//! Analyzes up to the first 1024 values of a column to select the optimal
6//! codec chain. Called at flush time when `ColumnCodec::Auto` is configured.
7//!
8//! Selection strategy:
9//! - Partitions ≥ 1024 values → cascading codecs (ALP, FastLanes, etc.)
10//! - Partitions < 1024 values → single-step codecs (Gorilla, Delta, etc.)
11//! - f64 with >95% ALP encodability → `AlpFastLanesLz4`
12//! - f64 with ≤95% ALP encodability → `Gorilla` (fallback)
13//! - i64 timestamps/counters → `DeltaFastLanesLz4`
14//! - Symbol columns → `FastLanesLz4` (small integer IDs)
15
16use crate::{ColumnCodec, ColumnTypeHint};
17
18use crate::CODEC_SAMPLE_SIZE;
19
20/// Minimum partition size to use cascading codecs.
21/// Below this, FastLanes block overhead dominates — use legacy codecs.
22const CASCADE_THRESHOLD: usize = 128;
23
24/// Detect the optimal codec for a column based on its type and data.
25///
26/// When `codec` is not `Auto`, returns it unchanged. When `Auto`,
27/// analyzes the column type hint to select the best codec. For
28/// data-aware selection (ALP encodability, monotonicity detection),
29/// use `detect_f64_codec()` or `detect_i64_codec()` with actual values.
30pub fn detect_codec(codec: ColumnCodec, type_hint: ColumnTypeHint) -> ColumnCodec {
31    if codec != ColumnCodec::Auto {
32        return codec;
33    }
34
35    // Default selections (data-unaware). The segment writer calls
36    // data-aware variants (detect_f64_codec, detect_i64_codec) when
37    // it has actual values.
38    match type_hint {
39        ColumnTypeHint::Timestamp => ColumnCodec::DeltaFastLanesLz4,
40        ColumnTypeHint::Float64 => ColumnCodec::AlpFastLanesLz4,
41        ColumnTypeHint::Int64 => ColumnCodec::DeltaFastLanesLz4,
42        ColumnTypeHint::Symbol => ColumnCodec::FastLanesLz4,
43        ColumnTypeHint::String => ColumnCodec::FsstLz4,
44    }
45}
46
47/// Detect the optimal codec for an i64 column by analyzing the data.
48///
49/// For partitions ≥ CASCADE_THRESHOLD values, selects cascading codecs.
50/// For smaller partitions, falls back to legacy single-step codecs.
51pub fn detect_i64_codec(values: &[i64]) -> ColumnCodec {
52    if values.len() < 2 {
53        return ColumnCodec::Delta;
54    }
55
56    // Large partitions → cascading codec (FastLanes handles all patterns).
57    if values.len() >= CASCADE_THRESHOLD {
58        return ColumnCodec::DeltaFastLanesLz4;
59    }
60
61    // Small partitions → legacy codecs. Analyze data to pick best one.
62    let sample_end = values.len().min(CODEC_SAMPLE_SIZE);
63    let sample = &values[..sample_end];
64
65    let mut zero_dod_count = 0usize;
66    let mut prev_delta: Option<i64> = None;
67
68    for i in 1..sample.len() {
69        let delta = sample[i] - sample[i - 1];
70        if let Some(pd) = prev_delta
71            && delta == pd
72        {
73            zero_dod_count += 1;
74        }
75        prev_delta = Some(delta);
76    }
77
78    let total_deltas = sample.len() - 1;
79    let constant_rate_ratio = zero_dod_count as f64 / total_deltas.max(1) as f64;
80
81    if constant_rate_ratio > 0.8 {
82        ColumnCodec::DoubleDelta
83    } else {
84        ColumnCodec::Delta
85    }
86}
87
88/// Detect the optimal codec for an f64 column by analyzing the data.
89///
90/// For partitions ≥ CASCADE_THRESHOLD values with >95% ALP encodability,
91/// selects `AlpFastLanesLz4`. Otherwise falls back to `Gorilla`.
92pub fn detect_f64_codec(values: &[f64]) -> ColumnCodec {
93    if values.len() < 2 {
94        return ColumnCodec::Gorilla;
95    }
96
97    let use_cascade = values.len() >= CASCADE_THRESHOLD;
98
99    if use_cascade {
100        // Check ALP encodability on a sample.
101        let encodability = crate::alp::alp_encodability(values);
102        if encodability > 0.95 {
103            return ColumnCodec::AlpFastLanesLz4;
104        }
105    }
106
107    // Fallback: Gorilla is the best general-purpose f64 codec.
108    ColumnCodec::Gorilla
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114
115    #[test]
116    fn explicit_codec_passthrough() {
117        assert_eq!(
118            detect_codec(ColumnCodec::Lz4, ColumnTypeHint::Timestamp),
119            ColumnCodec::Lz4
120        );
121        assert_eq!(
122            detect_codec(ColumnCodec::Zstd, ColumnTypeHint::Float64),
123            ColumnCodec::Zstd
124        );
125    }
126
127    #[test]
128    fn auto_timestamp() {
129        assert_eq!(
130            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Timestamp),
131            ColumnCodec::DeltaFastLanesLz4
132        );
133    }
134
135    #[test]
136    fn auto_float64() {
137        assert_eq!(
138            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Float64),
139            ColumnCodec::AlpFastLanesLz4
140        );
141    }
142
143    #[test]
144    fn auto_int64() {
145        assert_eq!(
146            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Int64),
147            ColumnCodec::DeltaFastLanesLz4
148        );
149    }
150
151    #[test]
152    fn auto_symbol() {
153        assert_eq!(
154            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Symbol),
155            ColumnCodec::FastLanesLz4
156        );
157    }
158
159    #[test]
160    fn auto_string() {
161        assert_eq!(
162            detect_codec(ColumnCodec::Auto, ColumnTypeHint::String),
163            ColumnCodec::FsstLz4
164        );
165    }
166
167    #[test]
168    fn detect_large_i64_uses_cascade() {
169        // ≥128 values → cascading DeltaFastLanesLz4 regardless of pattern.
170        let values: Vec<i64> = (0..1000).map(|i| i * 100).collect();
171        assert_eq!(detect_i64_codec(&values), ColumnCodec::DeltaFastLanesLz4);
172
173        let timestamps: Vec<i64> = (0..1000).map(|i| 1_700_000_000_000 + i * 10_000).collect();
174        assert_eq!(
175            detect_i64_codec(&timestamps),
176            ColumnCodec::DeltaFastLanesLz4
177        );
178    }
179
180    #[test]
181    fn detect_small_i64_uses_legacy() {
182        // <128 values → legacy codecs.
183        let constant_rate: Vec<i64> = (0..50).map(|i| i * 100).collect();
184        assert_eq!(detect_i64_codec(&constant_rate), ColumnCodec::DoubleDelta);
185
186        let varying: Vec<i64> = vec![1, 3, 7, 15, 22, 30];
187        assert_eq!(detect_i64_codec(&varying), ColumnCodec::Delta);
188    }
189
190    #[test]
191    fn detect_large_f64_decimal_uses_alp() {
192        // Decimal-origin f64 with ≥128 values → AlpFastLanesLz4.
193        let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
194        assert_eq!(detect_f64_codec(&values), ColumnCodec::AlpFastLanesLz4);
195    }
196
197    #[test]
198    fn detect_large_f64_irrational_uses_gorilla() {
199        // Non-ALP-encodable f64 → Gorilla fallback.
200        let values: Vec<f64> = (1..1000).map(|i| std::f64::consts::PI * i as f64).collect();
201        assert_eq!(detect_f64_codec(&values), ColumnCodec::Gorilla);
202    }
203
204    #[test]
205    fn detect_small_f64_uses_gorilla() {
206        let values: Vec<f64> = (0..50).map(|i| i as f64 * 0.1).collect();
207        assert_eq!(detect_f64_codec(&values), ColumnCodec::Gorilla);
208    }
209
210    #[test]
211    fn small_sample() {
212        assert_eq!(detect_i64_codec(&[]), ColumnCodec::Delta);
213        assert_eq!(detect_i64_codec(&[42]), ColumnCodec::Delta);
214        assert_eq!(detect_f64_codec(&[]), ColumnCodec::Gorilla);
215    }
216}