Skip to main content

nodedb_codec/
detect.rs

1//! Codec auto-detection from column type and data distribution.
2//!
3//! Analyzes up to the first 1024 values of a column to select the optimal
4//! codec chain. Called at flush time when `ColumnCodec::Auto` is configured.
5//!
6//! Selection strategy:
7//! - Partitions ≥ 1024 values → cascading codecs (ALP, FastLanes, etc.)
8//! - Partitions < 1024 values → single-step codecs (Gorilla, Delta, etc.)
9//! - f64 with >95% ALP encodability → `AlpFastLanesLz4`
10//! - f64 with ≤95% ALP encodability → `Gorilla` (fallback)
11//! - i64 timestamps/counters → `DeltaFastLanesLz4`
12//! - Symbol columns → `FastLanesLz4` (small integer IDs)
13
14use crate::{ColumnCodec, ColumnTypeHint};
15
16use crate::CODEC_SAMPLE_SIZE;
17
18/// Minimum partition size to use cascading codecs.
19/// Below this, FastLanes block overhead dominates — use legacy codecs.
20const CASCADE_THRESHOLD: usize = 128;
21
22/// Detect the optimal codec for a column based on its type and data.
23///
24/// When `codec` is not `Auto`, returns it unchanged. When `Auto`,
25/// analyzes the column type hint to select the best codec. For
26/// data-aware selection (ALP encodability, monotonicity detection),
27/// use `detect_f64_codec()` or `detect_i64_codec()` with actual values.
28pub fn detect_codec(codec: ColumnCodec, type_hint: ColumnTypeHint) -> ColumnCodec {
29    if codec != ColumnCodec::Auto {
30        return codec;
31    }
32
33    // Default selections (data-unaware). The segment writer calls
34    // data-aware variants (detect_f64_codec, detect_i64_codec) when
35    // it has actual values.
36    match type_hint {
37        ColumnTypeHint::Timestamp => ColumnCodec::DeltaFastLanesLz4,
38        ColumnTypeHint::Float64 => ColumnCodec::AlpFastLanesLz4,
39        ColumnTypeHint::Int64 => ColumnCodec::DeltaFastLanesLz4,
40        ColumnTypeHint::Symbol => ColumnCodec::FastLanesLz4,
41        ColumnTypeHint::String => ColumnCodec::FsstLz4,
42    }
43}
44
45/// Detect the optimal codec for an i64 column by analyzing the data.
46///
47/// For partitions ≥ CASCADE_THRESHOLD values, selects cascading codecs.
48/// For smaller partitions, falls back to legacy single-step codecs.
49pub fn detect_i64_codec(values: &[i64]) -> ColumnCodec {
50    if values.len() < 2 {
51        return ColumnCodec::Delta;
52    }
53
54    // Large partitions → cascading codec (FastLanes handles all patterns).
55    if values.len() >= CASCADE_THRESHOLD {
56        return ColumnCodec::DeltaFastLanesLz4;
57    }
58
59    // Small partitions → legacy codecs. Analyze data to pick best one.
60    let sample_end = values.len().min(CODEC_SAMPLE_SIZE);
61    let sample = &values[..sample_end];
62
63    let mut zero_dod_count = 0usize;
64    let mut prev_delta: Option<i64> = None;
65
66    for i in 1..sample.len() {
67        let delta = sample[i] - sample[i - 1];
68        if let Some(pd) = prev_delta
69            && delta == pd
70        {
71            zero_dod_count += 1;
72        }
73        prev_delta = Some(delta);
74    }
75
76    let total_deltas = sample.len() - 1;
77    let constant_rate_ratio = zero_dod_count as f64 / total_deltas.max(1) as f64;
78
79    if constant_rate_ratio > 0.8 {
80        ColumnCodec::DoubleDelta
81    } else {
82        ColumnCodec::Delta
83    }
84}
85
86/// Detect the optimal codec for an f64 column by analyzing the data.
87///
88/// For partitions ≥ CASCADE_THRESHOLD values with >95% ALP encodability,
89/// selects `AlpFastLanesLz4`. Otherwise falls back to `Gorilla`.
90pub fn detect_f64_codec(values: &[f64]) -> ColumnCodec {
91    if values.len() < 2 {
92        return ColumnCodec::Gorilla;
93    }
94
95    let use_cascade = values.len() >= CASCADE_THRESHOLD;
96
97    if use_cascade {
98        // Check ALP encodability on a sample.
99        let encodability = crate::alp::alp_encodability(values);
100        if encodability > 0.95 {
101            return ColumnCodec::AlpFastLanesLz4;
102        }
103    }
104
105    // Fallback: Gorilla is the best general-purpose f64 codec.
106    ColumnCodec::Gorilla
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112
113    #[test]
114    fn explicit_codec_passthrough() {
115        assert_eq!(
116            detect_codec(ColumnCodec::Lz4, ColumnTypeHint::Timestamp),
117            ColumnCodec::Lz4
118        );
119        assert_eq!(
120            detect_codec(ColumnCodec::Zstd, ColumnTypeHint::Float64),
121            ColumnCodec::Zstd
122        );
123    }
124
125    #[test]
126    fn auto_timestamp() {
127        assert_eq!(
128            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Timestamp),
129            ColumnCodec::DeltaFastLanesLz4
130        );
131    }
132
133    #[test]
134    fn auto_float64() {
135        assert_eq!(
136            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Float64),
137            ColumnCodec::AlpFastLanesLz4
138        );
139    }
140
141    #[test]
142    fn auto_int64() {
143        assert_eq!(
144            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Int64),
145            ColumnCodec::DeltaFastLanesLz4
146        );
147    }
148
149    #[test]
150    fn auto_symbol() {
151        assert_eq!(
152            detect_codec(ColumnCodec::Auto, ColumnTypeHint::Symbol),
153            ColumnCodec::FastLanesLz4
154        );
155    }
156
157    #[test]
158    fn auto_string() {
159        assert_eq!(
160            detect_codec(ColumnCodec::Auto, ColumnTypeHint::String),
161            ColumnCodec::FsstLz4
162        );
163    }
164
165    #[test]
166    fn detect_large_i64_uses_cascade() {
167        // ≥128 values → cascading DeltaFastLanesLz4 regardless of pattern.
168        let values: Vec<i64> = (0..1000).map(|i| i * 100).collect();
169        assert_eq!(detect_i64_codec(&values), ColumnCodec::DeltaFastLanesLz4);
170
171        let timestamps: Vec<i64> = (0..1000).map(|i| 1_700_000_000_000 + i * 10_000).collect();
172        assert_eq!(
173            detect_i64_codec(&timestamps),
174            ColumnCodec::DeltaFastLanesLz4
175        );
176    }
177
178    #[test]
179    fn detect_small_i64_uses_legacy() {
180        // <128 values → legacy codecs.
181        let constant_rate: Vec<i64> = (0..50).map(|i| i * 100).collect();
182        assert_eq!(detect_i64_codec(&constant_rate), ColumnCodec::DoubleDelta);
183
184        let varying: Vec<i64> = vec![1, 3, 7, 15, 22, 30];
185        assert_eq!(detect_i64_codec(&varying), ColumnCodec::Delta);
186    }
187
188    #[test]
189    fn detect_large_f64_decimal_uses_alp() {
190        // Decimal-origin f64 with ≥128 values → AlpFastLanesLz4.
191        let values: Vec<f64> = (0..1000).map(|i| i as f64 * 0.1).collect();
192        assert_eq!(detect_f64_codec(&values), ColumnCodec::AlpFastLanesLz4);
193    }
194
195    #[test]
196    fn detect_large_f64_irrational_uses_gorilla() {
197        // Non-ALP-encodable f64 → Gorilla fallback.
198        let values: Vec<f64> = (1..1000).map(|i| std::f64::consts::PI * i as f64).collect();
199        assert_eq!(detect_f64_codec(&values), ColumnCodec::Gorilla);
200    }
201
202    #[test]
203    fn detect_small_f64_uses_gorilla() {
204        let values: Vec<f64> = (0..50).map(|i| i as f64 * 0.1).collect();
205        assert_eq!(detect_f64_codec(&values), ColumnCodec::Gorilla);
206    }
207
208    #[test]
209    fn small_sample() {
210        assert_eq!(detect_i64_codec(&[]), ColumnCodec::Delta);
211        assert_eq!(detect_i64_codec(&[42]), ColumnCodec::Delta);
212        assert_eq!(detect_f64_codec(&[]), ColumnCodec::Gorilla);
213    }
214}