tantivy_columnar/column_values/u64_based/
mod.rs

1mod bitpacked;
2mod blockwise_linear;
3mod line;
4mod linear;
5mod stats_collector;
6
7use std::io;
8use std::io::Write;
9use std::sync::Arc;
10
11use common::{BinarySerializable, OwnedBytes};
12
13use crate::column_values::monotonic_mapping::{
14    StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
15};
16pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
17pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
18pub use crate::column_values::u64_based::linear::LinearCodec;
19pub use crate::column_values::u64_based::stats_collector::StatsCollector;
20use crate::column_values::{ColumnStats, monotonic_map_column};
21use crate::iterable::Iterable;
22use crate::{ColumnValues, MonotonicallyMappableToU64};
23
24/// A `ColumnCodecEstimator` is in charge of gathering all
25/// data required to serialize a column.
26///
27/// This happens during a first pass on data of the column elements.
28/// During that pass, all column estimators receive a call to their
29/// `.collect(el)`.
30///
31/// After this first pass, finalize is called.
32/// `.estimate(..)` then should return an accurate estimation of the
33/// size of the serialized column (were we to pick this codec.).
34/// `.serialize(..)` then serializes the column using this codec.
35pub trait ColumnCodecEstimator<T = u64>: 'static {
36    /// Records a new value for estimation.
37    /// This method will be called for each element of the column during
38    /// `estimation`.
39    fn collect(&mut self, value: u64);
40    /// Finalizes the first pass phase.
41    fn finalize(&mut self) {}
42    /// Returns an accurate estimation of the number of bytes that will
43    /// be used to represent this column.
44    fn estimate(&self, stats: &ColumnStats) -> Option<u64>;
45    /// Serializes the column using the given codec.
46    /// This constitutes a second pass over the columns values.
47    fn serialize(
48        &self,
49        stats: &ColumnStats,
50        vals: &mut dyn Iterator<Item = T>,
51        wrt: &mut dyn io::Write,
52    ) -> io::Result<()>;
53}
54
55/// A column codec describes a colunm serialization format.
56pub trait ColumnCodec<T: PartialOrd = u64> {
57    /// Specialized `ColumnValues` type.
58    type ColumnValues: ColumnValues<T> + 'static;
59    /// `Estimator` for the given codec.
60    type Estimator: ColumnCodecEstimator + Default;
61
62    /// Loads a column that has been serialized using this codec.
63    fn load(bytes: OwnedBytes) -> io::Result<Self::ColumnValues>;
64
65    /// Returns an estimator.
66    fn estimator() -> Self::Estimator {
67        Self::Estimator::default()
68    }
69
70    /// Returns a boxed estimator.
71    fn boxed_estimator() -> Box<dyn ColumnCodecEstimator> {
72        Box::new(Self::estimator())
73    }
74}
75
76/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
77#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
78#[repr(u8)]
79pub enum CodecType {
80    /// Bitpack all values in the value range. The number of bits is defined by the amplitude
81    /// `column.max_value() - column.min_value()`
82    Bitpacked = 0u8,
83    /// Linear interpolation puts a line between the first and last value and then bitpacks the
84    /// values by the offset from the line. The number of bits is defined by the max deviation from
85    /// the line.
86    Linear = 1u8,
87    /// Same as [`CodecType::Linear`], but encodes in blocks of 512 elements.
88    BlockwiseLinear = 2u8,
89}
90
91/// List of all available u64-base codecs.
92pub const ALL_U64_CODEC_TYPES: [CodecType; 3] = [
93    CodecType::Bitpacked,
94    CodecType::Linear,
95    CodecType::BlockwiseLinear,
96];
97
98impl CodecType {
99    fn to_code(self) -> u8 {
100        self as u8
101    }
102
103    fn try_from_code(code: u8) -> Option<CodecType> {
104        match code {
105            0u8 => Some(CodecType::Bitpacked),
106            1u8 => Some(CodecType::Linear),
107            2u8 => Some(CodecType::BlockwiseLinear),
108            _ => None,
109        }
110    }
111
112    fn load<T: MonotonicallyMappableToU64>(
113        &self,
114        bytes: OwnedBytes,
115    ) -> io::Result<Arc<dyn ColumnValues<T>>> {
116        match self {
117            CodecType::Bitpacked => load_specific_codec::<BitpackedCodec, T>(bytes),
118            CodecType::Linear => load_specific_codec::<LinearCodec, T>(bytes),
119            CodecType::BlockwiseLinear => load_specific_codec::<BlockwiseLinearCodec, T>(bytes),
120        }
121    }
122}
123
124fn load_specific_codec<C: ColumnCodec, T: MonotonicallyMappableToU64>(
125    bytes: OwnedBytes,
126) -> io::Result<Arc<dyn ColumnValues<T>>> {
127    let reader = C::load(bytes)?;
128    let reader_typed = monotonic_map_column(
129        reader,
130        StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<T>::new()),
131    );
132    Ok(Arc::new(reader_typed))
133}
134
135impl CodecType {
136    /// Returns a boxed codec estimator associated to a given `CodecType`.
137    pub fn estimator(&self) -> Box<dyn ColumnCodecEstimator> {
138        match self {
139            CodecType::Bitpacked => BitpackedCodec::boxed_estimator(),
140            CodecType::Linear => LinearCodec::boxed_estimator(),
141            CodecType::BlockwiseLinear => BlockwiseLinearCodec::boxed_estimator(),
142        }
143    }
144}
145
146/// Serializes a given column of u64-mapped values.
147pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
148    vals: &dyn Iterable<T>,
149    codec_types: &[CodecType],
150    wrt: &mut dyn Write,
151) -> io::Result<()> {
152    let mut stats_collector = StatsCollector::default();
153    let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
154        Vec::with_capacity(codec_types.len());
155    for &codec_type in codec_types {
156        estimators.push((codec_type, codec_type.estimator()));
157    }
158    for val in vals.boxed_iter() {
159        let val_u64 = val.to_u64();
160        stats_collector.collect(val_u64);
161        for (_, estimator) in &mut estimators {
162            estimator.collect(val_u64);
163        }
164    }
165    for (_, estimator) in &mut estimators {
166        estimator.finalize();
167    }
168    let stats = stats_collector.stats();
169    let (_, best_codec, best_codec_estimator) = estimators
170        .into_iter()
171        .flat_map(|(codec_type, estimator)| {
172            let num_bytes = estimator.estimate(&stats)?;
173            Some((num_bytes, codec_type, estimator))
174        })
175        .min_by_key(|(num_bytes, _, _)| *num_bytes)
176        .ok_or_else(|| {
177            io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
178        })?;
179    best_codec.to_code().serialize(wrt)?;
180    best_codec_estimator.serialize(
181        &stats,
182        &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
183        wrt,
184    )?;
185    Ok(())
186}
187
188/// Load u64-based column values.
189///
190/// This method first identifies the codec off the first byte.
191pub fn load_u64_based_column_values<T: MonotonicallyMappableToU64>(
192    mut bytes: OwnedBytes,
193) -> io::Result<Arc<dyn ColumnValues<T>>> {
194    let codec_type: CodecType = bytes
195        .first()
196        .copied()
197        .and_then(CodecType::try_from_code)
198        .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Failed to read codec type"))?;
199    bytes.advance(1);
200    codec_type.load(bytes)
201}
202
203/// Helper function to serialize a column (autodetect from all codecs) and then open it
204pub fn serialize_and_load_u64_based_column_values<T: MonotonicallyMappableToU64>(
205    vals: &dyn Iterable,
206    codec_types: &[CodecType],
207) -> Arc<dyn ColumnValues<T>> {
208    let mut buffer = Vec::new();
209    serialize_u64_based_column_values(vals, codec_types, &mut buffer).unwrap();
210    load_u64_based_column_values::<T>(OwnedBytes::new(buffer)).unwrap()
211}
212
213#[cfg(test)]
214mod tests;