tantivy_columnar/column_values/u64_based/
mod.rs1mod bitpacked;
2mod blockwise_linear;
3mod line;
4mod linear;
5mod stats_collector;
6
7use std::io;
8use std::io::Write;
9use std::sync::Arc;
10
11use common::{BinarySerializable, OwnedBytes};
12
13use crate::column_values::monotonic_mapping::{
14 StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
15};
16pub use crate::column_values::u64_based::bitpacked::BitpackedCodec;
17pub use crate::column_values::u64_based::blockwise_linear::BlockwiseLinearCodec;
18pub use crate::column_values::u64_based::linear::LinearCodec;
19pub use crate::column_values::u64_based::stats_collector::StatsCollector;
20use crate::column_values::{ColumnStats, monotonic_map_column};
21use crate::iterable::Iterable;
22use crate::{ColumnValues, MonotonicallyMappableToU64};
23
24pub trait ColumnCodecEstimator<T = u64>: 'static {
36 fn collect(&mut self, value: u64);
40 fn finalize(&mut self) {}
42 fn estimate(&self, stats: &ColumnStats) -> Option<u64>;
45 fn serialize(
48 &self,
49 stats: &ColumnStats,
50 vals: &mut dyn Iterator<Item = T>,
51 wrt: &mut dyn io::Write,
52 ) -> io::Result<()>;
53}
54
55pub trait ColumnCodec<T: PartialOrd = u64> {
57 type ColumnValues: ColumnValues<T> + 'static;
59 type Estimator: ColumnCodecEstimator + Default;
61
62 fn load(bytes: OwnedBytes) -> io::Result<Self::ColumnValues>;
64
65 fn estimator() -> Self::Estimator {
67 Self::Estimator::default()
68 }
69
70 fn boxed_estimator() -> Box<dyn ColumnCodecEstimator> {
72 Box::new(Self::estimator())
73 }
74}
75
76#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
78#[repr(u8)]
79pub enum CodecType {
80 Bitpacked = 0u8,
83 Linear = 1u8,
87 BlockwiseLinear = 2u8,
89}
90
91pub const ALL_U64_CODEC_TYPES: [CodecType; 3] = [
93 CodecType::Bitpacked,
94 CodecType::Linear,
95 CodecType::BlockwiseLinear,
96];
97
98impl CodecType {
99 fn to_code(self) -> u8 {
100 self as u8
101 }
102
103 fn try_from_code(code: u8) -> Option<CodecType> {
104 match code {
105 0u8 => Some(CodecType::Bitpacked),
106 1u8 => Some(CodecType::Linear),
107 2u8 => Some(CodecType::BlockwiseLinear),
108 _ => None,
109 }
110 }
111
112 fn load<T: MonotonicallyMappableToU64>(
113 &self,
114 bytes: OwnedBytes,
115 ) -> io::Result<Arc<dyn ColumnValues<T>>> {
116 match self {
117 CodecType::Bitpacked => load_specific_codec::<BitpackedCodec, T>(bytes),
118 CodecType::Linear => load_specific_codec::<LinearCodec, T>(bytes),
119 CodecType::BlockwiseLinear => load_specific_codec::<BlockwiseLinearCodec, T>(bytes),
120 }
121 }
122}
123
124fn load_specific_codec<C: ColumnCodec, T: MonotonicallyMappableToU64>(
125 bytes: OwnedBytes,
126) -> io::Result<Arc<dyn ColumnValues<T>>> {
127 let reader = C::load(bytes)?;
128 let reader_typed = monotonic_map_column(
129 reader,
130 StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<T>::new()),
131 );
132 Ok(Arc::new(reader_typed))
133}
134
135impl CodecType {
136 pub fn estimator(&self) -> Box<dyn ColumnCodecEstimator> {
138 match self {
139 CodecType::Bitpacked => BitpackedCodec::boxed_estimator(),
140 CodecType::Linear => LinearCodec::boxed_estimator(),
141 CodecType::BlockwiseLinear => BlockwiseLinearCodec::boxed_estimator(),
142 }
143 }
144}
145
146pub fn serialize_u64_based_column_values<T: MonotonicallyMappableToU64>(
148 vals: &dyn Iterable<T>,
149 codec_types: &[CodecType],
150 wrt: &mut dyn Write,
151) -> io::Result<()> {
152 let mut stats_collector = StatsCollector::default();
153 let mut estimators: Vec<(CodecType, Box<dyn ColumnCodecEstimator>)> =
154 Vec::with_capacity(codec_types.len());
155 for &codec_type in codec_types {
156 estimators.push((codec_type, codec_type.estimator()));
157 }
158 for val in vals.boxed_iter() {
159 let val_u64 = val.to_u64();
160 stats_collector.collect(val_u64);
161 for (_, estimator) in &mut estimators {
162 estimator.collect(val_u64);
163 }
164 }
165 for (_, estimator) in &mut estimators {
166 estimator.finalize();
167 }
168 let stats = stats_collector.stats();
169 let (_, best_codec, best_codec_estimator) = estimators
170 .into_iter()
171 .flat_map(|(codec_type, estimator)| {
172 let num_bytes = estimator.estimate(&stats)?;
173 Some((num_bytes, codec_type, estimator))
174 })
175 .min_by_key(|(num_bytes, _, _)| *num_bytes)
176 .ok_or_else(|| {
177 io::Error::new(io::ErrorKind::InvalidData, "No available applicable codec.")
178 })?;
179 best_codec.to_code().serialize(wrt)?;
180 best_codec_estimator.serialize(
181 &stats,
182 &mut vals.boxed_iter().map(MonotonicallyMappableToU64::to_u64),
183 wrt,
184 )?;
185 Ok(())
186}
187
188pub fn load_u64_based_column_values<T: MonotonicallyMappableToU64>(
192 mut bytes: OwnedBytes,
193) -> io::Result<Arc<dyn ColumnValues<T>>> {
194 let codec_type: CodecType = bytes
195 .first()
196 .copied()
197 .and_then(CodecType::try_from_code)
198 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Failed to read codec type"))?;
199 bytes.advance(1);
200 codec_type.load(bytes)
201}
202
203pub fn serialize_and_load_u64_based_column_values<T: MonotonicallyMappableToU64>(
205 vals: &dyn Iterable,
206 codec_types: &[CodecType],
207) -> Arc<dyn ColumnValues<T>> {
208 let mut buffer = Vec::new();
209 serialize_u64_based_column_values(vals, codec_types, &mut buffer).unwrap();
210 load_u64_based_column_values::<T>(OwnedBytes::new(buffer)).unwrap()
211}
212
213#[cfg(test)]
214mod tests;