1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
use crate::constants::DEFAULT_MAX_PAGE_N;
use crate::errors::{PcoError, PcoResult};
use crate::{bits, DEFAULT_COMPRESSION_LEVEL};
/// Configures whether integer multiplier detection is enabled.
///
/// Examples where this helps:
/// * nanosecond-precision timestamps that are mostly whole numbers of
/// microseconds, with a few exceptions
/// * integers `[7, 107, 207, 307, ... 100007]` shuffled
///
/// When this is helpful, compression and decompression speeds can be
/// substantially reduced. This configuration may hurt
/// compression speed slightly even when it isn't helpful.
/// However, the compression ratio improvements tend to be large.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum IntMultSpec {
Disabled,
#[default]
Enabled,
}
/// Configures whether float multiplier detection is enabled.
///
/// Examples where this helps:
/// * approximate multiples of 0.01
/// * approximate multiples of pi
///
/// Float mults can work even when there are NaNs and infinities.
/// When this is helpful, compression and decompression speeds can be
/// substantially reduced. In rare cases, this configuration
/// may reduce compression speed somewhat even when it isn't helpful.
/// However, the compression ratio improvements tend to be large.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum FloatMultSpec {
Disabled,
#[default]
Enabled,
// TODO support a LossyEnabled mode that always drops the ULPs latent var
}
/// All configurations available for a compressor.
///
/// Some, like `delta_encoding_order`, are explicitly stored in the
/// compressed bytes.
/// Others, like `compression_level`, affect compression but are not explicitly
/// stored in the output.
#[derive(Clone, Debug)]
#[non_exhaustive]
pub struct ChunkConfig {
/// `compression_level` ranges from 0 to 12 inclusive (default: 8).
///
/// At present,
/// * Level 0 achieves only a small amount of compression.
/// * Level 8 achieves very good compression and runs
/// only slightly slower.
/// * Level 12 achieves marginally better compression than 8
/// and may run several times slower.
///
/// At present, the compression levels cover a relatively small range of the
/// compression time vs. ratio tradeoff.
/// However, the meaning of the compression levels is subject to change with
/// new releases.
pub compression_level: usize,
/// `delta_encoding_order` ranges from 0 to 7 inclusive (default:
/// `None`, automatically detecting on each chunk).
///
/// It is the number of times to apply delta encoding
/// before compressing. For instance, say we have the numbers
/// `[0, 2, 2, 4, 4, 6, 6]` and consider different delta encoding orders.
/// * 0th order takes numbers as-is.
/// This is perfect for columnar data were the order is essentially random.
/// * 1st order takes consecutive differences, leaving
/// `[0, 2, 0, 2, 0, 2, 0]`. This is best for continuous but noisy time
/// series data, like stock prices or most time series data.
/// * 2nd order takes consecutive differences again,
/// leaving `[2, -2, 2, -2, 2, -2]`. This is best for piecewise-linear or
/// somewhat quadratic data.
/// * Even higher-order is best for time series that are very
/// smooth, like temperature or light sensor readings.
///
/// If you would like to automatically choose this once and reuse it for all
/// chunks,
/// [`auto_compressor_config()`][crate::auto_delta_encoding_order] can help.
pub delta_encoding_order: Option<usize>,
/// Integer multiplier mode improves compression ratio in cases where many
/// numbers are congruent modulo an integer `base`
/// (default: `Enabled`).
///
/// See [`IntMultSpec`][crate::IntMultSpec] for more detail.
pub int_mult_spec: IntMultSpec,
/// Float multiplier mode improves compression ratio in cases where the data
/// type is a float and all numbers are close to a multiple of a float
/// `base`
/// (default: `Enabled`).
///
/// See [`FloatMultSpec`][crate::FloatMultSpec] for more detail.
pub float_mult_spec: FloatMultSpec,
/// `paging_spec` specifies how the chunk should be split into pages
/// (default: equal pages up to 2^18 numbers each).
///
/// See [`PagingSpec`][crate::PagingSpec] for more information.
pub paging_spec: PagingSpec,
}
impl Default for ChunkConfig {
fn default() -> Self {
Self {
compression_level: DEFAULT_COMPRESSION_LEVEL,
delta_encoding_order: None,
int_mult_spec: IntMultSpec::Enabled,
float_mult_spec: FloatMultSpec::Enabled,
paging_spec: PagingSpec::EqualPagesUpTo(DEFAULT_MAX_PAGE_N),
}
}
}
impl ChunkConfig {
/// Sets [`compression_level`][ChunkConfig::compression_level].
pub fn with_compression_level(mut self, level: usize) -> Self {
self.compression_level = level;
self
}
/// Sets [`delta_encoding_order`][ChunkConfig::delta_encoding_order].
pub fn with_delta_encoding_order(mut self, order: Option<usize>) -> Self {
self.delta_encoding_order = order;
self
}
/// Sets [`int_mult_spec`][ChunkConfig::int_mult_spec].
pub fn with_int_mult_spec(mut self, int_mult_spec: IntMultSpec) -> Self {
self.int_mult_spec = int_mult_spec;
self
}
/// Sets [`float_mult_spec`][ChunkConfig::float_mult_spec].
pub fn with_float_mult_spec(mut self, float_mult_spec: FloatMultSpec) -> Self {
self.float_mult_spec = float_mult_spec;
self
}
/// Sets [`paging_spec`][ChunkConfig::paging_spec].
pub fn with_paging_spec(mut self, paging_spec: PagingSpec) -> Self {
self.paging_spec = paging_spec;
self
}
}
/// `PagingSpec` specifies how a chunk is split into pages.
#[derive(Clone, Debug)]
#[non_exhaustive]
pub enum PagingSpec {
/// Divide the chunk into equal pages of up to this many numbers.
///
/// For example, with equal pages up to 100,000, a chunk of 150,000
/// numbers would be divided into 2 pages, each of 75,000 numbers.
EqualPagesUpTo(usize),
/// Divide the chunk into the exactly provided counts.
///
/// Will return an InvalidArgument error during compression if
/// any of the counts are 0 or the sum does not equal the chunk count.
ExactPageSizes(Vec<usize>),
}
impl Default for PagingSpec {
fn default() -> Self {
Self::EqualPagesUpTo(DEFAULT_MAX_PAGE_N)
}
}
impl PagingSpec {
pub(crate) fn n_per_page(&self, n: usize) -> PcoResult<Vec<usize>> {
let n_per_page = match self {
// TODO in 0.2 make this error if max_size isn't a multiple of full batch size
// and try to make all but one page a multiple of full batch size
PagingSpec::EqualPagesUpTo(max_size) => {
let n_pages = bits::ceil_div(n, *max_size);
let mut res = Vec::new();
let mut start = 0;
for i in 0..n_pages {
let end = ((i + 1) * n) / n_pages;
res.push(end - start);
start = end;
}
res
}
PagingSpec::ExactPageSizes(n_per_page) => n_per_page.to_vec(),
};
let summed_n: usize = n_per_page.iter().sum();
if summed_n != n {
return Err(PcoError::invalid_argument(format!(
"paging spec suggests {} numbers but {} were given",
summed_n, n,
)));
}
for &page_n in &n_per_page {
if page_n == 0 {
return Err(PcoError::invalid_argument(
"cannot write data page of 0 numbers",
));
}
}
Ok(n_per_page)
}
}