csf/fp/
level_size_chooser.rs

1use bitm::ceiling_div;
2use std::mem::MaybeUninit;
3use fsum::FSum;
4use std::fmt;
5use std::fmt::Formatter;
6use crate::coding::Coding;
7
8/// Chooses the size of level for the given level input.
9pub trait LevelSizeChooser {
10
11    /// Returns number of 64-bit segments to use for given level input.
12    fn size_segments<C: Coding>(&self, _coding: &C, values: &[C::Codeword], _value_rev_indices: &[u8]) -> usize {
13        self.max_size_segments(values.len())
14    }
15
16    /// Returns maximal number of segment that can be returned by `size_segments` for level of size `max_level_size` or less.
17    fn max_size_segments(&self, max_level_size: usize) -> usize;
18}
19
20pub trait SimpleLevelSizeChooser {
21
22    /// Returns number of 64-bit segments to use for given level input.
23    fn size_segments(&self, values: &[u8], _bits_per_value: u8) -> usize {
24        self.max_size_segments(values.len())
25    }
26
27    /// Returns maximal number of segment that can be returned by `size_segments` for level of size `max_level_size` or less.
28    fn max_size_segments(&self, max_level_size: usize) -> usize;
29}
30
31/// Choose level size as a percent of the input size.
32#[derive(Copy, Clone)]
33pub struct ProportionalLevelSize {
34    pub percent: u16
35}
36
37impl ProportionalLevelSize {
38    pub fn with_percent(percent: u16) -> Self { Self{percent} }
39}
40
41impl Default for ProportionalLevelSize {
42    fn default() -> Self { Self::with_percent(90) } // 80 is a bit better than 90 but slower
43}
44
45impl LevelSizeChooser for ProportionalLevelSize {
46    fn max_size_segments(&self, max_level_size: usize) -> usize {
47        ceiling_div(max_level_size*self.percent as usize, 64*100)
48    }
49}
50
51impl SimpleLevelSizeChooser for ProportionalLevelSize {
52    fn max_size_segments(&self, max_level_size: usize) -> usize {
53        ceiling_div(max_level_size*self.percent as usize, 64*100)
54    }
55}
56
57impl fmt::Display for ProportionalLevelSize {
58    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
59        write!(f, "{}percent", self.percent)
60    }
61}
62
63/// Chooses optimal level size considering distribution of incidence of values.
64#[derive(Default, Copy, Clone)]
65pub struct OptimalLevelSize;
66
67/// Removes zeros from `count` and returns its prefix without zeros.
68fn remove_zeros(counts: &mut [u32]) -> &[u32] {
69    let mut counts_len = 0usize;
70    for i in 0usize..counts.len() {
71        if counts[i] != 0 {
72            counts[counts_len] = counts[i];
73            counts_len += 1;
74        }
75    }
76    &counts[0..counts_len]
77}
78
79/// For given distribution of incidence of values `counts` and `input_size` (sum of counts),
80/// returns probabilities of k positive collisions, for all k = 0, 1, ..., 15.
81pub(crate) fn positive_collisions_prob(counts: &mut [u32], input_size: usize) -> [f64; 16] {
82    let counts = remove_zeros(counts);
83    let mut array: [MaybeUninit<f64>; 16] = unsafe { MaybeUninit::uninit().assume_init() };
84    for i in 0usize..array.len() {   // k = i + 1 = 1, 2, ...
85        let mut r = FSum::new();
86        for c in counts {
87            //if *c > i as u32 {
88            //    r += (*c as f64 / input_size as f64).powi(i as i32+1);
89            //}
90            if *c > i as u32 {
91                r += (0..=i)
92                    .map(|v| (*c - v as u32) as f64 / (input_size - v) as f64)
93                    .fold(1.0, |a, b| a * b);
94            }
95        }
96        array[i] = MaybeUninit::new(r.value());
97    }
98    unsafe { std::mem::transmute::<_, [f64; 16]>(array) }
99}
100
101impl OptimalLevelSize {
102    fn size_segments_for_dist(counts: &mut [u32], input_size: usize, bits_per_fragment: u8) -> usize {
103        let mut result = ceiling_div(input_size, 64);
104        if result == 1 { return 1; }
105        let positive_collisions_p = positive_collisions_prob(counts, input_size);
106        let mut result_eval = f64::MAX;
107        while result >= 1 {
108            let mut numerator = FSum::with_value(1.0625);
109            /*#[cfg(feature = "simple_rank")] let mut numerator = FSum::with_value(1.0625);
110            #[cfg(not(feature = ""))] let mut numerator = FSum::with_value(1.03125);*/
111            numerator += 1.0 / result as f64;   // = 64bit / (level size * 64bit)
112            let mut denominator = FSum::new();
113
114            let lambda = input_size as f64 / (result * 64) as f64;
115            let mut lambda_to_power_k = lambda;
116            let mut k_factorial = 1u64;
117            for i in 0usize..16 {
118                let k = i as u32 + 1;
119                k_factorial *= k as u64;
120                let pk = positive_collisions_p[i] * lambda_to_power_k * (-lambda).exp() / k_factorial as f64;
121                lambda_to_power_k *= lambda;
122
123                numerator += pk * bits_per_fragment as f64;
124                denominator += pk * k as f64;
125            }
126            let new_result_eval = numerator.value() / denominator.value();
127            if new_result_eval >= result_eval {  // impossible in the first iteration
128                return result + 1;
129            }
130            result_eval = new_result_eval;
131            result -= 1;
132        }
133        1
134    }
135    //Licznik = (suma po k = 1... z pk(k)) * bits_per_fragment + 1
136    //Mianownik = suma po k = 1 ... z k*pk(k)
137    //pk(k) = positive_collisions_p(k) * d.pmf(k)
138    //positive_collisions_p(k) = suma po v = [prawdopdobieństwa (udziały) wystąpień różnych wartości] z v**k
139    //    (positive_collisions_p to prawdopobieństwo pozytywnej kolizji dla k elementów trafiających w ten sam indeks)
140    //d.pmf(k) = prawdopobieństwo k sukcesów według rozkładu
141    //  poisson(licza fragmentów do zapisania, wielkość wejścia / wielkość tablicy, liczba wpisów)
142}
143
144impl LevelSizeChooser for OptimalLevelSize {
145    fn size_segments<C: Coding>(&self, coding: &C, values: &[C::Codeword], value_rev_indices: &[u8]) -> usize {
146        let mut counts = [0u32; 256];
147        for (c, ri) in values.iter().zip(value_rev_indices.iter()) {
148            counts[coding.rev_fragment_of(*c, *ri) as usize] += 1;
149        }
150        Self::size_segments_for_dist(
151            &mut counts[0..=coding.max_fragment_value() as usize],
152            values.len(),
153            coding.bits_per_fragment()
154        )
155    }
156
157    fn max_size_segments(&self, max_level_size: usize) -> usize {
158        ceiling_div(max_level_size, 64)
159    }
160}
161
162impl SimpleLevelSizeChooser for OptimalLevelSize {
163    fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize {
164        let mut counts = [0u32; 256];
165        for v in values { counts[*v as usize] += 1; }
166        Self::size_segments_for_dist(
167            &mut counts[0..(1usize<<bits_per_value)],
168            values.len(),
169            bits_per_value
170        )
171    }
172
173    fn max_size_segments(&self, max_level_size: usize) -> usize {
174        ceiling_div(max_level_size, 64)
175    }
176}
177
178impl fmt::Display for OptimalLevelSize {
179    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
180        write!(f, "optimal")
181    }
182}
183
184
185#[derive(Copy, Clone)]
186pub struct OptimalGroupedLevelSize {
187    pub divider: u8
188}
189
190impl Default for OptimalGroupedLevelSize {
191    fn default() -> Self {
192        Self { divider: 1 }
193    }
194}
195
196impl OptimalGroupedLevelSize {
197    pub fn with_divider(divider: u8) -> Self {
198        Self { divider: divider.max(1) }
199    }
200}
201
202impl SimpleLevelSizeChooser for OptimalGroupedLevelSize {
203    fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize {
204        let divider = self.divider as usize;
205        let max_value = (1usize<<bits_per_value) - 1;
206        (0..divider).map(|delta| {
207            let mut counts = [0u32; 256];
208            for v in values { counts[(*v as usize + delta) / divider] += 1; }
209            OptimalLevelSize::size_segments_for_dist(
210                &mut counts[0 ..= (max_value + delta) / divider],
211                values.len(),
212                bits_per_value  // this must be unchanged as it is used to calculate memory used by a value
213            )
214        }).min().unwrap()
215    }
216
217    fn max_size_segments(&self, max_level_size: usize) -> usize {
218        ceiling_div(max_level_size, 64)
219    }
220}
221
222impl fmt::Display for OptimalGroupedLevelSize {
223    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
224        write!(f, "optimal_grouped{}", self.divider)
225    }
226}
227
228/// Resize level obtained from another chooser.
229#[derive(Copy, Clone)]
230pub struct ResizedLevel<LSC> {
231    pub level_size_chooser: LSC,
232    pub percent: u16
233}
234
235impl<LSC> ResizedLevel<LSC> {
236    #[inline(always)] pub fn new(percent: u16, level_size_chooser: LSC) -> Self {
237        Self { level_size_chooser, percent }
238    }
239
240    #[inline(always)] fn resized(&self, size: usize) -> usize {
241        ceiling_div(size * self.percent as usize, 100)
242    }
243}
244
245impl<LSC: LevelSizeChooser> LevelSizeChooser for ResizedLevel<LSC> {
246    fn size_segments<C: Coding>(&self, coding: &C, values: &[C::Codeword], value_rev_indices: &[u8]) -> usize {
247        self.resized(self.level_size_chooser.size_segments(coding, values, value_rev_indices))
248    }
249
250    fn max_size_segments(&self, max_level_size: usize) -> usize {
251        self.resized(self.level_size_chooser.max_size_segments(max_level_size))
252    }
253}
254
255impl<LSC: SimpleLevelSizeChooser> SimpleLevelSizeChooser for ResizedLevel<LSC> {
256    #[inline(always)] fn size_segments(&self, values: &[u8], bits_per_value: u8) -> usize {
257        self.resized(self.level_size_chooser.size_segments(values, bits_per_value))
258    }
259
260    #[inline(always)] fn max_size_segments(&self, max_level_size: usize) -> usize {
261        self.resized(max_level_size)
262    }
263}