Skip to main content

ph_temp/phast/
compressed_array.rs

1use std::{io, isize, marker::PhantomData};
2
3use binout::{AsIs, Serializer, VByte};
4use bitm::{bits_to_store, ceiling_div, get_bits57, init_bits57, n_lowest_bits, BitAccess, BitVec};
5use dyn_size_of::GetSize;
6#[cfg(feature = "sux")] use sux::traits::IndexedSeq;
7
8/// Compressed array of usize integers that can be used by `PHast`.
9pub trait CompressedArray {
10
11    /// Expect `values` to have `usize::MAX` for unused values; `false` if `values` must be sorted.
12    const MAX_FOR_UNUSED: bool = false;
13
14    /// Construct `Self`.
15    fn new(values: Vec<usize>, last_in_value: usize, number_of_keys: usize) -> Self;
16
17    /// Get `index`-th item from the array.
18    fn get(&self, index: usize) -> usize;
19
20    /// Writes `self` to the `output`.
21    fn write(&self, output: &mut dyn io::Write) -> io::Result<()>;
22
23    /// Returns number of bytes which `write` will write.
24    fn write_bytes(&self) -> usize;
25
26    /// Read `Self` from the `input`.
27    fn read(input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized;
28}
29
30/// Builder used to construct `CompressedArray`.
31pub trait CompressedBuilder: Sized {
32    fn new(num_of_values: usize, max_value: usize) -> Self;
33    fn push(&mut self, value: usize);
34
35    #[inline]
36    fn with_all(values: Vec<usize>, last: usize) -> Self {
37        let mut builder = Self::new(values.len(), last);
38        for value in values { builder.push(value); }
39        builder
40    }
41}
42
43/// CompressedArray implementation by Elias-Fano from `cseq` crate.
44#[cfg(feature = "cseq")] pub type CSeqEliasFano = cseq::elias_fano::Sequence<bitm::CombinedSampling<bitm::ConstCombinedSamplingDensity<11>>, bitm::BinaryRankSearch>;
45
46#[cfg(feature = "cseq")]
47impl CompressedBuilder for cseq::elias_fano::Builder {
48    #[inline] fn new(num_of_values: usize, max_value: usize) -> Self {
49        cseq::elias_fano::Builder::new(num_of_values, max_value as u64+1)
50    }
51
52    #[inline] fn push(&mut self, value: usize) {
53        unsafe { self.push_unchecked(value as u64); }
54    }
55}
56
57#[cfg(feature = "cseq")]
58impl CompressedArray for CSeqEliasFano {
59
60    fn new(values: Vec<usize>, last: usize, _num_of_keys: usize) -> Self {
61        cseq::elias_fano::Builder::with_all(values, last).finish_s()
62    }
63
64    #[inline]
65    fn get(&self, index: usize) -> usize {
66        //self.get_or_panic(index) as usize
67        unsafe { self.get_unchecked(index) as usize }
68    }
69    
70    fn write(&self, output: &mut dyn io::Write) -> io::Result<()> {
71        CSeqEliasFano::write(&self, output)
72    }
73    
74    fn write_bytes(&self) -> usize {
75        CSeqEliasFano::write_bytes(&self)
76    }
77    
78    fn read(input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized {
79        CSeqEliasFano::read_s(input)
80    }
81}
82
83/// Represents linear function f(i) = floor((multiplier*i + offset) / divider).
84pub struct LinearRegression {
85    multiplier: isize,   // can be usize
86    divider: isize, // can be usize
87    offset: isize,  // must be isize
88}
89
90impl LinearRegression {
91    /// Constructs `LinearRegression` (with given `multiplier/divider` linear coefficient) and possibly small array of corrections
92    /// that can produce given `values`.
93    pub fn new(multiplier: usize, divider: usize, values: Vec<usize>) -> (Self, CompactFast) {
94        let mut max_diff = isize::MIN;   // max value - predicted difference = max correction
95        let mut min_diff = isize::MAX;   // min value - predicted difference = min correction
96        for (i, v) in values.iter().copied().enumerate() {
97            if v == usize::MAX { continue; }
98            let diff = (i * multiplier) as isize - (v * divider) as isize;   // divide by divider here?
99            if diff > max_diff { max_diff = diff }
100            if diff < min_diff { min_diff = diff }
101        }
102        let regression = LinearRegression {
103            multiplier: multiplier as isize,
104            divider: divider as isize,
105            offset: min_diff
106        };
107        let max_correction = (max_diff - min_diff) as usize / divider;
108        let mut corrections = CompactFastBuilder::new(values.len(), max_correction);
109        //let mut real_max_correction = usize::MIN;
110        //let mut real_min_correction = usize::MAX;
111        for (i, v) in values.iter().copied().enumerate() {
112            if v == usize::MAX {
113                corrections.push(0);
114            } else {
115                let correction = regression.get(i) - v as isize;
116                debug_assert!(correction >= 0);
117                let correction = correction as usize;
118                debug_assert!(correction <= max_correction, "{correction} <= {max_correction}");
119                corrections.push(correction as usize);
120                //if correction > real_max_correction { real_max_correction = correction; }
121                //if correction < real_min_correction { real_min_correction = correction; }
122            }
123        }
124        //assert_eq!(real_min_correction, 0);
125        //assert_eq!(real_max_correction, max_correction);
126        (regression, corrections.compact)
127    }
128
129    /// Add `total_offset` to each value returned by `get`.
130    /* #[inline] pub fn add_total_offset(&mut self, total_offset: usize) {
131        self.offset += dbg!(total_offset * self.divider) as isize;
132    }*/
133
134    /// Returns the value of function.
135    #[inline(always)] pub fn get(&self, i: usize) -> isize {
136        (self.multiplier * i as isize - self.offset) / self.divider 
137    }
138}
139
140pub trait LinearRegressionConstructor {
141    /// Returns linear coefficient as numerator and denominator.
142    fn new(values: &[usize], num_of_keys: usize) -> (usize, usize);
143}
144
145pub struct Simple;
146
147impl LinearRegressionConstructor for Simple {
148    #[inline] fn new(values: &[usize], num_of_keys: usize) -> (usize, usize) {
149        (num_of_keys, values.len()+1)
150    }
151}
152
153pub struct LeastSquares;
154
155impl LinearRegressionConstructor for LeastSquares {
156    fn new(values: &[usize], _num_of_keys: usize) -> (usize, usize) {        
157        let mut n= 0u128;
158        let mut x_sum = 0;
159        let mut y_sum = 0;
160        let mut x_sqr_sum = 0;
161        let mut xy_sum = 0;
162        for (x, y) in values.iter().copied().enumerate() {
163            if y == usize::MAX { continue; }
164            n += 1;
165            x_sum += x as u128;
166            y_sum += y as u128;
167            x_sqr_sum += (x as u128) * (x as u128);
168            xy_sum += (x as u128) * (y as u128);
169        }
170        if n == 0 { return (1, 1); }
171        let mut multiplier = (n * xy_sum).abs_diff(x_sum * y_sum);
172        let mut divider = (n * x_sqr_sum).abs_diff(x_sum * x_sum);
173        let max_vals = (1<<(isize::BITS-2)) / n;
174        if multiplier > max_vals || divider > max_vals {
175            let div = (multiplier / max_vals).max(divider / max_vals);
176            let divh = div / 2;
177            multiplier = (multiplier + divh) / div;
178            divider = (divider + divh) / div;
179        }
180        (multiplier as usize, divider as usize)
181    }
182}
183
184/// Implementation of `CompressedArray` that stores differences of values and linear regression
185/// with the same number of bits required to store the largest difference.
186pub struct LinearRegressionArray<C> {
187    regression: LinearRegression,
188    corrections: CompactFast,
189    constructor: PhantomData<C>
190}
191
192impl<C: LinearRegressionConstructor> CompressedArray for LinearRegressionArray<C> {
193    const MAX_FOR_UNUSED: bool = true;
194    
195    fn new(values: Vec<usize>, _last: usize, num_of_keys: usize) -> Self {
196        let (multiplier, divider) = C::new(&values, num_of_keys);
197        let (regression, corrections) = LinearRegression::new(multiplier, divider, values);
198        Self { regression, corrections, constructor: PhantomData }
199    }
200
201    fn get(&self, index: usize) -> usize {
202        (self.regression.get(index) - self.corrections.get(index) as isize) as usize
203        //(unsafe { get_bits57(self.corrections.as_ptr(), index * self.bits_per_correction as usize) & n_lowest_bits(self.bits_per_correction) }) as usize
204    }
205    
206    fn write(&self, output: &mut dyn io::Write) -> io::Result<()> {
207        AsIs::write(output, self.regression.multiplier as usize)?;
208        AsIs::write(output, self.regression.divider as usize)?;
209        AsIs::write(output, self.regression.offset as usize)?;
210        self.corrections.write(output)
211    }
212        
213    fn write_bytes(&self) -> usize {
214        AsIs::size(self.regression.multiplier as usize)
215        + AsIs::size(self.regression.divider as usize)
216        + AsIs::size(self.regression.offset as usize)
217        + self.corrections.write_bytes()
218    }
219
220    /// Read `Self` from the `input`.
221    fn read(input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized {
222        let multiplier: usize = AsIs::read(input)?;
223        let divider: usize = AsIs::read(input)?;
224        let offset: usize = AsIs::read(input)?;
225        let corrections = CompactFast::read(input)?;
226        Ok(Self {
227            regression: LinearRegression {
228                multiplier: multiplier as isize,
229                divider: divider as isize,
230                offset: offset as isize,
231            },
232            corrections,
233            constructor: PhantomData
234        })
235    }
236}
237
238impl<C> GetSize for LinearRegressionArray<C> {
239    fn size_bytes_dyn(&self) -> usize { self.corrections.size_bytes_dyn() }
240    fn size_bytes_content_dyn(&self) -> usize { self.corrections.size_bytes_dyn() }
241    const USES_DYN_MEM: bool = true;
242}
243
244/// Implementation of `CompressedArray` that stores each value with the same number of bits required to store the largest one.
245pub struct Compact {
246    pub items: Box<[u64]>,
247    pub item_size: u8,
248}
249
250pub struct CompactBuilder {
251    compact: Compact,
252    index: usize
253}
254
255impl CompressedBuilder for CompactBuilder {
256    fn new(num_of_values: usize, max_value: usize) -> Self {
257        let item_size = bits_to_store(max_value as u64);
258        Self {
259            compact: Compact { items: Box::with_zeroed_bits(item_size as usize * num_of_values), item_size },
260            index: 0
261        }
262    }
263
264    #[inline] fn push(&mut self, value: usize) {
265        self.compact.items.init_successive_bits(&mut self.index, value as u64, self.compact.item_size);
266    }
267}
268
269impl GetSize for Compact {
270    fn size_bytes_dyn(&self) -> usize { self.items.size_bytes_dyn() }
271    fn size_bytes_content_dyn(&self) -> usize { self.items.size_bytes_dyn() }
272    const USES_DYN_MEM: bool = true;
273}
274
275impl CompressedArray for Compact {
276    fn new(values: Vec<usize>, last: usize, _num_of_keys: usize) -> Self {
277        CompactBuilder::with_all(values, last).compact
278    }
279
280    #[inline]
281    fn get(&self, index: usize) -> usize {
282        unsafe { self.items.get_fragment_unchecked(index, self.item_size) as usize }
283    }
284
285    fn write(&self, output: &mut dyn io::Write) -> io::Result<()> {
286        VByte::write(output, self.item_size)?;
287        AsIs::write_array(output, &self.items)
288    }
289
290    fn write_bytes(&self) -> usize {
291        VByte::size(self.item_size) + AsIs::array_size(&self.items)
292    }
293
294    fn read(input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized {
295        let item_size = VByte::read(input)?;
296        let items = AsIs::read_array(input)?;
297        Ok(Self { items, item_size })
298    }
299}
300
301
302/// Implementation of `CompressedArray` that stores each value with the same number of bits required to store the largest one.
303/// It uses unaligned memory reading and writing.
304pub struct CompactFast {
305    pub items: Box<[u8]>,
306    pub item_size: u8,
307}
308
309pub struct CompactFastBuilder {
310    compact: CompactFast,
311    first_bit: usize,
312}
313
314impl CompressedBuilder for CompactFastBuilder {
315    #[inline] fn new(num_of_values: usize, max_value: usize) -> Self {
316        let item_size = bits_to_store(max_value as u64);
317        Self {
318            compact: CompactFast { items: vec![0; ceiling_div(item_size as usize * num_of_values, 8) + 7].into_boxed_slice(), item_size },
319            first_bit: 0,
320        }
321    }
322
323    #[inline] fn push(&mut self, value: usize) {
324        unsafe{init_bits57(self.compact.items.as_mut_ptr(), self.first_bit, value as u64)};
325        self.first_bit += self.compact.item_size as usize;
326    }
327}
328
329impl GetSize for CompactFast {
330    fn size_bytes_dyn(&self) -> usize { self.items.size_bytes_dyn() }
331    fn size_bytes_content_dyn(&self) -> usize { self.items.size_bytes_dyn() }
332    const USES_DYN_MEM: bool = true;
333}
334
335impl CompressedArray for CompactFast {
336    fn new(values: Vec<usize>, last: usize, _num_of_keys: usize) -> Self {
337        CompactFastBuilder::with_all(values, last).compact
338    }
339
340    #[inline]
341    fn get(&self, index: usize) -> usize {
342        (unsafe { get_bits57(self.items.as_ptr(), index * self.item_size as usize) & n_lowest_bits(self.item_size) }) as usize
343    }
344
345    #[inline]
346    fn write(&self, output: &mut dyn io::Write) -> io::Result<()> {
347        VByte::write(output, self.item_size)?;
348        AsIs::write_array(output, &self.items)
349    }
350
351    fn write_bytes(&self) -> usize {
352        VByte::size(self.item_size) + AsIs::array_size(&self.items)
353    }
354
355    fn read(input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized {
356        let item_size = VByte::read(input)?;
357        let items = AsIs::read_array(input)?;
358        Ok(Self { items, item_size })
359    }
360}
361
362
363/// CompressedArray implementation by Elias-Fano from `sux` crate.
364#[cfg(feature = "sux")] pub struct SuxEliasFano(sux::dict::elias_fano::EfSeq);
365
366#[cfg(feature = "sux")] impl CompressedBuilder for sux::dict::EliasFanoBuilder {
367    #[inline] fn new(num_of_values: usize, max_value: usize) -> Self {
368        sux::dict::EliasFanoBuilder::new(num_of_values, max_value)
369    }
370
371    #[inline] fn push(&mut self, value: usize) {
372        unsafe{ self.push_unchecked(value); }
373    }
374}
375
376#[cfg(feature = "sux")]
377impl CompressedArray for SuxEliasFano {
378    fn new(values: Vec<usize>, last: usize, _num_of_keys: usize) -> Self {
379        SuxEliasFano(sux::dict::EliasFanoBuilder::with_all(values, last).build_with_seq())
380    }
381
382    #[inline]
383    fn get(&self, index: usize) -> usize {
384        unsafe { self.0.get_unchecked(index) }
385    }
386    
387    fn write(&self, output: &mut dyn io::Write) -> io::Result<()> {
388        use std::io::Write;
389
390        use epserde::ser::Serialize;
391        let mut bw = std::io::BufWriter::new(output);
392        match unsafe {self.0.serialize(&mut bw)} {
393            Ok(_) => Ok(()),
394            Err(epserde::ser::Error::FileOpenError(io_err)) => Err(io_err),
395            Err(e) => Err(io::Error::new(io::ErrorKind::Other, e))
396        }?;
397        bw.flush()
398    }
399
400    fn write_bytes(&self) -> usize {
401        let mut buf = Vec::<u8>::new();
402        use epserde::ser::Serialize;
403        unsafe { self.0.serialize(&mut buf).unwrap(); }
404        buf.len()
405    }
406
407    fn read(input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized {     
408        use epserde::deser::Deserialize;
409        match unsafe{ sux::dict::elias_fano::EfSeq::deserialize_full(&mut std::io::BufReader::with_capacity(1, input))} {
410            Ok(v) => Ok(Self(v)),
411            Err(epserde::deser::Error::FileOpenError(io_err)) => Err(io_err),
412            Err(e) => Err(io::Error::new(io::ErrorKind::Other, e))
413        }
414    }
415}
416
417#[cfg(feature = "sux")]
418impl GetSize for SuxEliasFano {
419    fn size_bytes_dyn(&self) -> usize {
420        mem_dbg::MemSize::mem_size(&self.0, mem_dbg::SizeFlags::default()) - std::mem::size_of_val(self)
421    }
422
423    fn size_bytes_content_dyn(&self) -> usize { 
424        mem_dbg::MemSize::mem_size(&self.0, mem_dbg::SizeFlags::default() | mem_dbg::SizeFlags::CAPACITY) - std::mem::size_of_val(self)
425    }
426
427    fn size_bytes(&self) -> usize {
428        mem_dbg::MemSize::mem_size(&self.0, mem_dbg::SizeFlags::default())
429    }
430
431    const USES_DYN_MEM: bool = true;
432}
433
434#[cfg(feature = "cacheline-ef")]
435/// CompressedArray implementation by Elias-Fano from `cacheline_ef` crate. Experimental.
436pub struct CachelineEF(cacheline_ef::CachelineEfVec);
437
438#[cfg(feature = "cacheline-ef")]
439impl CompressedArray for CachelineEF {
440    fn new(values: Vec<usize>, _last: usize, _num_of_keys: usize) -> Self {
441        let v: Vec<_> = values.iter().map(|v| *v as u64).collect();
442        CachelineEF(cacheline_ef::CachelineEfVec::new(&v))
443    }
444
445    //#[inline]
446    fn get(&self, index: usize) -> usize {
447        unsafe { self.0.index_unchecked(index) as usize }
448        //self.0.index(index) as usize
449    }
450    
451    fn write(&self, _output: &mut dyn io::Write) -> io::Result<()> {
452        todo!()
453    }
454    
455    fn write_bytes(&self) -> usize {
456        todo!()
457    }
458    
459    fn read(_input: &mut dyn io::Read) -> io::Result<Self> where Self: Sized {
460        todo!()
461    }
462}
463
464#[cfg(feature = "cacheline-ef")]
465impl GetSize for CachelineEF {
466    fn size_bytes_dyn(&self) -> usize {
467        mem_dbg::MemSize::mem_size(&self.0, mem_dbg::SizeFlags::default()) - std::mem::size_of_val(self)
468    }
469
470    fn size_bytes_content_dyn(&self) -> usize { 
471        mem_dbg::MemSize::mem_size(&self.0, mem_dbg::SizeFlags::default() | mem_dbg::SizeFlags::CAPACITY) - std::mem::size_of_val(self)
472    }
473
474    fn size_bytes(&self) -> usize {
475        mem_dbg::MemSize::mem_size(&self.0, mem_dbg::SizeFlags::default())
476    }
477
478    const USES_DYN_MEM: bool = true;
479}
480
481#[cfg(feature = "sux")] pub type DefaultCompressedArray = SuxEliasFano;
482#[cfg(all(feature = "cacheline-ef", not(feature = "sux")))] pub type DefaultCompressedArray = CachelineEF;
483#[cfg(all(feature = "cseq", not(feature = "sux"), not(feature="cacheline-ef")))] pub type DefaultCompressedArray = CSeqEliasFano;
484#[cfg(all(not(feature="cseq"), not(feature = "sux"), not(feature="cacheline-ef")))] pub type DefaultCompressedArray = Compact;