orc_format/read/decode/
rle_v2.rs

1use std::io::Read;
2
3use crate::error::Error;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
6enum EncodingTypeV2 {
7    ShortRepeat,
8    Direct,
9    PatchedBase,
10    Delta,
11}
12
13fn header_to_rle_v2_short_repeated_width(header: u8) -> u8 {
14    (header & 0b00111000) >> 3
15}
16
17fn header_to_rle_v2_short_repeated_count(header: u8) -> u8 {
18    header & 0b00000111
19}
20
21fn rle_v2_direct_bit_width(value: u8) -> u8 {
22    match value {
23        0 => 1,
24        1 => 2,
25        3 => 4,
26        7 => 8,
27        15 => 16,
28        23 => 24,
29        27 => 32,
30        28 => 40,
31        29 => 48,
32        30 => 56,
33        31 => 64,
34        other => todo!("{other}"),
35    }
36}
37
38fn header_to_rle_v2_direct_bit_width(header: u8) -> u8 {
39    let bit_width = (header & 0b00111110) >> 1;
40    rle_v2_direct_bit_width(bit_width)
41}
42
43fn rle_v2_delta_bit_width(value: u8) -> u8 {
44    match value {
45        0 => 0,
46        1 => 2,
47        3 => 4,
48        7 => 8,
49        15 => 16,
50        23 => 24,
51        27 => 32,
52        28 => 40,
53        29 => 48,
54        30 => 56,
55        31 => 64,
56        other => todo!("{other}"),
57    }
58}
59
60fn header_to_rle_v2_delta_bit_width(header: u8) -> u8 {
61    let bit_width = (header & 0b00111110) >> 1;
62    rle_v2_delta_bit_width(bit_width)
63}
64
65fn header_to_rle_v2_direct_length(header: u8, header1: u8) -> u16 {
66    let bit = header & 0b00000001;
67    let r = u16::from_be_bytes([bit, header1]);
68    1 + r
69}
70
71fn unsigned_varint<R: Read>(reader: &mut R) -> Result<u64, Error> {
72    let mut i = 0u64;
73    let mut buf = [0u8; 1];
74    let mut j = 0;
75    loop {
76        if j > 9 {
77            // if j * 7 > 64
78            return Err(Error::OutOfSpec);
79        }
80        reader.read_exact(&mut buf[..])?;
81        i |= (u64::from(buf[0] & 0x7F)) << (j * 7);
82        if (buf[0] >> 7) == 0 {
83            break;
84        } else {
85            j += 1;
86        }
87    }
88    Ok(i)
89}
90
91#[inline]
92fn zigzag(z: u64) -> i64 {
93    if z & 0x1 == 0 {
94        (z >> 1) as i64
95    } else {
96        !(z >> 1) as i64
97    }
98}
99
100fn signed_varint<R: Read>(reader: &mut R) -> Result<i64, Error> {
101    unsigned_varint(reader).map(zigzag)
102}
103
104#[inline]
105fn unpack(bytes: &[u8], num_bits: u8, index: usize) -> u64 {
106    if num_bits == 0 {
107        return 0;
108    };
109    let num_bits = num_bits as usize;
110    let start = num_bits * index; // in bits
111    let length = num_bits; // in bits
112    let byte_start = start / 8;
113    let byte_end = (start + length + 7) / 8;
114    // copy swapped
115    let slice = &bytes[byte_start..byte_end];
116    let mut a = [0u8; 8];
117    for (i, item) in slice.iter().rev().enumerate() {
118        a[i] = *item;
119    }
120    let bits = u64::from_le_bytes(a);
121    let offset = (slice.len() * 8 - num_bits) % 8 - start % 8;
122    (bits >> offset) & (!0u64 >> (64 - num_bits))
123}
124
125#[derive(Debug)]
126pub struct UnsignedDirectRun {
127    data: Vec<u8>,
128    bit_width: u8,
129    index: usize,
130    length: usize,
131}
132
133impl UnsignedDirectRun {
134    #[inline]
135    pub fn try_new<R: Read>(
136        header: u8,
137        reader: &mut R,
138        mut scratch: Vec<u8>,
139    ) -> Result<Self, Error> {
140        let mut header1 = [0u8];
141        reader.read_exact(&mut header1)?;
142        let bit_width = header_to_rle_v2_direct_bit_width(header);
143
144        let length = header_to_rle_v2_direct_length(header, header1[0]);
145
146        let additional = ((bit_width as usize) * (length as usize) + 7) / 8;
147        scratch.clear();
148        scratch.reserve(additional);
149        reader.take(additional as u64).read_to_end(&mut scratch)?;
150
151        Ok(Self {
152            data: scratch,
153            bit_width,
154            index: 0,
155            length: length as usize,
156        })
157    }
158
159    #[inline]
160    pub fn len(&self) -> usize {
161        self.length - self.index
162    }
163}
164
165impl Iterator for UnsignedDirectRun {
166    type Item = u64;
167
168    #[inline]
169    fn next(&mut self) -> Option<Self::Item> {
170        (self.index != self.length).then(|| {
171            let index = self.index;
172            self.index += 1;
173            unpack(&self.data, self.bit_width, index)
174        })
175    }
176
177    #[inline]
178    fn size_hint(&self) -> (usize, Option<usize>) {
179        let remaining = self.len();
180        (remaining, Some(remaining))
181    }
182}
183
184pub struct UnsignedDeltaRun {
185    encoded_deltas: Vec<u8>,
186    bit_width: u8,
187    index: usize,
188    length: usize,
189    base: u64,
190    delta_base: i64,
191}
192
193impl UnsignedDeltaRun {
194    #[inline]
195    pub fn try_new<R: Read>(
196        header: u8,
197        reader: &mut R,
198        mut scratch: Vec<u8>,
199    ) -> Result<Self, Error> {
200        let mut header1 = [0u8];
201        reader.read_exact(&mut header1)?;
202        let bit_width = header_to_rle_v2_delta_bit_width(header);
203
204        let length = header_to_rle_v2_direct_length(header, header1[0]);
205
206        let base = unsigned_varint(reader)?;
207        let delta_base = signed_varint(reader)?;
208        let additional = ((length as usize - 2) * bit_width as usize + 7) / 8;
209
210        scratch.clear();
211        scratch.reserve(additional);
212        reader.take(additional as u64).read_to_end(&mut scratch)?;
213
214        Ok(Self {
215            base,
216            encoded_deltas: scratch,
217            bit_width,
218            index: 0,
219            length: length as usize,
220            delta_base,
221        })
222    }
223
224    #[inline]
225    pub fn len(&self) -> usize {
226        self.length - self.index
227    }
228
229    #[inline]
230    pub fn into_inner(mut self) -> Vec<u8> {
231        self.encoded_deltas.clear();
232        self.encoded_deltas
233    }
234}
235
236impl Iterator for UnsignedDeltaRun {
237    type Item = u64;
238
239    #[inline]
240    fn next(&mut self) -> Option<Self::Item> {
241        (self.index != self.length).then(|| {
242            let index = self.index;
243            if index == 0 {
244                self.index += 1;
245                return self.base;
246            }
247            if index == 1 || self.bit_width == 0 {
248                self.index += 1;
249                if self.delta_base > 0 {
250                    self.base += self.delta_base as u64;
251                } else {
252                    self.base -= (-self.delta_base) as u64;
253                }
254                return self.base;
255            }
256            self.index += 1;
257            let delta = unpack(&self.encoded_deltas, self.bit_width, index - 2);
258            if self.delta_base > 0 {
259                self.base += delta;
260            } else {
261                self.base -= delta;
262            }
263            self.base
264        })
265    }
266
267    #[inline]
268    fn size_hint(&self) -> (usize, Option<usize>) {
269        let remaining = self.len();
270        (remaining, Some(remaining))
271    }
272}
273
274#[derive(Debug)]
275pub struct UnsignedShortRepeat {
276    value: u64,
277    remaining: usize,
278    scratch: Vec<u8>,
279}
280
281impl UnsignedShortRepeat {
282    #[inline]
283    fn try_new<R: Read>(header: u8, reader: &mut R, mut scratch: Vec<u8>) -> Result<Self, Error> {
284        let width = 1 + header_to_rle_v2_short_repeated_width(header);
285        let count = 3 + header_to_rle_v2_short_repeated_count(header);
286
287        scratch.clear();
288        scratch.reserve(width as usize);
289        reader.take(width as u64).read_to_end(&mut scratch)?;
290
291        let mut a = [0u8; 8];
292        a[8 - scratch.len()..].copy_from_slice(&scratch);
293        let value = u64::from_be_bytes(a);
294        scratch.clear();
295
296        Ok(Self {
297            value,
298            remaining: count as usize,
299            scratch,
300        })
301    }
302
303    #[inline]
304    pub fn len(&self) -> usize {
305        self.remaining
306    }
307
308    #[inline]
309    pub fn into_inner(self) -> Vec<u8> {
310        self.scratch
311    }
312}
313
314impl Iterator for UnsignedShortRepeat {
315    type Item = u64;
316
317    #[inline]
318    fn next(&mut self) -> Option<Self::Item> {
319        (self.remaining != 0).then(|| {
320            self.remaining -= 1;
321            self.value
322        })
323    }
324
325    #[inline]
326    fn size_hint(&self) -> (usize, Option<usize>) {
327        (self.len(), Some(self.len()))
328    }
329}
330
331#[derive(Debug)]
332pub struct SignedDeltaRun {
333    encoded_deltas: Vec<u8>,
334    bit_width: u8,
335    index: usize,
336    length: usize,
337    base: i64,
338    delta_base: i64,
339}
340
341impl SignedDeltaRun {
342    #[inline]
343    fn try_new<R: Read>(header: u8, reader: &mut R, mut scratch: Vec<u8>) -> Result<Self, Error> {
344        let mut header1 = [0u8];
345        reader.read_exact(&mut header1)?;
346        let bit_width = header_to_rle_v2_delta_bit_width(header);
347
348        let length = header_to_rle_v2_direct_length(header, header1[0]);
349
350        let base = unsigned_varint(reader).map(zigzag)?;
351        let delta_base = signed_varint(reader)?;
352        let additional = ((length as usize - 2) * bit_width as usize + 7) / 8;
353
354        scratch.clear();
355        scratch.reserve(additional);
356        reader.take(additional as u64).read_to_end(&mut scratch)?;
357
358        Ok(Self {
359            base,
360            encoded_deltas: scratch,
361            bit_width,
362            index: 0,
363            length: length as usize,
364            delta_base,
365        })
366    }
367
368    pub fn len(&self) -> usize {
369        self.length - self.index
370    }
371
372    #[must_use]
373    pub fn is_empty(&self) -> bool {
374        self.len() == 0
375    }
376}
377
378impl Iterator for SignedDeltaRun {
379    type Item = i64;
380
381    #[inline]
382    fn next(&mut self) -> Option<Self::Item> {
383        (self.index != self.length).then(|| {
384            let index = self.index;
385            if index == 0 {
386                self.index += 1;
387                return self.base;
388            }
389            if index == 1 || self.bit_width == 0 {
390                self.index += 1;
391                if self.delta_base > 0 {
392                    self.base += self.delta_base as i64;
393                } else {
394                    self.base -= (-self.delta_base) as i64;
395                }
396                return self.base;
397            }
398            self.index += 1;
399            // edge case where `bit_width == 0`, where deltas are equal to base delta
400            let delta = unpack(&self.encoded_deltas, self.bit_width, index - 2);
401            if self.delta_base > 0 {
402                self.base += delta as i64;
403            } else {
404                self.base -= delta as i64;
405            }
406            self.base
407        })
408    }
409
410    #[inline]
411    fn size_hint(&self) -> (usize, Option<usize>) {
412        let remaining = self.length - self.index;
413        (remaining, Some(remaining))
414    }
415}
416
417#[inline]
418fn run_encoding(header: u8) -> EncodingTypeV2 {
419    match (header & 128 == 128, header & 64 == 64) {
420        // 11... = 3
421        (true, true) => EncodingTypeV2::Delta,
422        // 10... = 2
423        (true, false) => EncodingTypeV2::PatchedBase,
424        // 01... = 1
425        (false, true) => EncodingTypeV2::Direct,
426        // 00... = 0
427        (false, false) => EncodingTypeV2::ShortRepeat,
428    }
429}
430
431/// An enum describing one of the RLE v2 runs for unsigned integers
432pub enum UnsignedRleV2Run {
433    /// Direct
434    Direct(UnsignedDirectRun),
435    /// Delta
436    Delta(UnsignedDeltaRun),
437    /// Short repeat
438    ShortRepeat(UnsignedShortRepeat),
439}
440
441impl UnsignedRleV2Run {
442    /// Returns a new [`UnsignedRleV2Run`] owning `scratch`.
443    pub fn try_new<R: Read>(reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
444        let mut header = [0u8];
445        reader.read_exact(&mut header)?;
446        let header = header[0];
447        let encoding = run_encoding(header);
448
449        match encoding {
450            EncodingTypeV2::Direct => {
451                UnsignedDirectRun::try_new(header, reader, scratch).map(Self::Direct)
452            }
453            EncodingTypeV2::Delta => {
454                UnsignedDeltaRun::try_new(header, reader, scratch).map(Self::Delta)
455            }
456            EncodingTypeV2::ShortRepeat => {
457                UnsignedShortRepeat::try_new(header, reader, scratch).map(Self::ShortRepeat)
458            }
459            other => todo!("{other:?}"),
460        }
461    }
462
463    /// The number of items remaining
464    pub fn len(&self) -> usize {
465        match self {
466            Self::Direct(run) => run.len(),
467            Self::Delta(run) => run.len(),
468            Self::ShortRepeat(run) => run.len(),
469        }
470    }
471
472    /// Whether the iterator is empty
473    #[must_use]
474    pub fn is_empty(&self) -> bool {
475        self.len() == 0
476    }
477}
478
479/// A fallible [`Iterator`] of [`UnsignedRleV2Run`].
480pub struct UnsignedRleV2RunIter<R: Read> {
481    reader: R,
482    scratch: Vec<u8>,
483    length: usize,
484}
485
486impl<R: Read> UnsignedRleV2RunIter<R> {
487    /// Returns a new [`UnsignedRleV2RunIter`].
488    pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
489        Self {
490            reader,
491            scratch,
492            length,
493        }
494    }
495
496    /// Returns its internal buffer
497    pub fn into_inner(mut self) -> (R, Vec<u8>) {
498        self.scratch.clear();
499        (self.reader, self.scratch)
500    }
501}
502
503impl<R: Read> Iterator for UnsignedRleV2RunIter<R> {
504    type Item = Result<UnsignedRleV2Run, Error>;
505
506    fn next(&mut self) -> Option<Self::Item> {
507        (self.length != 0).then(|| {
508            let run =
509                UnsignedRleV2Run::try_new(&mut self.reader, std::mem::take(&mut self.scratch))?;
510            self.length -= run.len();
511            Ok(run)
512        })
513    }
514}
515
516/// A fallible [`Iterator`] of [`i64`].
517pub struct UnsignedRleV2Iter<R: Read> {
518    current: Option<UnsignedRleV2Run>,
519    runs: UnsignedRleV2RunIter<R>,
520}
521
522impl<R: Read> UnsignedRleV2Iter<R> {
523    /// Returns a new [`SignedRleV2Iter`].
524    pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
525        Self {
526            runs: UnsignedRleV2RunIter::new(reader, length, scratch),
527            current: None,
528        }
529    }
530
531    /// Returns its internal buffer
532    pub fn into_inner(self) -> (R, Vec<u8>) {
533        self.runs.into_inner()
534    }
535}
536
537impl<R: Read> Iterator for UnsignedRleV2Iter<R> {
538    type Item = Result<u64, Error>;
539
540    #[inline]
541    fn next(&mut self) -> Option<Self::Item> {
542        let next = if let Some(run) = &mut self.current {
543            match run {
544                UnsignedRleV2Run::Direct(values_iter) => values_iter.next(),
545                UnsignedRleV2Run::Delta(values_iter) => values_iter.next(),
546                UnsignedRleV2Run::ShortRepeat(values_iter) => values_iter.next(),
547            }
548        } else {
549            None
550        };
551
552        if next.is_none() {
553            match self.runs.next()? {
554                Ok(run) => self.current = Some(run),
555                Err(e) => return Some(Err(e)),
556            }
557            self.next()
558        } else {
559            next.map(Ok)
560        }
561    }
562}
563
564#[derive(Debug)]
565pub struct SignedDirectRun(UnsignedDirectRun);
566
567impl SignedDirectRun {
568    pub fn try_new<R: Read>(header: u8, reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
569        UnsignedDirectRun::try_new(header, reader, scratch).map(Self)
570    }
571
572    pub fn len(&self) -> usize {
573        self.0.len()
574    }
575
576    /// Whether the iterator is empty
577    #[must_use]
578    pub fn is_empty(&self) -> bool {
579        self.len() == 0
580    }
581}
582
583impl Iterator for SignedDirectRun {
584    type Item = i64;
585
586    fn next(&mut self) -> Option<Self::Item> {
587        self.0.next().map(zigzag)
588    }
589
590    fn size_hint(&self) -> (usize, Option<usize>) {
591        self.0.size_hint()
592    }
593}
594
595#[derive(Debug)]
596pub struct SignedShortRepeat(UnsignedShortRepeat);
597
598impl SignedShortRepeat {
599    pub fn try_new<R: Read>(header: u8, reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
600        UnsignedShortRepeat::try_new(header, reader, scratch).map(Self)
601    }
602
603    /// The number of items remaining
604    pub fn len(&self) -> usize {
605        self.0.len()
606    }
607
608    /// Whether the iterator is empty
609    #[must_use]
610    pub fn is_empty(&self) -> bool {
611        self.len() == 0
612    }
613}
614
615impl Iterator for SignedShortRepeat {
616    type Item = i64;
617
618    fn next(&mut self) -> Option<Self::Item> {
619        self.0.next().map(zigzag)
620    }
621
622    fn size_hint(&self) -> (usize, Option<usize>) {
623        self.0.size_hint()
624    }
625}
626
627/// An enum describing one of the RLE v2 runs for signed integers
628#[derive(Debug)]
629pub enum SignedRleV2Run {
630    /// Direct
631    Direct(SignedDirectRun),
632    /// Delta
633    Delta(SignedDeltaRun),
634    /// Short repeat
635    ShortRepeat(SignedShortRepeat),
636}
637
638impl SignedRleV2Run {
639    /// Returns a new [`SignedRleV2Run`], moving `scratch` to itself
640    pub fn try_new<R: Read>(reader: &mut R, scratch: Vec<u8>) -> Result<Self, Error> {
641        let mut header = [0u8];
642        reader.read_exact(&mut header)?;
643        let header = header[0];
644        let encoding = run_encoding(header);
645
646        match encoding {
647            EncodingTypeV2::Direct => {
648                SignedDirectRun::try_new(header, reader, scratch).map(Self::Direct)
649            }
650            EncodingTypeV2::Delta => {
651                SignedDeltaRun::try_new(header, reader, scratch).map(Self::Delta)
652            }
653            EncodingTypeV2::ShortRepeat => {
654                SignedShortRepeat::try_new(header, reader, scratch).map(Self::ShortRepeat)
655            }
656            other => todo!("{other:?}"),
657        }
658    }
659
660    /// The number of items remaining
661    pub fn len(&self) -> usize {
662        match self {
663            Self::Direct(run) => run.len(),
664            Self::Delta(run) => run.len(),
665            Self::ShortRepeat(run) => run.len(),
666        }
667    }
668
669    /// Whether the iterator is empty
670    #[must_use]
671    pub fn is_empty(&self) -> bool {
672        self.len() == 0
673    }
674}
675
676/// A fallible [`Iterator`] of [`SignedRleV2Run`].
677pub struct SignedRleV2RunIter<R: Read> {
678    reader: R,
679    scratch: Vec<u8>,
680    length: usize,
681}
682
683impl<R: Read> SignedRleV2RunIter<R> {
684    /// Returns a new [`SignedRleV2RunIter`].
685    pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
686        Self {
687            reader,
688            scratch,
689            length,
690        }
691    }
692
693    pub fn into_inner(mut self) -> (R, Vec<u8>) {
694        self.scratch.clear();
695        (self.reader, self.scratch)
696    }
697}
698
699impl<R: Read> Iterator for SignedRleV2RunIter<R> {
700    type Item = Result<SignedRleV2Run, Error>;
701
702    #[inline]
703    fn next(&mut self) -> Option<Self::Item> {
704        (self.length != 0).then(|| {
705            let run = SignedRleV2Run::try_new(&mut self.reader, std::mem::take(&mut self.scratch))?;
706            self.length -= run.len();
707            Ok(run)
708        })
709    }
710}
711
712/// A fallible [`Iterator`] of [`i64`].
713pub struct SignedRleV2Iter<R: Read> {
714    current: Option<SignedRleV2Run>,
715    runs: SignedRleV2RunIter<R>,
716}
717
718impl<R: Read> SignedRleV2Iter<R> {
719    /// Returns a new [`SignedRleV2Iter`].
720    pub fn new(reader: R, length: usize, scratch: Vec<u8>) -> Self {
721        Self {
722            runs: SignedRleV2RunIter::new(reader, length, scratch),
723            current: None,
724        }
725    }
726
727    /// Returns its internal buffer
728    pub fn into_inner(self) -> (R, Vec<u8>) {
729        self.runs.into_inner()
730    }
731}
732
733impl<R: Read> Iterator for SignedRleV2Iter<R> {
734    type Item = Result<i64, Error>;
735
736    #[inline]
737    fn next(&mut self) -> Option<Self::Item> {
738        let next = if let Some(run) = &mut self.current {
739            match run {
740                SignedRleV2Run::Direct(values_iter) => values_iter.next(),
741                SignedRleV2Run::Delta(values_iter) => values_iter.next(),
742                SignedRleV2Run::ShortRepeat(values_iter) => values_iter.next(),
743            }
744        } else {
745            None
746        };
747
748        if next.is_none() {
749            match self.runs.next()? {
750                Ok(run) => self.current = Some(run),
751                Err(e) => return Some(Err(e)),
752            }
753            self.next()
754        } else {
755            next.map(Ok)
756        }
757    }
758}
759
760#[cfg(test)]
761mod test {
762    use super::*;
763
764    #[test]
765    fn test_zigzag() {
766        assert_eq!(zigzag(2), 1);
767        assert_eq!(zigzag(4), 2);
768    }
769
770    #[test]
771    fn unpacking() {
772        let bytes = [0b01000000u8];
773        assert_eq!(unpack(&bytes, 2, 0), 1);
774        assert_eq!(unpack(&bytes, 2, 1), 0);
775    }
776
777    #[test]
778    fn short_repeat() {
779        // [10000, 10000, 10000, 10000, 10000]
780        let data: [u8; 3] = [0x0a, 0x27, 0x10];
781
782        let a = UnsignedShortRepeat::try_new(data[0], &mut &data[1..], vec![])
783            .unwrap()
784            .collect::<Vec<_>>();
785        assert_eq!(a, vec![10000, 10000, 10000, 10000, 10000]);
786    }
787
788    #[test]
789    fn direct() {
790        // [23713, 43806, 57005, 48879]
791        let data: [u8; 10] = [0x5e, 0x03, 0x5c, 0xa1, 0xab, 0x1e, 0xde, 0xad, 0xbe, 0xef];
792
793        let data = &mut data.as_ref();
794
795        let a = UnsignedDirectRun::try_new(data[0], &mut &data[1..], vec![])
796            .unwrap()
797            .collect::<Vec<_>>();
798        assert_eq!(a, vec![23713, 43806, 57005, 48879]);
799    }
800
801    #[test]
802    fn delta() {
803        // [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]
804        // 0x22 = 34
805        // 0x42 = 66
806        // 0x46 = 70
807        let data: [u8; 8] = [0xc6, 0x09, 0x02, 0x02, 0x22, 0x42, 0x42, 0x46];
808
809        let data = &mut data.as_ref();
810
811        let a = UnsignedDeltaRun::try_new(data[0], &mut &data[1..], vec![])
812            .unwrap()
813            .collect::<Vec<_>>();
814        assert_eq!(a, vec![2, 3, 5, 7, 11, 13, 17, 19, 23, 29]);
815    }
816}