1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*! # Fast Byte-Aligned Integer Coding
This crate is a Rust port of [Daniel Lemire's `streamvbyte` library](https://github.com/lemire/streamvbyte).
It contains multiple implementations of this format aimed at different integer sizes and value distributions.

Each `Coder` implementation produces different formats that are incompatible with one another. Names
provide the length of each of the 4 possible tags for each value so `Coder1234` encodes each entry
as 1, 2, 3, or 4 bytes. A scalar implementation is always available at a large speed penalty but
the implementation will automatically use an accelerated implementation for the target if available.

At the moment group implementations only have acceleration on little-endian `aarch64` targets with
`NEON` instruction support.

## Example without delta-coding

```
use streamvbyte64::{Coder, Coder1234};

let coder = Coder1234::new();
let values = vec![
    0u32, 128, 256, 1024, 70, 36, 1000000,
    378, 45, 888, 26, 262144, 88, 89, 90, 16777216
];
let (tag_len, data_len) = Coder1234::max_compressed_bytes(values.len());
let mut encoded = vec![0u8; tag_len + data_len];
let (tags, data) = encoded.split_at_mut(tag_len);
// This is the number of bytes in data actually used. If you're writing the encoded data
// to an output you would write tags and &data[..encoded_data_len].
let encoded_data_len = coder.encode(&values, tags, data);

let mut decoded = vec![0u32; values.len()];
coder.decode(tags, &data[..encoded_data_len], &mut decoded);
assert_eq!(values, decoded);

// You can also use this a bit like an array using data_len() to skip whole groups
// at the cost of reading all the tags before the target group.
let mut group = [0u32; 4];
coder.decode(&tags[2..3], &data[coder.data_len(&tags[..2])..], &mut group);
assert_eq!(values[11], group[3]);
```

## Example with delta coding

```
use streamvbyte64::{Coder, Coder1234};

let coder = Coder1234::new();
let mut sum = 0u32;
let values = [
    0u32, 128, 256, 1024, 70, 36, 1000000,
    378, 45, 888, 26, 262144, 88, 89, 90, 16777216
].iter().map(|x| {
    sum += x;
    sum
}).collect::<Vec<_>>();
let (tag_len, data_len) = Coder1234::max_compressed_bytes(values.len());
let mut encoded = vec![0u8; tag_len + data_len];
let (tags, data) = encoded.split_at_mut(tag_len);
let nodelta_data_len = coder.encode(&values, tags, data);
// encode_deltas() and decode_deltas() both accept an initial value that is subtracted from
// or added to all the value in the stream. At the beginning of the stream this is usually
// zero but may be non-zero if you are encoding/decoding in the middle of a stream.
let encoded_data_len = coder.encode_deltas(0, &values, tags, data);
// Fewer bytes are written with the delta encoding as we only record the distance between
// each value and not the value itself.
assert!(encoded_data_len < nodelta_data_len);

let mut decoded = vec![0u32; values.len()];
coder.decode_deltas(0, tags, &data[..encoded_data_len], &mut decoded);
assert_eq!(values, decoded);

// You can also use this a bit like an array using skip_deltas() to skip whole groups
// at a cost that is less than straight decoding.
let (skip_data_len, initial) = coder.skip_deltas(&tags[..2], data);
let mut group = [0u32; 4];
coder.decode_deltas(initial, &tags[2..3], &data[skip_data_len..], &mut group);
assert_eq!(values[11], group[3]);
```
*/

mod arch;
mod coder_impl;
mod coding_descriptor;
mod raw_group;
mod tag_utils;

mod coder0124;
mod coder1234;
mod coder1248;

pub use num_traits::{ops::wrapping::WrappingAdd, ops::wrapping::WrappingSub, PrimInt};

/// `Coder` compresses and decompresses integers in a byte-aligned format compose of two streams.
///
/// Groups of 4 integers are coded into two separate streams: a tag stream where each byte describes
/// a group, and a data stream containing values as described by the tag. The coder _does not_
/// record the number of entries in the stream, instead assuming a multiple of 4.
///
/// Different coder implementations support different integer widths (32 or 64 bit) as well as
/// different byte length distributions to better compress some data sets.
///
/// Use `max_compressed_bytes()` to compute the number of tag and data bytes that must be allocated
/// to safely encode a slice of input values.
pub trait Coder: Sized + Copy + Clone {
    /// The input/output element type for this coder, typically `u32` or `u64`.
    type Elem: PrimInt + WrappingAdd + WrappingSub + std::fmt::Debug + Sized + Copy + Clone;

    /// Create a new `Coder`, selecting the fastest implementation available.
    ///
    /// These objects should be relatively cheap to create and require no heap allocation.
    fn new() -> Self;

    /// Returns the number of `(tag_bytes, data_bytes)` required to compress a slice of length `len`.
    fn max_compressed_bytes(len: usize) -> (usize, usize) {
        let num_groups = (len + 3) / 4;
        (
            num_groups,
            num_groups * 4 * std::mem::size_of::<Self::Elem>(),
        )
    }

    /// Encodes a slice of values, writing tags and data to separate streams.
    ///
    /// For every 4 input values one tag byte and up to `std::mem::size_of::<Elem>() * 4` data bytes
    /// may be written to output.
    ///
    /// Returns the number of bytes written to the data stream.
    ///
    /// # Panics
    ///
    /// - If `values.len() % 4 != 0`
    /// - If `tags` or `data` are too small to fit all of the output data.
    fn encode(&self, values: &[Self::Elem], tags: &mut [u8], data: &mut [u8]) -> usize;

    /// Encodes a slice of values, writing tags and data to separate streams.
    ///
    /// Values are interpreted as deltas starting from `initial` which produces a more compact
    /// output that is also more expensive to encode and decode.
    ///
    /// For every 4 input values one tag byte and up to `std::mem::size_of::<Elem>() * 4` data bytes
    /// may be written to output.
    ///
    /// Returns the number of bytes written to the data stream.
    ///
    /// # Panics
    ///
    /// - If `values.len() % 4 != 0`
    /// - If `tags` or `data` are too small to fit all of the output data.
    fn encode_deltas(
        &self,
        initial: Self::Elem,
        values: &[Self::Elem],
        tags: &mut [u8],
        data: &mut [u8],
    ) -> usize;

    /// Decodes input tags and data streams to an output slice.
    ///
    /// Consumes all tag values in the input stream to produce `tags.len() * 4` values. May consume
    /// up to `std::mem::size_of<Elem>() * tags.len() * 4` bytes from data.
    ///
    /// Returns the number of bytes consumed from the data stream.
    ///
    /// # Panics
    ///
    /// - If `values.len() % 4 != 0`.
    /// - If `tags.len() < values.len() / 4`.
    /// - If decoding would consume bytes past the end of `data`.
    fn decode(&self, tags: &[u8], data: &[u8], values: &mut [Self::Elem]) -> usize;

    /// Decodes input tags and data streams to an output slice.
    ///
    /// Values are interepreted as deltas starting from `initial`.
    ///
    /// Consumes all tag values in the input stream to produce `tags.len() * 4` values. May consume
    /// up to `std::mem::size_of<Elem>() * tags.len() * 4` bytes from data.
    ///
    /// Returns the number of bytes consumed from the data stream.
    ///
    /// # Panics
    ///
    /// - If `values.len() % 4 != 0`.
    /// - If `tags.len() < values.len() / 4`.
    /// - If decoding would consume bytes past the end of `data`.
    fn decode_deltas(
        &self,
        initial: Self::Elem,
        tags: &[u8],
        data: &[u8],
        values: &mut [Self::Elem],
    ) -> usize;

    /// Returns the data length of all the groups encoded by `tags`.
    fn data_len(&self, tags: &[u8]) -> usize;

    /// Skip `tags.len() * 4` deltas read from input tag and data streams.
    ///
    /// Returns the number of bytes consumed from the data stream and the sum of all the deltas that
    /// were skipped.
    ///
    /// # Panics
    ///
    ///  - If decoding would consume bytes past the end of `data`.
    fn skip_deltas(&self, tags: &[u8], data: &[u8]) -> (usize, Self::Elem);
}

pub use coder0124::Coder0124;
pub use coder1234::Coder1234;
pub use coder1248::Coder1248;

#[cfg(test)]
pub(crate) mod tests;