1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
//! Golomb Coded Set is a probabilistic data structure which is typically smaller than a bloom
//! filter, at the cost of query performance.. A sorted list of differences between samples of a
//! random distribution will roughly have a geometric distribution, which makes for an optimal
//! prefix for Golomb-Rice coding. Since a good hash algorithm will be randomly distributed, this
//! encoding makes for efficient storage of hashed values.
//!
//! Giovanni Bajo's blog post as well as their Python and C++ implementations were a huge help when
//! writing and testing this library, found
//! [here](http://giovanni.bajo.it/post/47119962313golomb-coded-sets-smaller-than-bloom-filters)
//! and [here](https://github.com/rasky/gcs) respectively.
//!
//! ## Usage and Behaviour
//!
//! There are 3 main parameters to select when creating a Golomb Coded Set: the hash algorithm, `N`
//! and `P`. `N` is the desired maximum number of elements that will be inserted into the set, and
//! `1 / 2 ^ P` is the desired probability of a false positive when the set is full. If fewer items
//! have been inserted the real probability will be significantly lower.
//!
//! The chosen hashing algorithm must have a uniform distribution (which is not the same as being
//! cryptograpically secure) and the output length of the hash in bits must be greater than
//! `log2(N * 2 ^ P)` bits. This is not currently enforced by the library and failing to do so
//! could result in far more false positives than expected. Beyond meeting those requirements,
//! selecting an algorithm for speed would be appropriate. If the hardware acceleration is present,
//! CRC32 would be a good choice for up to a million elements and a false positive probability of
//! 0.001%. For larger sets and/or lower probabilities a hashing algorithm with a longer output is
//! needed.
//!
//! ## Example
//!
//! ```rust
//! use {golomb_set::UnpackedGcs, md5::Md5};
//!
//! // Create a GCS where when 3 items are stored in the set, the
//! // probability of a false positive will be `1/(2^5)`, or 3.1%
//! let mut gcs = UnpackedGcs::<Md5>::new(3, 5);
//!
//! // Insert the MD5 hashes of "alpha" and "bravo"
//! gcs.insert(b"alpha");
//! gcs.insert(b"bravo");
//!
//! assert!(gcs.contains(b"alpha"));
//! assert!(gcs.contains(b"bravo"));
//! assert!(!gcs.contains(b"charlie"));
//!
//! // Reduces memory footprint in exchange for slower querying
//! let gcs = gcs.pack();
//!
//! assert!(gcs.contains(b"alpha"));
//! assert!(gcs.contains(b"bravo"));
//! assert!(!gcs.contains(b"charlie"));
//! ```

#![deny(missing_docs)]

#[macro_use]
extern crate failure_derive;

use {
    bitvec::{
        prelude::{BigEndian, BitVec, LittleEndian},
        store::BitStore,
    },
    byteorder::ByteOrder,
    digest::Digest,
    num_integer::div_rem,
    std::{
        io::{self, Read, Write},
        marker::PhantomData,
    },
};

/// Errors that may occur when handling Golomb Coded Sets.
#[derive(Debug, Fail)]
pub enum Error {
    /// Returned when attempting to insert an additional element into an
    /// already full Golomb Coded Set.
    #[fail(display = "Limit for the number of elements has been reached")]
    LimitReached,
    /// The Golomb-Rice encoded sequence of bits could not be decoded, returned
    /// when unpacking or calling the `contains` method on a a packed GCS.
    #[fail(display = "Decoding failed due to invalid Golomb-Rice bit sequence")]
    Decode,
    /// todo
    #[fail(display = "IO error: {}", _0)]
    Io(io::Error),
}

impl From<io::Error> for Error {
    fn from(err: io::Error) -> Self {
        Error::Io(err)
    }
}

/// An unpacked Golomb Coded Set.
#[derive(Clone, Debug, PartialEq)]
pub struct UnpackedGcs<D: Digest> {
    n: usize,
    p: u8,
    values: Vec<u64>,
    digest: PhantomData<D>,
}

impl<D: Digest> UnpackedGcs<D> {
    /// Creates a new `UnpackedGcs` from `n` and `p`, where `1/2^p` is the probability
    /// of a false positive when n items have been inserted into the set.
    pub fn new(n: usize, p: u8) -> Self {
        Self {
            n,
            p,
            values: Vec::new(),
            digest: PhantomData,
        }
    }

    /// Copies data from the reader and inserts into into the set.
    ///
    /// # Errors
    /// * If there is an error reading data from `reader`.
    /// * If more than `n` items have been inserted.
    pub fn insert_from_reader<R: Read>(&mut self, mut reader: R) -> Result<(), Error> {
        let mut vec = Vec::new();
        reader.read_exact(&mut vec)?;
        self.insert(&vec)
    }

    /// Adds an entry to the set, and returns an error if more than N items are added.
    ///
    /// # Errors
    /// * If more than `n` items have been inserted.
    pub fn insert<A: AsRef<[u8]>>(&mut self, input: A) -> Result<(), Error> {
        if self.values.len() < self.n {
            self.values
                .push(digest_value::<D>(self.n as u64, self.p, input.as_ref()));
            self.values.sort();
            Ok(())
        } else {
            Err(Error::LimitReached)
        }
    }

    /// Returns whether or not an input is contained in the set. If false the
    /// input is definitely not present, if true the input is probably present.
    pub fn contains<A: AsRef<[u8]>>(&self, input: A) -> bool {
        self.values
            .binary_search(&digest_value::<D>(self.n as u64, self.p, input.as_ref()))
            .is_ok()
    }

    /// Packs an `UnpackedGcs` into a `Gcs`.
    ///
    /// This will will reduce the memory footprint, but also reduce query
    /// performance.
    pub fn pack(&self) -> Gcs<D> {
        let mut values = self.values.clone();

        // Sort then calculate differences
        values.sort();
        for i in (1..values.len()).rev() {
            values[i] -= values[i - 1];
        }

        // Apply golomb encoding
        let mut data = BitVec::<BigEndian, u8>::new();
        for val in values {
            data.append(&mut golomb_encode(val, self.p))
        }

        Gcs {
            n: self.n,
            p: self.p,
            data,
            digest: self.digest,
        }
    }
}

/// A packed Golomb-coded Set.
#[derive(Clone, Debug, PartialEq)]
pub struct Gcs<D: Digest> {
    n: usize,
    p: u8,
    data: BitVec,
    digest: PhantomData<D>,
}

impl<D: Digest> Gcs<D> {
    /// Read a packed `Gcs` from any Reader.
    ///
    /// # Errors
    /// * If there is an error reading data from `reader`.
    pub fn from_reader<R: Read>(reader: &mut R, n: usize, p: u8) -> Result<Self, Error> {
        let mut buf = Vec::new();
        reader.read_to_end(&mut buf)?;
        let data = BitVec::<BigEndian, u8>::from_vec(buf);

        let mut iter = data.iter().peekable();
        while iter.peek().is_some() {
            golomb_decode(&mut iter, p)?;
        }

        Ok(Self {
            n,
            p,
            data,
            digest: PhantomData,
        })
    }

    /// Writes a packed `Gcs` to a Writer.
    ///
    /// # Errors
    /// * If there is an error writing data to `writer`.
    pub fn write<W: Write>(&self, writer: &mut W) -> Result<(), Error> {
        writer.write_all(&self.data.clone().into_vec())?;
        Ok(())
    }

    /// Returns whether or not an input is contained in the set. If false the
    /// input is definitely not present, if true the input is probably present.
    ///
    /// # Errors
    /// * If the inner data is not a valid Golomb-Rice encoding.
    pub fn contains<A: AsRef<[u8]>>(&self, input: A) -> bool {
        let input = digest_value::<D>(self.n as u64, self.p, input.as_ref());

        let mut iter = self.data.iter().peekable();

        let mut last = 0;

        while iter.peek().is_some() {
            // This should never happen because data is checked on creation
            let decoded = golomb_decode(&mut iter, self.p).expect("Golomb decoding failed");

            if input == (decoded + last) {
                return true;
            } else {
                last += decoded;
            }
        }

        false
    }

    /// Unpacks a `Gcs` into an `UnpackedGcs`.
    ///
    /// This will will increase query performance, but also increase the memory
    /// footprint.
    ///
    /// # Errors
    /// * If the inner data is not a valid Golomb-Rice encoding.
    pub fn unpack(&self) -> UnpackedGcs<D> {
        let mut values = {
            let mut iter = self.data.iter().peekable();
            let mut values = Vec::new();

            while iter.peek().is_some() {
                // This should never happen because data is checked on creation
                values.push(golomb_decode(&mut iter, self.p).expect("Golomb decoding failed"));
            }

            values
        };

        for i in 1..values.len() {
            values[i] += values[i - 1];
        }

        values.sort();

        UnpackedGcs {
            n: self.n,
            p: self.p,
            values,
            digest: self.digest,
        }
    }
}

/// Perform Golomb-Rice encoding of n, with modulus 2^p.
///
/// # Panics
/// * Panics if `p == 0`.
fn golomb_encode(n: u64, p: u8) -> BitVec {
    if p == 0 {
        panic!("p cannot be 0");
    }
    let (quo, rem) = div_rem(n, 2u64.pow(u32::from(p)));

    let mut out = BitVec::new();

    // Unary encoding of quotient
    for _ in 0..quo {
        out.push(true);
    }
    out.push(false);

    // Binary encoding of remainder in p bits
    // remove vec and change to big end?
    for i in (0..p).rev() {
        out.push(rem.get::<LittleEndian>(i.into()));
    }

    out
}

/// Perform Golomb-Rice decoding of n, with modulus 2^p.
///
/// # Errors
/// * If `iter` is not a valid Golomb-Rice encoding
fn golomb_decode<I>(iter: &mut I, p: u8) -> Result<u64, Error>
where
    I: Iterator<Item = bool>,
{
    // parse unary encoded quotient
    let quo = iter.take_while(|i| *i).count() as u64;

    // parse binary encoded remainder
    let mut rem = 0u64;
    for _ in 0..p {
        match iter.next() {
            Some(true) => {
                rem += 1;
            }

            Some(false) => {}

            None => {
                return Err(Error::Decode);
            }
        }

        rem <<= 1;
    }
    rem >>= 1;

    // push quo * p + rem
    Ok(quo * 2u64.pow(u32::from(p)) + rem)
}

fn digest_value<D: Digest>(n: u64, p: u8, input: &[u8]) -> u64 {
    let val = if D::output_size() < 8 {
        let mut buf = [0u8; 8];
        let digest = D::digest(input);
        for i in 0..D::output_size() {
            buf[i + D::output_size()] = digest[i];
        }

        byteorder::BigEndian::read_u64(&buf)
    } else {
        byteorder::BigEndian::read_u64(&D::digest(input)[..8])
    };

    val % (n * 2u64.pow(u32::from(p)))
}

#[cfg(test)]
mod tests {
    use {super::*, proptest::prelude::*};

    proptest! {
        // Ranges need to be extended after improving performance
        #[test]
        fn golomb_single(n in 0u64..100000u64, p in 2u8..16) {
            assert_eq!(
                n,
                golomb_decode(&mut golomb_encode(n, p).iter().peekable(), p).unwrap()
            );
        }
    }
}