runiq/
filters.rs

1//! Module containing filter implementations for Runiq.
2//!
3//! Each structure in this module has different filtering properties
4//! and should be chosen based on the specific use case of the user.
5//!
6//! Please see the struct documentation for further information on
7//! each filter, including their runtime characteristics.
8use clap::*;
9use fnv::FnvHashSet;
10use scalable_bloom_filter::ScalableBloomFilter;
11use twox_hash::XxHash64;
12
13use std::collections::HashSet;
14use std::hash::Hasher;
15
16// Enumerable filters for clap-rs.
17arg_enum! {
18    /// Enum to store all possible variants of filters.
19    ///
20    /// This will implement the `Into` trait in order to create a new
21    /// boxed filter from a filter kind to keep conversion contained.
22    #[doc(hidden)]
23    #[derive(Copy, Clone, Debug)]
24    pub enum FilterKind {
25        Sorted,
26        Digest,
27        Naive,
28        Bloom,
29    }
30}
31
32/// Trait for any type which can be used to filter unique values.
33///
34/// The filter only supports a single operation to detect a unique
35/// which will provide the ability to check/insert in a single operation.
36pub trait Filter {
37    /// Create a new instance using defaults.
38    fn new() -> Self
39    where
40        Self: Sized;
41
42    /// Detects a unique value.
43    ///
44    /// Return values are booleans to represent whether the value
45    /// was added to the internal filter or not (i.e. `true` if
46    /// this is the first time the value has been seen).
47    fn detect(&mut self, input: &[u8]) -> bool;
48}
49
50/// Implement `Into` to convert to `Filter`.
51impl Into<Box<dyn Filter>> for FilterKind {
52    /// Creates a new `Filter` type based on the enum value.
53    fn into(self) -> Box<dyn Filter> {
54        match self {
55            FilterKind::Sorted => Box::new(SortedFilter::new()),
56            FilterKind::Digest => Box::new(DigestFilter::new()),
57            FilterKind::Naive => Box::new(NaiveFilter::new()),
58            FilterKind::Bloom => Box::new(BloomFilter::new()),
59        }
60    }
61}
62
63/// Basic filter implementation backed by a `HashSet`.
64///
65/// This implementation offers nothing more than abstraction over
66/// using a `HashSet` directly, and will store raw values in the
67/// set. Naturally this means that memory will not be particularly
68/// efficient, but it is guaranteed to be completely accurate when
69/// calculating unique collisions in inputs.
70#[derive(Clone, Debug, Default)]
71pub struct NaiveFilter {
72    inner: HashSet<Vec<u8>>,
73}
74
75/// Implement all trait methods.
76impl Filter for NaiveFilter {
77    /// Creates a new `NaiveFilter`.
78    fn new() -> NaiveFilter {
79        NaiveFilter::default()
80    }
81
82    /// Detects a unique value.
83    #[inline]
84    fn detect(&mut self, input: &[u8]) -> bool {
85        self.inner.insert(input.to_vec())
86    }
87}
88
89/// Digest filter implementation backed by a `HashSet`.
90///
91/// This implementation offers much better memory efficiency when
92/// compared to the `NaiveFilter` due to the fact that raw values
93/// are hashed to `usize` values before being stored in the set.
94///
95/// It's also a little faster due to some improved efficiency
96/// when comparing values in the set itself, but it's not of any
97/// real consequence and is barely noticeable.
98#[derive(Clone, Debug, Default)]
99pub struct DigestFilter {
100    inner: FnvHashSet<u64>,
101}
102
103/// Implement all trait methods.
104impl Filter for DigestFilter {
105    /// Creates a new `DigestFilter`.
106    fn new() -> DigestFilter {
107        DigestFilter::default()
108    }
109
110    /// Detects a unique value.
111    #[inline]
112    fn detect(&mut self, input: &[u8]) -> bool {
113        // insert as a hashed digest
114        self.inner.insert(hash(input))
115    }
116}
117
118/// Uniq filter implementation to only remove consecutive duplicates.
119///
120/// This is the fastest filter (although not by much), and the best in
121/// terms of memory efficiency as it only requires a single value stored
122/// in the filter memory at once. It operates in the same was as the Unix
123/// `uniq` utility, and thus requires your data be sorted prior to any
124/// execution.
125///
126/// Remember that repeatedly running Runiq on the same input would be
127/// a good candidate for sorting your data initially and then making
128/// use of this filter to optimize memory usage going forward.
129#[derive(Clone, Debug)]
130pub struct SortedFilter {
131    inner: Vec<u8>,
132}
133
134/// Implement all trait methods.
135impl Filter for SortedFilter {
136    /// Creates a new `SortedFilter`.
137    fn new() -> SortedFilter {
138        SortedFilter { inner: Vec::new() }
139    }
140
141    /// Detects a unique value.
142    #[inline]
143    fn detect(&mut self, input: &[u8]) -> bool {
144        // check for consec collision
145        if input == &self.inner[..] {
146            return false;
147        }
148
149        // overwrite the previous value
150        self.inner = input.to_vec();
151        true
152    }
153}
154
155/// Bitset filter backed by a scalable Bloom Filter.
156///
157/// This filter operates with the least amount of memory, with a cost
158/// of speed (roughly 60-70% of the speed of the `DigestFilter`, using
159/// only 25% of the memory).
160///
161/// The backing bloom filter initializes with `1e6` bits by default, with
162/// `1e-7` probability of collisions. This is roughly comparable to the
163/// collision rate of the digest filter, so this should be chosen when
164/// memory is critical.
165#[derive(Debug)]
166pub struct BloomFilter {
167    inner: ScalableBloomFilter<u64>,
168}
169
170/// Implement all trait methods.
171impl Filter for BloomFilter {
172    /// Creates a new `BloomFilter`.
173    fn new() -> BloomFilter {
174        BloomFilter {
175            inner: ScalableBloomFilter::new(1_000_000, 1e-8),
176        }
177    }
178
179    /// Detects a unique value.
180    #[inline]
181    fn detect(&mut self, input: &[u8]) -> bool {
182        // create a digest from the input
183        let digest = hash(input);
184
185        // short circuit if duplicated
186        if self.inner.contains(&digest) {
187            return false;
188        }
189
190        // insert on duplicates
191        self.inner.insert(&digest);
192        true
193    }
194}
195
196/// Small hash binding around `Hasher`.
197fn hash(input: &[u8]) -> u64 {
198    // create a new default hasher
199    let mut hasher = XxHash64::default();
200
201    // write the bytes to the hasher
202    hasher.write(input);
203
204    // finish the hash
205    hasher.finish()
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    #[test]
213    fn naive_filter_detection() {
214        let mut filter = NaiveFilter::new();
215
216        let ins1 = filter.detect(b"input1");
217        let ins2 = filter.detect(b"input1");
218
219        assert_eq!(ins1, true);
220        assert_eq!(ins2, false);
221    }
222
223    #[test]
224    fn digest_filter_detection() {
225        let mut filter = DigestFilter::new();
226
227        let ins1 = filter.detect(b"input1");
228        let ins2 = filter.detect(b"input1");
229
230        assert_eq!(ins1, true);
231        assert_eq!(ins2, false);
232    }
233
234    #[test]
235    fn sorted_filter_detection() {
236        let mut filter = SortedFilter::new();
237
238        let ins1 = filter.detect(b"input1");
239        let ins2 = filter.detect(b"input1");
240        let ins3 = filter.detect(b"input2");
241        let ins4 = filter.detect(b"input1");
242
243        assert_eq!(ins1, true);
244        assert_eq!(ins2, false);
245        assert_eq!(ins3, true);
246        assert_eq!(ins4, true);
247    }
248
249    #[test]
250    fn bloom_filter_detection() {
251        let mut filter = BloomFilter::new();
252
253        let ins1 = filter.detect(b"input1");
254        let ins2 = filter.detect(b"input1");
255
256        assert_eq!(ins1, true);
257        assert_eq!(ins2, false);
258    }
259}