runiq/filters.rs
1//! Module containing filter implementations for Runiq.
2//!
3//! Each structure in this module has different filtering properties
4//! and should be chosen based on the specific use case of the user.
5//!
6//! Please see the struct documentation for further information on
7//! each filter, including their runtime characteristics.
8use clap::*;
9use fnv::FnvHashSet;
10use scalable_bloom_filter::ScalableBloomFilter;
11use twox_hash::XxHash64;
12
13use std::collections::HashSet;
14use std::hash::Hasher;
15
16// Enumerable filters for clap-rs.
17arg_enum! {
18 /// Enum to store all possible variants of filters.
19 ///
20 /// This will implement the `Into` trait in order to create a new
21 /// boxed filter from a filter kind to keep conversion contained.
22 #[doc(hidden)]
23 #[derive(Copy, Clone, Debug)]
24 pub enum FilterKind {
25 Sorted,
26 Digest,
27 Naive,
28 Bloom,
29 }
30}
31
32/// Trait for any type which can be used to filter unique values.
33///
34/// The filter only supports a single operation to detect a unique
35/// which will provide the ability to check/insert in a single operation.
36pub trait Filter {
37 /// Create a new instance using defaults.
38 fn new() -> Self
39 where
40 Self: Sized;
41
42 /// Detects a unique value.
43 ///
44 /// Return values are booleans to represent whether the value
45 /// was added to the internal filter or not (i.e. `true` if
46 /// this is the first time the value has been seen).
47 fn detect(&mut self, input: &[u8]) -> bool;
48}
49
50/// Implement `Into` to convert to `Filter`.
51impl Into<Box<dyn Filter>> for FilterKind {
52 /// Creates a new `Filter` type based on the enum value.
53 fn into(self) -> Box<dyn Filter> {
54 match self {
55 FilterKind::Sorted => Box::new(SortedFilter::new()),
56 FilterKind::Digest => Box::new(DigestFilter::new()),
57 FilterKind::Naive => Box::new(NaiveFilter::new()),
58 FilterKind::Bloom => Box::new(BloomFilter::new()),
59 }
60 }
61}
62
63/// Basic filter implementation backed by a `HashSet`.
64///
65/// This implementation offers nothing more than abstraction over
66/// using a `HashSet` directly, and will store raw values in the
67/// set. Naturally this means that memory will not be particularly
68/// efficient, but it is guaranteed to be completely accurate when
69/// calculating unique collisions in inputs.
70#[derive(Clone, Debug, Default)]
71pub struct NaiveFilter {
72 inner: HashSet<Vec<u8>>,
73}
74
75/// Implement all trait methods.
76impl Filter for NaiveFilter {
77 /// Creates a new `NaiveFilter`.
78 fn new() -> NaiveFilter {
79 NaiveFilter::default()
80 }
81
82 /// Detects a unique value.
83 #[inline]
84 fn detect(&mut self, input: &[u8]) -> bool {
85 self.inner.insert(input.to_vec())
86 }
87}
88
89/// Digest filter implementation backed by a `HashSet`.
90///
91/// This implementation offers much better memory efficiency when
92/// compared to the `NaiveFilter` due to the fact that raw values
93/// are hashed to `usize` values before being stored in the set.
94///
95/// It's also a little faster due to some improved efficiency
96/// when comparing values in the set itself, but it's not of any
97/// real consequence and is barely noticeable.
98#[derive(Clone, Debug, Default)]
99pub struct DigestFilter {
100 inner: FnvHashSet<u64>,
101}
102
103/// Implement all trait methods.
104impl Filter for DigestFilter {
105 /// Creates a new `DigestFilter`.
106 fn new() -> DigestFilter {
107 DigestFilter::default()
108 }
109
110 /// Detects a unique value.
111 #[inline]
112 fn detect(&mut self, input: &[u8]) -> bool {
113 // insert as a hashed digest
114 self.inner.insert(hash(input))
115 }
116}
117
118/// Uniq filter implementation to only remove consecutive duplicates.
119///
120/// This is the fastest filter (although not by much), and the best in
121/// terms of memory efficiency as it only requires a single value stored
122/// in the filter memory at once. It operates in the same was as the Unix
123/// `uniq` utility, and thus requires your data be sorted prior to any
124/// execution.
125///
126/// Remember that repeatedly running Runiq on the same input would be
127/// a good candidate for sorting your data initially and then making
128/// use of this filter to optimize memory usage going forward.
129#[derive(Clone, Debug)]
130pub struct SortedFilter {
131 inner: Vec<u8>,
132}
133
134/// Implement all trait methods.
135impl Filter for SortedFilter {
136 /// Creates a new `SortedFilter`.
137 fn new() -> SortedFilter {
138 SortedFilter { inner: Vec::new() }
139 }
140
141 /// Detects a unique value.
142 #[inline]
143 fn detect(&mut self, input: &[u8]) -> bool {
144 // check for consec collision
145 if input == &self.inner[..] {
146 return false;
147 }
148
149 // overwrite the previous value
150 self.inner = input.to_vec();
151 true
152 }
153}
154
155/// Bitset filter backed by a scalable Bloom Filter.
156///
157/// This filter operates with the least amount of memory, with a cost
158/// of speed (roughly 60-70% of the speed of the `DigestFilter`, using
159/// only 25% of the memory).
160///
161/// The backing bloom filter initializes with `1e6` bits by default, with
162/// `1e-7` probability of collisions. This is roughly comparable to the
163/// collision rate of the digest filter, so this should be chosen when
164/// memory is critical.
165#[derive(Debug)]
166pub struct BloomFilter {
167 inner: ScalableBloomFilter<u64>,
168}
169
170/// Implement all trait methods.
171impl Filter for BloomFilter {
172 /// Creates a new `BloomFilter`.
173 fn new() -> BloomFilter {
174 BloomFilter {
175 inner: ScalableBloomFilter::new(1_000_000, 1e-8),
176 }
177 }
178
179 /// Detects a unique value.
180 #[inline]
181 fn detect(&mut self, input: &[u8]) -> bool {
182 // create a digest from the input
183 let digest = hash(input);
184
185 // short circuit if duplicated
186 if self.inner.contains(&digest) {
187 return false;
188 }
189
190 // insert on duplicates
191 self.inner.insert(&digest);
192 true
193 }
194}
195
196/// Small hash binding around `Hasher`.
197fn hash(input: &[u8]) -> u64 {
198 // create a new default hasher
199 let mut hasher = XxHash64::default();
200
201 // write the bytes to the hasher
202 hasher.write(input);
203
204 // finish the hash
205 hasher.finish()
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 #[test]
213 fn naive_filter_detection() {
214 let mut filter = NaiveFilter::new();
215
216 let ins1 = filter.detect(b"input1");
217 let ins2 = filter.detect(b"input1");
218
219 assert_eq!(ins1, true);
220 assert_eq!(ins2, false);
221 }
222
223 #[test]
224 fn digest_filter_detection() {
225 let mut filter = DigestFilter::new();
226
227 let ins1 = filter.detect(b"input1");
228 let ins2 = filter.detect(b"input1");
229
230 assert_eq!(ins1, true);
231 assert_eq!(ins2, false);
232 }
233
234 #[test]
235 fn sorted_filter_detection() {
236 let mut filter = SortedFilter::new();
237
238 let ins1 = filter.detect(b"input1");
239 let ins2 = filter.detect(b"input1");
240 let ins3 = filter.detect(b"input2");
241 let ins4 = filter.detect(b"input1");
242
243 assert_eq!(ins1, true);
244 assert_eq!(ins2, false);
245 assert_eq!(ins3, true);
246 assert_eq!(ins4, true);
247 }
248
249 #[test]
250 fn bloom_filter_detection() {
251 let mut filter = BloomFilter::new();
252
253 let ins1 = filter.detect(b"input1");
254 let ins2 = filter.detect(b"input1");
255
256 assert_eq!(ins1, true);
257 assert_eq!(ins2, false);
258 }
259}