all_pairs_hamming/
chunked_join.rs

1//! A fast and compact implementation of similarity self-join on binary sketches in the Hamming space.
2use hashbrown::HashSet;
3
4use crate::errors::{AllPairsHammingError, Result};
5use crate::multi_sort::MultiSort;
6use crate::sketch::Sketch;
7
8/// A fast and compact implementation of similarity self-join on binary sketches in the Hamming space.
9/// The algorithm employs a modified variant of the sketch sorting with the multi-index approach.
10///
11/// # Complexities
12///
13/// The time and memory complexities are linear in the input and output size.
14///
15/// # Examples
16///
17/// ```
18/// use all_pairs_hamming::ChunkedJoiner;
19///
20/// let mut joiner = ChunkedJoiner::<u8>::new(2);
21/// joiner.add([0b1111, 0b1001]);
22/// joiner.add([0b1101, 0b1001]);
23/// joiner.add([0b0101, 0b0001]);
24///
25/// let mut results = joiner.similar_pairs(0.15);
26/// assert_eq!(results, vec![(0, 1, 0.0625), (1, 2, 0.125)]);
27/// ```
28///
29/// # References
30///
31/// - Tabei, Uno, Sugiyama, and Tsuda.
32///   [Single versus Multiple Sorting in All Pairs Similarity Search](https://proceedings.mlr.press/v13/tabei10a.html).
33///   ACML, 2010
34/// - J. Qin et al.
35///   [Generalizing the Pigeonhole Principle for Similarity Search in Hamming Space](https://doi.org/10.1109/TKDE.2019.2899597).
36///   IEEE Transactions on Knowledge and Data Engineering, 2021
37pub struct ChunkedJoiner<S> {
38    chunks: Vec<Vec<S>>,
39    shows_progress: bool,
40}
41
42impl<S> ChunkedJoiner<S>
43where
44    S: Sketch,
45{
46    /// Creates an instance, handling sketches of `num_chunks` chunks, i.e.,
47    /// in `S::dim() * num_chunks` dimensions.
48    pub fn new(num_chunks: usize) -> Self {
49        Self {
50            chunks: vec![vec![]; num_chunks],
51            shows_progress: false,
52        }
53    }
54
55    /// Prints the progress with stderr?
56    pub const fn shows_progress(mut self, yes: bool) -> Self {
57        self.shows_progress = yes;
58        self
59    }
60
61    /// Appends a sketch of [`Self::num_chunks()`] chunks.
62    /// The first [`Self::num_chunks()`] elements of an input iterator is stored.
63    /// If the iterator is consumed until obtaining the elements, an error is returned.
64    pub fn add<I>(&mut self, sketch: I) -> Result<()>
65    where
66        I: IntoIterator<Item = S>,
67    {
68        let num_chunks = self.num_chunks();
69        let mut iter = sketch.into_iter();
70        for chunk in self.chunks.iter_mut() {
71            chunk.push(iter.next().ok_or_else(|| {
72                let msg = format!("The input sketch must include {num_chunks} chunks at least.");
73                AllPairsHammingError::input(msg)
74            })?);
75        }
76        Ok(())
77    }
78
79    /// Finds all similar pairs whose normalized Hamming distance is within `radius`,
80    /// returning triplets of the left-side id, the right-side id, and thier distance.
81    pub fn similar_pairs(&self, radius: f64) -> Vec<(usize, usize, f64)> {
82        let dimension = S::dim() * self.num_chunks();
83        let hamradius = (dimension as f64 * radius).ceil() as usize;
84        if self.shows_progress {
85            eprintln!(
86                "[ChunkedJoiner::similar_pairs] #dimensions={dimension}, hamradius={hamradius}"
87            );
88        }
89
90        // TODO: Threading.
91        let mut candidates = HashSet::new();
92        for (j, chunk) in self.chunks.iter().enumerate() {
93            // Based on the general pigeonhole principle.
94            // https://doi.org/10.1109/TKDE.2019.2899597
95            if j + hamradius + 1 < self.chunks.len() {
96                continue;
97            }
98            let r = (j + hamradius + 1 - self.chunks.len()) / self.chunks.len();
99            MultiSort::new().similar_pairs(chunk, r, &mut candidates);
100
101            if self.shows_progress {
102                eprintln!(
103                    "[ChunkedJoiner::similar_pairs] Processed {}/{}...",
104                    j + 1,
105                    self.chunks.len()
106                );
107                eprintln!(
108                    "[ChunkedJoiner::similar_pairs] #candidates={}",
109                    candidates.len()
110                );
111            }
112        }
113        if self.shows_progress {
114            eprintln!("[ChunkedJoiner::similar_pairs] Done");
115        }
116
117        let mut candidates: Vec<_> = candidates.into_iter().collect();
118        candidates.sort_unstable();
119
120        let bound = (dimension as f64 * radius) as usize;
121        let mut matched = vec![];
122
123        for (i, j) in candidates {
124            if let Some(dist) = self.hamming_distance(i, j, bound) {
125                let dist = dist as f64 / dimension as f64;
126                if dist <= radius {
127                    matched.push((i, j, dist));
128                }
129            }
130        }
131        if self.shows_progress {
132            eprintln!("[ChunkedJoiner::similar_pairs] #matched={}", matched.len());
133        }
134        matched
135    }
136
137    /// Gets the number of chunks.
138    pub fn num_chunks(&self) -> usize {
139        self.chunks.len()
140    }
141
142    /// Gets the number of stored sketches.
143    pub fn num_sketches(&self) -> usize {
144        self.chunks.get(0).map(|v| v.len()).unwrap_or(0)
145    }
146
147    /// Gets the memory usage in bytes.
148    pub fn memory_in_bytes(&self) -> usize {
149        self.num_chunks() * self.num_sketches() * std::mem::size_of::<S>()
150    }
151
152    fn hamming_distance(&self, i: usize, j: usize, bound: usize) -> Option<usize> {
153        let mut dist = 0;
154        for chunk in &self.chunks {
155            dist += chunk[i].hamdist(chunk[j]);
156            if bound < dist {
157                return None;
158            }
159        }
160        Some(dist)
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    fn example_sketches() -> Vec<u16> {
169        vec![
170            0b_1110_0011_1111_1011, // 0
171            0b_0001_0111_0111_1101, // 1
172            0b_1100_1101_1000_1100, // 2
173            0b_1100_1101_0001_0100, // 3
174            0b_1010_1110_0010_1010, // 4
175            0b_0111_1001_0011_1111, // 5
176            0b_1110_0011_0001_0000, // 6
177            0b_1000_0111_1001_0101, // 7
178            0b_1110_1101_1000_1101, // 8
179            0b_0111_1001_0011_1001, // 9
180        ]
181    }
182
183    fn naive_search(sketches: &[u16], radius: f64) -> Vec<(usize, usize, f64)> {
184        let mut results = vec![];
185        for i in 0..sketches.len() {
186            let x = sketches[i];
187            for j in i + 1..sketches.len() {
188                let y = sketches[j];
189                let dist = x.hamdist(y);
190                let dist = dist as f64 / 16.;
191                if dist <= radius {
192                    results.push((i, j, dist));
193                }
194            }
195        }
196        results
197    }
198
199    fn test_similar_pairs(radius: f64) {
200        let sketches = example_sketches();
201        let expected = naive_search(&sketches, radius);
202
203        let mut joiner = ChunkedJoiner::new(2);
204        for s in sketches {
205            joiner.add([(s & 0xFF) as u8, (s >> 8) as u8]).unwrap();
206        }
207        let mut results = joiner.similar_pairs(radius);
208        results.sort_by_key(|&(i, j, _)| (i, j));
209        assert_eq!(results, expected);
210    }
211
212    #[test]
213    fn test_similar_pairs_for_all() {
214        for radius in 0..=10 {
215            test_similar_pairs(radius as f64 / 10.);
216        }
217    }
218
219    #[test]
220    fn test_short_sketch() {
221        let mut joiner = ChunkedJoiner::new(2);
222        let result = joiner.add([0u64]);
223        assert!(result.is_err());
224    }
225}