Skip to main content

dyntext/
postings.rs

1//! Inverted-index data structure mapping trigram hash to the
2//! set of document ids whose text contains the trigram.
3//!
4//! The mapping is stored as a [`BTreeMap`] keyed by the trigram
5//! `u64` (so iteration is deterministic) with [`RoaringBitmap`]
6//! values. Roaring is the right shape for this workload because
7//! the postings lists are dense in popular trigrams and sparse
8//! in rare ones, and Roaring's hybrid container layout
9//! (run / array / bitmap) is competitive with both extremes
10//! while keeping the compressed size proportional to the
11//! shannon entropy of the doc-id stream rather than to its
12//! cardinality.
13//!
14//! Intersection and union operate on slices of trigram hashes;
15//! the slice ordering does not affect the result. Intersection
16//! sorts by posting-list cardinality before reducing so the
17//! shortest list bounds the working-set size of the reduce.
18
19use std::collections::BTreeMap;
20
21use roaring::RoaringBitmap;
22use serde::{Deserialize, Serialize};
23
24/// Inverted index: `trigram_hash -> bitmap of doc ids`.
25#[derive(Debug, Default, Clone, Serialize, Deserialize)]
26pub struct Postings {
27    map: BTreeMap<u64, RoaringBitmap>,
28}
29
30impl Postings {
31    /// Construct an empty postings index.
32    #[must_use]
33    pub fn new() -> Self {
34        Self {
35            map: BTreeMap::new(),
36        }
37    }
38
39    /// Number of distinct trigrams present in the index.
40    #[must_use]
41    pub fn len(&self) -> usize {
42        self.map.len()
43    }
44
45    /// Whether the index has any entries.
46    #[must_use]
47    pub fn is_empty(&self) -> bool {
48        self.map.is_empty()
49    }
50
51    /// Insert `doc_id` into the postings list for `trigram`.
52    ///
53    /// Idempotent: re-inserting the same `(trigram, doc_id)`
54    /// pair is a no-op.
55    pub fn insert(&mut self, trigram: u64, doc_id: u32) {
56        self.map.entry(trigram).or_default().insert(doc_id);
57    }
58
59    /// Remove `doc_id` from the postings list for `trigram`.
60    ///
61    /// If removing leaves the postings list empty, the trigram
62    /// entry is dropped from the map (garbage collection).
63    pub fn remove(&mut self, trigram: u64, doc_id: u32) {
64        let drop = match self.map.get_mut(&trigram) {
65            Some(bm) => {
66                bm.remove(doc_id);
67                bm.is_empty()
68            }
69            None => false,
70        };
71        if drop {
72            self.map.remove(&trigram);
73        }
74    }
75
76    /// Borrow the postings bitmap for `trigram`, if any.
77    #[must_use]
78    pub fn lookup(&self, trigram: u64) -> Option<&RoaringBitmap> {
79        self.map.get(&trigram)
80    }
81
82    /// Intersect the postings lists for the given trigrams.
83    ///
84    /// Returns the set of doc ids that appear in EVERY trigram's
85    /// list. If `trigrams` is empty, the result is empty (the
86    /// index does not invent a "universal" set: callers asking
87    /// to intersect "no" trigrams should treat this as a
88    /// no-evidence query and either fall back to a full scan or
89    /// return the empty set, depending on policy).
90    ///
91    /// If any trigram has no postings entry, the result is empty
92    /// (a missing trigram is provably absent from every doc).
93    ///
94    /// The reduction order is by ascending posting-list size so
95    /// the working set shrinks as fast as possible.
96    #[must_use]
97    pub fn intersect(&self, trigrams: &[u64]) -> RoaringBitmap {
98        if trigrams.is_empty() {
99            return RoaringBitmap::new();
100        }
101        let mut order: Vec<u64> = trigrams.to_vec();
102        order.sort_unstable();
103        order.dedup();
104        order.sort_by_key(|t| self.map.get(t).map_or(0, RoaringBitmap::len));
105
106        let mut iter = order.into_iter();
107        let Some(first_trigram) = iter.next() else {
108            return RoaringBitmap::new();
109        };
110        let mut acc = match self.map.get(&first_trigram) {
111            Some(b) => b.clone(),
112            None => return RoaringBitmap::new(),
113        };
114        for t in iter {
115            if acc.is_empty() {
116                return acc;
117            }
118            match self.map.get(&t) {
119                Some(b) => {
120                    acc &= b;
121                }
122                None => return RoaringBitmap::new(),
123            }
124        }
125        acc
126    }
127
128    /// Union the postings lists for the given trigrams.
129    ///
130    /// Returns the set of doc ids that appear in AT LEAST ONE of
131    /// the trigrams' lists. Missing trigrams contribute nothing.
132    /// An empty input yields the empty set.
133    #[must_use]
134    pub fn union(&self, trigrams: &[u64]) -> RoaringBitmap {
135        let mut acc = RoaringBitmap::new();
136        let mut seen: Vec<u64> = trigrams.to_vec();
137        seen.sort_unstable();
138        seen.dedup();
139        for t in seen {
140            if let Some(b) = self.map.get(&t) {
141                acc |= b;
142            }
143        }
144        acc
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151
152    #[test]
153    fn postings_insert_and_lookup() {
154        let mut p = Postings::new();
155        p.insert(7, 1);
156        p.insert(7, 2);
157        p.insert(8, 2);
158        let bm = p.lookup(7).expect("trigram 7 present");
159        assert!(bm.contains(1));
160        assert!(bm.contains(2));
161        let bm = p.lookup(8).expect("trigram 8 present");
162        assert!(bm.contains(2));
163        assert!(!bm.contains(1));
164        assert_eq!(p.len(), 2);
165    }
166
167    #[test]
168    fn postings_lookup_missing_returns_none() {
169        let p = Postings::new();
170        assert!(p.lookup(99).is_none());
171    }
172
173    #[test]
174    fn postings_intersection_of_two_trigrams_yields_intersection() {
175        let mut p = Postings::new();
176        p.insert(1, 10);
177        p.insert(1, 20);
178        p.insert(1, 30);
179        p.insert(2, 20);
180        p.insert(2, 30);
181        p.insert(2, 40);
182        let r = p.intersect(&[1, 2]);
183        assert!(r.contains(20));
184        assert!(r.contains(30));
185        assert!(!r.contains(10));
186        assert!(!r.contains(40));
187        assert_eq!(r.len(), 2);
188    }
189
190    #[test]
191    fn postings_intersection_of_disjoint_trigrams_is_empty() {
192        let mut p = Postings::new();
193        p.insert(1, 10);
194        p.insert(1, 20);
195        p.insert(2, 30);
196        p.insert(2, 40);
197        let r = p.intersect(&[1, 2]);
198        assert!(r.is_empty());
199    }
200
201    #[test]
202    fn postings_intersection_with_missing_trigram_is_empty() {
203        let mut p = Postings::new();
204        p.insert(1, 10);
205        let r = p.intersect(&[1, 999]);
206        assert!(r.is_empty());
207    }
208
209    #[test]
210    fn postings_intersection_empty_input_is_empty() {
211        let p = Postings::new();
212        assert!(p.intersect(&[]).is_empty());
213    }
214
215    #[test]
216    fn postings_intersection_single_trigram_returns_that_list() {
217        let mut p = Postings::new();
218        p.insert(1, 10);
219        p.insert(1, 20);
220        let r = p.intersect(&[1]);
221        assert_eq!(r.len(), 2);
222    }
223
224    #[test]
225    fn postings_union_basic() {
226        let mut p = Postings::new();
227        p.insert(1, 10);
228        p.insert(2, 20);
229        p.insert(3, 30);
230        let r = p.union(&[1, 2]);
231        assert!(r.contains(10));
232        assert!(r.contains(20));
233        assert!(!r.contains(30));
234    }
235
236    #[test]
237    fn postings_remove_clears_doc_id_from_one_trigram() {
238        let mut p = Postings::new();
239        p.insert(1, 10);
240        p.insert(1, 20);
241        p.remove(1, 10);
242        let bm = p.lookup(1).expect("trigram 1 still present");
243        assert!(!bm.contains(10));
244        assert!(bm.contains(20));
245    }
246
247    #[test]
248    fn postings_dropping_a_trigram_when_no_docs_left_garbage_collects() {
249        let mut p = Postings::new();
250        p.insert(1, 10);
251        assert_eq!(p.len(), 1);
252        p.remove(1, 10);
253        assert_eq!(p.len(), 0);
254        assert!(p.lookup(1).is_none());
255    }
256
257    #[test]
258    fn postings_remove_missing_is_a_noop() {
259        let mut p = Postings::new();
260        p.insert(1, 10);
261        p.remove(1, 99);
262        p.remove(2, 10);
263        let bm = p.lookup(1).expect("trigram 1 still present");
264        assert!(bm.contains(10));
265    }
266}