Skip to main content

rgx/
trigram.rs

1//! Byte trigrams: the atomic unit of the candidate index.
2//!
3//! A trigram is any three consecutive bytes of a file's contents. A file is represented in the
4//! index by the *set* of distinct trigrams it contains; a query is represented by a boolean
5//! formula over trigrams that every matching file must satisfy. See `docs/index-and-storage.md`.
6
7use std::collections::BTreeSet;
8
9/// A single trigram. Three raw bytes; works for arbitrary (incl. non-UTF-8) content.
10pub type Trigram = [u8; 3];
11
12/// Pack a trigram into the low 24 bits of a `u32`, for compact keys.
13#[inline]
14pub fn pack(t: Trigram) -> u32 {
15    ((t[0] as u32) << 16) | ((t[1] as u32) << 8) | (t[2] as u32)
16}
17
18/// Invoke `f` once per (not-necessarily-distinct) trigram window over `bytes`.
19#[inline]
20pub fn for_each(bytes: &[u8], mut f: impl FnMut(Trigram)) {
21    for w in bytes.windows(3) {
22        f([w[0], w[1], w[2]]);
23    }
24}
25
26/// The set of distinct trigrams in `bytes`. A `BTreeSet` keeps results deterministic for tests;
27/// the indexer will use a faster set.
28pub fn distinct(bytes: &[u8]) -> BTreeSet<Trigram> {
29    let mut set = BTreeSet::new();
30    for_each(bytes, |t| {
31        set.insert(t);
32    });
33    set
34}
35
36/// The trigrams of a literal byte string, in order (with repeats). Empty if `lit.len() < 3`.
37pub fn of_literal(lit: &[u8]) -> Vec<Trigram> {
38    lit.windows(3).map(|w| [w[0], w[1], w[2]]).collect()
39}
40
41#[cfg(test)]
42mod tests {
43    use super::*;
44
45    #[test]
46    fn distinct_and_literal_agree() {
47        assert_eq!(of_literal(b"abcd"), vec![*b"abc", *b"bcd"]);
48        assert!(of_literal(b"ab").is_empty());
49        let d = distinct(b"abcabc");
50        // "abc","bca","cab","abc" -> distinct {abc,bca,cab}
51        assert_eq!(d.len(), 3);
52        assert!(d.contains(b"abc"));
53    }
54}