Skip to main content

sift_core/index/
trigram.rs

1//! Overlapping byte trigrams.
2
3use std::collections::HashSet;
4
5/// Extract overlapping 3-byte windows from `text`, handling invalid UTF-8 with lossy replacement.
6///
7/// Valid UTF-8 takes a fast path (no replacement string allocated). Invalid sequences fall back to
8/// [`String::from_utf8_lossy`] semantics, matching what [`extract_trigrams_utf8_lossy`] would do on the
9/// raw bytes — but in a single pass without an intermediate `String`.
10#[must_use]
11pub fn extract_trigrams(text: &str) -> Vec<[u8; 3]> {
12    extract_trigrams_from_bytes(text.as_bytes())
13}
14
15/// Core: sliding 3-byte windows over raw bytes.
16#[must_use]
17pub fn extract_trigrams_from_bytes(b: &[u8]) -> Vec<[u8; 3]> {
18    if b.len() < 3 {
19        return Vec::new();
20    }
21    let mut out = Vec::with_capacity(b.len() - 2);
22    for i in 0..=b.len() - 3 {
23        out.push([b[i], b[i + 1], b[i + 2]]);
24    }
25    out
26}
27
28#[must_use]
29pub fn extract_unique_trigrams_from_bytes(b: &[u8]) -> HashSet<[u8; 3]> {
30    let mut out = HashSet::new();
31    if b.len() < 3 {
32        return out;
33    }
34    out.reserve(b.len().min(1 << 16));
35    for i in 0..=b.len() - 3 {
36        out.insert([b[i], b[i + 1], b[i + 2]]);
37    }
38    out
39}
40
41/// Extract trigrams from raw bytes, falling back to lossy UTF-8 for invalid sequences.
42#[must_use]
43#[cfg(test)]
44pub fn extract_trigrams_utf8_lossy(bytes: &[u8]) -> Vec<[u8; 3]> {
45    std::str::from_utf8(bytes).map_or_else(
46        |_| extract_trigrams(String::from_utf8_lossy(bytes).as_ref()),
47        extract_trigrams,
48    )
49}
50
51#[must_use]
52pub fn extract_unique_trigrams_utf8_lossy(bytes: &[u8]) -> HashSet<[u8; 3]> {
53    std::str::from_utf8(bytes).map_or_else(
54        |_| extract_unique_trigrams_from_bytes(String::from_utf8_lossy(bytes).as_ref().as_bytes()),
55        |text| extract_unique_trigrams_from_bytes(text.as_bytes()),
56    )
57}
58
59#[cfg(test)]
60mod tests {
61    use super::*;
62
63    fn reference_lossy(bytes: &[u8]) -> Vec<[u8; 3]> {
64        extract_trigrams(String::from_utf8_lossy(bytes).as_ref())
65    }
66
67    #[test]
68    fn utf8_lossy_matches_reference_valid_ascii() {
69        let b = b"hello world";
70        assert_eq!(extract_trigrams_utf8_lossy(b), reference_lossy(b));
71    }
72
73    #[test]
74    fn utf8_lossy_matches_reference_multibyte() {
75        let b = "café résumé 日本語".as_bytes();
76        assert_eq!(extract_trigrams_utf8_lossy(b), reference_lossy(b));
77    }
78
79    #[test]
80    fn utf8_lossy_matches_reference_invalid() {
81        for b in [
82            &[0xff, 0xfe, 0xfd][..],
83            b"ok\xff\xfe trail",
84            &[0x80][..],
85            b"a\xe0\x80\x80b",
86        ] {
87            assert_eq!(
88                extract_trigrams_utf8_lossy(b),
89                reference_lossy(b),
90                "bytes={b:?}"
91            );
92        }
93    }
94
95    #[test]
96    fn utf8_lossy_matches_reference_mixed() {
97        let b: Vec<u8> = (0_u8..=255)
98            .cycle()
99            .take(512)
100            .chain(std::iter::once(0xff))
101            .collect();
102        assert_eq!(extract_trigrams_utf8_lossy(&b), reference_lossy(&b));
103    }
104
105    #[test]
106    fn short_string_empty() {
107        assert!(extract_trigrams("").is_empty());
108        assert!(extract_trigrams("ab").is_empty());
109    }
110
111    #[test]
112    fn ascii_three_chars_one_trigram() {
113        assert_eq!(extract_trigrams("abc"), vec![[b'a', b'b', b'c']]);
114    }
115
116    #[test]
117    fn overlapping_windows() {
118        assert_eq!(
119            extract_trigrams("abcd"),
120            vec![[b'a', b'b', b'c'], [b'b', b'c', b'd']]
121        );
122    }
123}