sift_core/index/
trigram.rs1use std::collections::HashSet;
4
5#[must_use]
11pub fn extract_trigrams(text: &str) -> Vec<[u8; 3]> {
12 extract_trigrams_from_bytes(text.as_bytes())
13}
14
15#[must_use]
17pub fn extract_trigrams_from_bytes(b: &[u8]) -> Vec<[u8; 3]> {
18 if b.len() < 3 {
19 return Vec::new();
20 }
21 let mut out = Vec::with_capacity(b.len() - 2);
22 for i in 0..=b.len() - 3 {
23 out.push([b[i], b[i + 1], b[i + 2]]);
24 }
25 out
26}
27
28#[must_use]
29pub fn extract_unique_trigrams_from_bytes(b: &[u8]) -> HashSet<[u8; 3]> {
30 let mut out = HashSet::new();
31 if b.len() < 3 {
32 return out;
33 }
34 out.reserve(b.len().min(1 << 16));
35 for i in 0..=b.len() - 3 {
36 out.insert([b[i], b[i + 1], b[i + 2]]);
37 }
38 out
39}
40
41#[must_use]
43#[cfg(test)]
44pub fn extract_trigrams_utf8_lossy(bytes: &[u8]) -> Vec<[u8; 3]> {
45 std::str::from_utf8(bytes).map_or_else(
46 |_| extract_trigrams(String::from_utf8_lossy(bytes).as_ref()),
47 extract_trigrams,
48 )
49}
50
51#[must_use]
52pub fn extract_unique_trigrams_utf8_lossy(bytes: &[u8]) -> HashSet<[u8; 3]> {
53 std::str::from_utf8(bytes).map_or_else(
54 |_| extract_unique_trigrams_from_bytes(String::from_utf8_lossy(bytes).as_ref().as_bytes()),
55 |text| extract_unique_trigrams_from_bytes(text.as_bytes()),
56 )
57}
58
59#[cfg(test)]
60mod tests {
61 use super::*;
62
63 fn reference_lossy(bytes: &[u8]) -> Vec<[u8; 3]> {
64 extract_trigrams(String::from_utf8_lossy(bytes).as_ref())
65 }
66
67 #[test]
68 fn utf8_lossy_matches_reference_valid_ascii() {
69 let b = b"hello world";
70 assert_eq!(extract_trigrams_utf8_lossy(b), reference_lossy(b));
71 }
72
73 #[test]
74 fn utf8_lossy_matches_reference_multibyte() {
75 let b = "café résumé 日本語".as_bytes();
76 assert_eq!(extract_trigrams_utf8_lossy(b), reference_lossy(b));
77 }
78
79 #[test]
80 fn utf8_lossy_matches_reference_invalid() {
81 for b in [
82 &[0xff, 0xfe, 0xfd][..],
83 b"ok\xff\xfe trail",
84 &[0x80][..],
85 b"a\xe0\x80\x80b",
86 ] {
87 assert_eq!(
88 extract_trigrams_utf8_lossy(b),
89 reference_lossy(b),
90 "bytes={b:?}"
91 );
92 }
93 }
94
95 #[test]
96 fn utf8_lossy_matches_reference_mixed() {
97 let b: Vec<u8> = (0_u8..=255)
98 .cycle()
99 .take(512)
100 .chain(std::iter::once(0xff))
101 .collect();
102 assert_eq!(extract_trigrams_utf8_lossy(&b), reference_lossy(&b));
103 }
104
105 #[test]
106 fn short_string_empty() {
107 assert!(extract_trigrams("").is_empty());
108 assert!(extract_trigrams("ab").is_empty());
109 }
110
111 #[test]
112 fn ascii_three_chars_one_trigram() {
113 assert_eq!(extract_trigrams("abc"), vec![[b'a', b'b', b'c']]);
114 }
115
116 #[test]
117 fn overlapping_windows() {
118 assert_eq!(
119 extract_trigrams("abcd"),
120 vec![[b'a', b'b', b'c'], [b'b', b'c', b'd']]
121 );
122 }
123}