1pub type Trigram = u32;
9
10#[inline]
12pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
13 ((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
14}
15
16pub struct Extractor {
18 trigram_offsets: Vec<(Trigram, u32)>,
19}
20
21impl Extractor {
22 pub fn new() -> Self {
23 Self {
24 trigram_offsets: Vec::with_capacity(4096),
25 }
26 }
27
28 pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
31 self.trigram_offsets.clear();
32
33 if data.len() < 3 {
34 return &self.trigram_offsets;
35 }
36
37 for i in 0..=(data.len() - 3) {
39 if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
41 continue;
42 }
43
44 let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
45 self.trigram_offsets.push((tri, i as u32));
46 }
47
48 self.trigram_offsets.sort_unstable();
50
51 &self.trigram_offsets
52 }
53
54 pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
56 if query.len() < 3 {
57 return vec![];
58 }
59 let mut set = Vec::with_capacity(query.len() - 2);
60 for i in 0..=(query.len() - 3) {
61 let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
62 set.push(tri);
63 }
64 set.sort_unstable();
65 set.dedup();
66 set
67 }
68
69 pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
74 if query.len() < 3 {
75 return vec![];
76 }
77 let mut groups = Vec::with_capacity(query.len() - 2);
78 for i in 0..=(query.len() - 3) {
79 let bytes = [query[i], query[i + 1], query[i + 2]];
80 let mut variants = case_variants(bytes);
81 variants.sort_unstable();
82 variants.dedup();
83 groups.push(variants);
84 }
85 groups
86 }
87}
88
89fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
92 let alts: [Vec<u8>; 3] = [
93 byte_cases(bytes[0]),
94 byte_cases(bytes[1]),
95 byte_cases(bytes[2]),
96 ];
97 let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
98 for &a in &alts[0] {
99 for &b in &alts[1] {
100 for &c in &alts[2] {
101 out.push(from_bytes(a, b, c));
102 }
103 }
104 }
105 out
106}
107
108#[inline]
110fn byte_cases(b: u8) -> Vec<u8> {
111 if b.is_ascii_uppercase() {
112 vec![b, b.to_ascii_lowercase()]
113 } else if b.is_ascii_lowercase() {
114 vec![b, b.to_ascii_uppercase()]
115 } else {
116 vec![b]
117 }
118}
119
120impl Default for Extractor {
121 fn default() -> Self {
122 Self::new()
123 }
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn empty() {
132 let mut e = Extractor::new();
133 assert!(e.extract_with_offsets(b"").is_empty());
134 assert!(e.extract_with_offsets(b"ab").is_empty());
135 }
136
137 #[test]
138 fn single_trigram() {
139 let mut e = Extractor::new();
140 let result = e.extract_with_offsets(b"abc");
141 assert_eq!(result.len(), 1);
142 assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
143 }
144
145 #[test]
146 fn deduplication() {
147 let mut e = Extractor::new();
148 let result = e.extract_with_offsets(b"abcabc");
149 let abc = from_bytes(b'a', b'b', b'c');
152 let bca = from_bytes(b'b', b'c', b'a');
153 let cab = from_bytes(b'c', b'a', b'b');
154
155 assert_eq!(result.len(), 4);
156 assert_eq!(result[0], (abc, 0));
157 assert_eq!(result[1], (abc, 3));
158 assert_eq!(result[2], (bca, 1));
159 assert_eq!(result[3], (cab, 2));
160 }
161
162 #[test]
163 fn null_bytes_skipped() {
164 let mut e = Extractor::new();
165 let result = e.extract_with_offsets(b"a\x00b");
166 assert!(result.is_empty());
167 }
168
169 #[test]
170 fn high_bytes_work() {
171 let mut e = Extractor::new();
172 let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
173 let tri = from_bytes(0xFF, 0xFE, 0xFD);
174 assert_eq!(result[0], (tri, 0));
175 }
176
177 #[test]
178 fn utf8_cafe() {
179 let data = "caf\u{00e9}".as_bytes();
181 let set = Extractor::extract_set(data);
182 assert!(!set.is_empty());
183 assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
185 }
186
187 #[test]
188 fn query_set_deduped() {
189 let set = Extractor::extract_set(b"errorerror");
190 let unique_count = set.len();
192 assert!(unique_count <= 8); }
194}