1pub type Trigram = u32;
9
10#[inline]
12#[must_use]
13pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
14 (u32::from(a) << 16) | (u32::from(b) << 8) | u32::from(c)
15}
16
17pub struct Extractor {
19 trigram_offsets: Vec<(Trigram, u32)>,
20}
21
22impl Extractor {
23 #[must_use]
25 pub fn new() -> Self {
26 Self {
27 trigram_offsets: Vec::with_capacity(4096),
28 }
29 }
30
31 #[must_use]
35 #[allow(clippy::indexing_slicing)] #[allow(clippy::as_conversions)] pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
38 self.trigram_offsets.clear();
39
40 if data.len() < 3 {
41 return &self.trigram_offsets;
42 }
43
44 for i in 0..=(data.len() - 3) {
45 if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
46 continue;
47 }
48
49 let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
50 self.trigram_offsets.push((tri, i as u32));
51
52 if self.trigram_offsets.len() >= 1_000_000 {
53 break;
54 }
55 }
56
57 self.trigram_offsets.sort_unstable();
58
59 &self.trigram_offsets
60 }
61
62 #[must_use]
64 #[allow(clippy::indexing_slicing)] pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
66 if query.len() < 3 {
67 return vec![];
68 }
69 let mut set = Vec::with_capacity(query.len() - 2);
70 for i in 0..=(query.len() - 3) {
71 let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
72 set.push(tri);
73 }
74 set.sort_unstable();
75 set.dedup();
76 set
77 }
78
79 #[must_use]
85 #[allow(clippy::indexing_slicing)] pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
87 if query.len() < 3 {
88 return vec![];
89 }
90 let mut groups = Vec::with_capacity(query.len() - 2);
91 for i in 0..=(query.len() - 3) {
92 let bytes = [query[i], query[i + 1], query[i + 2]];
93 let mut variants = case_variants(bytes);
94 variants.sort_unstable();
95 variants.dedup();
96 groups.push(variants);
97 }
98 groups
99 }
100}
101
102fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
105 let alts: [Vec<u8>; 3] = [
106 byte_cases(bytes[0]),
107 byte_cases(bytes[1]),
108 byte_cases(bytes[2]),
109 ];
110 let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
111 for &a in &alts[0] {
112 for &b in &alts[1] {
113 for &c in &alts[2] {
114 out.push(from_bytes(a, b, c));
115 }
116 }
117 }
118 out
119}
120
121#[inline]
123fn byte_cases(b: u8) -> Vec<u8> {
124 if b.is_ascii_uppercase() {
125 vec![b, b.to_ascii_lowercase()]
126 } else if b.is_ascii_lowercase() {
127 vec![b, b.to_ascii_uppercase()]
128 } else {
129 vec![b]
130 }
131}
132
133impl Default for Extractor {
134 fn default() -> Self {
135 Self::new()
136 }
137}
138
139#[cfg(test)]
140#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
141mod tests {
142 use super::*;
143
144 #[test]
145 fn empty() {
146 let mut e = Extractor::new();
147 assert!(e.extract_with_offsets(b"").is_empty());
148 assert!(e.extract_with_offsets(b"ab").is_empty());
149 }
150
151 #[test]
152 fn single_trigram() {
153 let mut e = Extractor::new();
154 let result = e.extract_with_offsets(b"abc");
155 assert_eq!(result.len(), 1);
156 assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
157 }
158
159 #[test]
160 fn deduplication() {
161 let mut e = Extractor::new();
162 let result = e.extract_with_offsets(b"abcabc");
163 let abc = from_bytes(b'a', b'b', b'c');
164 let bca = from_bytes(b'b', b'c', b'a');
165 let cab = from_bytes(b'c', b'a', b'b');
166
167 assert_eq!(result.len(), 4);
168 assert_eq!(result[0], (abc, 0));
169 assert_eq!(result[1], (abc, 3));
170 assert_eq!(result[2], (bca, 1));
171 assert_eq!(result[3], (cab, 2));
172 }
173
174 #[test]
175 fn null_bytes_skipped() {
176 let mut e = Extractor::new();
177 let result = e.extract_with_offsets(b"a\x00b");
178 assert!(result.is_empty());
179 }
180
181 #[test]
182 fn high_bytes_work() {
183 let mut e = Extractor::new();
184 let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
185 let tri = from_bytes(0xFF, 0xFE, 0xFD);
186 assert_eq!(result[0], (tri, 0));
187 }
188
189 #[test]
190 fn utf8_cafe() {
191 let data = "caf\u{00e9}".as_bytes();
192 let set = Extractor::extract_set(data);
193 assert!(!set.is_empty());
194 assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
195 }
196
197 #[test]
198 fn query_set_deduped() {
199 let set = Extractor::extract_set(b"errorerror");
200 let unique_count = set.len();
201 assert!(unique_count <= 8);
202 }
203}