1pub type Trigram = u32;
9
10#[inline]
12#[must_use]
13pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
14 (u32::from(a) << 16) | (u32::from(b) << 8) | u32::from(c)
15}
16
17pub struct Extractor {
19 trigram_offsets: Vec<(Trigram, u32)>,
20}
21
22impl Extractor {
23 #[must_use]
25 pub fn new() -> Self {
26 Self {
27 trigram_offsets: Vec::with_capacity(4096),
28 }
29 }
30
31 #[must_use]
35 #[allow(clippy::indexing_slicing)] #[allow(clippy::as_conversions)] pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
38 const MAX_OFFSETS_PER_TRIGRAM: usize = 64;
39 const SMALL_FILE_LIMIT: usize = 1_000_000;
40
41 self.trigram_offsets.clear();
42
43 if data.len() < 3 {
44 return &self.trigram_offsets;
45 }
46
47 if data.len() <= SMALL_FILE_LIMIT {
48 for i in 0..=(data.len() - 3) {
49 if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
50 continue;
51 }
52 let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
53 self.trigram_offsets.push((tri, i as u32));
54 }
55 } else {
56 let mut seen_counts: std::collections::HashMap<Trigram, usize> =
57 std::collections::HashMap::new();
58 for i in 0..=(data.len() - 3) {
59 if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
60 continue;
61 }
62 let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
63 let count = seen_counts.entry(tri).or_insert(0);
64 if *count < MAX_OFFSETS_PER_TRIGRAM {
65 self.trigram_offsets.push((tri, i as u32));
66 *count += 1;
67 }
68 }
69 }
70
71 self.trigram_offsets.sort_unstable();
72
73 &self.trigram_offsets
74 }
75
76 #[must_use]
78 #[allow(clippy::indexing_slicing)] pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
80 if query.len() < 3 {
81 return vec![];
82 }
83 let mut set = Vec::with_capacity(query.len() - 2);
84 for i in 0..=(query.len() - 3) {
85 let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
86 set.push(tri);
87 }
88 set.sort_unstable();
89 set.dedup();
90 set
91 }
92
93 #[must_use]
99 #[allow(clippy::indexing_slicing)] pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
101 if query.len() < 3 {
102 return vec![];
103 }
104 let mut groups = Vec::with_capacity(query.len() - 2);
105 for i in 0..=(query.len() - 3) {
106 let bytes = [query[i], query[i + 1], query[i + 2]];
107 let mut variants = case_variants(bytes);
108 variants.sort_unstable();
109 variants.dedup();
110 groups.push(variants);
111 }
112 groups
113 }
114}
115
116fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
119 let alts: [Vec<u8>; 3] = [
120 byte_cases(bytes[0]),
121 byte_cases(bytes[1]),
122 byte_cases(bytes[2]),
123 ];
124 let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
125 for &a in &alts[0] {
126 for &b in &alts[1] {
127 for &c in &alts[2] {
128 out.push(from_bytes(a, b, c));
129 }
130 }
131 }
132 out
133}
134
135#[inline]
137fn byte_cases(b: u8) -> Vec<u8> {
138 if b.is_ascii_uppercase() {
139 vec![b, b.to_ascii_lowercase()]
140 } else if b.is_ascii_lowercase() {
141 vec![b, b.to_ascii_uppercase()]
142 } else {
143 vec![b]
144 }
145}
146
147impl Default for Extractor {
148 fn default() -> Self {
149 Self::new()
150 }
151}
152
153#[cfg(test)]
154#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
155mod tests {
156 use super::*;
157
158 #[test]
159 fn empty() {
160 let mut e = Extractor::new();
161 assert!(e.extract_with_offsets(b"").is_empty());
162 assert!(e.extract_with_offsets(b"ab").is_empty());
163 }
164
165 #[test]
166 fn single_trigram() {
167 let mut e = Extractor::new();
168 let result = e.extract_with_offsets(b"abc");
169 assert_eq!(result.len(), 1);
170 assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
171 }
172
173 #[test]
174 fn deduplication() {
175 let mut e = Extractor::new();
176 let result = e.extract_with_offsets(b"abcabc");
177 let abc = from_bytes(b'a', b'b', b'c');
178 let bca = from_bytes(b'b', b'c', b'a');
179 let cab = from_bytes(b'c', b'a', b'b');
180
181 assert_eq!(result.len(), 4);
182 assert_eq!(result[0], (abc, 0));
183 assert_eq!(result[1], (abc, 3));
184 assert_eq!(result[2], (bca, 1));
185 assert_eq!(result[3], (cab, 2));
186 }
187
188 #[test]
189 fn null_bytes_skipped() {
190 let mut e = Extractor::new();
191 let result = e.extract_with_offsets(b"a\x00b");
192 assert!(result.is_empty());
193 }
194
195 #[test]
196 fn high_bytes_work() {
197 let mut e = Extractor::new();
198 let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
199 let tri = from_bytes(0xFF, 0xFE, 0xFD);
200 assert_eq!(result[0], (tri, 0));
201 }
202
203 #[test]
204 fn utf8_cafe() {
205 let data = "caf\u{00e9}".as_bytes();
206 let set = Extractor::extract_set(data);
207 assert!(!set.is_empty());
208 assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
209 }
210
211 #[test]
212 fn query_set_deduped() {
213 let set = Extractor::extract_set(b"errorerror");
214 let unique_count = set.len();
215 assert!(unique_count <= 8);
216 }
217}