1pub type Trigram = u32;
9
10#[inline]
12pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
13 ((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
14}
15
16pub struct Extractor {
18 trigram_offsets: Vec<(Trigram, u32)>,
19}
20
21impl Extractor {
22 pub fn new() -> Self {
23 Self {
24 trigram_offsets: Vec::with_capacity(4096),
25 }
26 }
27
28 pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
31 self.trigram_offsets.clear();
32
33 if data.len() < 3 {
34 return &self.trigram_offsets;
35 }
36
37 for i in 0..=(data.len() - 3) {
39 if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
41 continue;
42 }
43
44 let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
45 self.trigram_offsets.push((tri, i as u32));
46
47 if self.trigram_offsets.len() >= 1_000_000 {
49 break;
50 }
51 }
52
53 self.trigram_offsets.sort_unstable();
55
56 &self.trigram_offsets
57 }
58
59 pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
61 if query.len() < 3 {
62 return vec![];
63 }
64 let mut set = Vec::with_capacity(query.len() - 2);
65 for i in 0..=(query.len() - 3) {
66 let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
67 set.push(tri);
68 }
69 set.sort_unstable();
70 set.dedup();
71 set
72 }
73
74 pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
79 if query.len() < 3 {
80 return vec![];
81 }
82 let mut groups = Vec::with_capacity(query.len() - 2);
83 for i in 0..=(query.len() - 3) {
84 let bytes = [query[i], query[i + 1], query[i + 2]];
85 let mut variants = case_variants(bytes);
86 variants.sort_unstable();
87 variants.dedup();
88 groups.push(variants);
89 }
90 groups
91 }
92}
93
94fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
97 let alts: [Vec<u8>; 3] = [
98 byte_cases(bytes[0]),
99 byte_cases(bytes[1]),
100 byte_cases(bytes[2]),
101 ];
102 let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
103 for &a in &alts[0] {
104 for &b in &alts[1] {
105 for &c in &alts[2] {
106 out.push(from_bytes(a, b, c));
107 }
108 }
109 }
110 out
111}
112
113#[inline]
115fn byte_cases(b: u8) -> Vec<u8> {
116 if b.is_ascii_uppercase() {
117 vec![b, b.to_ascii_lowercase()]
118 } else if b.is_ascii_lowercase() {
119 vec![b, b.to_ascii_uppercase()]
120 } else {
121 vec![b]
122 }
123}
124
125impl Default for Extractor {
126 fn default() -> Self {
127 Self::new()
128 }
129}
130
131#[cfg(test)]
132mod tests {
133 use super::*;
134
135 #[test]
136 fn empty() {
137 let mut e = Extractor::new();
138 assert!(e.extract_with_offsets(b"").is_empty());
139 assert!(e.extract_with_offsets(b"ab").is_empty());
140 }
141
142 #[test]
143 fn single_trigram() {
144 let mut e = Extractor::new();
145 let result = e.extract_with_offsets(b"abc");
146 assert_eq!(result.len(), 1);
147 assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
148 }
149
150 #[test]
151 fn deduplication() {
152 let mut e = Extractor::new();
153 let result = e.extract_with_offsets(b"abcabc");
154 let abc = from_bytes(b'a', b'b', b'c');
157 let bca = from_bytes(b'b', b'c', b'a');
158 let cab = from_bytes(b'c', b'a', b'b');
159
160 assert_eq!(result.len(), 4);
161 assert_eq!(result[0], (abc, 0));
162 assert_eq!(result[1], (abc, 3));
163 assert_eq!(result[2], (bca, 1));
164 assert_eq!(result[3], (cab, 2));
165 }
166
167 #[test]
168 fn null_bytes_skipped() {
169 let mut e = Extractor::new();
170 let result = e.extract_with_offsets(b"a\x00b");
171 assert!(result.is_empty());
172 }
173
174 #[test]
175 fn high_bytes_work() {
176 let mut e = Extractor::new();
177 let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
178 let tri = from_bytes(0xFF, 0xFE, 0xFD);
179 assert_eq!(result[0], (tri, 0));
180 }
181
182 #[test]
183 fn utf8_cafe() {
184 let data = "caf\u{00e9}".as_bytes();
186 let set = Extractor::extract_set(data);
187 assert!(!set.is_empty());
188 assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
190 }
191
192 #[test]
193 fn query_set_deduped() {
194 let set = Extractor::extract_set(b"errorerror");
195 let unique_count = set.len();
197 assert!(unique_count <= 8); }
199}