1use std::collections::HashMap;
7
8pub type Trigram = u32;
11
12#[inline]
14pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
15 ((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
16}
17
18pub struct Extractor {
20 trigram_offsets: HashMap<Trigram, Vec<u32>>,
21}
22
23impl Extractor {
24 pub fn new() -> Self {
25 Self {
26 trigram_offsets: HashMap::with_capacity(4096),
27 }
28 }
29
30 pub fn extract_with_offsets(&mut self, data: &[u8]) -> &HashMap<Trigram, Vec<u32>> {
32 self.trigram_offsets.clear();
33
34 if data.len() < 3 {
35 return &self.trigram_offsets;
36 }
37
38 for i in 0..=(data.len() - 3) {
39 if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
41 continue;
42 }
43
44 let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
45 self.trigram_offsets.entry(tri).or_default().push(i as u32);
46 }
47
48 &self.trigram_offsets
49 }
50
51 pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
53 if query.len() < 3 {
54 return vec![];
55 }
56 let mut set = Vec::with_capacity(query.len() - 2);
57 for i in 0..=(query.len() - 3) {
58 let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
59 set.push(tri);
60 }
61 set.sort_unstable();
62 set.dedup();
63 set
64 }
65
66 pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
71 if query.len() < 3 {
72 return vec![];
73 }
74 let mut groups = Vec::with_capacity(query.len() - 2);
75 for i in 0..=(query.len() - 3) {
76 let bytes = [query[i], query[i + 1], query[i + 2]];
77 let mut variants = case_variants(bytes);
78 variants.sort_unstable();
79 variants.dedup();
80 groups.push(variants);
81 }
82 groups
83 }
84}
85
86fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
89 let alts: [Vec<u8>; 3] = [
90 byte_cases(bytes[0]),
91 byte_cases(bytes[1]),
92 byte_cases(bytes[2]),
93 ];
94 let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
95 for &a in &alts[0] {
96 for &b in &alts[1] {
97 for &c in &alts[2] {
98 out.push(from_bytes(a, b, c));
99 }
100 }
101 }
102 out
103}
104
105#[inline]
107fn byte_cases(b: u8) -> Vec<u8> {
108 if b.is_ascii_uppercase() {
109 vec![b, b.to_ascii_lowercase()]
110 } else if b.is_ascii_lowercase() {
111 vec![b, b.to_ascii_uppercase()]
112 } else {
113 vec![b]
114 }
115}
116
117impl Default for Extractor {
118 fn default() -> Self {
119 Self::new()
120 }
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126
127 #[test]
128 fn empty() {
129 let mut e = Extractor::new();
130 assert!(e.extract_with_offsets(b"").is_empty());
131 assert!(e.extract_with_offsets(b"ab").is_empty());
132 }
133
134 #[test]
135 fn single_trigram() {
136 let mut e = Extractor::new();
137 let result = e.extract_with_offsets(b"abc");
138 assert_eq!(result.len(), 1);
139 assert!(result.contains_key(&from_bytes(b'a', b'b', b'c')));
140 }
141
142 #[test]
143 fn deduplication() {
144 let mut e = Extractor::new();
145 let result = e.extract_with_offsets(b"abcabc");
146 let abc = from_bytes(b'a', b'b', b'c');
148 assert_eq!(result[&abc].len(), 2);
149 assert_eq!(result[&abc], vec![0, 3]);
150 }
151
152 #[test]
153 fn null_bytes_skipped() {
154 let mut e = Extractor::new();
155 let result = e.extract_with_offsets(b"a\x00b");
156 assert!(result.is_empty());
157 }
158
159 #[test]
160 fn high_bytes_work() {
161 let mut e = Extractor::new();
162 let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
163 let tri = from_bytes(0xFF, 0xFE, 0xFD);
164 assert!(result.contains_key(&tri));
165 }
166
167 #[test]
168 fn utf8_cafe() {
169 let data = "caf\u{00e9}".as_bytes();
171 let set = Extractor::extract_set(data);
172 assert!(!set.is_empty());
173 assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
175 }
176
177 #[test]
178 fn query_set_deduped() {
179 let set = Extractor::extract_set(b"errorerror");
180 let unique_count = set.len();
182 assert!(unique_count <= 8); }
184}