boreal/scanner/ac_scan.rs
1//! Provides the [`AcScan`] object, used to scan for all variables in a single AC pass.
2use std::collections::hash_map::Entry;
3use std::collections::{HashMap, HashSet};
4
5use aho_corasick::{AhoCorasick, AhoCorasickBuilder, AhoCorasickKind};
6
7use super::{
8 CallbackEvents, ScanCallbackResult, ScanData, ScanError, ScanEvent, StringIdentifier,
9 StringMatch,
10};
11use crate::atoms::pick_atom_in_literal;
12use crate::compiler::variable::Variable;
13use crate::compiler::CompilerProfile;
14use crate::matcher::{AcMatchStatus, Matcher};
15use crate::memory::Region;
16
17/// Factorize atoms from all variables, to scan for them in a single pass.
18///
19/// For every variable, literals named atoms are extracted from the variables expressions. A single
20/// Aho-Corasick object is built from all those literals, and a single pass on the scanned bytes
21/// is done with this object. For every match on a literal, the match is then verified to see if
22/// it matches the whole variable expression.
23///
24/// An exception to this is for variables that we either:
25/// - cannot manage to extract atoms from
26/// - need to or prefer scanning on their own
27///
28/// For those variables, the AC pass does not provide any result, and the variable will be scanned
29/// on its own during evaluation of the rules.
30#[derive(Debug)]
31pub(crate) struct AcScan {
32 /// Aho Corasick for variables that are literals.
33 aho: AhoCorasick,
34
35 /// Map from a aho pattern index to a list details on the literals.
36 aho_index_to_literal_info: Vec<Vec<LiteralInfo>>,
37
38 /// List of indexes for vars that are not part of the aho corasick.
39 non_handled_var_indexes: Vec<usize>,
40}
41
42/// Details on a literal of a variable.
43#[derive(Debug)]
44struct LiteralInfo {
45 /// Index of the variable in the variable array.
46 variable_index: usize,
47
48 /// Index of the literal for the variable.
49 literal_index: usize,
50
51 /// Left and right offset for the slice picked in the Aho-Corasick.
52 slice_offset: (usize, usize),
53}
54
55impl AcScan {
56 pub(crate) fn new(variables: &[Variable], profile: CompilerProfile) -> Self {
57 let mut lits = Vec::new();
58 let mut known_lits = HashMap::new();
59 let mut aho_index_to_literal_info = Vec::new();
60 let mut non_handled_var_indexes = Vec::new();
61
62 for (variable_index, var) in variables.iter().enumerate() {
63 if var.matcher.literals.is_empty() {
64 non_handled_var_indexes.push(variable_index);
65 } else {
66 let mut known_literals_of_var = HashSet::new();
67
68 for (literal_index, lit) in var.matcher.literals.iter().enumerate() {
69 let (start, end) = pick_atom_in_literal(lit);
70 let mut atom = lit[start..(lit.len() - end)].to_vec();
71 let literal_info = LiteralInfo {
72 variable_index,
73 literal_index,
74 slice_offset: (start, end),
75 };
76
77 // Sometimes, two literals of the same variable can provide the same atom.
78 // This can happen if the two literals are identical (for example, someone
79 // like me writing a test on `/(abc|abc)/`), or if the literals are
80 // different but contain the same atom (for example,
81 // `{ ( 00 AB CD | AB CD 00 ) }`).
82 //
83 // In those cases, we must *not* use the same atom twice in the Aho-Corasick,
84 // as this would result in two identical matches for the same variable.
85 // To prevent this, a set is used here. Both the atom itself and its position
86 // in the literal are important.
87 if !known_literals_of_var.insert((atom.clone(), start)) {
88 continue;
89 }
90
91 // Ensure the literals provided to the aho corasick are not
92 // duplicated. If multiple variables uses the same atoms,
93 // we will iterate on every variable in this module, instead
94 // of going back into the aho-corasick just for it to
95 // iterate over the matching ids and return immediately
96 // to this code. This improves performances significantly.
97 //
98 // In addition, since the aho-corasick is case insensitive,
99 // normalize before de-duplicating.
100 atom.make_ascii_lowercase();
101
102 match known_lits.entry(atom.clone()) {
103 Entry::Vacant(v) => {
104 let _r = v.insert(lits.len());
105 aho_index_to_literal_info.push(vec![literal_info]);
106 lits.push(atom);
107 }
108 Entry::Occupied(o) => {
109 let index = o.get();
110 aho_index_to_literal_info[*index].push(literal_info);
111 }
112 }
113 }
114 }
115 }
116
117 // TODO: Should this AC be case insensitive or not? Redo some benches once other
118 // optimizations are done.
119
120 let mut builder = AhoCorasickBuilder::new();
121 let builder = builder.ascii_case_insensitive(true);
122 let builder = builder.kind(Some(match profile {
123 CompilerProfile::Speed => AhoCorasickKind::DFA,
124 CompilerProfile::Memory => AhoCorasickKind::ContiguousNFA,
125 }));
126
127 // First try with a smaller size to reduce memory use and improve performances, otherwise
128 // use the default version.
129 let aho = builder.build(&lits).unwrap();
130
131 Self {
132 aho,
133 aho_index_to_literal_info,
134 non_handled_var_indexes,
135 }
136 }
137
138 pub(super) fn scan_region<'scanner>(
139 &self,
140 region: &Region,
141 scanner: &'scanner super::Inner,
142 scan_data: &mut ScanData<'scanner, '_>,
143 matches: &mut [Vec<StringMatch>],
144 ) -> Result<(), ScanError> {
145 #[cfg(feature = "profiling")]
146 if let Some(stats) = scan_data.statistics.as_mut() {
147 stats.nb_memory_chunks += 1;
148 stats.memory_scanned_size += region.mem.len();
149 }
150
151 // Iterate over aho-corasick matches, validating those matches
152 for mat in self.aho.find_overlapping_iter(region.mem) {
153 if scan_data.check_timeout() {
154 return Err(ScanError::Timeout);
155 }
156 self.handle_possible_match(region, scanner, &mat, scan_data, matches)?;
157 }
158
159 if !self.non_handled_var_indexes.is_empty() {
160 #[cfg(feature = "profiling")]
161 let start = std::time::Instant::now();
162
163 // For every "raw" variable, scan the memory for this variable.
164 for variable_index in &self.non_handled_var_indexes {
165 let var = &scanner.variables[*variable_index].matcher;
166
167 scan_single_variable(region, var, scan_data, &mut matches[*variable_index]);
168 }
169
170 #[cfg(feature = "profiling")]
171 if let Some(stats) = scan_data.statistics.as_mut() {
172 stats.raw_regexes_eval_duration += start.elapsed();
173 }
174 }
175
176 Ok(())
177 }
178
179 fn handle_possible_match<'scanner>(
180 &self,
181 region: &Region,
182 scanner: &'scanner super::Inner,
183 mat: &aho_corasick::Match,
184 scan_data: &mut ScanData<'scanner, '_>,
185 matches: &mut [Vec<StringMatch>],
186 ) -> Result<(), ScanError> {
187 for literal_info in &self.aho_index_to_literal_info[mat.pattern()] {
188 let LiteralInfo {
189 variable_index,
190 literal_index,
191 slice_offset: (start_offset, end_offset),
192 } = *literal_info;
193 let var = &scanner.variables[variable_index].matcher;
194
195 #[cfg(feature = "profiling")]
196 if let Some(stats) = scan_data.statistics.as_mut() {
197 stats.nb_ac_matches += 1;
198 }
199 #[cfg(feature = "profiling")]
200 let start_instant = std::time::Instant::now();
201
202 // Upscale to the original literal shape before feeding it to the matcher verification
203 // function.
204 let Some(start) = mat.start().checked_sub(start_offset) else {
205 continue;
206 };
207 let end = match mat.end().checked_add(end_offset) {
208 Some(v) if v <= region.mem.len() => v,
209 _ => continue,
210 };
211 let m = start..end;
212
213 // Verify the literal is valid.
214 let Some(match_type) = var.confirm_ac_literal(region.mem, &m, literal_index) else {
215 continue;
216 };
217
218 let var_matches = &mut matches[variable_index];
219
220 // Shorten the mem to prevent new matches on the same starting byte.
221 // For example, for `a.*?bb`, and input `abbb`, this can happen:
222 // - extract atom `bb`
223 // - get AC match on `a(bb)b`: call check_ac_match, this will return the
224 // match `(abb)b`.
225 // - get AC match on `ab(bb)`: call check_ac_match, this will return the
226 // match `(abbb)`.
227 // This is invalid, only one match per starting byte can happen.
228 // To avoid this, ensure the mem given to check_ac_match starts one byte after the last
229 // saved match.
230 //
231 // This must only be done if the match is in the same region, otherwise the offset
232 // of the previous match makes no sense for this match, and will falsify results.
233 let start_position = match var_matches.last() {
234 Some(mat) if mat.base == region.start => mat.offset + 1,
235 _ => 0,
236 };
237
238 let res = var.process_ac_match(region.mem, m, start_position, match_type);
239
240 #[cfg(feature = "profiling")]
241 {
242 if let Some(stats) = scan_data.statistics.as_mut() {
243 stats.ac_confirm_duration += start_instant.elapsed();
244 }
245 }
246
247 match res {
248 AcMatchStatus::None => (),
249 AcMatchStatus::Multiple(v) if v.is_empty() => (),
250 AcMatchStatus::Multiple(found_matches) => {
251 var_matches.extend(found_matches.into_iter().map(|m| {
252 StringMatch::new(region, m, scan_data.params.match_max_length, 0)
253 }));
254 }
255 AcMatchStatus::Single(m) => {
256 let xor_key = var.get_xor_key(literal_index);
257 var_matches.push(StringMatch::new(
258 region,
259 m,
260 scan_data.params.match_max_length,
261 xor_key,
262 ));
263 }
264 }
265
266 if var_matches.len() > (scan_data.params.string_max_nb_matches as usize) {
267 var_matches.truncate(scan_data.params.string_max_nb_matches as usize);
268 if (scan_data.params.callback_events & CallbackEvents::STRING_REACHED_MATCH_LIMIT).0
269 != 0
270 && scan_data.string_reached_match_limit.insert(variable_index)
271 {
272 if let Some(cb) = &mut scan_data.callback {
273 if let Some(string_identifier) =
274 build_string_identifier(scanner, variable_index)
275 {
276 match (cb)(ScanEvent::StringReachedMatchLimit(string_identifier)) {
277 ScanCallbackResult::Continue => (),
278 ScanCallbackResult::Abort => return Err(ScanError::CallbackAbort),
279 }
280 }
281 }
282 }
283 }
284 }
285
286 Ok(())
287 }
288}
289
290fn scan_single_variable(
291 region: &Region,
292 matcher: &Matcher,
293 scan_data: &mut ScanData,
294 string_matches: &mut Vec<StringMatch>,
295) {
296 let mut offset = 0;
297 while offset < region.mem.len() {
298 let mat = matcher.find_next_match_at(region.mem, offset);
299
300 match mat {
301 None => break,
302 Some(mat) => {
303 offset = mat.start + 1;
304 string_matches.push(StringMatch::new(
305 region,
306 mat,
307 scan_data.params.match_max_length,
308 // No xor key, since this function is only used for regex variables
309 0,
310 ));
311
312 // This is safe to allow because this is called on every iterator of self.matches, so
313 // it cannot overflow u32 before this condition is true.
314 #[allow(clippy::cast_possible_truncation)]
315 if (string_matches.len() as u32) >= scan_data.params.string_max_nb_matches {
316 break;
317 }
318 }
319 }
320 }
321}
322
323fn build_string_identifier(
324 scanner: &super::Inner,
325 variable_index: usize,
326) -> Option<StringIdentifier<'_>> {
327 let mut index = 0;
328 // Go through all the rules of the scanner to find the right one.
329 // This is O(n) on the rules, which isn't ideal. But this is only done
330 // iff:
331 // - the callback API is used
332 // - the "string reaches match limit" event is enabled
333 // - a string reaches the match limit
334 // This thus should not be called frequently, and a O(n) search through
335 // the rules should not take that long.
336 //
337 // A solution to improve this would be to store in each rule the index
338 // of its first variable, which would make a binary search through
339 // the rules possible. However, this means an additional word to store
340 // with each rule, only to alleviate this very specific event. For
341 // the moment, this is not considered to be worth the cost.
342 for rule in scanner.global_rules.iter().chain(scanner.rules.iter()) {
343 if index + rule.nb_variables > variable_index {
344 return Some(StringIdentifier {
345 rule_namespace: scanner.namespaces[rule.namespace_index].as_ref(),
346 rule_name: &rule.name,
347 string_name: &scanner.variables[variable_index].name,
348 string_index: variable_index - index,
349 });
350 }
351 index += rule.nb_variables;
352 }
353 // Should technically be impossible to reach.
354 debug_assert!(false);
355 None
356}
357
358#[cfg(test)]
359mod tests {
360 use super::*;
361 use crate::test_helpers::test_type_traits_non_clonable;
362
363 #[test]
364 fn test_types_traits() {
365 test_type_traits_non_clonable(AcScan::new(&[], CompilerProfile::Speed));
366 test_type_traits_non_clonable(LiteralInfo {
367 variable_index: 0,
368 literal_index: 0,
369 slice_offset: (0, 0),
370 });
371 }
372}