boreal/scanner/
ac_scan.rs

1//! Provides the [`AcScan`] object, used to scan for all variables in a single AC pass.
2use std::collections::hash_map::Entry;
3use std::collections::{HashMap, HashSet};
4
5use aho_corasick::{AhoCorasick, AhoCorasickBuilder, AhoCorasickKind};
6
7use super::{
8    CallbackEvents, ScanCallbackResult, ScanData, ScanError, ScanEvent, StringIdentifier,
9    StringMatch,
10};
11use crate::atoms::pick_atom_in_literal;
12use crate::compiler::variable::Variable;
13use crate::compiler::CompilerProfile;
14use crate::matcher::{AcMatchStatus, Matcher};
15use crate::memory::Region;
16
17/// Factorize atoms from all variables, to scan for them in a single pass.
18///
19/// For every variable, literals named atoms are extracted from the variables expressions. A single
20/// Aho-Corasick object is built from all those literals, and a single pass on the scanned bytes
21/// is done with this object. For every match on a literal, the match is then verified to see if
22/// it matches the whole variable expression.
23///
24/// An exception to this is for variables that we either:
25/// - cannot manage to extract atoms from
26/// - need to or prefer scanning on their own
27///
28/// For those variables, the AC pass does not provide any result, and the variable will be scanned
29/// on its own during evaluation of the rules.
30#[derive(Debug)]
31pub(crate) struct AcScan {
32    /// Aho Corasick for variables that are literals.
33    aho: AhoCorasick,
34
35    /// Map from a aho pattern index to a list details on the literals.
36    aho_index_to_literal_info: Vec<Vec<LiteralInfo>>,
37
38    /// List of indexes for vars that are not part of the aho corasick.
39    non_handled_var_indexes: Vec<usize>,
40}
41
42/// Details on a literal of a variable.
43#[derive(Debug)]
44struct LiteralInfo {
45    /// Index of the variable in the variable array.
46    variable_index: usize,
47
48    /// Index of the literal for the variable.
49    literal_index: usize,
50
51    /// Left and right offset for the slice picked in the Aho-Corasick.
52    slice_offset: (usize, usize),
53}
54
55impl AcScan {
56    pub(crate) fn new(variables: &[Variable], profile: CompilerProfile) -> Self {
57        let mut lits = Vec::new();
58        let mut known_lits = HashMap::new();
59        let mut aho_index_to_literal_info = Vec::new();
60        let mut non_handled_var_indexes = Vec::new();
61
62        for (variable_index, var) in variables.iter().enumerate() {
63            if var.matcher.literals.is_empty() {
64                non_handled_var_indexes.push(variable_index);
65            } else {
66                let mut known_literals_of_var = HashSet::new();
67
68                for (literal_index, lit) in var.matcher.literals.iter().enumerate() {
69                    let (start, end) = pick_atom_in_literal(lit);
70                    let mut atom = lit[start..(lit.len() - end)].to_vec();
71                    let literal_info = LiteralInfo {
72                        variable_index,
73                        literal_index,
74                        slice_offset: (start, end),
75                    };
76
77                    // Sometimes, two literals of the same variable can provide the same atom.
78                    // This can happen if the two literals are identical (for example, someone
79                    // like me writing a test on `/(abc|abc)/`), or if the literals are
80                    // different but contain the same atom (for example,
81                    // `{ ( 00 AB CD | AB CD 00 ) }`).
82                    //
83                    // In those cases, we must *not* use the same atom twice in the Aho-Corasick,
84                    // as this would result in two identical matches for the same variable.
85                    // To prevent this, a set is used here. Both the atom itself and its position
86                    // in the literal are important.
87                    if !known_literals_of_var.insert((atom.clone(), start)) {
88                        continue;
89                    }
90
91                    // Ensure the literals provided to the aho corasick are not
92                    // duplicated. If multiple variables uses the same atoms,
93                    // we will iterate on every variable in this module, instead
94                    // of going back into the aho-corasick just for it to
95                    // iterate over the matching ids and return immediately
96                    // to this code. This improves performances significantly.
97                    //
98                    // In addition, since the aho-corasick is case insensitive,
99                    // normalize before de-duplicating.
100                    atom.make_ascii_lowercase();
101
102                    match known_lits.entry(atom.clone()) {
103                        Entry::Vacant(v) => {
104                            let _r = v.insert(lits.len());
105                            aho_index_to_literal_info.push(vec![literal_info]);
106                            lits.push(atom);
107                        }
108                        Entry::Occupied(o) => {
109                            let index = o.get();
110                            aho_index_to_literal_info[*index].push(literal_info);
111                        }
112                    }
113                }
114            }
115        }
116
117        // TODO: Should this AC be case insensitive or not? Redo some benches once other
118        // optimizations are done.
119
120        let mut builder = AhoCorasickBuilder::new();
121        let builder = builder.ascii_case_insensitive(true);
122        let builder = builder.kind(Some(match profile {
123            CompilerProfile::Speed => AhoCorasickKind::DFA,
124            CompilerProfile::Memory => AhoCorasickKind::ContiguousNFA,
125        }));
126
127        // First try with a smaller size to reduce memory use and improve performances, otherwise
128        // use the default version.
129        let aho = builder.build(&lits).unwrap();
130
131        Self {
132            aho,
133            aho_index_to_literal_info,
134            non_handled_var_indexes,
135        }
136    }
137
138    pub(super) fn scan_region<'scanner>(
139        &self,
140        region: &Region,
141        scanner: &'scanner super::Inner,
142        scan_data: &mut ScanData<'scanner, '_>,
143        matches: &mut [Vec<StringMatch>],
144    ) -> Result<(), ScanError> {
145        #[cfg(feature = "profiling")]
146        if let Some(stats) = scan_data.statistics.as_mut() {
147            stats.nb_memory_chunks += 1;
148            stats.memory_scanned_size += region.mem.len();
149        }
150
151        // Iterate over aho-corasick matches, validating those matches
152        for mat in self.aho.find_overlapping_iter(region.mem) {
153            if scan_data.check_timeout() {
154                return Err(ScanError::Timeout);
155            }
156            self.handle_possible_match(region, scanner, &mat, scan_data, matches)?;
157        }
158
159        if !self.non_handled_var_indexes.is_empty() {
160            #[cfg(feature = "profiling")]
161            let start = std::time::Instant::now();
162
163            // For every "raw" variable, scan the memory for this variable.
164            for variable_index in &self.non_handled_var_indexes {
165                let var = &scanner.variables[*variable_index].matcher;
166
167                scan_single_variable(region, var, scan_data, &mut matches[*variable_index]);
168            }
169
170            #[cfg(feature = "profiling")]
171            if let Some(stats) = scan_data.statistics.as_mut() {
172                stats.raw_regexes_eval_duration += start.elapsed();
173            }
174        }
175
176        Ok(())
177    }
178
179    fn handle_possible_match<'scanner>(
180        &self,
181        region: &Region,
182        scanner: &'scanner super::Inner,
183        mat: &aho_corasick::Match,
184        scan_data: &mut ScanData<'scanner, '_>,
185        matches: &mut [Vec<StringMatch>],
186    ) -> Result<(), ScanError> {
187        for literal_info in &self.aho_index_to_literal_info[mat.pattern()] {
188            let LiteralInfo {
189                variable_index,
190                literal_index,
191                slice_offset: (start_offset, end_offset),
192            } = *literal_info;
193            let var = &scanner.variables[variable_index].matcher;
194
195            #[cfg(feature = "profiling")]
196            if let Some(stats) = scan_data.statistics.as_mut() {
197                stats.nb_ac_matches += 1;
198            }
199            #[cfg(feature = "profiling")]
200            let start_instant = std::time::Instant::now();
201
202            // Upscale to the original literal shape before feeding it to the matcher verification
203            // function.
204            let Some(start) = mat.start().checked_sub(start_offset) else {
205                continue;
206            };
207            let end = match mat.end().checked_add(end_offset) {
208                Some(v) if v <= region.mem.len() => v,
209                _ => continue,
210            };
211            let m = start..end;
212
213            // Verify the literal is valid.
214            let Some(match_type) = var.confirm_ac_literal(region.mem, &m, literal_index) else {
215                continue;
216            };
217
218            let var_matches = &mut matches[variable_index];
219
220            // Shorten the mem to prevent new matches on the same starting byte.
221            // For example, for `a.*?bb`, and input `abbb`, this can happen:
222            // - extract atom `bb`
223            // - get AC match on `a(bb)b`: call check_ac_match, this will return the
224            //   match `(abb)b`.
225            // - get AC match on `ab(bb)`: call check_ac_match, this will return the
226            //   match `(abbb)`.
227            // This is invalid, only one match per starting byte can happen.
228            // To avoid this, ensure the mem given to check_ac_match starts one byte after the last
229            // saved match.
230            //
231            // This must only be done if the match is in the same region, otherwise the offset
232            // of the previous match makes no sense for this match, and will falsify results.
233            let start_position = match var_matches.last() {
234                Some(mat) if mat.base == region.start => mat.offset + 1,
235                _ => 0,
236            };
237
238            let res = var.process_ac_match(region.mem, m, start_position, match_type);
239
240            #[cfg(feature = "profiling")]
241            {
242                if let Some(stats) = scan_data.statistics.as_mut() {
243                    stats.ac_confirm_duration += start_instant.elapsed();
244                }
245            }
246
247            match res {
248                AcMatchStatus::None => (),
249                AcMatchStatus::Multiple(v) if v.is_empty() => (),
250                AcMatchStatus::Multiple(found_matches) => {
251                    var_matches.extend(found_matches.into_iter().map(|m| {
252                        StringMatch::new(region, m, scan_data.params.match_max_length, 0)
253                    }));
254                }
255                AcMatchStatus::Single(m) => {
256                    let xor_key = var.get_xor_key(literal_index);
257                    var_matches.push(StringMatch::new(
258                        region,
259                        m,
260                        scan_data.params.match_max_length,
261                        xor_key,
262                    ));
263                }
264            }
265
266            if var_matches.len() > (scan_data.params.string_max_nb_matches as usize) {
267                var_matches.truncate(scan_data.params.string_max_nb_matches as usize);
268                if (scan_data.params.callback_events & CallbackEvents::STRING_REACHED_MATCH_LIMIT).0
269                    != 0
270                    && scan_data.string_reached_match_limit.insert(variable_index)
271                {
272                    if let Some(cb) = &mut scan_data.callback {
273                        if let Some(string_identifier) =
274                            build_string_identifier(scanner, variable_index)
275                        {
276                            match (cb)(ScanEvent::StringReachedMatchLimit(string_identifier)) {
277                                ScanCallbackResult::Continue => (),
278                                ScanCallbackResult::Abort => return Err(ScanError::CallbackAbort),
279                            }
280                        }
281                    }
282                }
283            }
284        }
285
286        Ok(())
287    }
288}
289
290fn scan_single_variable(
291    region: &Region,
292    matcher: &Matcher,
293    scan_data: &mut ScanData,
294    string_matches: &mut Vec<StringMatch>,
295) {
296    let mut offset = 0;
297    while offset < region.mem.len() {
298        let mat = matcher.find_next_match_at(region.mem, offset);
299
300        match mat {
301            None => break,
302            Some(mat) => {
303                offset = mat.start + 1;
304                string_matches.push(StringMatch::new(
305                    region,
306                    mat,
307                    scan_data.params.match_max_length,
308                    // No xor key, since this function is only used for regex variables
309                    0,
310                ));
311
312                // This is safe to allow because this is called on every iterator of self.matches, so
313                // it cannot overflow u32 before this condition is true.
314                #[allow(clippy::cast_possible_truncation)]
315                if (string_matches.len() as u32) >= scan_data.params.string_max_nb_matches {
316                    break;
317                }
318            }
319        }
320    }
321}
322
323fn build_string_identifier(
324    scanner: &super::Inner,
325    variable_index: usize,
326) -> Option<StringIdentifier<'_>> {
327    let mut index = 0;
328    // Go through all the rules of the scanner to find the right one.
329    // This is O(n) on the rules, which isn't ideal. But this is only done
330    // iff:
331    // - the callback API is used
332    // - the "string reaches match limit" event is enabled
333    // - a string reaches the match limit
334    // This thus should not be called frequently, and a O(n) search through
335    // the rules should not take that long.
336    //
337    // A solution to improve this would be to store in each rule the index
338    // of its first variable, which would make a binary search through
339    // the rules possible. However, this means an additional word to store
340    // with each rule, only to alleviate this very specific event. For
341    // the moment, this is not considered to be worth the cost.
342    for rule in scanner.global_rules.iter().chain(scanner.rules.iter()) {
343        if index + rule.nb_variables > variable_index {
344            return Some(StringIdentifier {
345                rule_namespace: scanner.namespaces[rule.namespace_index].as_ref(),
346                rule_name: &rule.name,
347                string_name: &scanner.variables[variable_index].name,
348                string_index: variable_index - index,
349            });
350        }
351        index += rule.nb_variables;
352    }
353    // Should technically be impossible to reach.
354    debug_assert!(false);
355    None
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361    use crate::test_helpers::test_type_traits_non_clonable;
362
363    #[test]
364    fn test_types_traits() {
365        test_type_traits_non_clonable(AcScan::new(&[], CompilerProfile::Speed));
366        test_type_traits_non_clonable(LiteralInfo {
367            variable_index: 0,
368            literal_index: 0,
369            slice_offset: (0, 0),
370        });
371    }
372}