Skip to main content

regex_filtered/
model.rs

1#![doc(hidden)]
2
3use itertools::iproduct;
4use regex_syntax::hir::{self, Hir, HirKind, Visitor, visit};
5use std::cell::Cell;
6use std::fmt::{Display, Formatter, Write};
7use std::str::Utf8Error;
8use std::{collections::BTreeSet, ops::Deref};
9
10const EXACT_CROSS_LIMIT: usize = 16;
11
12#[derive(Clone, Debug)]
13pub enum Model {
14    /// Everything matches.
15    All(Cell<usize>),
16    /// Nothing matches.
17    None(Cell<usize>),
18    /// The string matches.
19    Atom(Cell<usize>, String),
20    /// All sub-filters must match.
21    And(Cell<usize>, Vec<Model>),
22    /// One sub-filter must match.
23    Or(Cell<usize>, Vec<Model>),
24}
25use Model::{All, And, Atom, None, Or};
26
27impl std::hash::Hash for Model {
28    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
29        state.write_u8(self.op());
30        match self {
31            All(_) | None(_) => (),
32            Atom(_, s) => s.hash(state),
33            And(_, ps) | Or(_, ps) => {
34                state.write_usize(ps.len());
35                for p in ps {
36                    state.write_usize(p.unique_id());
37                }
38            }
39        }
40    }
41}
42
43impl std::cmp::PartialEq for Model {
44    fn eq(&self, other: &Self) -> bool {
45        match (self, other) {
46            (All(_), All(_)) | (None(_), None(_)) => true,
47            (Atom(_, a), Atom(_, b)) => a == b,
48            (And(_, va), And(_, vb)) | (Or(_, va), Or(_, vb)) => {
49                va.len() == vb.len()
50                    && std::iter::zip(va, vb).all(|(a, b)| a.unique_id() == b.unique_id())
51            }
52            _ => false,
53        }
54    }
55}
56impl Eq for Model {}
57
58impl From<String> for Model {
59    fn from(s: String) -> Self {
60        Atom(Cell::new(usize::MAX), s)
61    }
62}
63
64impl Display for Model {
65    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
66        match &self {
67            All(_) => f.write_str(""),
68            None(_) => f.write_str("*no-matches*"),
69            Atom(_, s) => f.write_str(s),
70            And(_, subs) => {
71                for (i, s) in subs.iter().enumerate() {
72                    if i != 0 {
73                        f.write_char(' ')?;
74                    }
75                    write!(f, "{s}")?;
76                }
77                Ok(())
78            }
79            Or(_, subs) => {
80                f.write_char('(')?;
81                for (i, s) in subs.iter().enumerate() {
82                    if i != 0 {
83                        f.write_char('|')?;
84                    }
85                    write!(f, "{s}")?;
86                }
87                f.write_char(')')
88            }
89        }
90    }
91}
92
93/// Processing errors
94#[derive(Debug)]
95pub enum Error {
96    /// Processing missed or exceeded some of the stack
97    FinalizationError,
98    /// Processing reached HIR nodes limit
99    EarlyStop,
100    /// Literal was not a valid string
101    DecodeError(Utf8Error),
102    /// Non-decodable character class
103    ClassError(hir::ClassBytes),
104}
105impl Display for Error {
106    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
107        write!(f, "{self:?}")
108    }
109}
110impl std::error::Error for Error {}
111impl From<Utf8Error> for Error {
112    fn from(value: Utf8Error) -> Self {
113        Error::DecodeError(value)
114    }
115}
116
117impl Model {
118    pub fn new(r: &Hir) -> Result<Self, Error> {
119        visit(r, InfoVisitor::default())
120    }
121
122    pub fn unique_id(&self) -> usize {
123        match self {
124            All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.get(),
125        }
126    }
127    pub fn set_unique_id(&self, value: usize) {
128        match self {
129            All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.set(value),
130        }
131    }
132
133    pub fn all() -> Self {
134        All(Cell::new(usize::MAX))
135    }
136
137    pub fn none() -> Self {
138        None(Cell::new(usize::MAX))
139    }
140
141    fn or_strings(strings: SSet) -> Self {
142        Model::Or(
143            Cell::new(usize::MAX),
144            simplify_string_set(strings).map(From::from).collect(),
145        )
146    }
147
148    fn op(&self) -> u8 {
149        match self {
150            All(_) => 0,
151            None(_) => 1,
152            Atom(_, _) => 2,
153            And(_, _) => 3,
154            Or(_, _) => 4,
155        }
156    }
157
158    /// Simplifies And and Or nodes
159    fn simplify(self) -> Self {
160        match self {
161            And(uid, v) if v.is_empty() => All(uid),
162            Or(uid, v) if v.is_empty() => None(uid),
163            And(_, mut v) | Or(_, mut v) if v.len() == 1 => {
164                v.pop().expect("we checked the length").simplify()
165            }
166            s => s,
167        }
168    }
169
170    // re2 merges those into separate functions but it only saves on
171    // the header and increases the branching complexity of the rest
172    // so y?
173    fn and(self, mut b: Self) -> Self {
174        let mut a = self.simplify();
175        b = b.simplify();
176
177        // Canonicalize: a->op <= b->op.
178        if a.op() > b.op() {
179            std::mem::swap(&mut a, &mut b);
180        }
181
182        // ALL and NONE are smallest opcodes.
183        a = match a {
184            // ALL and b = b
185            All(..) => return b,
186            // NONE and b = None
187            None(uid) => return None(uid),
188            a => a,
189        };
190
191        match (a, b) {
192            // If a and b match op, merge their contents.
193            (And(unique_id, mut va), And(_, vb)) => {
194                va.extend(vb);
195                And(unique_id, va)
196            }
197            // If a or b matches the operation, merge the other one in
198            (And(unique_id, mut v), vv) | (vv, And(unique_id, mut v)) => {
199                v.push(vv);
200                And(unique_id, v)
201            }
202            (a, b) => And(Cell::new(usize::MAX), vec![a, b]),
203        }
204    }
205
206    fn or(self, mut b: Self) -> Self {
207        let mut a = self.simplify();
208        b = b.simplify();
209
210        // Canonicalize: a->op <= b->op.
211        if a.op() > b.op() {
212            std::mem::swap(&mut a, &mut b);
213        }
214
215        a = match a {
216            // NONE or b = b
217            None(..) => return b,
218            // ALL or b = ALL
219            All(uid) => return All(uid),
220            a => a,
221        };
222
223        match (a, b) {
224            // If a and b match op, merge their contents.
225            (Or(unique_id, mut va), Or(_, vb)) => {
226                va.extend(vb);
227                Or(unique_id, va)
228            }
229            // If a or b matches the operation, merge the other one in
230            (Or(unique_id, mut v), vv) | (vv, Or(unique_id, mut v)) => {
231                v.push(vv);
232                Or(unique_id, v)
233            }
234            (a, b) => Or(Cell::new(usize::MAX), vec![a, b]),
235        }
236    }
237}
238
239// Necessary for simplify_string_set to work: the simplification
240// consists of removing every "superset" of an other string of the
241// set, that is any strings which contains an other (non-empty) string
242// of the set, because the smaller atom will already indicate that the
243// pattern is a candidate, so also matching the larger atom is useless
244//
245// In order to make the implementation simpler and more efficient,
246// visit the smaller strings first that way we only need to visit the
247// following siblings (larger strings which *might* contain the
248// current one).
249#[derive(PartialEq, Eq, Debug, Clone)]
250struct LengthThenLex(pub String);
251impl Deref for LengthThenLex {
252    type Target = String;
253
254    fn deref(&self) -> &Self::Target {
255        &self.0
256    }
257}
258impl Ord for LengthThenLex {
259    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
260        self.0
261            .len()
262            .cmp(&other.0.len())
263            .then_with(|| self.0.cmp(&other.0))
264    }
265}
266impl PartialOrd for LengthThenLex {
267    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
268        Some(self.cmp(other))
269    }
270}
271type SSet = BTreeSet<LengthThenLex>;
272fn simplify_string_set(strings: SSet) -> impl Iterator<Item = String> {
273    let mut to_keep = vec![true; strings.len()];
274    let mut e = strings.iter().enumerate();
275    while let Some((i, s)) = e.next() {
276        if s.is_empty() || !to_keep[i] {
277            continue;
278        }
279
280        for (keep, (_, s2)) in to_keep[i..].iter_mut().skip(1).zip(e.clone()) {
281            if *keep && s2.len() > s.len() && s2.0.contains(&s.0) {
282                *keep = false;
283            }
284        }
285    }
286
287    std::iter::zip(to_keep, strings)
288        .filter(|v| v.0)
289        .map(|v| v.1.0)
290}
291
292/// Intermediate information about the set of strings a regex matches,
293/// used for the computation of a prefilter.
294#[derive(Debug)]
295enum Info {
296    Match(Model),
297    Exact(SSet),
298}
299impl Info {
300    fn take_match(self) -> Model {
301        match self {
302            Self::Match(p) => p,
303            Self::Exact(s) => Model::or_strings(s),
304        }
305    }
306
307    fn into_exact(self) -> Option<SSet> {
308        match self {
309            Self::Exact(s) => Some(s),
310            Self::Match(_) => Option::None,
311        }
312    }
313}
314
315struct InfoVisitor {
316    stack: Vec<Info>,
317    max_visits: usize,
318}
319impl Default for InfoVisitor {
320    fn default() -> Self {
321        Self {
322            max_visits: 100_000,
323            stack: Vec::new(),
324        }
325    }
326}
327
328// [`regex_syntax::hir::Visitor`] works pretty differently than
329// `re2::Regexp::Walker` as it does not return / merge anything, so we
330// need to merge down into the stack on post.
331impl Visitor for InfoVisitor {
332    type Output = Model;
333    type Err = Error;
334
335    fn finish(mut self) -> Result<Self::Output, Self::Err> {
336        (self.stack.len() == 1)
337            .then_some(&mut self.stack)
338            .and_then(|s| s.pop())
339            .map(Info::take_match)
340            .ok_or(Error::FinalizationError)
341    }
342
343    fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
344        // re2 sets `stopped_early` and calls `ShortVisit` but keeps
345        // on keeping on, not clear why & ultimately BuildInfo only
346        // cares about having stopped early
347        self.max_visits = self.max_visits.checked_sub(1).ok_or(Error::EarlyStop)?;
348
349        Ok(())
350    }
351
352    fn visit_post(&mut self, hir: &Hir) -> Result<(), Self::Err> {
353        match hir.kind() {
354            HirKind::Empty | HirKind::Look(_) => {
355                self.stack
356                    .push(Info::Exact([LengthThenLex(String::new())].into()));
357            }
358            HirKind::Literal(hir::Literal(data)) => {
359                if data.is_empty() {
360                    // NoMatch
361                    self.stack.push(Info::Match(Model::none()));
362                } else {
363                    // re2 does this weird as it performs a cross
364                    // product of individual characters, but as far as
365                    // I understand that's just a complicated way to
366                    // build a singleton set of the payload?
367                    self.stack.push(Info::Exact(
368                        [LengthThenLex(
369                            std::str::from_utf8(data)?.to_ascii_lowercase(),
370                        )]
371                        .into(),
372                    ));
373                }
374            }
375            HirKind::Class(cls) => {
376                let uc;
377                let c = match cls {
378                    hir::Class::Unicode(c) => c,
379                    hir::Class::Bytes(b) => {
380                        uc = b
381                            .to_unicode_class()
382                            .ok_or_else(|| Error::ClassError(b.clone()))?;
383                        &uc
384                    }
385                };
386                self.stack
387                    .push(if c.iter().map(|r| r.len()).sum::<usize>() > 10 {
388                        Info::Match(Model::all())
389                    } else {
390                        Info::Exact(
391                            c.iter()
392                                .flat_map(|r| r.start()..=r.end())
393                                .map(|c| c.to_ascii_lowercase())
394                                .map(String::from)
395                                .map(LengthThenLex)
396                                .collect(),
397                        )
398                    });
399            }
400            // Apparently re2 and regex have inverse choices, re2
401            // normalises repetitions to */+/?, regex normalises
402            // everything to {a, b}, so this may or may make any sense
403            HirKind::Repetition(hir::Repetition { min, max, .. }) => {
404                match min {
405                    0 => {
406                        self.stack.pop();
407                        self.stack.push(Info::Match(Model::all()));
408                    }
409                    &min => {
410                        let arg = self
411                            .stack
412                            .pop()
413                            .expect("a repetition to be associated with a pattern to repeat");
414                        match arg {
415                            Info::Exact(mut arg) if arg.len() == 1 => {
416                                let s = arg.pop_first().unwrap();
417                                let minsize = min as usize;
418                                // re2 limits repetitions to 1000, but
419                                // allows literal with ~no length
420                                // limit to be repeated which feels a
421                                // bit strange and irregular, instead
422                                // limit atoms expansion to 2KB
423                                if Some(min) == *max && (minsize * s.len() < 2048) {
424                                    let set = [LengthThenLex(s.repeat(minsize))].into();
425                                    self.stack.push(Info::Exact(set));
426                                } else {
427                                    let min = (2048 / s.len()).clamp(1, minsize);
428                                    let set = [LengthThenLex(s.repeat(min))].into();
429                                    self.stack.push(Info::Match(Model::or_strings(set)));
430                                }
431                            }
432                            // same limit as Concat
433                            // TODO: if arg.len() is < 4, we can splat it
434                            //       to the limit and decrease min by that
435                            //       much...
436                            Info::Exact(arg) if arg.len().pow(min) <= EXACT_CROSS_LIMIT => {
437                                let mut acc = arg.clone();
438                                for _ in 1..min {
439                                    acc = iproduct!(&acc, &arg)
440                                        .map(|(s, ss)| {
441                                            let mut r = String::with_capacity(s.len() + ss.len());
442                                            r.push_str(s);
443                                            r.push_str(ss);
444                                            LengthThenLex(r)
445                                        })
446                                        .collect();
447                                }
448                                if Some(min) == *max {
449                                    self.stack.push(Info::Exact(acc));
450                                } else {
451                                    self.stack.push(Info::Match(Model::or_strings(acc)));
452                                }
453                            }
454                            arg => {
455                                self.stack.push(Info::Match(arg.take_match()));
456                            }
457                        }
458                    }
459                }
460            }
461            // should just leave its child on the stack for whoever
462            // lives up
463            HirKind::Capture(_) => (),
464            HirKind::Alternation(alt) => {
465                // needs to pop alt.len() items from the stack, and if
466                // they're ``exact`` then just merge them, otherwise
467                // ``Prefilter::Or`` them
468
469                // sort the topn to have the exacts at the top, largest top
470                let topn = self.stack.len() - alt.len()..;
471                let infos = &mut self.stack[topn.clone()];
472
473                let matches =
474                    topn.start + infos.iter().filter(|v| matches!(v, Info::Match(_))).count();
475                // I think we can do that because we don't actually
476                // regex match so order should not matter question
477                // mark
478                infos.sort_unstable_by_key(|v| match v {
479                    Info::Match(_) => (false, 0),
480                    Info::Exact(s) => (true, s.len()),
481                });
482                // there are exact matches, merge them
483                let exacts = self
484                    .stack
485                    .drain(matches..)
486                    .rev()
487                    .fold(BTreeSet::new(), |mut s, i| {
488                        s.append(
489                            &mut i
490                                .into_exact()
491                                .expect("the top `matches` records should be exacts"),
492                        );
493                        s
494                    });
495                let mut matches = self
496                    .stack
497                    .drain(topn)
498                    .map(Info::take_match)
499                    .collect::<Vec<_>>();
500                self.stack.push(if matches.is_empty() {
501                    Info::Exact(exacts)
502                } else {
503                    if !exacts.is_empty() {
504                        matches.push(Model::or_strings(exacts));
505                    }
506                    Info::Match(matches.into_iter().fold(Model::none(), Model::or))
507                });
508            }
509            HirKind::Concat(c) => {
510                let topn = self.stack.len() - c.len()..;
511
512                // ALL is the identity element of AND
513                let mut result = Info::Match(Model::all());
514                let mut resulted = false;
515                let mut exacts = BTreeSet::new();
516                for info in self.stack.drain(topn) {
517                    match info {
518                        Info::Exact(set) if exacts.is_empty() => {
519                            exacts = set;
520                        }
521                        Info::Exact(set) if exacts.len() == 1 && set.len() == 1 => {
522                            let r = exacts.pop_first().expect("exacts to be non-empty").0
523                                + set.first().expect("set to be non-empty");
524
525                            exacts.insert(LengthThenLex(r));
526                        }
527                        Info::Exact(set) => {
528                            if set.len() * exacts.len() <= EXACT_CROSS_LIMIT {
529                                // Not useful to consume the existing
530                                // `exacts` up-front, as each item has to
531                                // be splatted over `set`.
532                                exacts = iproduct!(&exacts, &set)
533                                    .map(|(s, ss)| {
534                                        let mut r = String::with_capacity(s.len() + ss.len());
535                                        r.push_str(s);
536                                        r.push_str(ss);
537                                        LengthThenLex(r)
538                                    })
539                                    .collect();
540                            } else {
541                                resulted = true;
542                                result = Info::Match(Model::and(
543                                    result.take_match(),
544                                    Model::or_strings(exacts),
545                                ));
546                                exacts = set;
547                            }
548                        }
549                        i => {
550                            resulted = true;
551                            // here AND the combination of info,
552                            // exact, and the existing garbage
553                            let mut p = result.take_match();
554                            if !exacts.is_empty() {
555                                p = Model::and(p, Model::or_strings(std::mem::take(&mut exacts)));
556                            }
557                            p = Model::and(p, i.take_match());
558                            result = Info::Match(p);
559                        }
560                    }
561                }
562
563                self.stack.push(if exacts.is_empty() {
564                    result
565                } else if !resulted {
566                    Info::Exact(exacts)
567                } else {
568                    Info::Match(Model::and(result.take_match(), Model::or_strings(exacts)))
569                });
570            }
571        }
572        Ok(())
573    }
574}