regex_intersect/
lib.rs

1//!
2//! `regex-intersect` is a library for finding out if two regexes have a non-empty intersection.
3//!
4//! For example `.*` and `abc` obviously intersect (because `.*` matches everything, including the string "abc").
5//! However, `d.*` and `abc` or `a[h-z]c` and `abc` do not, and could never intersect, because `abc` could never start with a `d` in the first case,
6//! and `b` is not in the set `[h-z]` in the second case.
7//!
8//!
9//! ## Examples
10//!
11//! Check if non empty:
12//!
13//! ```
14//! use regex_intersect::non_empty;
15//! assert!(non_empty("a.*", "ab.*cd").expect("regex expressions should parse"))
16//!```
17//!
18//! ## Tracing
19//!
20//! This libary incorporates tracing, which helps follow the way it operates. Tracing is not enabled by
21//! default but you can enable it:
22//!
23//! ```toml
24//! regex-intersect = { version="0.1", features=["tracing"] }
25//! ```
26//!
27#![warn(missing_docs)]
28#![allow(clippy::similar_names)]
29
30#[cfg(feature = "glob")]
31/// Glob intersect
32pub mod glob;
33
34use regex_syntax::hir::{Class, ClassBytes, ClassUnicode, Hir, HirKind, Literal, RepetitionKind};
35use regex_syntax::ParserBuilder;
36use std::borrow::BorrowMut;
37use thiserror::Error;
38
39/// An error in intersecting expressions
40#[derive(Debug, Error)]
41pub enum IntersectError {
42    /// An error parsing an expression
43    #[error("error while parsing expression: {0}")]
44    Parse(#[from] regex_syntax::Error),
45
46    /// A general error
47    #[error("error: {0}")]
48    Error(String),
49}
50
51fn maybe_trim<'a>(
52    x: &'a Hir,
53    y: &'a Hir,
54    xs: &'a [Hir],
55    ys: &'a [Hir],
56    reversed: bool,
57) -> Option<(&'a [Hir], &'a [Hir])> {
58    if x.eq(y) {
59        trim_slice(xs, ys, reversed)
60    } else {
61        None
62    }
63}
64fn trim_slice<'a>(
65    left: &'a [Hir],
66    right: &'a [Hir],
67    reversed: bool,
68) -> Option<(&'a [Hir], &'a [Hir])> {
69    match (left, right) {
70        ([xs @ .., x], [ys @ .., y]) | ([x, xs @ ..], [y, ys @ ..])
71            if reversed && x.is_literal() && y.is_literal() =>
72        {
73            maybe_trim(x, y, xs, ys, reversed)
74        }
75        ([x, xs @ ..], [y, ys @ ..]) if !reversed && x.is_literal() && y.is_literal() => {
76            maybe_trim(x, y, xs, ys, reversed)
77        }
78        _ => Some((left, right)),
79    }
80}
81
82fn trim<'a>(left: &'a Hir, right: &'a Hir, reversed: bool) -> Option<(Hir, Hir)> {
83    match (left.kind(), right.kind()) {
84        (HirKind::Concat(left), HirKind::Concat(right)) => {
85            trim_slice(left.as_slice(), right.as_slice(), reversed)
86                .map(|(left, right)| (Hir::concat(left.to_vec()), Hir::concat(right.to_vec())))
87        }
88        _ => Some((left.clone(), right.clone())),
89    }
90}
91
92#[cfg_attr(feature="tracing", tracing::instrument(skip_all, fields(
93  left = format!("{}", left.iter().map(|f| format!("{}", f)).collect::<Vec<_>>().join(", ")),
94  right = format!("{:?}", right.iter().map(|f| format!("{}", f)).collect::<Vec<_>>().join(", ")))))]
95fn concats(left: &[Hir], right: &[Hir]) -> bool {
96    match (left, right) {
97        // nothing matches with nothing
98        (&[], &[]) => true,
99        // nothing only matches *something* if that something is 'zero or X' (meaning, can match nothing)
100        (&[], [x, ..]) | ([x, ..], &[]) if matches!(x.kind(), HirKind::Repetition(r) if r.kind == RepetitionKind::ZeroOrMore || r.kind == RepetitionKind::ZeroOrOne) => {
101            true
102        }
103        // last item, and we're done only if repetition is one or more (+), which makes it symmetric to the case with zero or more above.
104        ([x], [rep, ..]) | ([rep, ..], [x]) if matches!(rep.kind(), HirKind::Repetition(r) if r.kind == RepetitionKind::OneOrMore) => {
105            exp(rep, x)
106        }
107        // otherwise, keep moving recursively
108        (xall @ [x, xs @ ..], yall @ [y, ys @ ..]) => {
109            match (x.kind(), y.kind()) {
110                // need to incorporate the class of repetition (zeroorone, one, many, etc.)
111                (HirKind::Repetition(_), _) => exp(x, y) && concats(xall, ys),
112                (_, HirKind::Repetition(_)) => exp(x, y) && concats(xs, yall),
113                (_, _) => exp(x, y) && concats(xs, ys),
114            }
115        }
116        _ => false,
117    }
118}
119
120#[cfg_attr(feature = "tracing", tracing::instrument)]
121fn unicode_range(left: &ClassUnicode, right: &ClassUnicode) -> bool {
122    let mut rl = left.clone();
123    rl.borrow_mut().intersect(right);
124    rl.iter().count() > 0
125}
126
127#[cfg_attr(feature = "tracing", tracing::instrument)]
128fn bytes_range(left: &ClassBytes, right: &ClassBytes) -> bool {
129    let mut rl = left.clone();
130    rl.borrow_mut().intersect(right);
131    rl.iter().count() > 0
132}
133
134#[cfg_attr(feature = "tracing", tracing::instrument)]
135fn literal(left: &Literal, right: &Literal) -> bool {
136    left.eq(right)
137}
138
139#[cfg_attr(feature="tracing", tracing::instrument(skip_all, fields(left = format!("{}", left), right = format!("{}", right))))]
140fn exp(left: &Hir, right: &Hir) -> bool {
141    match (left.kind(), right.kind()) {
142        (HirKind::Concat(xs), HirKind::Concat(ys)) => concats(xs, ys),
143        (HirKind::Concat(xs), _) => concats(xs, &[right.clone()]),
144        (_, HirKind::Concat(ys)) => concats(&[left.clone()], ys),
145        (HirKind::Class(Class::Unicode(left)), HirKind::Class(Class::Unicode(right))) => {
146            unicode_range(left, right)
147        }
148        (HirKind::Class(Class::Bytes(left)), HirKind::Class(Class::Bytes(right))) => {
149            bytes_range(left, right)
150        }
151        (HirKind::Class(Class::Unicode(cls)), HirKind::Literal(Literal::Unicode(lit)))
152        | (HirKind::Literal(Literal::Unicode(lit)), HirKind::Class(Class::Unicode(cls))) => {
153            cls.iter().any(|c| *lit >= c.start() && *lit <= c.end())
154        }
155        (HirKind::Class(Class::Bytes(cls)), HirKind::Literal(Literal::Unicode(lit)))
156        | (HirKind::Literal(Literal::Unicode(lit)), HirKind::Class(Class::Bytes(cls))) => {
157            cls.iter().any(|c| {
158                let s = lit.to_string();
159                let litb = s.as_bytes();
160                litb.len() == 1 && litb[0] >= c.start() && litb[0] <= c.end()
161            })
162        }
163        (HirKind::Literal(left), HirKind::Literal(right)) => literal(left, right),
164        (HirKind::Empty, HirKind::Repetition(rep)) | (HirKind::Repetition(rep), HirKind::Empty)
165            if rep.kind == RepetitionKind::ZeroOrMore || rep.kind == RepetitionKind::ZeroOrOne =>
166        {
167            true
168        }
169        (HirKind::Repetition(left), HirKind::Repetition(right)) => exp(&left.hir, &right.hir),
170        (HirKind::Repetition(left), _) => exp(&left.hir, right),
171        (_, HirKind::Repetition(right)) => exp(left, &right.hir),
172
173        // don't care about empty and anchors
174        (HirKind::Empty, HirKind::Empty) => true,
175        _tup => {
176            #[cfg(feature = "tracing")]
177            tracing::warn!("not found: {:?}", _tup);
178
179            false
180        }
181    }
182}
183
184#[cfg_attr(feature = "tracing", tracing::instrument)]
185fn hir(exp: &str) -> Result<Hir, IntersectError> {
186    let mut parser = ParserBuilder::new().allow_invalid_utf8(true).build();
187    Ok(parser.parse(exp)?)
188}
189
190///
191/// Check if `left` and `right` contains a non empty intersection.
192///
193/// ## Examples
194///
195/// ```
196/// use regex_intersect::non_empty;
197/// assert!(non_empty("a.*", "ab.*cd").expect("regex expressions should parse"))
198///```
199///
200/// ## Errors
201/// Returns an error if cannot parse one of the expressions
202///
203#[cfg_attr(feature = "tracing", tracing::instrument)]
204pub fn non_empty(left: &str, right: &str) -> Result<bool, IntersectError> {
205    let trimmed =
206        trim(&hir(left)?, &hir(right)?, false).and_then(|(left, right)| trim(&left, &right, true));
207    Ok(trimmed.map_or(false, |(hl, hr)| exp(&hl, &hr)))
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213    use pretty_assertions::assert_eq;
214
215    const NON_EMPTY: &[(&str, &[&str])] = &[
216        ("abcd", &["abcd", "....", "[a-d]*"]),
217        ("pqrs", &[".qrs", "p.rs", "pq.s", "pqr."]),
218        (".*", &["asdklfj", "jasdfh", "asdhfajfh", "asdflkasdfjl"]),
219        ("d*", &["[abcd][abcd]", "d[a-z]+", ".....", "[d]*"]),
220        ("[a-p]+", &["[p-z]+", "apapapaapapapap", ".*", "abcdefgh*"]),
221        (
222            "abcd[a-c]z+",
223            &["abcd[b-d][yz]*", "abcdazzzz", "abcdbzzz", "abcdcz"],
224        ),
225        (".*\\\\", &[".*", "asdfasdf\\\\"]),
226        (".a.a", &["b.b.", "c.c.", "d.d.", "e.e."]),
227        (
228            ".*.*.*.*.*.*.*.*.*.*.*.*.*.*.*",
229            &[".*.*.*.*.*.*.*.*.*.*.*"],
230        ),
231        ("foo.*bar", &["foobar", "fooalkdsjfbar"]),
232        ("[a-b]+c", &["[b-c]+"]),
233        (".xyz.", &[".*pqr.*"]), // this was moved from the 'EMPTY' testset because it didn't make sense there
234    ];
235    const EMPTY: &[(&str, &[&str])] = &[
236        ("abcd", &["lsdfhda", "abcdla", "asdlfk", "ksdfj"]),
237        ("[a-d]+", &["xyz", "p+", "[e-f]+"]),
238        ("[0-9]*", &["[a-z]", ".\\*"]),
239        ("ab+", &["a", "b", "abc"]),
240        ("mamama.*", &["dadada.*", "nanana.*"]),
241        (".*mamama", &[".*dadada", ".*nanana"]),
242        (".xyz.", &["paaap"]),
243        (".*.*.*.*f", &[".*.*.*.*g"]),
244    ];
245
246    const EXTRAS_NON_EMPTY: &[(&str, &[&str])] = &[
247        ("fabcd", &["f.*"]),
248        ("f.*abcd", &["fd.*"]),
249        ("f[a-n]*abcd", &["fd.*"]),
250    ];
251
252    const EXTRAS_EMPTY: &[(&str, &[&str])] = &[
253        ("fabcd", &["fd.*"]),
254        ("f.*abcd", &["fd.*z"]),
255        ("f[a-n]*abcd", &["fd.*z"]),
256    ];
257
258    #[test]
259    fn test_error() {
260        let err = format!("{}", non_empty("*", ".*g").unwrap_err());
261        assert_eq!(
262            r#"error while parsing expression: regex parse error:
263    *
264    ^
265error: repetition operator missing expression"#,
266            err
267        );
268    }
269
270    #[test]
271    fn test_one() {
272        assert!(!non_empty(".*f", ".*g").unwrap());
273    }
274
275    #[test]
276    fn test_non_empty() {
277        for (left, rights) in NON_EMPTY {
278            for right in rights.iter() {
279                assert!(non_empty(left, right).unwrap());
280                assert!(non_empty(right, left).unwrap());
281            }
282        }
283    }
284
285    #[test]
286    fn test_empty() {
287        for (left, rights) in EMPTY {
288            for right in rights.iter() {
289                assert!(!non_empty(left, right).unwrap());
290                assert!(!non_empty(right, left).unwrap());
291            }
292        }
293    }
294
295    #[test]
296    fn test_extras_non_empty() {
297        for (left, rights) in EXTRAS_NON_EMPTY {
298            for right in rights.iter() {
299                assert!(non_empty(left, right).unwrap());
300                assert!(non_empty(right, left).unwrap());
301            }
302        }
303    }
304    #[test]
305    fn test_extras_empty() {
306        for (left, rights) in EXTRAS_EMPTY {
307            for right in rights.iter() {
308                assert!(!non_empty(left, right).unwrap());
309                assert!(!non_empty(right, left).unwrap());
310            }
311        }
312    }
313}