regex_test/
lib.rs

1/*!
2A crate for defining tests in a TOML format and applying them to regex engine
3implementations.
4
5Generally speaking, if you aren't writing your own regex engine and looking to
6test it, then this crate is probably not for you. Moreover, this crate does not
7come with any actual tests. It merely defines the test format and provides some
8convenient routines for executing tests within the context of Rust unit tests.
9
10# Format
11
12The entire test corpus is derived from zero or more TOML files. Each TOML
13file contains zero or more tests, where each test is defined as a table via
14`[[test]]`.
15
16Each test has the following fields:
17
18* `name` - A name for the test. It must be unique within its file. A test's
19[`RegexTest::full_name`] is derived either via `{group_name}/{name}` or
20`{group_name}/{name}/{additional_name}`, with the latter being used only when
21[`TestRunner::expand`] is used. The `group_name` is derived from the file stem
22(the file name without the `.toml suffix).
23* `regex` - The regex to test. This is either a string or a (possibly empty)
24list of regex patterns. When using a list, the underlying regex engine is
25expected to support multiple patterns where each are identified starting from
26`0` and incrementing by 1 for each subsequent pattern.
27* `haystack` - The text to search.
28* `bounds` - An optional field whose value is a table with `start` and `end`
29fields, whose values must be valid for the given `haystack`. When set,
30the search will only execute within these bounds. When absent, the bounds
31correspond to `start = 0` and `end = haystack.len()`.
32* `matches` - Zero or more match values. Each match value can be in one of four
33formats:
34    * A simple span, i.e., `[5, 12]`, corresponding to the start and end of the
35    match, in byte offsets. The start is inclusive and the end is exclusive.
36    The pattern ID for the match is assumed to be `0`.
37    * A table corresponding to the matching pattern ID and the span of the
38    match. For example, `{ id = 5, span = [20, 21] }`.
39    * A list of capture group spans, with the first corresponding to the
40    overall match and the pattern ID assumed to be `0`. For example,
41    `[[5, 10], [6, 8], [], [9, 10]]`, where `[]` corresponds to a group
42    present in the regex but one that did not participate in a match.
43    * A table corresponding to the matching pattern ID and a list of spans
44    corresponding to the capture groups. For example,
45    `{ id = 5, spans = [[5, 10], [6, 8], [], [9, 10]] }`. This is the most
46    general, but also most verbose, syntax.
47* `match-limit` - An optional field that specifies a limit on the number of
48matches. When absent, no limit is enforced and all matches should be reported
49by the regex engine. This can be useful, for example, when one only cares about
50the first match.
51* `compiles` - An optional field indicating whether the regex is expected to
52compile. It defaults to `true` when absent. When `true`, if the regex does not
53compile, then the test fails. Conversely, when `false`, if the regex _does_
54compile, then the test fails.
55* `anchored` - Whether to execute an anchored search or not. Note that this is
56not the same as adding a `^` to the beginning of your regex pattern. `^` always
57requires the regex to match at position `0`, but an anchored search simply
58requires that the regex match at the starting position of the search. (The
59starting position of the search can be configured via the optional `bounds`
60field.)
61* `case-insensitive` - Whether to match the regex case insensitively. This is
62disabled by default. There is no real difference between using this field and
63adding a `(?i)` to the beginning of your regex. (Some regex engines may not
64support `(?i)`.)
65* `unescape` - When enabled, the haystack is unescaped. Sequences like `\x00`
66are turned into their corresponding byte values. This permits one to write
67haystacks that contain invalid UTF-8 without embedding actual invalid UTF-8
68into a TOML file (which is not allowed). There is generally no other reason to
69enable `unescape`.
70* `unicode` - When enabled, the regex pattern should be compiled with its
71corresponding Unicode mode enabled. For example, `[^a]` matches any UTF-8
72encoding of any codepoint other than `a`. Case insensitivty should be Unicode
73aware. Unicode classes like `\pL` are available. The Perl classes `\w`, `\s`
74and `\d` should be Unicode aware. And so on. This is an optional field and is
75enabled by default.
76* `utf8` - When this is enabled, all regex match substrings should be entirely
77valid UTF-8. While parts of the haystack the regex searches through may not be
78valid UTF-8, only the portions that are valid UTF-8 may be reported in match
79spans. Importantly, this includes zero-width matches. Zero-width matches must
80never split the UTF-8 encoding of a single codepoint when this is enabled. This
81is an optional field and is enabled by default.
82* `line-terminator` - This sets the line terminator used by the multi-line
83assertions `(?m:^)` and `(?m:$)`. It defaults to `\n`. It must be exactly one
84byte. This field is automatically unescaped in order to permit a non-ASCII
85byte.
86* `match-kind` - May be one of `all`, `leftmost-first` or `leftmost-longest`.
87See [`MatchKind`] for more details. This is an optional field and defaults to
88`leftmost-first`.
89* `search-kind` - May be one of `earliest`, `leftmost` or `overlapping`. See
90[`SearchKind`] for more details. This is an optional field and defaults to
91`leftmost`.
92*/
93
94#![deny(missing_docs)]
95
96/// For convenience, `anyhow::Error` is used to represents errors in this
97/// crate.
98///
99/// For this reason, `anyhow` is a public dependency and is re-exported here.
100pub extern crate anyhow;
101
102use std::{borrow::Borrow, collections::HashSet, fs, path::Path};
103
104use {
105    anyhow::{bail, Context, Result},
106    bstr::{BString, ByteSlice, ByteVec},
107    serde::Deserialize,
108};
109
110const ENV_REGEX_TEST: &str = "REGEX_TEST";
111const ENV_REGEX_TEST_VERBOSE: &str = "REGEX_TEST_VERBOSE";
112
113/// A collection of regex tests.
114#[derive(Clone, Debug, Deserialize)]
115pub struct RegexTests {
116    /// 'default' permits an empty TOML file.
117    #[serde(default, rename = "test")]
118    tests: Vec<RegexTest>,
119    #[serde(skip)]
120    seen: HashSet<String>,
121}
122
123impl RegexTests {
124    /// Create a new empty collection of glob tests.
125    pub fn new() -> RegexTests {
126        RegexTests { tests: vec![], seen: HashSet::new() }
127    }
128
129    /// Loads all of the tests in the given TOML file. The group name assigned
130    /// to each test is the stem of the file name. For example, if one loads
131    /// `foo/bar.toml`, then the group name for each test will be `bar`.
132    pub fn load<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
133        let path = path.as_ref();
134        let data = fs::read(path)
135            .with_context(|| format!("failed to read {}", path.display()))?;
136        let group_name = path
137            .file_stem()
138            .with_context(|| {
139                format!("failed to get file name of {}", path.display())
140            })?
141            .to_str()
142            .with_context(|| {
143                format!("invalid UTF-8 found in {}", path.display())
144            })?;
145        self.load_slice(&group_name, &data)
146            .with_context(|| format!("error loading {}", path.display()))?;
147        Ok(())
148    }
149
150    /// Load all of the TOML encoded tests in `data` into this collection.
151    /// The given group name is assigned to all loaded tests.
152    pub fn load_slice(&mut self, group_name: &str, data: &[u8]) -> Result<()> {
153        let data = std::str::from_utf8(&data).with_context(|| {
154            format!("data in {} is not valid UTF-8", group_name)
155        })?;
156        let mut index = 1;
157        let mut tests: RegexTests =
158            toml::from_str(&data).with_context(|| {
159                format!("error decoding TOML for '{}'", group_name)
160            })?;
161        for t in &mut tests.tests {
162            t.group = group_name.to_string();
163            if t.name.is_empty() {
164                t.name = format!("{}", index);
165                index += 1;
166            }
167            t.full_name = format!("{}/{}", t.group, t.name);
168            if t.unescape {
169                t.haystack = BString::from(Vec::unescape_bytes(
170                    // OK because TOML requires valid UTF-8.
171                    t.haystack.to_str().unwrap(),
172                ));
173            }
174            if t.line_terminator.is_empty() {
175                t.line_terminator = BString::from("\n");
176            } else {
177                t.line_terminator = BString::from(Vec::unescape_bytes(
178                    // OK because TOML requires valid UTF-8.
179                    t.line_terminator.to_str().unwrap(),
180                ));
181                anyhow::ensure!(
182                    t.line_terminator.len() == 1,
183                    "line terminator '{:?}' has length not equal to 1",
184                    t.line_terminator,
185                );
186            }
187            if self.seen.contains(t.full_name()) {
188                bail!("found duplicate tests for name '{}'", t.full_name());
189            }
190            self.seen.insert(t.full_name().to_string());
191        }
192        self.tests.extend(tests.tests);
193        Ok(())
194    }
195
196    /// Return an iterator over all regex tests that have been loaded. The
197    /// order of the iterator corresponds to the order in which the tests were
198    /// loaded.
199    ///
200    /// This is useful to pass to [`TestRunner::test_iter`].
201    pub fn iter(&self) -> RegexTestsIter {
202        RegexTestsIter(self.tests.iter())
203    }
204}
205
206/// A regex test describes the inputs and expected outputs of a regex match.
207///
208/// Each `RegexTest` represents a single `[[test]]` table in a TOML test file.
209#[derive(Clone, Debug, Deserialize)]
210#[serde(deny_unknown_fields)]
211pub struct RegexTest {
212    #[serde(skip)]
213    group: String,
214    #[serde(default)]
215    name: String,
216    #[serde(skip)]
217    additional_name: String,
218    #[serde(skip)]
219    full_name: String,
220    regex: RegexesFormat,
221    haystack: BString,
222    bounds: Option<Span>,
223    matches: Vec<Captures>,
224    #[serde(rename = "match-limit")]
225    match_limit: Option<usize>,
226    #[serde(default = "default_true")]
227    compiles: bool,
228    #[serde(default)]
229    anchored: bool,
230    #[serde(default, rename = "case-insensitive")]
231    case_insensitive: bool,
232    #[serde(default)]
233    unescape: bool,
234    #[serde(default = "default_true")]
235    unicode: bool,
236    #[serde(default = "default_true")]
237    utf8: bool,
238    #[serde(default, rename = "line-terminator")]
239    line_terminator: BString,
240    #[serde(default, rename = "match-kind")]
241    match_kind: MatchKind,
242    #[serde(default, rename = "search-kind")]
243    search_kind: SearchKind,
244}
245
246impl RegexTest {
247    /// Return the group name of this test.
248    ///
249    /// Usually the group name corresponds to a collection of related
250    /// tests. More specifically, when using [`RegexTests::load`], the
251    /// group name corresponds to the file stem (the file name without the
252    /// `.toml` suffix). Otherwise, the group name is whatever is given to
253    /// [`RegexTests::load_slice`].
254    pub fn group(&self) -> &str {
255        &self.group
256    }
257
258    /// The name of this test.
259    ///
260    /// Note that this is only the name as given in the `[[test]]` block. The
261    /// actual full name used for filtering and reporting can be retrieved with
262    /// [`RegexTest::full_name`].
263    pub fn name(&self) -> &str {
264        &self.name
265    }
266
267    /// The additional name for this test.
268    ///
269    /// This is only non-empty when the test runner was expanded with
270    /// [`TestRunner::expand`].
271    pub fn additional_name(&self) -> &str {
272        &self.additional_name
273    }
274
275    /// The full name of this test, which is formed by joining the group
276    /// name, the test name and the additional name with a `/`.
277    pub fn full_name(&self) -> &str {
278        &self.full_name
279    }
280
281    /// Return all of the regexes that should be matched for this test. This
282    /// slice may be empty!
283    pub fn regexes(&self) -> &[String] {
284        self.regex.patterns()
285    }
286
287    /// Return the bytes on which the regex should be matched.
288    pub fn haystack(&self) -> &[u8] {
289        &self.haystack
290    }
291
292    /// Returns the bounds of a search.
293    ///
294    /// If the test didn't specify any bounds, then the bounds returned are
295    /// equivalent to the entire haystack.
296    pub fn bounds(&self) -> Span {
297        self.bounds.unwrap_or(Span { start: 0, end: self.haystack().len() })
298    }
299
300    /// Returns the limit on the number of matches that should be reported,
301    /// if specified in the test.
302    ///
303    /// This is useful for tests that only want to check for the first
304    /// match. In which case, the match limit is set to 1.
305    ///
306    /// If there is no match limit, then regex engines are expected to report
307    /// all matches.
308    pub fn match_limit(&self) -> Option<usize> {
309        self.match_limit
310    }
311
312    /// Returns true if the regex(es) in this test are expected to compile.
313    pub fn compiles(&self) -> bool {
314        self.compiles
315    }
316
317    /// Whether the regex should perform an anchored search.
318    ///
319    /// This is distinct from putting a `^` in the regex in that `bounds` may
320    /// be specified that permit the regex search to start at a position
321    /// `i > 0`. In which case, enabling anchored mode here requires that any
322    /// matches produced must have a start offset at `i`.
323    pub fn anchored(&self) -> bool {
324        self.anchored
325    }
326
327    /// Returns true if regex matching should be performed without regard to
328    /// case.
329    pub fn case_insensitive(&self) -> bool {
330        self.case_insensitive
331    }
332
333    /// Returns true if regex matching should have Unicode mode enabled.
334    ///
335    /// For example, `[^a]` matches any UTF-8 encoding of any codepoint other
336    /// than `a`. Case insensitivty should be Unicode aware. Unicode classes
337    /// like `\pL` are available. The Perl classes `\w`, `\s` and `\d` should
338    /// be Unicode aware. And so on.
339    ///
340    /// This is enabled by default.
341    pub fn unicode(&self) -> bool {
342        self.unicode
343    }
344
345    /// Returns true if regex matching should exclusively match valid UTF-8.
346    /// When this is disabled, matching on arbitrary bytes is permitted.
347    ///
348    /// When this is enabled, all regex match substrings should be entirely
349    /// valid UTF-8. While parts of the haystack the regex searches through
350    /// may not be valid UTF-8, only the portions that are valid UTF-8 may be
351    /// reported in match spans.
352    ///
353    /// Importantly, this includes zero-width matches. Zero-width matches must
354    /// never split the UTF-8 encoding of a single codepoint when this is
355    /// enabled.
356    ///
357    /// This is enabled by default.
358    pub fn utf8(&self) -> bool {
359        self.utf8
360    }
361
362    /// Returns the line terminator that should be used for the multi-line
363    /// assertions `(?m:^)` and `(?m:$)`.
364    ///
365    /// If it isn't set, then this defaults to `\n`.
366    pub fn line_terminator(&self) -> u8 {
367        self.line_terminator[0]
368    }
369
370    /// Return the match semantics required by this test.
371    pub fn match_kind(&self) -> MatchKind {
372        self.match_kind
373    }
374
375    /// Return the search semantics required by this test.
376    pub fn search_kind(&self) -> SearchKind {
377        self.search_kind
378    }
379
380    /// Run the test and return the result produced by the given compiled
381    /// regex.
382    fn test(&self, regex: &mut CompiledRegex) -> TestResult {
383        match regex.matcher {
384            None => TestResult::skip(),
385            Some(ref mut match_regex) => match_regex(self),
386        }
387    }
388
389    /// Append `/name` to the `full_name` of this test.
390    ///
391    /// This is used to support [`TestRunner::expand`].
392    fn with_additional_name(&self, name: &str) -> RegexTest {
393        let additional_name = name.to_string();
394        let full_name = format!("{}/{}", self.full_name, additional_name);
395        RegexTest { additional_name, full_name, ..self.clone() }
396    }
397
398    /// Returns true if and only if this test expects at least one of the
399    /// regexes to match the haystack.
400    fn is_match(&self) -> bool {
401        !self.matches.is_empty()
402    }
403
404    /// Returns a slice of pattern IDs that are expected to match the haystack.
405    /// The slice is empty if no match is expected to occur. The IDs returned
406    /// are deduplicated and sorted in ascending order.
407    fn which_matches(&self) -> Vec<usize> {
408        let mut seen = HashSet::new();
409        let mut ids = vec![];
410        for cap in self.matches.iter() {
411            if !seen.contains(&cap.id) {
412                seen.insert(cap.id);
413                ids.push(cap.id);
414            }
415        }
416        ids.sort();
417        ids
418    }
419
420    /// Extracts the overall match from each `Captures` match in this test
421    /// and returns it.
422    fn matches(&self) -> Vec<Match> {
423        let mut matches = vec![];
424        for cap in self.matches.iter() {
425            matches.push(cap.to_match());
426        }
427        matches
428    }
429
430    /// Returns the matches expected by this test, including the spans of any
431    /// matching capture groups.
432    fn captures(&self) -> Vec<Captures> {
433        self.matches.clone()
434    }
435}
436
437/// The result of compiling a regex.
438///
439/// In many implementations, the act of matching a regex can be separated from
440/// the act of compiling a regex. A `CompiledRegex` represents a regex that has
441/// been compiled and is ready to be used for matching.
442///
443/// The matching implementation is represented by a closure that accepts a
444/// [`&RegexTest`](RegexTest) and returns a [`TestResult`].
445pub struct CompiledRegex {
446    matcher: Option<Box<dyn FnMut(&RegexTest) -> TestResult + 'static>>,
447}
448
449impl CompiledRegex {
450    /// Provide a closure that represents the compiled regex and executes a
451    /// regex match on any `RegexTest`. The `RegexTest` given to the closure
452    /// provided is the exact same `RegexTest` that is used to compile this
453    /// regex.
454    pub fn compiled(
455        matcher: impl FnMut(&RegexTest) -> TestResult + 'static,
456    ) -> CompiledRegex {
457        CompiledRegex { matcher: Some(Box::new(matcher)) }
458    }
459
460    /// Indicate that tests on this regex should be skipped. This typically
461    /// occurs if the `RegexTest` requires something that an implementation
462    /// does not support.
463    pub fn skip() -> CompiledRegex {
464        CompiledRegex { matcher: None }
465    }
466
467    /// Returns true if the test runner decided to skip the test when
468    /// attempting to compile the regex.
469    pub fn is_skip(&self) -> bool {
470        self.matcher.is_none()
471    }
472}
473
474impl std::fmt::Debug for CompiledRegex {
475    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
476        let status = match self.matcher {
477            None => "Skip",
478            Some(_) => "Run(...)",
479        };
480        f.debug_struct("CompiledRegex").field("matcher", &status).finish()
481    }
482}
483
484/// The result of executing a regex search.
485///
486/// When using the test runner, callers must provide a closure that takes
487/// a `RegexTest` and returns a `TestResult`. The `TestResult` is meant to
488/// capture the results of matching the haystack against the regex specified by
489/// the `RegexTest`.
490///
491/// Usually this consists of one or more matches, which can be constructed via
492/// `TestResult::matches` (for just overall matches) or `TestResult::captures`
493/// (for matches with capture group spans). But the regex engine may also
494/// report whether a match exists, or just whether a pattern matched or not.
495/// That can be done via `TestResult::matched` and `TestResult::which`,
496/// respectively.
497#[derive(Debug, Clone)]
498pub struct TestResult {
499    kind: TestResultKind,
500}
501
502#[derive(Debug, Clone)]
503enum TestResultKind {
504    Match(bool),
505    Which(Vec<usize>),
506    StartEnd(Vec<Match>),
507    Captures(Vec<Captures>),
508    Skip,
509    Fail { why: String },
510}
511
512impl TestResult {
513    /// Create a test result that indicates just whether any match was found
514    /// or not.
515    pub fn matched(yes: bool) -> TestResult {
516        TestResult { kind: TestResultKind::Match(yes) }
517    }
518
519    /// Create a test result that indicates which out of possibly many regexes
520    /// matched the haystack. If `which` is empty, then this is equivalent to
521    /// `TestResult::matched(false)`.
522    ///
523    /// Note that the iterator should consist of pattern IDs, where each
524    /// ID corresponds to a pattern that matches anywhere in the haystack.
525    /// Multiple patterns may match the same region of the haystack. That is,
526    /// this supports overlapping matches.
527    pub fn which<I: IntoIterator<Item = usize>>(it: I) -> TestResult {
528        let mut which: Vec<usize> = it.into_iter().collect();
529        which.sort();
530        TestResult { kind: TestResultKind::Which(which) }
531    }
532
533    /// Create a test result containing a sequence of all matches in the test's
534    /// haystack. This is useful when the regex engine only reports overall
535    /// matches and not the spans of each matching capture group.
536    ///
537    /// If the sequence is empty, then this is equivalent to
538    /// `TestResult::matched(false)`.
539    pub fn matches<I: IntoIterator<Item = Match>>(it: I) -> TestResult {
540        TestResult { kind: TestResultKind::StartEnd(it.into_iter().collect()) }
541    }
542
543    /// Create a test result containing a sequence of all capturing matches in
544    /// the test's haystack. Each match is a `Captures`, and each `Captures`
545    /// should include the spans of all matching capturing groups.
546    ///
547    /// If the sequence is empty, then this is equivalent to
548    /// `TestResult::matched(false)`.
549    pub fn captures<I: IntoIterator<Item = Captures>>(it: I) -> TestResult {
550        TestResult { kind: TestResultKind::Captures(it.into_iter().collect()) }
551    }
552
553    /// Indicate that this test should be skipped. It will not be counted as
554    /// a failure.
555    pub fn skip() -> TestResult {
556        TestResult { kind: TestResultKind::Skip }
557    }
558
559    /// Indicate that this test should be failed for the reason given.
560    ///
561    /// This is useful when a test needs to be failed for reasons that the
562    /// test runner itself cannot check. That is, the test is failed by the
563    /// implementation being tested.
564    pub fn fail(why: &str) -> TestResult {
565        TestResult { kind: TestResultKind::Fail { why: why.to_string() } }
566    }
567}
568
569/// A runner for executing regex tests.
570///
571/// This runner is intended to be used within a Rust unit test, marked with the
572/// `#[test]` attribute.
573///
574/// A test runner is responsible for running tests against a regex
575/// implementation. It contains logic for skipping tests and collects test
576/// results. Typical usage corresponds to calling [`TestRunner::test_iter`] on
577/// an iterator of `RegexTest`s, and then calling `assert` once done. If any
578/// tests failed, then `assert` will panic with an error message containing all
579/// test failures. `assert` must be called before the test completes.
580///
581/// # Skipping tests
582///
583/// If the `REGEX_TEST` environment variable is set, then it may contain
584/// a comma separated list of substrings. Each substring corresponds to a
585/// whitelisted item, unless it starts with a `-`, in which case it corresponds
586/// to a blacklisted item.
587///
588/// If there are any whitelist items, then a test's full name must contain at
589/// least one of the whitelist substrings in order to be run, and does not
590/// contain and blacklist substrings. If there are no whitelist substrings,
591/// then a test is run only when it does not match any blacklist substrings.
592///
593/// The last substring that a test name matches takes precedent.
594///
595/// Callers may also specify explicit whitelist or blacklist substrings using
596/// the corresponding methods on this type, which has the effect of always
597/// having those rules in place for that specific test. For example, if you're
598/// testing a search by building a DFA and then minimizing it, you may want to
599/// skip tests with bigger regexes, since they could take quite some time to
600/// run.
601///
602/// Whitelist and blacklist substrings are matched on the full name of each
603/// test, which typically looks like `group_name/test_name`.
604///
605/// Currently there is no way to escape either a `-` or a `,` in `REGEX_TEST`.
606/// This usually isn't required because test names usually don't include either
607/// character.
608#[derive(Debug)]
609pub struct TestRunner {
610    include: Vec<IncludePattern>,
611    results: RegexTestResults,
612    expanders: Vec<Expander>,
613}
614
615impl TestRunner {
616    /// Create a new runner for executing tests.
617    ///
618    /// The test runner maintains a full list of tests that have succeeded,
619    /// failed or been skipped. Moreover, the test runner may control which
620    /// tests get run via its whitelist and blacklist.
621    ///
622    /// This returns an error if there was a problem reading the `REGEX_TEST`
623    /// environment variable, which may be set to include or exclude tests.
624    /// See the docs on `TestRunner` for its format.
625    pub fn new() -> Result<TestRunner> {
626        let mut runner = TestRunner {
627            include: vec![],
628            results: RegexTestResults::new(),
629            expanders: vec![],
630        };
631        for mut substring in read_env(ENV_REGEX_TEST)?.split(",") {
632            substring = substring.trim();
633            if substring.is_empty() {
634                continue;
635            }
636            if substring.starts_with("-") {
637                runner.blacklist(&substring[1..]);
638            } else {
639                runner.whitelist(substring);
640            }
641        }
642        Ok(runner)
643    }
644
645    /// Assert that all tests run have either passed or have been skipped.
646    ///
647    /// If any tests have failed, then a panic occurs with a report of all
648    /// failures.
649    ///
650    /// If `REGEX_TEST_VERBOSE` is set to `1`, then a longer report of tests
651    /// that passed, failed or skipped is printed.
652    pub fn assert(&mut self) {
653        self.results.assert();
654    }
655
656    /// Whitelist the given substring.
657    ///
658    /// Whitelist and blacklist rules are only applied when
659    /// [`TestRunner::test_iter`] is called.
660    pub fn whitelist(&mut self, substring: &str) -> &mut TestRunner {
661        self.include.push(IncludePattern {
662            blacklist: false,
663            substring: BString::from(substring),
664        });
665        self
666    }
667
668    /// Whitelist the given iterator substrings.
669    ///
670    /// This is a convenience routine for calling `whitelist` on each of the
671    /// substrings in the iterator provided.
672    ///
673    /// Whitelist and blacklist rules are only applied when
674    /// [`TestRunner::test_iter`] is called.
675    pub fn whitelist_iter<I, S>(&mut self, substrings: I) -> &mut TestRunner
676    where
677        I: IntoIterator<Item = S>,
678        S: AsRef<str>,
679    {
680        for substring in substrings {
681            self.whitelist(substring.as_ref());
682        }
683        self
684    }
685
686    /// Blacklist the given substring.
687    ///
688    /// A blacklisted test is never run, unless a whitelisted substring added
689    /// after the blacklisted substring matches it.
690    ///
691    /// Whitelist and blacklist rules are only applied when
692    /// [`TestRunner::test_iter`] is called.
693    pub fn blacklist(&mut self, substring: &str) -> &mut TestRunner {
694        self.include.push(IncludePattern {
695            blacklist: true,
696            substring: BString::from(substring),
697        });
698        self
699    }
700
701    /// Blacklist the given iterator substrings.
702    ///
703    /// A blacklisted test is never run, unless a whitelisted substring added
704    /// after the blacklisted substring matches it.
705    ///
706    /// This is a convenience routine for calling `blacklist` on each of the
707    /// substrings in the iterator provided.
708    ///
709    /// Whitelist and blacklist rules are only applied when
710    /// [`TestRunner::test_iter`] is called.
711    pub fn blacklist_iter<I, S>(&mut self, substrings: I) -> &mut TestRunner
712    where
713        I: IntoIterator<Item = S>,
714        S: AsRef<str>,
715    {
716        for substring in substrings {
717            self.blacklist(substring.as_ref());
718        }
719        self
720    }
721
722    /// Set an expansion predicate that appends each entry in
723    /// `additional_names` to the end the name for every test that `predicate`
724    /// returns true. Moreover, the corresponding additional name is made
725    /// available via [`RegexTest::additional_name`].
726    ///
727    /// This permits implementors to create multiple copies of each test, and
728    /// then do specifically different tasks with each, while making it so each
729    /// test is distinct.
730    ///
731    /// For example, you might write something like this:
732    ///
733    /// ```ignore
734    /// TestRunner::new()?
735    ///     .expand(&["is_match", "find"], |t| t.compiles())
736    ///     .test_iter(tests, compiler)
737    ///     .assert()
738    /// ```
739    ///
740    /// where each test that is expected to have a regex compile gets copied
741    /// with `/is_match` and `/find` appends to the end of its name. Then, in
742    /// your own test runner, you can inspect [`RegexTest::additional_name`] to
743    /// decide what to do. In the case of `is_match`, you might test your regex
744    /// engines "has a match" API, which might exercise different logic than
745    /// your "find where the matches are" API.
746    pub fn expand<S: AsRef<str>>(
747        &mut self,
748        additional_names: &[S],
749        predicate: impl FnMut(&RegexTest) -> bool + 'static,
750    ) -> &mut TestRunner {
751        self.expanders.push(Expander {
752            predicate: Box::new(predicate),
753            additional_names: additional_names
754                .iter()
755                .map(|s| s.as_ref().to_string())
756                .collect(),
757        });
758        self
759    }
760
761    /// Run all of the given tests using the given regex compiler.
762    ///
763    /// The compiler given is a closure that accepts a
764    /// [`&RegexTest`](RegexTest) and a sequence of patterns, and returns (if
765    /// successful) a [`CompiledRegex`] which can execute a search.
766    ///
767    /// Note that if there are test failures, this merely _collects_ them. Use
768    /// [`TestRunner::assert`] to fail the current test by panicking if there
769    /// any failures.
770    ///
771    /// Typically, one provides [`RegexTests::iter`] as the iterator of
772    /// `RegexTest` values.
773    pub fn test_iter<I, T>(
774        &mut self,
775        it: I,
776        mut compile: impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex>,
777    ) -> &mut TestRunner
778    where
779        I: IntoIterator<Item = T>,
780        T: Borrow<RegexTest>,
781    {
782        for test in it {
783            let test = test.borrow();
784            let mut additional = vec![];
785            for expander in &mut self.expanders {
786                if (expander.predicate)(test) {
787                    for name in expander.additional_names.iter() {
788                        additional.push(test.with_additional_name(name));
789                    }
790                    break;
791                }
792            }
793            if additional.is_empty() {
794                additional.push(test.to_owned());
795            }
796            for test in &additional {
797                if self.should_skip(test) {
798                    self.results.skip(test);
799                    continue;
800                }
801                self.test(test, |regexes| compile(test, regexes));
802            }
803        }
804        self
805    }
806
807    /// Run a single test.
808    ///
809    /// This records the result of running the test in this runner. This does
810    /// not fail the test immediately if the given regex test fails. Instead,
811    /// this is only done when the `assert` method is called.
812    ///
813    /// Note that using this method bypasses any whitelist or blacklist applied
814    /// to this runner. Whitelisted (and blacklisted) substrings are only
815    /// applied when using `test_iter`.
816    pub fn test(
817        &mut self,
818        test: &RegexTest,
819        mut compile: impl FnMut(&[String]) -> Result<CompiledRegex>,
820    ) -> &mut TestRunner {
821        let mut compiled = match safe(|| compile(test.regexes())) {
822            Err(msg) => {
823                // Regex tests should never panic. It's auto-fail if they do.
824                self.results.fail(
825                    test,
826                    RegexTestFailureKind::UnexpectedPanicCompile(msg),
827                );
828                return self;
829            }
830            Ok(Ok(compiled)) => compiled,
831            Ok(Err(err)) => {
832                if !test.compiles() {
833                    self.results.pass(test);
834                } else {
835                    self.results.fail(
836                        test,
837                        RegexTestFailureKind::CompileError { err },
838                    );
839                }
840                return self;
841            }
842        };
843        // We fail the test if we didn't expect the regex to compile. However,
844        // it's possible the caller decided to skip the test when attempting
845        // to compile the regex, so we check for that. If the compiled regex
846        // is marked as skipped, then 'test.test(..)' below handles it
847        // correctly.
848        if !test.compiles() && !compiled.is_skip() {
849            self.results.fail(test, RegexTestFailureKind::NoCompileError);
850            return self;
851        }
852        let result = match safe(|| test.test(&mut compiled)) {
853            Ok(result) => result,
854            Err(msg) => {
855                self.results.fail(
856                    test,
857                    RegexTestFailureKind::UnexpectedPanicSearch(msg),
858                );
859                return self;
860            }
861        };
862        match result.kind {
863            TestResultKind::Match(yes) => {
864                if yes == test.is_match() {
865                    self.results.pass(test);
866                } else {
867                    self.results.fail(test, RegexTestFailureKind::IsMatch);
868                }
869            }
870            TestResultKind::Which(which) => {
871                if which != test.which_matches() {
872                    self.results
873                        .fail(test, RegexTestFailureKind::Many { got: which });
874                } else {
875                    self.results.pass(test);
876                }
877            }
878            TestResultKind::StartEnd(matches) => {
879                let expected = test.matches();
880                if expected != matches {
881                    self.results.fail(
882                        test,
883                        RegexTestFailureKind::StartEnd { got: matches },
884                    );
885                } else {
886                    self.results.pass(test);
887                }
888            }
889            TestResultKind::Captures(caps) => {
890                let expected = test.captures();
891                if expected != caps {
892                    self.results.fail(
893                        test,
894                        RegexTestFailureKind::Captures { got: caps },
895                    );
896                } else {
897                    self.results.pass(test);
898                }
899            }
900            TestResultKind::Skip => {
901                self.results.skip(test);
902            }
903            TestResultKind::Fail { why } => {
904                self.results
905                    .fail(test, RegexTestFailureKind::UserFailure { why });
906            }
907        }
908        self
909    }
910
911    /// Return true if and only if the given test should be skipped.
912    fn should_skip(&self, test: &RegexTest) -> bool {
913        if self.include.is_empty() {
914            return false;
915        }
916
917        // If we don't have any whitelist patterns, then the test will be run
918        // unless it is blacklisted. Otherwise, if there are whitelist
919        // patterns, then the test must match at least one of them.
920        let mut skip = self.include.iter().any(|pat| !pat.blacklist);
921        for pat in &self.include {
922            if test.full_name().as_bytes().contains_str(&pat.substring) {
923                skip = pat.blacklist;
924            }
925        }
926        skip
927    }
928}
929
930#[derive(Debug)]
931struct IncludePattern {
932    blacklist: bool,
933    substring: BString,
934}
935
936struct Expander {
937    predicate: Box<dyn FnMut(&RegexTest) -> bool>,
938    additional_names: Vec<String>,
939}
940
941impl std::fmt::Debug for Expander {
942    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
943        f.debug_struct("Expander")
944            .field("predicate", &"<FnMut(..)>")
945            .field("additional_names", &self.additional_names)
946            .finish()
947    }
948}
949
950/// A collection of test results, corresponding to passed, skipped and failed
951/// tests.
952#[derive(Debug)]
953struct RegexTestResults {
954    pass: Vec<RegexTestResult>,
955    fail: Vec<RegexTestFailure>,
956    skip: Vec<RegexTestResult>,
957}
958
959/// A test that passed or skipped, along with its specific result.
960#[derive(Debug)]
961struct RegexTestResult {
962    test: RegexTest,
963}
964
965/// A test that failed along with the reason why.
966#[derive(Debug)]
967struct RegexTestFailure {
968    test: RegexTest,
969    kind: RegexTestFailureKind,
970}
971
972/// Describes the nature of the failed test.
973#[derive(Debug)]
974enum RegexTestFailureKind {
975    /// UserFailure indicates that the test failed because the test function
976    /// explicitly failed it for the reason in the message given.
977    UserFailure { why: String },
978    /// This occurs when the test expected a match (or didn't expect a match),
979    /// but the actual regex implementation didn't match (or did match).
980    IsMatch,
981    /// This occurs when a set of regexes is tested, and the matching regexes
982    /// returned by the regex implementation don't match the expected matching
983    /// regexes. This error contains the indices of the regexes that matched.
984    Many { got: Vec<usize> },
985    /// This occurs when a single regex is used to find all non-overlapping
986    /// matches in a haystack, where the result did not match what was
987    /// expected. This reports the incorrect matches returned by the regex
988    /// implementation under test.
989    StartEnd { got: Vec<Match> },
990    /// Like StartEnd, but for capturing groups.
991    Captures { got: Vec<Captures> },
992    /// This occurs when the test expected the regex to fail to compile, but it
993    /// compiled successfully.
994    NoCompileError,
995    /// This occurs when the test expected the regex to compile successfully,
996    /// but it failed to compile.
997    CompileError { err: anyhow::Error },
998    /// While compiling, a panic occurred. If possible, the panic message
999    /// is captured.
1000    UnexpectedPanicCompile(String),
1001    /// While searching, a panic occurred. If possible, the panic message
1002    /// is captured.
1003    UnexpectedPanicSearch(String),
1004}
1005
1006impl RegexTestResults {
1007    fn new() -> RegexTestResults {
1008        RegexTestResults { pass: vec![], fail: vec![], skip: vec![] }
1009    }
1010
1011    fn pass(&mut self, test: &RegexTest) {
1012        self.pass.push(RegexTestResult { test: test.clone() });
1013    }
1014
1015    fn fail(&mut self, test: &RegexTest, kind: RegexTestFailureKind) {
1016        self.fail.push(RegexTestFailure { test: test.clone(), kind });
1017    }
1018
1019    fn skip(&mut self, test: &RegexTest) {
1020        self.skip.push(RegexTestResult { test: test.clone() });
1021    }
1022
1023    fn assert(&self) {
1024        if read_env(ENV_REGEX_TEST_VERBOSE).map_or(false, |s| s == "1") {
1025            self.verbose();
1026        }
1027        if self.fail.is_empty() {
1028            return;
1029        }
1030        let failures = self
1031            .fail
1032            .iter()
1033            .map(|f| f.to_string())
1034            .collect::<Vec<String>>()
1035            .join("\n\n");
1036        panic!(
1037            "found {} failures:\n{}\n{}\n{}\n\n\
1038             Set the REGEX_TEST environment variable to filter tests, \n\
1039             e.g., REGEX_TEST=foo,-foo2 runs every test whose name contains \n\
1040             foo but not foo2\n\n",
1041            self.fail.len(),
1042            "~".repeat(79),
1043            failures.trim(),
1044            "~".repeat(79),
1045        )
1046    }
1047
1048    fn verbose(&self) {
1049        println!("{}", "~".repeat(79));
1050        for t in &self.skip {
1051            println!("skip: {}", t.full_name());
1052        }
1053        for t in &self.pass {
1054            println!("pass: {}", t.full_name());
1055        }
1056        for t in &self.fail {
1057            println!("FAIL: {}", t.test.full_name());
1058        }
1059        println!(
1060            "\npassed: {}, skipped: {}, failed: {}",
1061            self.pass.len(),
1062            self.skip.len(),
1063            self.fail.len()
1064        );
1065        println!("{}", "~".repeat(79));
1066    }
1067}
1068
1069impl RegexTestResult {
1070    fn full_name(&self) -> String {
1071        self.test.full_name().to_string()
1072    }
1073}
1074
1075impl RegexTestFailure {
1076    fn full_name(&self) -> String {
1077        self.test.full_name().to_string()
1078    }
1079}
1080
1081impl std::fmt::Display for RegexTestFailure {
1082    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1083        write!(
1084            f,
1085            "{}: {}\n\
1086             pattern:     {:?}\n\
1087             haystack:    {:?}",
1088            self.full_name(),
1089            self.kind.fmt(&self.test)?,
1090            self.test.regexes(),
1091            self.test.haystack().as_bstr(),
1092        )?;
1093        Ok(())
1094    }
1095}
1096
1097impl RegexTestFailureKind {
1098    fn fmt(&self, test: &RegexTest) -> Result<String, std::fmt::Error> {
1099        use std::fmt::Write;
1100
1101        let mut buf = String::new();
1102        match *self {
1103            RegexTestFailureKind::UserFailure { ref why } => {
1104                write!(buf, "failed by implementor because: {}", why)?;
1105            }
1106            RegexTestFailureKind::IsMatch => {
1107                if test.is_match() {
1108                    write!(buf, "expected match, but none found")?;
1109                } else {
1110                    write!(buf, "expected no match, but found a match")?;
1111                }
1112            }
1113            RegexTestFailureKind::Many { ref got } => {
1114                write!(
1115                    buf,
1116                    "expected regexes {:?} to match, but found {:?}",
1117                    test.which_matches(),
1118                    got
1119                )?;
1120            }
1121            RegexTestFailureKind::StartEnd { ref got } => {
1122                write!(
1123                    buf,
1124                    "did not find expected matches\n\
1125                     expected: {:?}\n     \
1126                     got: {:?}",
1127                    test.matches(),
1128                    got,
1129                )?;
1130            }
1131            RegexTestFailureKind::Captures { ref got } => {
1132                write!(
1133                    buf,
1134                    "expected to find {:?} captures, but got {:?}",
1135                    test.captures(),
1136                    got,
1137                )?;
1138            }
1139            RegexTestFailureKind::NoCompileError => {
1140                write!(buf, "expected regex to NOT compile, but it did")?;
1141            }
1142            RegexTestFailureKind::CompileError { ref err } => {
1143                write!(buf, "expected regex to compile, failed: {}", err)?;
1144            }
1145            RegexTestFailureKind::UnexpectedPanicCompile(ref msg) => {
1146                write!(buf, "got unexpected panic while compiling:\n{}", msg)?;
1147            }
1148            RegexTestFailureKind::UnexpectedPanicSearch(ref msg) => {
1149                write!(buf, "got unexpected panic while searching:\n{}", msg)?;
1150            }
1151        }
1152        Ok(buf)
1153    }
1154}
1155
1156/// An iterator over regex tests.
1157///
1158/// This iterator is created by the [`RegexTests::iter`] method.
1159#[derive(Debug)]
1160pub struct RegexTestsIter<'a>(std::slice::Iter<'a, RegexTest>);
1161
1162impl<'a> Iterator for RegexTestsIter<'a> {
1163    type Item = &'a RegexTest;
1164
1165    fn next(&mut self) -> Option<&'a RegexTest> {
1166        self.0.next()
1167    }
1168}
1169
1170/// Represents either a single regex or a list of regexes in a TOML.
1171#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
1172#[serde(untagged)]
1173enum RegexesFormat {
1174    Single(String),
1175    Many(Vec<String>),
1176}
1177
1178impl RegexesFormat {
1179    fn patterns(&self) -> &[String] {
1180        match *self {
1181            RegexesFormat::Single(ref pat) => std::slice::from_ref(pat),
1182            RegexesFormat::Many(ref pats) => pats,
1183        }
1184    }
1185}
1186
1187/// Captures represents a single group of captured matches from a regex search.
1188///
1189/// There is always at least 1 group, and the first group is always present and
1190/// corresponds to the overall match.
1191#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
1192#[serde(try_from = "CapturesFormat")]
1193pub struct Captures {
1194    /// The ID of the regex that matched.
1195    ///
1196    /// The ID is the index of the regex provided to the regex compiler,
1197    /// starting from `0`. In the case of a single regex search, the only
1198    /// possible ID is `0`.
1199    id: usize,
1200    /// The capturing groups that matched, along with the match offsets for
1201    /// each. The first group should always be non-None, as it corresponds to
1202    /// the overall match.
1203    ///
1204    /// This should either have length 1 (when not capturing group offsets are
1205    /// included in the tes tresult) or it should have length equal to the
1206    /// number of capturing groups in the regex pattern.
1207    groups: Vec<Option<Span>>,
1208}
1209
1210impl Captures {
1211    /// Create a new set of captures for a single match of a regex.
1212    ///
1213    /// If available, iterator should provide items for every capturing group
1214    /// in the regex, including the 0th capturing group corresponding to the
1215    /// entire match. At minimum, the 0th capturing group should be provided.
1216    ///
1217    /// If a capturing group did not participate in the match, then a `None`
1218    /// value should be used. (The 0th capturing group should never be `None`.)
1219    ///
1220    /// If the iterator yields no elements or the first group is `None`, then
1221    /// this returns an error.
1222    ///
1223    /// The `id` should be the ID of the pattern that matched. This is always
1224    /// `0` for single-pattern regexes. Otherwise, the ID of a pattern starts
1225    /// at `0` and is incremented by 1 for each subsequent pattern.
1226    ///
1227    /// Note that there are possibly more convenient and infallible `From`
1228    /// impls for converting a `Match` or a `Span` into a `Captures`.
1229    pub fn new<I: IntoIterator<Item = Option<Span>>>(
1230        id: usize,
1231        it: I,
1232    ) -> Result<Captures> {
1233        let groups: Vec<Option<Span>> = it.into_iter().collect();
1234        if groups.is_empty() {
1235            bail!("captures must contain at least one group");
1236        } else if groups[0].is_none() {
1237            bail!("first group (index 0) of captures must be non-None");
1238        }
1239        Ok(Captures { id, groups })
1240    }
1241
1242    /// Returns the ID of the pattern that matched.
1243    ///
1244    /// For any single pattern regexes, this should always be zero.
1245    pub fn id(&self) -> usize {
1246        self.id
1247    }
1248
1249    /// Returns a slice of the underlying spans, each group corresponding to
1250    /// the (possibly) matched span. The first group in the slice returned
1251    /// is guaranteed to correspond to the overall match span and is thus
1252    /// non-`None`. All other groups may be `None`. Similarly, the slice is
1253    /// guaranteed to have length at least 1.
1254    pub fn groups(&self) -> &[Option<Span>] {
1255        &self.groups
1256    }
1257
1258    /// Returns the number of groups (including the first) in these captures.
1259    ///
1260    /// The length returned is guaranteed to be greater than zero.
1261    pub fn len(&self) -> usize {
1262        self.groups.len()
1263    }
1264
1265    /// Returns the overall match, including the pattern ID, for this group
1266    /// of captures.
1267    pub fn to_match(&self) -> Match {
1268        Match { id: self.id(), span: self.to_span() }
1269    }
1270
1271    /// Returns the overall match span for this group of captures.
1272    pub fn to_span(&self) -> Span {
1273        // This is OK because a Captures value must always have at least one
1274        // group where the first group always corresponds to match offsets.
1275        self.groups[0].unwrap()
1276    }
1277}
1278
1279/// Converts a plain `Match` to a `Captures` value, where the match corresponds
1280/// to the first and only group in `Captures`.
1281impl From<Match> for Captures {
1282    fn from(m: Match) -> Captures {
1283        Captures { id: m.id, groups: vec![Some(m.span)] }
1284    }
1285}
1286
1287/// Converts a plain `Span` to a `Captures` value, where the span corresponds to
1288/// the first and only group in `Captures`. Since a `Span` does not contain a
1289/// pattern ID, the pattern ID used in this conversion is always `0`.
1290impl From<Span> for Captures {
1291    fn from(sp: Span) -> Captures {
1292        Captures { id: 0, groups: vec![Some(sp)] }
1293    }
1294}
1295
1296/// Represents the actual 'captures' key format more faithfully such that
1297/// Serde can deserialize it.
1298///
1299/// Namely, we need a way to represent a 'None' value inside a TOML array, and
1300/// TOML has no 'null' value. So we make '[]' be 'None', and we use 'MaybeSpan'
1301/// to recognize it.
1302#[derive(Deserialize)]
1303#[serde(untagged)]
1304enum CapturesFormat {
1305    Span([usize; 2]),
1306    Match { id: usize, span: [usize; 2] },
1307    Spans(Vec<MaybeSpan>),
1308    Captures { id: usize, spans: Vec<MaybeSpan> },
1309}
1310
1311impl TryFrom<CapturesFormat> for Captures {
1312    type Error = anyhow::Error;
1313
1314    fn try_from(data: CapturesFormat) -> Result<Captures> {
1315        match data {
1316            CapturesFormat::Span([start, end]) => {
1317                Ok(Captures { id: 0, groups: vec![Some(Span { start, end })] })
1318            }
1319            CapturesFormat::Match { id, span: [start, end] } => {
1320                Ok(Captures { id, groups: vec![Some(Span { start, end })] })
1321            }
1322            CapturesFormat::Spans(spans) => {
1323                Captures::new(0, spans.into_iter().map(|s| s.into_option()))
1324            }
1325            CapturesFormat::Captures { id, spans } => {
1326                Captures::new(id, spans.into_iter().map(|s| s.into_option()))
1327            }
1328        }
1329    }
1330}
1331
1332/// A single match, consisting of the pattern that matched and its span.
1333#[derive(Clone, Copy, Eq, PartialEq)]
1334pub struct Match {
1335    /// The ID of the pattern that matched.
1336    ///
1337    /// This is always `0` for single-pattern regexes. Otherwise, patterns
1338    /// start at `0` and count upwards in increments of `1`.
1339    pub id: usize,
1340    /// The span of the overall match.
1341    pub span: Span,
1342}
1343
1344impl std::fmt::Debug for Match {
1345    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1346        write!(f, "Match({:?}: {:?})", self.id, self.span)
1347    }
1348}
1349
1350/// A span of contiguous bytes, from start to end, represented via byte
1351/// offsets.
1352///
1353/// The range is inclusive at the beginning and exclusive at the end.
1354#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
1355pub struct Span {
1356    /// The starting byte offset of the match.
1357    pub start: usize,
1358    /// The ending byte offset of the match.
1359    pub end: usize,
1360}
1361
1362impl std::fmt::Debug for Span {
1363    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1364        write!(f, "{:?}..{:?}", self.start, self.end)
1365    }
1366}
1367
1368/// Represents a single span, either present or empty.
1369///
1370/// An empty span is spelled `[]` in TOML, and a present span is spelled `[m,
1371/// n]`.
1372#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
1373#[serde(untagged)]
1374enum MaybeSpan {
1375    None([usize; 0]),
1376    Some([usize; 2]),
1377}
1378
1379impl MaybeSpan {
1380    /// Converts this TOML representation of a possibly absent span to a proper
1381    /// `Option<Span>`.
1382    fn into_option(self) -> Option<Span> {
1383        match self {
1384            MaybeSpan::None(_) => None,
1385            MaybeSpan::Some([start, end]) => Some(Span { start, end }),
1386        }
1387    }
1388}
1389
1390/// The match semantics to use for a search.
1391///
1392/// When not specified in a test, the default is `MatchKind::LeftmostFirst`.
1393#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
1394#[serde(rename_all = "kebab-case")]
1395pub enum MatchKind {
1396    /// All possible matches should be reported.
1397    ///
1398    /// Usually this makes it impossible for non-greedy repetition operators
1399    /// to exist. That is, they behave as greedy repetition operators.
1400    All,
1401    /// Report only the leftmost match. When there are multiple leftmost
1402    /// matches that start at the same position, prefer the one that comes
1403    /// "first" in the pattern. For example, `sam|samwise` matches `sam` in
1404    /// `samwise`.
1405    ///
1406    /// This typically corresponds to the semantics implemented by backtracking
1407    /// engines.
1408    LeftmostFirst,
1409    /// Report only the leftmost match. When there are multiple leftmost
1410    /// matches that start at the same position, prefer the one the longest
1411    /// match. For example, `sam|samwise` matches `samwise` in `samwise`.
1412    ///
1413    /// This typically corresponds to the semantics implemented by POSIX
1414    /// engines.
1415    LeftmostLongest,
1416}
1417
1418impl Default for MatchKind {
1419    fn default() -> MatchKind {
1420        MatchKind::LeftmostFirst
1421    }
1422}
1423
1424/// Represents the type of search to perform.
1425///
1426/// When not specified in a test, the default is `SearchKind::Leftmost`.
1427#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
1428#[serde(rename_all = "kebab-case")]
1429pub enum SearchKind {
1430    /// Report matches as soon as they are found.
1431    ///
1432    /// This is somewhat tricky to test, as this semantic is specified in terms
1433    /// of whatever the regex engine can do. For example, an automata oriented
1434    /// engine might be able to report a match earlier than a backtracking
1435    /// engine.
1436    Earliest,
1437    /// A standard leftmost search, returning either the leftmost-first or
1438    /// leftmost-longest match. Generally speaking, it doesn't make sense to
1439    /// use this type of search in combination with [`MatchKind::All`].
1440    Leftmost,
1441    /// Return all possible matches, including ones that overlap. Typically
1442    /// this search kind is used in combination with [`MatchKind::All`].
1443    Overlapping,
1444}
1445
1446impl Default for SearchKind {
1447    fn default() -> SearchKind {
1448        SearchKind::Leftmost
1449    }
1450}
1451
1452/// Read the environment variable given. If it doesn't exist, then return an
1453/// empty string. Otherwise, check that it is valid UTF-8. If it isn't, return
1454/// a useful error message.
1455fn read_env(var: &str) -> Result<String> {
1456    let val = match std::env::var_os(var) {
1457        None => return Ok("".to_string()),
1458        Some(val) => val,
1459    };
1460    let val = val.into_string().map_err(|os| {
1461        anyhow::anyhow!(
1462            "invalid UTF-8 in env var {}={:?}",
1463            var,
1464            Vec::from_os_str_lossy(&os)
1465        )
1466    })?;
1467    Ok(val)
1468}
1469
1470/// Runs the given closure such that any panics are caught and converted into
1471/// errors. If the panic'd value could not be converted to a known error type,
1472/// then a generic string error message is used.
1473///
1474/// This is useful for use inside the test runner such that bugs for certain
1475/// tests don't prevent other tests from running.
1476fn safe<T, F>(fun: F) -> Result<T, String>
1477where
1478    F: FnOnce() -> T,
1479{
1480    use std::panic;
1481
1482    panic::catch_unwind(panic::AssertUnwindSafe(fun)).map_err(|any_err| {
1483        // Extract common types of panic payload:
1484        // panic and assert produce &str or String
1485        if let Some(&s) = any_err.downcast_ref::<&str>() {
1486            s.to_owned()
1487        } else if let Some(s) = any_err.downcast_ref::<String>() {
1488            s.to_owned()
1489        } else {
1490            "UNABLE TO SHOW RESULT OF PANIC.".to_owned()
1491        }
1492    })
1493}
1494
1495/// A function to set some boolean fields to a default of 'true'. We use a
1496/// function so that we can hand a path to it to Serde.
1497fn default_true() -> bool {
1498    true
1499}
1500
1501#[cfg(test)]
1502mod tests {
1503    use super::*;
1504
1505    #[test]
1506    fn err_no_regexes() {
1507        let data = r#"
1508[[test]]
1509name = "foo"
1510haystack = "lib.rs"
1511matches = true
1512case-insensitive = true
1513"#;
1514
1515        let mut tests = RegexTests::new();
1516        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1517    }
1518
1519    #[test]
1520    fn err_unknown_field() {
1521        let data = r#"
1522[[test]]
1523name = "foo"
1524regex = ".*.rs"
1525haystack = "lib.rs"
1526matches = true
1527something = 0
1528"#;
1529
1530        let mut tests = RegexTests::new();
1531        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1532    }
1533
1534    #[test]
1535    fn err_no_matches() {
1536        let data = r#"
1537[[test]]
1538name = "foo"
1539regex = ".*.rs"
1540haystack = "lib.rs"
1541"#;
1542
1543        let mut tests = RegexTests::new();
1544        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1545    }
1546
1547    #[test]
1548    fn load_match() {
1549        let data = r#"
1550[[test]]
1551name = "foo"
1552regex = ".*.rs"
1553haystack = "lib.rs"
1554matches = [[0, 6]]
1555compiles = false
1556anchored = true
1557case-insensitive = true
1558unicode = false
1559utf8 = false
1560"#;
1561
1562        let mut tests = RegexTests::new();
1563        tests.load_slice("test", data.as_bytes()).unwrap();
1564
1565        let t0 = &tests.tests[0];
1566        assert_eq!("test", t0.group());
1567        assert_eq!("foo", t0.name());
1568        assert_eq!("test/foo", t0.full_name());
1569        assert_eq!(&[".*.rs"], t0.regexes());
1570        assert_eq!(true, t0.is_match());
1571        assert_eq!(vec![0], t0.which_matches());
1572
1573        assert!(!t0.compiles());
1574        assert!(t0.anchored());
1575        assert!(t0.case_insensitive());
1576        assert!(!t0.unicode());
1577        assert!(!t0.utf8());
1578    }
1579
1580    #[test]
1581    fn load_which_matches() {
1582        let data = r#"
1583[[test]]
1584name = "foo"
1585regex = [".*.rs", ".*.toml"]
1586haystack = "lib.rs"
1587matches = [
1588    { id = 0, spans = [[0, 0]] },
1589    { id = 2, spans = [[0, 0]] },
1590    { id = 5, spans = [[0, 0]] },
1591]
1592"#;
1593
1594        let mut tests = RegexTests::new();
1595        tests.load_slice("test", data.as_bytes()).unwrap();
1596
1597        let t0 = &tests.tests[0];
1598        assert_eq!(&[".*.rs", ".*.toml"], t0.regexes());
1599        assert_eq!(true, t0.is_match());
1600        assert_eq!(vec![0, 2, 5], t0.which_matches());
1601
1602        assert!(t0.compiles());
1603        assert!(!t0.anchored());
1604        assert!(!t0.case_insensitive());
1605        assert!(t0.unicode());
1606        assert!(t0.utf8());
1607    }
1608
1609    #[test]
1610    fn load_spans() {
1611        let data = r#"
1612[[test]]
1613name = "foo"
1614regex = ".*.rs"
1615haystack = "lib.rs"
1616matches = [[0, 2], [5, 10]]
1617"#;
1618
1619        let mut tests = RegexTests::new();
1620        tests.load_slice("test", data.as_bytes()).unwrap();
1621
1622        let spans =
1623            vec![Span { start: 0, end: 2 }, Span { start: 5, end: 10 }];
1624        let t0 = &tests.tests[0];
1625        assert_eq!(t0.regexes(), &[".*.rs"]);
1626        assert_eq!(t0.is_match(), true);
1627        assert_eq!(t0.which_matches(), &[0]);
1628        assert_eq!(
1629            t0.matches(),
1630            vec![
1631                Match { id: 0, span: spans[0] },
1632                Match { id: 0, span: spans[1] },
1633            ]
1634        );
1635        assert_eq!(
1636            t0.captures(),
1637            vec![
1638                Captures::new(0, vec![Some(spans[0])]).unwrap(),
1639                Captures::new(0, vec![Some(spans[1])]).unwrap(),
1640            ]
1641        );
1642    }
1643
1644    #[test]
1645    fn load_capture_spans() {
1646        let data = r#"
1647[[test]]
1648name = "foo"
1649regex = ".*.rs"
1650haystack = "lib.rs"
1651matches = [
1652  [[0, 15], [5, 10], [], [13, 14]],
1653  [[20, 30], [22, 24], [25, 27], []],
1654]
1655"#;
1656
1657        let mut tests = RegexTests::new();
1658        tests.load_slice("test", data.as_bytes()).unwrap();
1659
1660        let t0 = &tests.tests[0];
1661        assert_eq!(t0.regexes(), &[".*.rs"]);
1662        assert_eq!(t0.is_match(), true);
1663        assert_eq!(t0.which_matches(), &[0]);
1664        assert_eq!(
1665            t0.matches(),
1666            vec![
1667                Match { id: 0, span: Span { start: 0, end: 15 } },
1668                Match { id: 0, span: Span { start: 20, end: 30 } },
1669            ]
1670        );
1671        assert_eq!(
1672            t0.captures(),
1673            vec![
1674                Captures::new(
1675                    0,
1676                    vec![
1677                        Some(Span { start: 0, end: 15 }),
1678                        Some(Span { start: 5, end: 10 }),
1679                        None,
1680                        Some(Span { start: 13, end: 14 }),
1681                    ]
1682                )
1683                .unwrap(),
1684                Captures::new(
1685                    0,
1686                    vec![
1687                        Some(Span { start: 20, end: 30 }),
1688                        Some(Span { start: 22, end: 24 }),
1689                        Some(Span { start: 25, end: 27 }),
1690                        None,
1691                    ]
1692                )
1693                .unwrap(),
1694            ]
1695        );
1696    }
1697
1698    #[test]
1699    fn fail_spans_empty1() {
1700        let data = r#"
1701[[test]]
1702name = "foo"
1703regex = ".*.rs"
1704haystack = "lib.rs"
1705matches = [
1706  [],
1707]
1708"#;
1709
1710        let mut tests = RegexTests::new();
1711        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1712    }
1713
1714    #[test]
1715    fn fail_spans_empty2() {
1716        let data = r#"
1717[[test]]
1718name = "foo"
1719regex = ".*.rs"
1720haystack = "lib.rs"
1721matches = [
1722  [[]],
1723]
1724"#;
1725
1726        let mut tests = RegexTests::new();
1727        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1728    }
1729
1730    #[test]
1731    fn fail_spans_empty3() {
1732        let data = r#"
1733[[test]]
1734name = "foo"
1735regex = ".*.rs"
1736haystack = "lib.rs"
1737matches = [
1738  [[], [0, 2]],
1739]
1740"#;
1741
1742        let mut tests = RegexTests::new();
1743        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1744    }
1745
1746    #[test]
1747    fn fail_captures_empty1() {
1748        let data = r#"
1749[[test]]
1750name = "foo"
1751regex = ".*.rs"
1752haystack = "lib.rs"
1753matches = [
1754  { id = 0, spans = [] },
1755]
1756"#;
1757
1758        let mut tests = RegexTests::new();
1759        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1760    }
1761
1762    #[test]
1763    fn fail_captures_empty2() {
1764        let data = r#"
1765[[test]]
1766name = "foo"
1767regex = ".*.rs"
1768haystack = "lib.rs"
1769matches = [
1770  { id = 0, spans = [[]] },
1771]
1772"#;
1773
1774        let mut tests = RegexTests::new();
1775        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1776    }
1777
1778    #[test]
1779    fn fail_captures_empty3() {
1780        let data = r#"
1781[[test]]
1782name = "foo"
1783regex = ".*.rs"
1784haystack = "lib.rs"
1785matches = [
1786  { id = 0, spans = [[], [0, 2]] },
1787]
1788"#;
1789
1790        let mut tests = RegexTests::new();
1791        assert!(tests.load_slice("test", data.as_bytes()).is_err());
1792    }
1793}