regex_test/lib.rs
1/*!
2A crate for defining tests in a TOML format and applying them to regex engine
3implementations.
4
5Generally speaking, if you aren't writing your own regex engine and looking to
6test it, then this crate is probably not for you. Moreover, this crate does not
7come with any actual tests. It merely defines the test format and provides some
8convenient routines for executing tests within the context of Rust unit tests.
9
10# Format
11
12The entire test corpus is derived from zero or more TOML files. Each TOML
13file contains zero or more tests, where each test is defined as a table via
14`[[test]]`.
15
16Each test has the following fields:
17
18* `name` - A name for the test. It must be unique within its file. A test's
19[`RegexTest::full_name`] is derived either via `{group_name}/{name}` or
20`{group_name}/{name}/{additional_name}`, with the latter being used only when
21[`TestRunner::expand`] is used. The `group_name` is derived from the file stem
22(the file name without the `.toml suffix).
23* `regex` - The regex to test. This is either a string or a (possibly empty)
24list of regex patterns. When using a list, the underlying regex engine is
25expected to support multiple patterns where each are identified starting from
26`0` and incrementing by 1 for each subsequent pattern.
27* `haystack` - The text to search.
28* `bounds` - An optional field whose value is a table with `start` and `end`
29fields, whose values must be valid for the given `haystack`. When set,
30the search will only execute within these bounds. When absent, the bounds
31correspond to `start = 0` and `end = haystack.len()`.
32* `matches` - Zero or more match values. Each match value can be in one of four
33formats:
34 * A simple span, i.e., `[5, 12]`, corresponding to the start and end of the
35 match, in byte offsets. The start is inclusive and the end is exclusive.
36 The pattern ID for the match is assumed to be `0`.
37 * A table corresponding to the matching pattern ID and the span of the
38 match. For example, `{ id = 5, span = [20, 21] }`.
39 * A list of capture group spans, with the first corresponding to the
40 overall match and the pattern ID assumed to be `0`. For example,
41 `[[5, 10], [6, 8], [], [9, 10]]`, where `[]` corresponds to a group
42 present in the regex but one that did not participate in a match.
43 * A table corresponding to the matching pattern ID and a list of spans
44 corresponding to the capture groups. For example,
45 `{ id = 5, spans = [[5, 10], [6, 8], [], [9, 10]] }`. This is the most
46 general, but also most verbose, syntax.
47* `match-limit` - An optional field that specifies a limit on the number of
48matches. When absent, no limit is enforced and all matches should be reported
49by the regex engine. This can be useful, for example, when one only cares about
50the first match.
51* `compiles` - An optional field indicating whether the regex is expected to
52compile. It defaults to `true` when absent. When `true`, if the regex does not
53compile, then the test fails. Conversely, when `false`, if the regex _does_
54compile, then the test fails.
55* `anchored` - Whether to execute an anchored search or not. Note that this is
56not the same as adding a `^` to the beginning of your regex pattern. `^` always
57requires the regex to match at position `0`, but an anchored search simply
58requires that the regex match at the starting position of the search. (The
59starting position of the search can be configured via the optional `bounds`
60field.)
61* `case-insensitive` - Whether to match the regex case insensitively. This is
62disabled by default. There is no real difference between using this field and
63adding a `(?i)` to the beginning of your regex. (Some regex engines may not
64support `(?i)`.)
65* `unescape` - When enabled, the haystack is unescaped. Sequences like `\x00`
66are turned into their corresponding byte values. This permits one to write
67haystacks that contain invalid UTF-8 without embedding actual invalid UTF-8
68into a TOML file (which is not allowed). There is generally no other reason to
69enable `unescape`.
70* `unicode` - When enabled, the regex pattern should be compiled with its
71corresponding Unicode mode enabled. For example, `[^a]` matches any UTF-8
72encoding of any codepoint other than `a`. Case insensitivty should be Unicode
73aware. Unicode classes like `\pL` are available. The Perl classes `\w`, `\s`
74and `\d` should be Unicode aware. And so on. This is an optional field and is
75enabled by default.
76* `utf8` - When this is enabled, all regex match substrings should be entirely
77valid UTF-8. While parts of the haystack the regex searches through may not be
78valid UTF-8, only the portions that are valid UTF-8 may be reported in match
79spans. Importantly, this includes zero-width matches. Zero-width matches must
80never split the UTF-8 encoding of a single codepoint when this is enabled. This
81is an optional field and is enabled by default.
82* `line-terminator` - This sets the line terminator used by the multi-line
83assertions `(?m:^)` and `(?m:$)`. It defaults to `\n`. It must be exactly one
84byte. This field is automatically unescaped in order to permit a non-ASCII
85byte.
86* `match-kind` - May be one of `all`, `leftmost-first` or `leftmost-longest`.
87See [`MatchKind`] for more details. This is an optional field and defaults to
88`leftmost-first`.
89* `search-kind` - May be one of `earliest`, `leftmost` or `overlapping`. See
90[`SearchKind`] for more details. This is an optional field and defaults to
91`leftmost`.
92*/
93
94#![deny(missing_docs)]
95
96/// For convenience, `anyhow::Error` is used to represents errors in this
97/// crate.
98///
99/// For this reason, `anyhow` is a public dependency and is re-exported here.
100pub extern crate anyhow;
101
102use std::{borrow::Borrow, collections::HashSet, fs, path::Path};
103
104use {
105 anyhow::{bail, Context, Result},
106 bstr::{BString, ByteSlice, ByteVec},
107 serde::Deserialize,
108};
109
110const ENV_REGEX_TEST: &str = "REGEX_TEST";
111const ENV_REGEX_TEST_VERBOSE: &str = "REGEX_TEST_VERBOSE";
112
113/// A collection of regex tests.
114#[derive(Clone, Debug, Deserialize)]
115pub struct RegexTests {
116 /// 'default' permits an empty TOML file.
117 #[serde(default, rename = "test")]
118 tests: Vec<RegexTest>,
119 #[serde(skip)]
120 seen: HashSet<String>,
121}
122
123impl RegexTests {
124 /// Create a new empty collection of glob tests.
125 pub fn new() -> RegexTests {
126 RegexTests { tests: vec![], seen: HashSet::new() }
127 }
128
129 /// Loads all of the tests in the given TOML file. The group name assigned
130 /// to each test is the stem of the file name. For example, if one loads
131 /// `foo/bar.toml`, then the group name for each test will be `bar`.
132 pub fn load<P: AsRef<Path>>(&mut self, path: P) -> Result<()> {
133 let path = path.as_ref();
134 let data = fs::read(path)
135 .with_context(|| format!("failed to read {}", path.display()))?;
136 let group_name = path
137 .file_stem()
138 .with_context(|| {
139 format!("failed to get file name of {}", path.display())
140 })?
141 .to_str()
142 .with_context(|| {
143 format!("invalid UTF-8 found in {}", path.display())
144 })?;
145 self.load_slice(&group_name, &data)
146 .with_context(|| format!("error loading {}", path.display()))?;
147 Ok(())
148 }
149
150 /// Load all of the TOML encoded tests in `data` into this collection.
151 /// The given group name is assigned to all loaded tests.
152 pub fn load_slice(&mut self, group_name: &str, data: &[u8]) -> Result<()> {
153 let data = std::str::from_utf8(&data).with_context(|| {
154 format!("data in {} is not valid UTF-8", group_name)
155 })?;
156 let mut index = 1;
157 let mut tests: RegexTests =
158 toml::from_str(&data).with_context(|| {
159 format!("error decoding TOML for '{}'", group_name)
160 })?;
161 for t in &mut tests.tests {
162 t.group = group_name.to_string();
163 if t.name.is_empty() {
164 t.name = format!("{}", index);
165 index += 1;
166 }
167 t.full_name = format!("{}/{}", t.group, t.name);
168 if t.unescape {
169 t.haystack = BString::from(Vec::unescape_bytes(
170 // OK because TOML requires valid UTF-8.
171 t.haystack.to_str().unwrap(),
172 ));
173 }
174 if t.line_terminator.is_empty() {
175 t.line_terminator = BString::from("\n");
176 } else {
177 t.line_terminator = BString::from(Vec::unescape_bytes(
178 // OK because TOML requires valid UTF-8.
179 t.line_terminator.to_str().unwrap(),
180 ));
181 anyhow::ensure!(
182 t.line_terminator.len() == 1,
183 "line terminator '{:?}' has length not equal to 1",
184 t.line_terminator,
185 );
186 }
187 if self.seen.contains(t.full_name()) {
188 bail!("found duplicate tests for name '{}'", t.full_name());
189 }
190 self.seen.insert(t.full_name().to_string());
191 }
192 self.tests.extend(tests.tests);
193 Ok(())
194 }
195
196 /// Return an iterator over all regex tests that have been loaded. The
197 /// order of the iterator corresponds to the order in which the tests were
198 /// loaded.
199 ///
200 /// This is useful to pass to [`TestRunner::test_iter`].
201 pub fn iter(&self) -> RegexTestsIter {
202 RegexTestsIter(self.tests.iter())
203 }
204}
205
206/// A regex test describes the inputs and expected outputs of a regex match.
207///
208/// Each `RegexTest` represents a single `[[test]]` table in a TOML test file.
209#[derive(Clone, Debug, Deserialize)]
210#[serde(deny_unknown_fields)]
211pub struct RegexTest {
212 #[serde(skip)]
213 group: String,
214 #[serde(default)]
215 name: String,
216 #[serde(skip)]
217 additional_name: String,
218 #[serde(skip)]
219 full_name: String,
220 regex: RegexesFormat,
221 haystack: BString,
222 bounds: Option<Span>,
223 matches: Vec<Captures>,
224 #[serde(rename = "match-limit")]
225 match_limit: Option<usize>,
226 #[serde(default = "default_true")]
227 compiles: bool,
228 #[serde(default)]
229 anchored: bool,
230 #[serde(default, rename = "case-insensitive")]
231 case_insensitive: bool,
232 #[serde(default)]
233 unescape: bool,
234 #[serde(default = "default_true")]
235 unicode: bool,
236 #[serde(default = "default_true")]
237 utf8: bool,
238 #[serde(default, rename = "line-terminator")]
239 line_terminator: BString,
240 #[serde(default, rename = "match-kind")]
241 match_kind: MatchKind,
242 #[serde(default, rename = "search-kind")]
243 search_kind: SearchKind,
244}
245
246impl RegexTest {
247 /// Return the group name of this test.
248 ///
249 /// Usually the group name corresponds to a collection of related
250 /// tests. More specifically, when using [`RegexTests::load`], the
251 /// group name corresponds to the file stem (the file name without the
252 /// `.toml` suffix). Otherwise, the group name is whatever is given to
253 /// [`RegexTests::load_slice`].
254 pub fn group(&self) -> &str {
255 &self.group
256 }
257
258 /// The name of this test.
259 ///
260 /// Note that this is only the name as given in the `[[test]]` block. The
261 /// actual full name used for filtering and reporting can be retrieved with
262 /// [`RegexTest::full_name`].
263 pub fn name(&self) -> &str {
264 &self.name
265 }
266
267 /// The additional name for this test.
268 ///
269 /// This is only non-empty when the test runner was expanded with
270 /// [`TestRunner::expand`].
271 pub fn additional_name(&self) -> &str {
272 &self.additional_name
273 }
274
275 /// The full name of this test, which is formed by joining the group
276 /// name, the test name and the additional name with a `/`.
277 pub fn full_name(&self) -> &str {
278 &self.full_name
279 }
280
281 /// Return all of the regexes that should be matched for this test. This
282 /// slice may be empty!
283 pub fn regexes(&self) -> &[String] {
284 self.regex.patterns()
285 }
286
287 /// Return the bytes on which the regex should be matched.
288 pub fn haystack(&self) -> &[u8] {
289 &self.haystack
290 }
291
292 /// Returns the bounds of a search.
293 ///
294 /// If the test didn't specify any bounds, then the bounds returned are
295 /// equivalent to the entire haystack.
296 pub fn bounds(&self) -> Span {
297 self.bounds.unwrap_or(Span { start: 0, end: self.haystack().len() })
298 }
299
300 /// Returns the limit on the number of matches that should be reported,
301 /// if specified in the test.
302 ///
303 /// This is useful for tests that only want to check for the first
304 /// match. In which case, the match limit is set to 1.
305 ///
306 /// If there is no match limit, then regex engines are expected to report
307 /// all matches.
308 pub fn match_limit(&self) -> Option<usize> {
309 self.match_limit
310 }
311
312 /// Returns true if the regex(es) in this test are expected to compile.
313 pub fn compiles(&self) -> bool {
314 self.compiles
315 }
316
317 /// Whether the regex should perform an anchored search.
318 ///
319 /// This is distinct from putting a `^` in the regex in that `bounds` may
320 /// be specified that permit the regex search to start at a position
321 /// `i > 0`. In which case, enabling anchored mode here requires that any
322 /// matches produced must have a start offset at `i`.
323 pub fn anchored(&self) -> bool {
324 self.anchored
325 }
326
327 /// Returns true if regex matching should be performed without regard to
328 /// case.
329 pub fn case_insensitive(&self) -> bool {
330 self.case_insensitive
331 }
332
333 /// Returns true if regex matching should have Unicode mode enabled.
334 ///
335 /// For example, `[^a]` matches any UTF-8 encoding of any codepoint other
336 /// than `a`. Case insensitivty should be Unicode aware. Unicode classes
337 /// like `\pL` are available. The Perl classes `\w`, `\s` and `\d` should
338 /// be Unicode aware. And so on.
339 ///
340 /// This is enabled by default.
341 pub fn unicode(&self) -> bool {
342 self.unicode
343 }
344
345 /// Returns true if regex matching should exclusively match valid UTF-8.
346 /// When this is disabled, matching on arbitrary bytes is permitted.
347 ///
348 /// When this is enabled, all regex match substrings should be entirely
349 /// valid UTF-8. While parts of the haystack the regex searches through
350 /// may not be valid UTF-8, only the portions that are valid UTF-8 may be
351 /// reported in match spans.
352 ///
353 /// Importantly, this includes zero-width matches. Zero-width matches must
354 /// never split the UTF-8 encoding of a single codepoint when this is
355 /// enabled.
356 ///
357 /// This is enabled by default.
358 pub fn utf8(&self) -> bool {
359 self.utf8
360 }
361
362 /// Returns the line terminator that should be used for the multi-line
363 /// assertions `(?m:^)` and `(?m:$)`.
364 ///
365 /// If it isn't set, then this defaults to `\n`.
366 pub fn line_terminator(&self) -> u8 {
367 self.line_terminator[0]
368 }
369
370 /// Return the match semantics required by this test.
371 pub fn match_kind(&self) -> MatchKind {
372 self.match_kind
373 }
374
375 /// Return the search semantics required by this test.
376 pub fn search_kind(&self) -> SearchKind {
377 self.search_kind
378 }
379
380 /// Run the test and return the result produced by the given compiled
381 /// regex.
382 fn test(&self, regex: &mut CompiledRegex) -> TestResult {
383 match regex.matcher {
384 None => TestResult::skip(),
385 Some(ref mut match_regex) => match_regex(self),
386 }
387 }
388
389 /// Append `/name` to the `full_name` of this test.
390 ///
391 /// This is used to support [`TestRunner::expand`].
392 fn with_additional_name(&self, name: &str) -> RegexTest {
393 let additional_name = name.to_string();
394 let full_name = format!("{}/{}", self.full_name, additional_name);
395 RegexTest { additional_name, full_name, ..self.clone() }
396 }
397
398 /// Returns true if and only if this test expects at least one of the
399 /// regexes to match the haystack.
400 fn is_match(&self) -> bool {
401 !self.matches.is_empty()
402 }
403
404 /// Returns a slice of pattern IDs that are expected to match the haystack.
405 /// The slice is empty if no match is expected to occur. The IDs returned
406 /// are deduplicated and sorted in ascending order.
407 fn which_matches(&self) -> Vec<usize> {
408 let mut seen = HashSet::new();
409 let mut ids = vec![];
410 for cap in self.matches.iter() {
411 if !seen.contains(&cap.id) {
412 seen.insert(cap.id);
413 ids.push(cap.id);
414 }
415 }
416 ids.sort();
417 ids
418 }
419
420 /// Extracts the overall match from each `Captures` match in this test
421 /// and returns it.
422 fn matches(&self) -> Vec<Match> {
423 let mut matches = vec![];
424 for cap in self.matches.iter() {
425 matches.push(cap.to_match());
426 }
427 matches
428 }
429
430 /// Returns the matches expected by this test, including the spans of any
431 /// matching capture groups.
432 fn captures(&self) -> Vec<Captures> {
433 self.matches.clone()
434 }
435}
436
437/// The result of compiling a regex.
438///
439/// In many implementations, the act of matching a regex can be separated from
440/// the act of compiling a regex. A `CompiledRegex` represents a regex that has
441/// been compiled and is ready to be used for matching.
442///
443/// The matching implementation is represented by a closure that accepts a
444/// [`&RegexTest`](RegexTest) and returns a [`TestResult`].
445pub struct CompiledRegex {
446 matcher: Option<Box<dyn FnMut(&RegexTest) -> TestResult + 'static>>,
447}
448
449impl CompiledRegex {
450 /// Provide a closure that represents the compiled regex and executes a
451 /// regex match on any `RegexTest`. The `RegexTest` given to the closure
452 /// provided is the exact same `RegexTest` that is used to compile this
453 /// regex.
454 pub fn compiled(
455 matcher: impl FnMut(&RegexTest) -> TestResult + 'static,
456 ) -> CompiledRegex {
457 CompiledRegex { matcher: Some(Box::new(matcher)) }
458 }
459
460 /// Indicate that tests on this regex should be skipped. This typically
461 /// occurs if the `RegexTest` requires something that an implementation
462 /// does not support.
463 pub fn skip() -> CompiledRegex {
464 CompiledRegex { matcher: None }
465 }
466
467 /// Returns true if the test runner decided to skip the test when
468 /// attempting to compile the regex.
469 pub fn is_skip(&self) -> bool {
470 self.matcher.is_none()
471 }
472}
473
474impl std::fmt::Debug for CompiledRegex {
475 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
476 let status = match self.matcher {
477 None => "Skip",
478 Some(_) => "Run(...)",
479 };
480 f.debug_struct("CompiledRegex").field("matcher", &status).finish()
481 }
482}
483
484/// The result of executing a regex search.
485///
486/// When using the test runner, callers must provide a closure that takes
487/// a `RegexTest` and returns a `TestResult`. The `TestResult` is meant to
488/// capture the results of matching the haystack against the regex specified by
489/// the `RegexTest`.
490///
491/// Usually this consists of one or more matches, which can be constructed via
492/// `TestResult::matches` (for just overall matches) or `TestResult::captures`
493/// (for matches with capture group spans). But the regex engine may also
494/// report whether a match exists, or just whether a pattern matched or not.
495/// That can be done via `TestResult::matched` and `TestResult::which`,
496/// respectively.
497#[derive(Debug, Clone)]
498pub struct TestResult {
499 kind: TestResultKind,
500}
501
502#[derive(Debug, Clone)]
503enum TestResultKind {
504 Match(bool),
505 Which(Vec<usize>),
506 StartEnd(Vec<Match>),
507 Captures(Vec<Captures>),
508 Skip,
509 Fail { why: String },
510}
511
512impl TestResult {
513 /// Create a test result that indicates just whether any match was found
514 /// or not.
515 pub fn matched(yes: bool) -> TestResult {
516 TestResult { kind: TestResultKind::Match(yes) }
517 }
518
519 /// Create a test result that indicates which out of possibly many regexes
520 /// matched the haystack. If `which` is empty, then this is equivalent to
521 /// `TestResult::matched(false)`.
522 ///
523 /// Note that the iterator should consist of pattern IDs, where each
524 /// ID corresponds to a pattern that matches anywhere in the haystack.
525 /// Multiple patterns may match the same region of the haystack. That is,
526 /// this supports overlapping matches.
527 pub fn which<I: IntoIterator<Item = usize>>(it: I) -> TestResult {
528 let mut which: Vec<usize> = it.into_iter().collect();
529 which.sort();
530 TestResult { kind: TestResultKind::Which(which) }
531 }
532
533 /// Create a test result containing a sequence of all matches in the test's
534 /// haystack. This is useful when the regex engine only reports overall
535 /// matches and not the spans of each matching capture group.
536 ///
537 /// If the sequence is empty, then this is equivalent to
538 /// `TestResult::matched(false)`.
539 pub fn matches<I: IntoIterator<Item = Match>>(it: I) -> TestResult {
540 TestResult { kind: TestResultKind::StartEnd(it.into_iter().collect()) }
541 }
542
543 /// Create a test result containing a sequence of all capturing matches in
544 /// the test's haystack. Each match is a `Captures`, and each `Captures`
545 /// should include the spans of all matching capturing groups.
546 ///
547 /// If the sequence is empty, then this is equivalent to
548 /// `TestResult::matched(false)`.
549 pub fn captures<I: IntoIterator<Item = Captures>>(it: I) -> TestResult {
550 TestResult { kind: TestResultKind::Captures(it.into_iter().collect()) }
551 }
552
553 /// Indicate that this test should be skipped. It will not be counted as
554 /// a failure.
555 pub fn skip() -> TestResult {
556 TestResult { kind: TestResultKind::Skip }
557 }
558
559 /// Indicate that this test should be failed for the reason given.
560 ///
561 /// This is useful when a test needs to be failed for reasons that the
562 /// test runner itself cannot check. That is, the test is failed by the
563 /// implementation being tested.
564 pub fn fail(why: &str) -> TestResult {
565 TestResult { kind: TestResultKind::Fail { why: why.to_string() } }
566 }
567}
568
569/// A runner for executing regex tests.
570///
571/// This runner is intended to be used within a Rust unit test, marked with the
572/// `#[test]` attribute.
573///
574/// A test runner is responsible for running tests against a regex
575/// implementation. It contains logic for skipping tests and collects test
576/// results. Typical usage corresponds to calling [`TestRunner::test_iter`] on
577/// an iterator of `RegexTest`s, and then calling `assert` once done. If any
578/// tests failed, then `assert` will panic with an error message containing all
579/// test failures. `assert` must be called before the test completes.
580///
581/// # Skipping tests
582///
583/// If the `REGEX_TEST` environment variable is set, then it may contain
584/// a comma separated list of substrings. Each substring corresponds to a
585/// whitelisted item, unless it starts with a `-`, in which case it corresponds
586/// to a blacklisted item.
587///
588/// If there are any whitelist items, then a test's full name must contain at
589/// least one of the whitelist substrings in order to be run, and does not
590/// contain and blacklist substrings. If there are no whitelist substrings,
591/// then a test is run only when it does not match any blacklist substrings.
592///
593/// The last substring that a test name matches takes precedent.
594///
595/// Callers may also specify explicit whitelist or blacklist substrings using
596/// the corresponding methods on this type, which has the effect of always
597/// having those rules in place for that specific test. For example, if you're
598/// testing a search by building a DFA and then minimizing it, you may want to
599/// skip tests with bigger regexes, since they could take quite some time to
600/// run.
601///
602/// Whitelist and blacklist substrings are matched on the full name of each
603/// test, which typically looks like `group_name/test_name`.
604///
605/// Currently there is no way to escape either a `-` or a `,` in `REGEX_TEST`.
606/// This usually isn't required because test names usually don't include either
607/// character.
608#[derive(Debug)]
609pub struct TestRunner {
610 include: Vec<IncludePattern>,
611 results: RegexTestResults,
612 expanders: Vec<Expander>,
613}
614
615impl TestRunner {
616 /// Create a new runner for executing tests.
617 ///
618 /// The test runner maintains a full list of tests that have succeeded,
619 /// failed or been skipped. Moreover, the test runner may control which
620 /// tests get run via its whitelist and blacklist.
621 ///
622 /// This returns an error if there was a problem reading the `REGEX_TEST`
623 /// environment variable, which may be set to include or exclude tests.
624 /// See the docs on `TestRunner` for its format.
625 pub fn new() -> Result<TestRunner> {
626 let mut runner = TestRunner {
627 include: vec![],
628 results: RegexTestResults::new(),
629 expanders: vec![],
630 };
631 for mut substring in read_env(ENV_REGEX_TEST)?.split(",") {
632 substring = substring.trim();
633 if substring.is_empty() {
634 continue;
635 }
636 if substring.starts_with("-") {
637 runner.blacklist(&substring[1..]);
638 } else {
639 runner.whitelist(substring);
640 }
641 }
642 Ok(runner)
643 }
644
645 /// Assert that all tests run have either passed or have been skipped.
646 ///
647 /// If any tests have failed, then a panic occurs with a report of all
648 /// failures.
649 ///
650 /// If `REGEX_TEST_VERBOSE` is set to `1`, then a longer report of tests
651 /// that passed, failed or skipped is printed.
652 pub fn assert(&mut self) {
653 self.results.assert();
654 }
655
656 /// Whitelist the given substring.
657 ///
658 /// Whitelist and blacklist rules are only applied when
659 /// [`TestRunner::test_iter`] is called.
660 pub fn whitelist(&mut self, substring: &str) -> &mut TestRunner {
661 self.include.push(IncludePattern {
662 blacklist: false,
663 substring: BString::from(substring),
664 });
665 self
666 }
667
668 /// Whitelist the given iterator substrings.
669 ///
670 /// This is a convenience routine for calling `whitelist` on each of the
671 /// substrings in the iterator provided.
672 ///
673 /// Whitelist and blacklist rules are only applied when
674 /// [`TestRunner::test_iter`] is called.
675 pub fn whitelist_iter<I, S>(&mut self, substrings: I) -> &mut TestRunner
676 where
677 I: IntoIterator<Item = S>,
678 S: AsRef<str>,
679 {
680 for substring in substrings {
681 self.whitelist(substring.as_ref());
682 }
683 self
684 }
685
686 /// Blacklist the given substring.
687 ///
688 /// A blacklisted test is never run, unless a whitelisted substring added
689 /// after the blacklisted substring matches it.
690 ///
691 /// Whitelist and blacklist rules are only applied when
692 /// [`TestRunner::test_iter`] is called.
693 pub fn blacklist(&mut self, substring: &str) -> &mut TestRunner {
694 self.include.push(IncludePattern {
695 blacklist: true,
696 substring: BString::from(substring),
697 });
698 self
699 }
700
701 /// Blacklist the given iterator substrings.
702 ///
703 /// A blacklisted test is never run, unless a whitelisted substring added
704 /// after the blacklisted substring matches it.
705 ///
706 /// This is a convenience routine for calling `blacklist` on each of the
707 /// substrings in the iterator provided.
708 ///
709 /// Whitelist and blacklist rules are only applied when
710 /// [`TestRunner::test_iter`] is called.
711 pub fn blacklist_iter<I, S>(&mut self, substrings: I) -> &mut TestRunner
712 where
713 I: IntoIterator<Item = S>,
714 S: AsRef<str>,
715 {
716 for substring in substrings {
717 self.blacklist(substring.as_ref());
718 }
719 self
720 }
721
722 /// Set an expansion predicate that appends each entry in
723 /// `additional_names` to the end the name for every test that `predicate`
724 /// returns true. Moreover, the corresponding additional name is made
725 /// available via [`RegexTest::additional_name`].
726 ///
727 /// This permits implementors to create multiple copies of each test, and
728 /// then do specifically different tasks with each, while making it so each
729 /// test is distinct.
730 ///
731 /// For example, you might write something like this:
732 ///
733 /// ```ignore
734 /// TestRunner::new()?
735 /// .expand(&["is_match", "find"], |t| t.compiles())
736 /// .test_iter(tests, compiler)
737 /// .assert()
738 /// ```
739 ///
740 /// where each test that is expected to have a regex compile gets copied
741 /// with `/is_match` and `/find` appends to the end of its name. Then, in
742 /// your own test runner, you can inspect [`RegexTest::additional_name`] to
743 /// decide what to do. In the case of `is_match`, you might test your regex
744 /// engines "has a match" API, which might exercise different logic than
745 /// your "find where the matches are" API.
746 pub fn expand<S: AsRef<str>>(
747 &mut self,
748 additional_names: &[S],
749 predicate: impl FnMut(&RegexTest) -> bool + 'static,
750 ) -> &mut TestRunner {
751 self.expanders.push(Expander {
752 predicate: Box::new(predicate),
753 additional_names: additional_names
754 .iter()
755 .map(|s| s.as_ref().to_string())
756 .collect(),
757 });
758 self
759 }
760
761 /// Run all of the given tests using the given regex compiler.
762 ///
763 /// The compiler given is a closure that accepts a
764 /// [`&RegexTest`](RegexTest) and a sequence of patterns, and returns (if
765 /// successful) a [`CompiledRegex`] which can execute a search.
766 ///
767 /// Note that if there are test failures, this merely _collects_ them. Use
768 /// [`TestRunner::assert`] to fail the current test by panicking if there
769 /// any failures.
770 ///
771 /// Typically, one provides [`RegexTests::iter`] as the iterator of
772 /// `RegexTest` values.
773 pub fn test_iter<I, T>(
774 &mut self,
775 it: I,
776 mut compile: impl FnMut(&RegexTest, &[String]) -> Result<CompiledRegex>,
777 ) -> &mut TestRunner
778 where
779 I: IntoIterator<Item = T>,
780 T: Borrow<RegexTest>,
781 {
782 for test in it {
783 let test = test.borrow();
784 let mut additional = vec![];
785 for expander in &mut self.expanders {
786 if (expander.predicate)(test) {
787 for name in expander.additional_names.iter() {
788 additional.push(test.with_additional_name(name));
789 }
790 break;
791 }
792 }
793 if additional.is_empty() {
794 additional.push(test.to_owned());
795 }
796 for test in &additional {
797 if self.should_skip(test) {
798 self.results.skip(test);
799 continue;
800 }
801 self.test(test, |regexes| compile(test, regexes));
802 }
803 }
804 self
805 }
806
807 /// Run a single test.
808 ///
809 /// This records the result of running the test in this runner. This does
810 /// not fail the test immediately if the given regex test fails. Instead,
811 /// this is only done when the `assert` method is called.
812 ///
813 /// Note that using this method bypasses any whitelist or blacklist applied
814 /// to this runner. Whitelisted (and blacklisted) substrings are only
815 /// applied when using `test_iter`.
816 pub fn test(
817 &mut self,
818 test: &RegexTest,
819 mut compile: impl FnMut(&[String]) -> Result<CompiledRegex>,
820 ) -> &mut TestRunner {
821 let mut compiled = match safe(|| compile(test.regexes())) {
822 Err(msg) => {
823 // Regex tests should never panic. It's auto-fail if they do.
824 self.results.fail(
825 test,
826 RegexTestFailureKind::UnexpectedPanicCompile(msg),
827 );
828 return self;
829 }
830 Ok(Ok(compiled)) => compiled,
831 Ok(Err(err)) => {
832 if !test.compiles() {
833 self.results.pass(test);
834 } else {
835 self.results.fail(
836 test,
837 RegexTestFailureKind::CompileError { err },
838 );
839 }
840 return self;
841 }
842 };
843 // We fail the test if we didn't expect the regex to compile. However,
844 // it's possible the caller decided to skip the test when attempting
845 // to compile the regex, so we check for that. If the compiled regex
846 // is marked as skipped, then 'test.test(..)' below handles it
847 // correctly.
848 if !test.compiles() && !compiled.is_skip() {
849 self.results.fail(test, RegexTestFailureKind::NoCompileError);
850 return self;
851 }
852 let result = match safe(|| test.test(&mut compiled)) {
853 Ok(result) => result,
854 Err(msg) => {
855 self.results.fail(
856 test,
857 RegexTestFailureKind::UnexpectedPanicSearch(msg),
858 );
859 return self;
860 }
861 };
862 match result.kind {
863 TestResultKind::Match(yes) => {
864 if yes == test.is_match() {
865 self.results.pass(test);
866 } else {
867 self.results.fail(test, RegexTestFailureKind::IsMatch);
868 }
869 }
870 TestResultKind::Which(which) => {
871 if which != test.which_matches() {
872 self.results
873 .fail(test, RegexTestFailureKind::Many { got: which });
874 } else {
875 self.results.pass(test);
876 }
877 }
878 TestResultKind::StartEnd(matches) => {
879 let expected = test.matches();
880 if expected != matches {
881 self.results.fail(
882 test,
883 RegexTestFailureKind::StartEnd { got: matches },
884 );
885 } else {
886 self.results.pass(test);
887 }
888 }
889 TestResultKind::Captures(caps) => {
890 let expected = test.captures();
891 if expected != caps {
892 self.results.fail(
893 test,
894 RegexTestFailureKind::Captures { got: caps },
895 );
896 } else {
897 self.results.pass(test);
898 }
899 }
900 TestResultKind::Skip => {
901 self.results.skip(test);
902 }
903 TestResultKind::Fail { why } => {
904 self.results
905 .fail(test, RegexTestFailureKind::UserFailure { why });
906 }
907 }
908 self
909 }
910
911 /// Return true if and only if the given test should be skipped.
912 fn should_skip(&self, test: &RegexTest) -> bool {
913 if self.include.is_empty() {
914 return false;
915 }
916
917 // If we don't have any whitelist patterns, then the test will be run
918 // unless it is blacklisted. Otherwise, if there are whitelist
919 // patterns, then the test must match at least one of them.
920 let mut skip = self.include.iter().any(|pat| !pat.blacklist);
921 for pat in &self.include {
922 if test.full_name().as_bytes().contains_str(&pat.substring) {
923 skip = pat.blacklist;
924 }
925 }
926 skip
927 }
928}
929
930#[derive(Debug)]
931struct IncludePattern {
932 blacklist: bool,
933 substring: BString,
934}
935
936struct Expander {
937 predicate: Box<dyn FnMut(&RegexTest) -> bool>,
938 additional_names: Vec<String>,
939}
940
941impl std::fmt::Debug for Expander {
942 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
943 f.debug_struct("Expander")
944 .field("predicate", &"<FnMut(..)>")
945 .field("additional_names", &self.additional_names)
946 .finish()
947 }
948}
949
950/// A collection of test results, corresponding to passed, skipped and failed
951/// tests.
952#[derive(Debug)]
953struct RegexTestResults {
954 pass: Vec<RegexTestResult>,
955 fail: Vec<RegexTestFailure>,
956 skip: Vec<RegexTestResult>,
957}
958
959/// A test that passed or skipped, along with its specific result.
960#[derive(Debug)]
961struct RegexTestResult {
962 test: RegexTest,
963}
964
965/// A test that failed along with the reason why.
966#[derive(Debug)]
967struct RegexTestFailure {
968 test: RegexTest,
969 kind: RegexTestFailureKind,
970}
971
972/// Describes the nature of the failed test.
973#[derive(Debug)]
974enum RegexTestFailureKind {
975 /// UserFailure indicates that the test failed because the test function
976 /// explicitly failed it for the reason in the message given.
977 UserFailure { why: String },
978 /// This occurs when the test expected a match (or didn't expect a match),
979 /// but the actual regex implementation didn't match (or did match).
980 IsMatch,
981 /// This occurs when a set of regexes is tested, and the matching regexes
982 /// returned by the regex implementation don't match the expected matching
983 /// regexes. This error contains the indices of the regexes that matched.
984 Many { got: Vec<usize> },
985 /// This occurs when a single regex is used to find all non-overlapping
986 /// matches in a haystack, where the result did not match what was
987 /// expected. This reports the incorrect matches returned by the regex
988 /// implementation under test.
989 StartEnd { got: Vec<Match> },
990 /// Like StartEnd, but for capturing groups.
991 Captures { got: Vec<Captures> },
992 /// This occurs when the test expected the regex to fail to compile, but it
993 /// compiled successfully.
994 NoCompileError,
995 /// This occurs when the test expected the regex to compile successfully,
996 /// but it failed to compile.
997 CompileError { err: anyhow::Error },
998 /// While compiling, a panic occurred. If possible, the panic message
999 /// is captured.
1000 UnexpectedPanicCompile(String),
1001 /// While searching, a panic occurred. If possible, the panic message
1002 /// is captured.
1003 UnexpectedPanicSearch(String),
1004}
1005
1006impl RegexTestResults {
1007 fn new() -> RegexTestResults {
1008 RegexTestResults { pass: vec![], fail: vec![], skip: vec![] }
1009 }
1010
1011 fn pass(&mut self, test: &RegexTest) {
1012 self.pass.push(RegexTestResult { test: test.clone() });
1013 }
1014
1015 fn fail(&mut self, test: &RegexTest, kind: RegexTestFailureKind) {
1016 self.fail.push(RegexTestFailure { test: test.clone(), kind });
1017 }
1018
1019 fn skip(&mut self, test: &RegexTest) {
1020 self.skip.push(RegexTestResult { test: test.clone() });
1021 }
1022
1023 fn assert(&self) {
1024 if read_env(ENV_REGEX_TEST_VERBOSE).map_or(false, |s| s == "1") {
1025 self.verbose();
1026 }
1027 if self.fail.is_empty() {
1028 return;
1029 }
1030 let failures = self
1031 .fail
1032 .iter()
1033 .map(|f| f.to_string())
1034 .collect::<Vec<String>>()
1035 .join("\n\n");
1036 panic!(
1037 "found {} failures:\n{}\n{}\n{}\n\n\
1038 Set the REGEX_TEST environment variable to filter tests, \n\
1039 e.g., REGEX_TEST=foo,-foo2 runs every test whose name contains \n\
1040 foo but not foo2\n\n",
1041 self.fail.len(),
1042 "~".repeat(79),
1043 failures.trim(),
1044 "~".repeat(79),
1045 )
1046 }
1047
1048 fn verbose(&self) {
1049 println!("{}", "~".repeat(79));
1050 for t in &self.skip {
1051 println!("skip: {}", t.full_name());
1052 }
1053 for t in &self.pass {
1054 println!("pass: {}", t.full_name());
1055 }
1056 for t in &self.fail {
1057 println!("FAIL: {}", t.test.full_name());
1058 }
1059 println!(
1060 "\npassed: {}, skipped: {}, failed: {}",
1061 self.pass.len(),
1062 self.skip.len(),
1063 self.fail.len()
1064 );
1065 println!("{}", "~".repeat(79));
1066 }
1067}
1068
1069impl RegexTestResult {
1070 fn full_name(&self) -> String {
1071 self.test.full_name().to_string()
1072 }
1073}
1074
1075impl RegexTestFailure {
1076 fn full_name(&self) -> String {
1077 self.test.full_name().to_string()
1078 }
1079}
1080
1081impl std::fmt::Display for RegexTestFailure {
1082 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1083 write!(
1084 f,
1085 "{}: {}\n\
1086 pattern: {:?}\n\
1087 haystack: {:?}",
1088 self.full_name(),
1089 self.kind.fmt(&self.test)?,
1090 self.test.regexes(),
1091 self.test.haystack().as_bstr(),
1092 )?;
1093 Ok(())
1094 }
1095}
1096
1097impl RegexTestFailureKind {
1098 fn fmt(&self, test: &RegexTest) -> Result<String, std::fmt::Error> {
1099 use std::fmt::Write;
1100
1101 let mut buf = String::new();
1102 match *self {
1103 RegexTestFailureKind::UserFailure { ref why } => {
1104 write!(buf, "failed by implementor because: {}", why)?;
1105 }
1106 RegexTestFailureKind::IsMatch => {
1107 if test.is_match() {
1108 write!(buf, "expected match, but none found")?;
1109 } else {
1110 write!(buf, "expected no match, but found a match")?;
1111 }
1112 }
1113 RegexTestFailureKind::Many { ref got } => {
1114 write!(
1115 buf,
1116 "expected regexes {:?} to match, but found {:?}",
1117 test.which_matches(),
1118 got
1119 )?;
1120 }
1121 RegexTestFailureKind::StartEnd { ref got } => {
1122 write!(
1123 buf,
1124 "did not find expected matches\n\
1125 expected: {:?}\n \
1126 got: {:?}",
1127 test.matches(),
1128 got,
1129 )?;
1130 }
1131 RegexTestFailureKind::Captures { ref got } => {
1132 write!(
1133 buf,
1134 "expected to find {:?} captures, but got {:?}",
1135 test.captures(),
1136 got,
1137 )?;
1138 }
1139 RegexTestFailureKind::NoCompileError => {
1140 write!(buf, "expected regex to NOT compile, but it did")?;
1141 }
1142 RegexTestFailureKind::CompileError { ref err } => {
1143 write!(buf, "expected regex to compile, failed: {}", err)?;
1144 }
1145 RegexTestFailureKind::UnexpectedPanicCompile(ref msg) => {
1146 write!(buf, "got unexpected panic while compiling:\n{}", msg)?;
1147 }
1148 RegexTestFailureKind::UnexpectedPanicSearch(ref msg) => {
1149 write!(buf, "got unexpected panic while searching:\n{}", msg)?;
1150 }
1151 }
1152 Ok(buf)
1153 }
1154}
1155
1156/// An iterator over regex tests.
1157///
1158/// This iterator is created by the [`RegexTests::iter`] method.
1159#[derive(Debug)]
1160pub struct RegexTestsIter<'a>(std::slice::Iter<'a, RegexTest>);
1161
1162impl<'a> Iterator for RegexTestsIter<'a> {
1163 type Item = &'a RegexTest;
1164
1165 fn next(&mut self) -> Option<&'a RegexTest> {
1166 self.0.next()
1167 }
1168}
1169
1170/// Represents either a single regex or a list of regexes in a TOML.
1171#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
1172#[serde(untagged)]
1173enum RegexesFormat {
1174 Single(String),
1175 Many(Vec<String>),
1176}
1177
1178impl RegexesFormat {
1179 fn patterns(&self) -> &[String] {
1180 match *self {
1181 RegexesFormat::Single(ref pat) => std::slice::from_ref(pat),
1182 RegexesFormat::Many(ref pats) => pats,
1183 }
1184 }
1185}
1186
1187/// Captures represents a single group of captured matches from a regex search.
1188///
1189/// There is always at least 1 group, and the first group is always present and
1190/// corresponds to the overall match.
1191#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
1192#[serde(try_from = "CapturesFormat")]
1193pub struct Captures {
1194 /// The ID of the regex that matched.
1195 ///
1196 /// The ID is the index of the regex provided to the regex compiler,
1197 /// starting from `0`. In the case of a single regex search, the only
1198 /// possible ID is `0`.
1199 id: usize,
1200 /// The capturing groups that matched, along with the match offsets for
1201 /// each. The first group should always be non-None, as it corresponds to
1202 /// the overall match.
1203 ///
1204 /// This should either have length 1 (when not capturing group offsets are
1205 /// included in the tes tresult) or it should have length equal to the
1206 /// number of capturing groups in the regex pattern.
1207 groups: Vec<Option<Span>>,
1208}
1209
1210impl Captures {
1211 /// Create a new set of captures for a single match of a regex.
1212 ///
1213 /// If available, iterator should provide items for every capturing group
1214 /// in the regex, including the 0th capturing group corresponding to the
1215 /// entire match. At minimum, the 0th capturing group should be provided.
1216 ///
1217 /// If a capturing group did not participate in the match, then a `None`
1218 /// value should be used. (The 0th capturing group should never be `None`.)
1219 ///
1220 /// If the iterator yields no elements or the first group is `None`, then
1221 /// this returns an error.
1222 ///
1223 /// The `id` should be the ID of the pattern that matched. This is always
1224 /// `0` for single-pattern regexes. Otherwise, the ID of a pattern starts
1225 /// at `0` and is incremented by 1 for each subsequent pattern.
1226 ///
1227 /// Note that there are possibly more convenient and infallible `From`
1228 /// impls for converting a `Match` or a `Span` into a `Captures`.
1229 pub fn new<I: IntoIterator<Item = Option<Span>>>(
1230 id: usize,
1231 it: I,
1232 ) -> Result<Captures> {
1233 let groups: Vec<Option<Span>> = it.into_iter().collect();
1234 if groups.is_empty() {
1235 bail!("captures must contain at least one group");
1236 } else if groups[0].is_none() {
1237 bail!("first group (index 0) of captures must be non-None");
1238 }
1239 Ok(Captures { id, groups })
1240 }
1241
1242 /// Returns the ID of the pattern that matched.
1243 ///
1244 /// For any single pattern regexes, this should always be zero.
1245 pub fn id(&self) -> usize {
1246 self.id
1247 }
1248
1249 /// Returns a slice of the underlying spans, each group corresponding to
1250 /// the (possibly) matched span. The first group in the slice returned
1251 /// is guaranteed to correspond to the overall match span and is thus
1252 /// non-`None`. All other groups may be `None`. Similarly, the slice is
1253 /// guaranteed to have length at least 1.
1254 pub fn groups(&self) -> &[Option<Span>] {
1255 &self.groups
1256 }
1257
1258 /// Returns the number of groups (including the first) in these captures.
1259 ///
1260 /// The length returned is guaranteed to be greater than zero.
1261 pub fn len(&self) -> usize {
1262 self.groups.len()
1263 }
1264
1265 /// Returns the overall match, including the pattern ID, for this group
1266 /// of captures.
1267 pub fn to_match(&self) -> Match {
1268 Match { id: self.id(), span: self.to_span() }
1269 }
1270
1271 /// Returns the overall match span for this group of captures.
1272 pub fn to_span(&self) -> Span {
1273 // This is OK because a Captures value must always have at least one
1274 // group where the first group always corresponds to match offsets.
1275 self.groups[0].unwrap()
1276 }
1277}
1278
1279/// Converts a plain `Match` to a `Captures` value, where the match corresponds
1280/// to the first and only group in `Captures`.
1281impl From<Match> for Captures {
1282 fn from(m: Match) -> Captures {
1283 Captures { id: m.id, groups: vec![Some(m.span)] }
1284 }
1285}
1286
1287/// Converts a plain `Span` to a `Captures` value, where the span corresponds to
1288/// the first and only group in `Captures`. Since a `Span` does not contain a
1289/// pattern ID, the pattern ID used in this conversion is always `0`.
1290impl From<Span> for Captures {
1291 fn from(sp: Span) -> Captures {
1292 Captures { id: 0, groups: vec![Some(sp)] }
1293 }
1294}
1295
1296/// Represents the actual 'captures' key format more faithfully such that
1297/// Serde can deserialize it.
1298///
1299/// Namely, we need a way to represent a 'None' value inside a TOML array, and
1300/// TOML has no 'null' value. So we make '[]' be 'None', and we use 'MaybeSpan'
1301/// to recognize it.
1302#[derive(Deserialize)]
1303#[serde(untagged)]
1304enum CapturesFormat {
1305 Span([usize; 2]),
1306 Match { id: usize, span: [usize; 2] },
1307 Spans(Vec<MaybeSpan>),
1308 Captures { id: usize, spans: Vec<MaybeSpan> },
1309}
1310
1311impl TryFrom<CapturesFormat> for Captures {
1312 type Error = anyhow::Error;
1313
1314 fn try_from(data: CapturesFormat) -> Result<Captures> {
1315 match data {
1316 CapturesFormat::Span([start, end]) => {
1317 Ok(Captures { id: 0, groups: vec![Some(Span { start, end })] })
1318 }
1319 CapturesFormat::Match { id, span: [start, end] } => {
1320 Ok(Captures { id, groups: vec![Some(Span { start, end })] })
1321 }
1322 CapturesFormat::Spans(spans) => {
1323 Captures::new(0, spans.into_iter().map(|s| s.into_option()))
1324 }
1325 CapturesFormat::Captures { id, spans } => {
1326 Captures::new(id, spans.into_iter().map(|s| s.into_option()))
1327 }
1328 }
1329 }
1330}
1331
1332/// A single match, consisting of the pattern that matched and its span.
1333#[derive(Clone, Copy, Eq, PartialEq)]
1334pub struct Match {
1335 /// The ID of the pattern that matched.
1336 ///
1337 /// This is always `0` for single-pattern regexes. Otherwise, patterns
1338 /// start at `0` and count upwards in increments of `1`.
1339 pub id: usize,
1340 /// The span of the overall match.
1341 pub span: Span,
1342}
1343
1344impl std::fmt::Debug for Match {
1345 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1346 write!(f, "Match({:?}: {:?})", self.id, self.span)
1347 }
1348}
1349
1350/// A span of contiguous bytes, from start to end, represented via byte
1351/// offsets.
1352///
1353/// The range is inclusive at the beginning and exclusive at the end.
1354#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
1355pub struct Span {
1356 /// The starting byte offset of the match.
1357 pub start: usize,
1358 /// The ending byte offset of the match.
1359 pub end: usize,
1360}
1361
1362impl std::fmt::Debug for Span {
1363 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1364 write!(f, "{:?}..{:?}", self.start, self.end)
1365 }
1366}
1367
1368/// Represents a single span, either present or empty.
1369///
1370/// An empty span is spelled `[]` in TOML, and a present span is spelled `[m,
1371/// n]`.
1372#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
1373#[serde(untagged)]
1374enum MaybeSpan {
1375 None([usize; 0]),
1376 Some([usize; 2]),
1377}
1378
1379impl MaybeSpan {
1380 /// Converts this TOML representation of a possibly absent span to a proper
1381 /// `Option<Span>`.
1382 fn into_option(self) -> Option<Span> {
1383 match self {
1384 MaybeSpan::None(_) => None,
1385 MaybeSpan::Some([start, end]) => Some(Span { start, end }),
1386 }
1387 }
1388}
1389
1390/// The match semantics to use for a search.
1391///
1392/// When not specified in a test, the default is `MatchKind::LeftmostFirst`.
1393#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
1394#[serde(rename_all = "kebab-case")]
1395pub enum MatchKind {
1396 /// All possible matches should be reported.
1397 ///
1398 /// Usually this makes it impossible for non-greedy repetition operators
1399 /// to exist. That is, they behave as greedy repetition operators.
1400 All,
1401 /// Report only the leftmost match. When there are multiple leftmost
1402 /// matches that start at the same position, prefer the one that comes
1403 /// "first" in the pattern. For example, `sam|samwise` matches `sam` in
1404 /// `samwise`.
1405 ///
1406 /// This typically corresponds to the semantics implemented by backtracking
1407 /// engines.
1408 LeftmostFirst,
1409 /// Report only the leftmost match. When there are multiple leftmost
1410 /// matches that start at the same position, prefer the one the longest
1411 /// match. For example, `sam|samwise` matches `samwise` in `samwise`.
1412 ///
1413 /// This typically corresponds to the semantics implemented by POSIX
1414 /// engines.
1415 LeftmostLongest,
1416}
1417
1418impl Default for MatchKind {
1419 fn default() -> MatchKind {
1420 MatchKind::LeftmostFirst
1421 }
1422}
1423
1424/// Represents the type of search to perform.
1425///
1426/// When not specified in a test, the default is `SearchKind::Leftmost`.
1427#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
1428#[serde(rename_all = "kebab-case")]
1429pub enum SearchKind {
1430 /// Report matches as soon as they are found.
1431 ///
1432 /// This is somewhat tricky to test, as this semantic is specified in terms
1433 /// of whatever the regex engine can do. For example, an automata oriented
1434 /// engine might be able to report a match earlier than a backtracking
1435 /// engine.
1436 Earliest,
1437 /// A standard leftmost search, returning either the leftmost-first or
1438 /// leftmost-longest match. Generally speaking, it doesn't make sense to
1439 /// use this type of search in combination with [`MatchKind::All`].
1440 Leftmost,
1441 /// Return all possible matches, including ones that overlap. Typically
1442 /// this search kind is used in combination with [`MatchKind::All`].
1443 Overlapping,
1444}
1445
1446impl Default for SearchKind {
1447 fn default() -> SearchKind {
1448 SearchKind::Leftmost
1449 }
1450}
1451
1452/// Read the environment variable given. If it doesn't exist, then return an
1453/// empty string. Otherwise, check that it is valid UTF-8. If it isn't, return
1454/// a useful error message.
1455fn read_env(var: &str) -> Result<String> {
1456 let val = match std::env::var_os(var) {
1457 None => return Ok("".to_string()),
1458 Some(val) => val,
1459 };
1460 let val = val.into_string().map_err(|os| {
1461 anyhow::anyhow!(
1462 "invalid UTF-8 in env var {}={:?}",
1463 var,
1464 Vec::from_os_str_lossy(&os)
1465 )
1466 })?;
1467 Ok(val)
1468}
1469
1470/// Runs the given closure such that any panics are caught and converted into
1471/// errors. If the panic'd value could not be converted to a known error type,
1472/// then a generic string error message is used.
1473///
1474/// This is useful for use inside the test runner such that bugs for certain
1475/// tests don't prevent other tests from running.
1476fn safe<T, F>(fun: F) -> Result<T, String>
1477where
1478 F: FnOnce() -> T,
1479{
1480 use std::panic;
1481
1482 panic::catch_unwind(panic::AssertUnwindSafe(fun)).map_err(|any_err| {
1483 // Extract common types of panic payload:
1484 // panic and assert produce &str or String
1485 if let Some(&s) = any_err.downcast_ref::<&str>() {
1486 s.to_owned()
1487 } else if let Some(s) = any_err.downcast_ref::<String>() {
1488 s.to_owned()
1489 } else {
1490 "UNABLE TO SHOW RESULT OF PANIC.".to_owned()
1491 }
1492 })
1493}
1494
1495/// A function to set some boolean fields to a default of 'true'. We use a
1496/// function so that we can hand a path to it to Serde.
1497fn default_true() -> bool {
1498 true
1499}
1500
1501#[cfg(test)]
1502mod tests {
1503 use super::*;
1504
1505 #[test]
1506 fn err_no_regexes() {
1507 let data = r#"
1508[[test]]
1509name = "foo"
1510haystack = "lib.rs"
1511matches = true
1512case-insensitive = true
1513"#;
1514
1515 let mut tests = RegexTests::new();
1516 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1517 }
1518
1519 #[test]
1520 fn err_unknown_field() {
1521 let data = r#"
1522[[test]]
1523name = "foo"
1524regex = ".*.rs"
1525haystack = "lib.rs"
1526matches = true
1527something = 0
1528"#;
1529
1530 let mut tests = RegexTests::new();
1531 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1532 }
1533
1534 #[test]
1535 fn err_no_matches() {
1536 let data = r#"
1537[[test]]
1538name = "foo"
1539regex = ".*.rs"
1540haystack = "lib.rs"
1541"#;
1542
1543 let mut tests = RegexTests::new();
1544 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1545 }
1546
1547 #[test]
1548 fn load_match() {
1549 let data = r#"
1550[[test]]
1551name = "foo"
1552regex = ".*.rs"
1553haystack = "lib.rs"
1554matches = [[0, 6]]
1555compiles = false
1556anchored = true
1557case-insensitive = true
1558unicode = false
1559utf8 = false
1560"#;
1561
1562 let mut tests = RegexTests::new();
1563 tests.load_slice("test", data.as_bytes()).unwrap();
1564
1565 let t0 = &tests.tests[0];
1566 assert_eq!("test", t0.group());
1567 assert_eq!("foo", t0.name());
1568 assert_eq!("test/foo", t0.full_name());
1569 assert_eq!(&[".*.rs"], t0.regexes());
1570 assert_eq!(true, t0.is_match());
1571 assert_eq!(vec![0], t0.which_matches());
1572
1573 assert!(!t0.compiles());
1574 assert!(t0.anchored());
1575 assert!(t0.case_insensitive());
1576 assert!(!t0.unicode());
1577 assert!(!t0.utf8());
1578 }
1579
1580 #[test]
1581 fn load_which_matches() {
1582 let data = r#"
1583[[test]]
1584name = "foo"
1585regex = [".*.rs", ".*.toml"]
1586haystack = "lib.rs"
1587matches = [
1588 { id = 0, spans = [[0, 0]] },
1589 { id = 2, spans = [[0, 0]] },
1590 { id = 5, spans = [[0, 0]] },
1591]
1592"#;
1593
1594 let mut tests = RegexTests::new();
1595 tests.load_slice("test", data.as_bytes()).unwrap();
1596
1597 let t0 = &tests.tests[0];
1598 assert_eq!(&[".*.rs", ".*.toml"], t0.regexes());
1599 assert_eq!(true, t0.is_match());
1600 assert_eq!(vec![0, 2, 5], t0.which_matches());
1601
1602 assert!(t0.compiles());
1603 assert!(!t0.anchored());
1604 assert!(!t0.case_insensitive());
1605 assert!(t0.unicode());
1606 assert!(t0.utf8());
1607 }
1608
1609 #[test]
1610 fn load_spans() {
1611 let data = r#"
1612[[test]]
1613name = "foo"
1614regex = ".*.rs"
1615haystack = "lib.rs"
1616matches = [[0, 2], [5, 10]]
1617"#;
1618
1619 let mut tests = RegexTests::new();
1620 tests.load_slice("test", data.as_bytes()).unwrap();
1621
1622 let spans =
1623 vec![Span { start: 0, end: 2 }, Span { start: 5, end: 10 }];
1624 let t0 = &tests.tests[0];
1625 assert_eq!(t0.regexes(), &[".*.rs"]);
1626 assert_eq!(t0.is_match(), true);
1627 assert_eq!(t0.which_matches(), &[0]);
1628 assert_eq!(
1629 t0.matches(),
1630 vec![
1631 Match { id: 0, span: spans[0] },
1632 Match { id: 0, span: spans[1] },
1633 ]
1634 );
1635 assert_eq!(
1636 t0.captures(),
1637 vec![
1638 Captures::new(0, vec![Some(spans[0])]).unwrap(),
1639 Captures::new(0, vec![Some(spans[1])]).unwrap(),
1640 ]
1641 );
1642 }
1643
1644 #[test]
1645 fn load_capture_spans() {
1646 let data = r#"
1647[[test]]
1648name = "foo"
1649regex = ".*.rs"
1650haystack = "lib.rs"
1651matches = [
1652 [[0, 15], [5, 10], [], [13, 14]],
1653 [[20, 30], [22, 24], [25, 27], []],
1654]
1655"#;
1656
1657 let mut tests = RegexTests::new();
1658 tests.load_slice("test", data.as_bytes()).unwrap();
1659
1660 let t0 = &tests.tests[0];
1661 assert_eq!(t0.regexes(), &[".*.rs"]);
1662 assert_eq!(t0.is_match(), true);
1663 assert_eq!(t0.which_matches(), &[0]);
1664 assert_eq!(
1665 t0.matches(),
1666 vec![
1667 Match { id: 0, span: Span { start: 0, end: 15 } },
1668 Match { id: 0, span: Span { start: 20, end: 30 } },
1669 ]
1670 );
1671 assert_eq!(
1672 t0.captures(),
1673 vec![
1674 Captures::new(
1675 0,
1676 vec![
1677 Some(Span { start: 0, end: 15 }),
1678 Some(Span { start: 5, end: 10 }),
1679 None,
1680 Some(Span { start: 13, end: 14 }),
1681 ]
1682 )
1683 .unwrap(),
1684 Captures::new(
1685 0,
1686 vec![
1687 Some(Span { start: 20, end: 30 }),
1688 Some(Span { start: 22, end: 24 }),
1689 Some(Span { start: 25, end: 27 }),
1690 None,
1691 ]
1692 )
1693 .unwrap(),
1694 ]
1695 );
1696 }
1697
1698 #[test]
1699 fn fail_spans_empty1() {
1700 let data = r#"
1701[[test]]
1702name = "foo"
1703regex = ".*.rs"
1704haystack = "lib.rs"
1705matches = [
1706 [],
1707]
1708"#;
1709
1710 let mut tests = RegexTests::new();
1711 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1712 }
1713
1714 #[test]
1715 fn fail_spans_empty2() {
1716 let data = r#"
1717[[test]]
1718name = "foo"
1719regex = ".*.rs"
1720haystack = "lib.rs"
1721matches = [
1722 [[]],
1723]
1724"#;
1725
1726 let mut tests = RegexTests::new();
1727 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1728 }
1729
1730 #[test]
1731 fn fail_spans_empty3() {
1732 let data = r#"
1733[[test]]
1734name = "foo"
1735regex = ".*.rs"
1736haystack = "lib.rs"
1737matches = [
1738 [[], [0, 2]],
1739]
1740"#;
1741
1742 let mut tests = RegexTests::new();
1743 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1744 }
1745
1746 #[test]
1747 fn fail_captures_empty1() {
1748 let data = r#"
1749[[test]]
1750name = "foo"
1751regex = ".*.rs"
1752haystack = "lib.rs"
1753matches = [
1754 { id = 0, spans = [] },
1755]
1756"#;
1757
1758 let mut tests = RegexTests::new();
1759 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1760 }
1761
1762 #[test]
1763 fn fail_captures_empty2() {
1764 let data = r#"
1765[[test]]
1766name = "foo"
1767regex = ".*.rs"
1768haystack = "lib.rs"
1769matches = [
1770 { id = 0, spans = [[]] },
1771]
1772"#;
1773
1774 let mut tests = RegexTests::new();
1775 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1776 }
1777
1778 #[test]
1779 fn fail_captures_empty3() {
1780 let data = r#"
1781[[test]]
1782name = "foo"
1783regex = ".*.rs"
1784haystack = "lib.rs"
1785matches = [
1786 { id = 0, spans = [[], [0, 2]] },
1787]
1788"#;
1789
1790 let mut tests = RegexTests::new();
1791 assert!(tests.load_slice("test", data.as_bytes()).is_err());
1792 }
1793}