proptest 0.8.5

Hypothesis-like property-based testing and shrinking.
Documentation
//-
// Copyright 2017 Jason Lingle
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Strategies for generating strings and byte strings from regular
//! expressions.

use core::mem;
use core::fmt;
use core::ops::RangeInclusive;
use core::u32;
use std_facade::{Cow, Box, String, Vec, ToOwned};

use regex_syntax::{Parser, Error as ParseError};
use regex_syntax::hir::{
    self, Hir, HirKind::*, Literal::*,
    RepetitionKind::{self, *}, RepetitionRange::*
};

use bool;
use char;
use collection::{vec, size_range, SizeRange};
use strategy::*;
use test_runner::*;

/// Wraps the regex that forms the `Strategy` for `String` so that a sensible
/// `Default` can be given. The default is a string of non-control characters.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct StringParam(&'static str);

impl From<StringParam> for &'static str {
    fn from(x: StringParam) -> Self { x.0 }
}

impl From<&'static str> for StringParam {
    fn from(x: &'static str) -> Self { StringParam(x) }
}

impl Default for StringParam {
    fn default() -> Self {
        StringParam("\\PC*")
    }
}

// quick_error! uses bare trait objects, so we enclose its invocation here in a
// module so the lint can be disabled just for it.
#[allow(bare_trait_objects)]
mod error_container {
    use super::*;

    quick_error! {
        /// Errors which may occur when preparing a regular expression for use with
        /// string generation.
        #[derive(Debug)]
        pub enum Error {
            /// The string passed as the regex was not syntactically valid.
            RegexSyntax(err: ParseError) {
                from()
                    cause(err)
                    description(err.description())
                    display("{}", err)
            }
            /// The regex was syntactically valid, but contains elements not
            /// supported by proptest.
            UnsupportedRegex(message: &'static str) {
                description(message)
            }
        }
    }
}

pub use self::error_container::Error;

opaque_strategy_wrapper! {
    /// Strategy which generates values (i.e., `String` or `Vec<u8>`) matching
    /// a regular expression.
    ///
    /// Created by various functions in this module.
    #[derive(Debug)]
    pub struct RegexGeneratorStrategy[<T>][where T : fmt::Debug]
        (SBoxedStrategy<T>) -> RegexGeneratorValueTree<T>;
    /// `ValueTree` corresponding to `RegexGeneratorStrategy`.
    pub struct RegexGeneratorValueTree[<T>][where T : fmt::Debug]
        (Box<dyn ValueTree<Value = T>>) -> T;
}

impl Strategy for str {
    type Tree = RegexGeneratorValueTree<String>;
    type Value = String;

    fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
        string_regex(self).unwrap().new_tree(runner)
    }
}

type ParseResult<T> = Result<RegexGeneratorStrategy<T>, Error>;

/// Creates a strategy which generates strings matching the given regular
/// expression.
///
/// If you don't need error handling and aren't limited by setup time, it is
/// also possible to directly use a `&str` as a strategy with the same effect.
pub fn string_regex(regex: &str) -> ParseResult<String> {
    string_regex_parsed(&regex_to_hir(regex)?)
}

/// Like `string_regex()`, but allows providing a pre-parsed expression.
pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
    bytes_regex_parsed(expr).map(
        |v| v.prop_map(|bytes| String::from_utf8(bytes).expect(
            "non-utf8 string")).sboxed()).map(RegexGeneratorStrategy)
}

/// Creates a strategy which generates byte strings matching the given regular
/// expression.
pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
    bytes_regex_parsed(&regex_to_hir(regex)?)
}

/// Like `bytes_regex()`, but allows providing a pre-parsed expression.
pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> {
    match expr.kind() {
        Empty => Ok(Just(vec![]).sboxed()),

        Literal(lit) => Ok(Just(match lit {
            Unicode(scalar) => to_bytes(*scalar),
            Byte(byte) => vec![*byte],
        }).sboxed()),

        Class(class) => Ok(match class {
            hir::Class::Unicode(class) =>
                unicode_class_strategy(class).prop_map(to_bytes).sboxed(),
            hir::Class::Bytes(class) => {
                let subs = class.iter().map(|r| r.start() ..= r.end());
                Union::new(subs).prop_map(|b| vec![b]).sboxed()
            }
        }),

        Repetition(rep) => Ok(
            vec(bytes_regex_parsed(&rep.hir)?, to_range(rep.kind.clone())?)
                .prop_map(|parts| parts.into_iter().fold(
                   vec![], |mut acc, child| { acc.extend(child); acc }))
                .sboxed()
        ),

        Group(group) => bytes_regex_parsed(&group.hir).map(|v| v.0),

        Concat(subs) => {
            let subs = ConcatIter { iter: subs.iter(), buf: vec![], next: None };
            let ext = |(mut lhs, rhs): (Vec<_>, _)| {
                lhs.extend(rhs);
                lhs
            };
            Ok(subs.fold(Ok(None), |accum: Result<_, Error>, rhs| Ok(match accum? {
                None => Some(rhs?.sboxed()),
                Some(accum) => Some((accum, rhs?).prop_map(ext).sboxed()),
            }))?.unwrap_or_else(|| Just(vec![]).sboxed()))
        },

        Alternation(subs) =>
            Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed()),

        Anchor(_) =>
            unsupported("line/text anchors not supported for string generation"),

        WordBoundary(_) =>
            unsupported("word boundary tests not supported for string generation"),
    }.map(RegexGeneratorStrategy)
}

fn unicode_class_strategy(class: &hir::ClassUnicode) -> char::CharStrategy<'static> {
    static NONL_RANGES: &[RangeInclusive<char>] = &[
        '\x00'..='\x09',
        // Multiple instances of the latter range to partially make up
        // for the bias of having such a tiny range in the control
        // characters.
        '\x0B'..=::core::char::MAX,
        '\x0B'..=::core::char::MAX,
        '\x0B'..=::core::char::MAX,
        '\x0B'..=::core::char::MAX,
        '\x0B'..=::core::char::MAX,
    ];

    let dotnnl = |x: &hir::ClassUnicodeRange, y: &hir::ClassUnicodeRange|
        x.start() == '\0' && x.end() == '\x09' &&
        y.start() == '\x0B' && y.end() == '\u{10FFFF}';

    char::ranges(match class.ranges() {
        [x, y] if dotnnl(x, y) || dotnnl(y, x) => Cow::Borrowed(NONL_RANGES),
        _ => Cow::Owned(class.iter().map(|r| r.start() ..= r.end()).collect()),
    })
}

struct ConcatIter<'a, I> {
    buf: Vec<u8>,
    iter: I,
    next: Option<&'a Hir>,
}

fn flush_lit_buf<I>(it: &mut ConcatIter<'_, I>) -> Option<ParseResult<Vec<u8>>> {
    Some(Ok(RegexGeneratorStrategy(
        Just(mem::replace(&mut it.buf, vec![])).sboxed()
    )))
}

impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> {
    type Item = ParseResult<Vec<u8>>;

    fn next(&mut self) -> Option<Self::Item> {
        // A left-over node, process it first:
        if let Some(next) = self.next.take() {
            return Some(bytes_regex_parsed(next));
        }

        // Accumulate a literal sequence as long as we can:
        while let Some(next) = self.iter.next() {
            match next.kind() {
                // A literal. Accumulate:
                Literal(Unicode(scalar)) => self.buf.extend(to_bytes(*scalar)),
                Literal(Byte(byte)) => self.buf.push(*byte),
                // Ecountered a non-literal.
                _ => return if !self.buf.is_empty() {
                    // We've accumulated a literal from before, flush it out.
                    // Store this node so we deal with it the next call.
                    self.next = Some(next);
                    flush_lit_buf(self)
                } else {
                    // We didn't; just yield this node.
                    Some(bytes_regex_parsed(next))
                },
            }
        }

        // Flush out any accumulated literal from before.
        if !self.buf.is_empty() {
            flush_lit_buf(self)
        } else {
            self.next.take().map(bytes_regex_parsed)
        }
    }
}

fn to_range(kind: RepetitionKind) -> Result<SizeRange, Error> {
    Ok(match kind {
        ZeroOrOne => size_range(0..=1),
        ZeroOrMore => size_range(0..=32),
        OneOrMore => size_range(1..=32),
        Range(range) => match range {
            Exactly(count) if u32::MAX == count =>
                return unsupported("Cannot have repetition of exactly u32::MAX"),
            Exactly(count) => size_range(count as usize),
            AtLeast(min) => {
                let max = if min < u32::MAX as u32 / 2 {
                    min as usize * 2
                } else {
                    u32::MAX as usize
                };
                size_range((min as usize)..max)
            },
            Bounded(_, max) if u32::MAX == max =>
                return unsupported("Cannot have repetition max of u32::MAX"),
            Bounded(min, max) =>
                size_range((min as usize)..(max as usize + 1))
        }
    })
}

fn to_bytes(khar: char) -> Vec<u8> {
    let mut buf = [0u8; 4];
    khar.encode_utf8(&mut buf).as_bytes().to_owned()
}

fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
    Ok(Parser::new().parse(pattern)?)
}

fn unsupported<T>(error: &'static str) -> Result<T, Error> {
    Err(Error::UnsupportedRegex(error))
}

#[cfg(test)]
mod test {
    use std::collections::HashSet;

    use regex::Regex;

    use super::*;

    fn do_test(pattern: &str, min_distinct: usize, max_distinct: usize,
               iterations: usize) {
        let generated = generate_values_matching_regex(pattern, iterations);
        assert!(generated.len() >= min_distinct,
                "Expected to generate at least {} strings, but only \
                 generated {}", min_distinct, generated.len());
        assert!(generated.len() <= max_distinct,
                "Expected to generate at most {} strings, but \
                 generated {}", max_distinct, generated.len());
    }

    fn generate_values_matching_regex(pattern: &str, iterations: usize) -> HashSet<String> {
        let rx = Regex::new(pattern).unwrap();
        let mut generated = HashSet::new();

        let strategy = string_regex(pattern).unwrap();
        let mut runner = TestRunner::default();
        for _ in 0..iterations {
            let mut value = strategy.new_tree(&mut runner).unwrap();

            loop {
                let s = value.current();
                let ok = if let Some(matsch) = rx.find(&s) {
                    0 == matsch.start() && s.len() == matsch.end()
                } else {
                    false
                };
                if !ok {
                    panic!("Generated string {:?} which does not match {:?}",
                           s, pattern);
                }

                generated.insert(s);

                if !value.simplify() { break; }
            }
        }
        generated
    }

    #[test]
    fn test_case_insensitive_produces_all_available_values() {
        let mut expected: HashSet<String> = HashSet::new();
        expected.insert("a".into());
        expected.insert("b".into());
        expected.insert("A".into());
        expected.insert("B".into());
        assert_eq!(generate_values_matching_regex("(?i:a|B)", 64), expected);
    }

    #[test]
    fn test_literal() {
        do_test("foo", 1, 1, 8);
    }

    #[test]
    fn test_casei_literal() {
        do_test("(?i:fOo)", 8, 8, 64);
    }

    #[test]
    fn test_alternation() {
        do_test("foo|bar|baz", 3, 3, 16);
    }

    #[test]
    fn test_repitition() {
        do_test("a{0,8}", 9, 9, 64);
    }

    #[test]
    fn test_question() {
        do_test("a?", 2, 2, 16);
    }

    #[test]
    fn test_star() {
        do_test("a*", 33, 33, 256);
    }

    #[test]
    fn test_plus() {
        do_test("a+", 32, 32, 256);
    }

    #[test]
    fn test_n_to_range() {
        do_test("a{4,}", 4, 4, 64);
    }

    #[test]
    fn test_concatenation() {
        do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
    }

    #[test]
    fn test_ascii_class() {
        do_test("[[:digit:]]", 10, 10, 256);
    }

    #[test]
    fn test_unicode_class() {
        do_test("\\p{Greek}", 24, 512, 256);
    }

    #[test]
    fn test_dot() {
        do_test(".", 200, 65536, 256);
    }

    #[test]
    fn test_dot_s() {
        do_test("(?s).", 200, 65536, 256);
    }

    #[test]
    fn test_backslash_d_plus() {
        do_test("\\d+", 1, 65536, 256);
    }

    fn assert_send_and_sync<T : Send + Sync>(_: T) { }

    #[test]
    fn regex_strategy_is_send_and_sync() {
        assert_send_and_sync(string_regex(".").unwrap());
    }

    macro_rules! consistent {
        ($name:ident, $value:expr) => {
            #[test]
            fn $name() {
                test_generates_matching_strings($value);
            }
        }
    }

    fn test_generates_matching_strings(pattern: &str) {
        use std::time;

        let mut runner = TestRunner::default();
        let start = time::Instant::now();

        // If we don't support this regex, just move on quietly
        if let Ok(strategy) = string_regex(pattern) {
            let rx = Regex::new(pattern).unwrap();

            for _ in 0..1000 {
                let mut val = strategy.new_tree(&mut runner).unwrap();
                // No more than 1000 simplify steps to keep test time down
                for _ in 0..1000 {
                    let s = val.current();
                    assert!(rx.is_match(&s),
                            "Produced string {:?}, which does not match {:?}",
                            s, pattern);

                    if !val.simplify() { break; }
                }

                // Quietly stop testing if we've run for >10 s
                if start.elapsed().as_secs() > 10 { break; }
            }
        }
    }

    include!("regex-contrib/crates_regex.rs");
}