1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# These tests are specifically written to test the regex-lite crate. While it
# largely has the same semantics as the regex crate, there are some differences
# around Unicode support and UTF-8.
#
# To be clear, regex-lite supports far fewer patterns because of its lack of
# Unicode support, nested character classes and character class set operations.
# What we're talking about here are the patterns that both crates support but
# where the semantics might differ.
# regex-lite uses ASCII definitions for Perl character classes.
[[]]
= "perl-class-decimal"
= '\d'
= '᠕'
= []
= true
# regex-lite uses ASCII definitions for Perl character classes.
[[]]
= "perl-class-space"
= '\s'
= "\u2000"
= []
= true
# regex-lite uses ASCII definitions for Perl character classes.
[[]]
= "perl-class-word"
= '\w'
= 'δ'
= []
= true
# regex-lite uses the ASCII definition of word for word boundary assertions.
[[]]
= "word-boundary"
= '\b'
= 'δ'
= []
= true
# regex-lite uses the ASCII definition of word for negated word boundary
# assertions. But note that it should still not split codepoints!
[[]]
= "word-boundary-negated"
= '\B'
= 'δ'
= [[0, 0], [2, 2]]
= true
# While we're here, the empty regex---which matches at every
# position---shouldn't split a codepoint either.
[[]]
= "empty-no-split-codepoint"
= ''
= '💩'
= [[0, 0], [4, 4]]
= true
# A dot always matches a full codepoint.
[[]]
= "dot-always-matches-codepoint"
= '.'
= '💩'
= [[0, 4]]
= false
# A negated character class also always matches a full codepoint.
[[]]
= "negated-class-always-matches-codepoint"
= '[^a]'
= '💩'
= [[0, 4]]
= false
# regex-lite only supports ASCII-aware case insensitive matching.
[[]]
= "case-insensitive-is-ascii-only"
= 's'
= 'ſ'
= []
= true
= true
# Negated word boundaries shouldn't split a codepoint, but they will match
# between invalid UTF-8.
#
# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
# regex-lite. This can't happen in the main API because &str can't contain
# invalid UTF-8.
# [[test]]
# name = "word-boundary-invalid-utf8"
# regex = '\B'
# haystack = '\xFF\xFF\xFF\xFF'
# unescape = true
# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
# unicode = true
# utf8 = false