1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
use icu_collections::codepointinvlist::CodePointInversionListBuilder;
use crate::{
character_class::CharacterClass,
op_nothing::Nothing,
op_unambiguous_repeat::UnambiguousRepeat,
operation::{
Operation, OperationControl, MATCHES_ZLS_ANYWHERE, MATCHES_ZLS_AT_END,
MATCHES_ZLS_AT_START, MATCHES_ZLS_NEVER,
},
re_compiler::ReCompiler,
re_flags::ReFlags,
re_matcher::{CaptureState, ReMatcher},
};
// A sequence of multiple pieces in a regular expression.
#[derive(Debug, Clone)]
pub(crate) struct Sequence {
pub(crate) operations: Vec<Operation>,
}
impl Sequence {
pub(crate) fn new(operations: Vec<Operation>) -> Self {
Self { operations }
}
}
impl OperationControl for Sequence {
fn get_match_length(&self) -> Option<usize> {
self.operations
.iter()
.try_fold(0, |acc, op| op.get_match_length().map(|len| acc + len))
}
fn get_minimum_match_length(&self) -> usize {
self.operations
.iter()
.fold(0, |acc, op| acc + op.get_minimum_match_length())
}
fn get_initial_character_class(&self, case_blind: bool) -> CharacterClass {
let mut builder = CodePointInversionListBuilder::new();
for o in &self.operations {
let cc = o.get_initial_character_class(case_blind);
builder.add_set(cc.as_code_point_inversion_list());
if o.matches_empty_string() == MATCHES_ZLS_NEVER {
break;
}
}
CharacterClass::new(builder.build())
}
fn optimize(self, flags: &ReFlags) -> Operation {
match self.operations.len() {
0 => Operation::from(Nothing),
1 => self.operations.into_iter().next().unwrap(),
_ => {
let l = self.operations.len();
// cannot avoid clone here as we are referring to the operations
let operations = self
.operations
.iter()
.cloned()
.enumerate()
.map(|(i, o)| {
let optimized = o.optimize(flags);
// we can never further optimize the last operation,
// as it has no adjacent operation
if i == l - 1 {
return optimized;
}
// now if this is a repeat operation, we may be able
// to optimize it further
if let Some(repeat_operation) = optimized.repeat_operation() {
// if the repeated operation is an atom or a charclass,
// we may be able to make an optimization
let repeated_operation = repeat_operation.child().clone();
if matches!(
repeated_operation,
Operation::Atom(_) | Operation::CharClass(_)
) {
if repeat_operation.min() == repeat_operation.max() {
// we can optimize this to an unambiguous repeat
return Operation::from(UnambiguousRepeat::new(
repeated_operation.clone(),
repeat_operation.min(),
repeat_operation.max(),
));
}
// get the adjacent operation
let next_operation = &self.operations[i + 1];
if ReCompiler::no_ambiguity(
&repeated_operation,
next_operation,
flags.is_case_independent(),
!repeat_operation.greedy(),
) {
return Operation::from(UnambiguousRepeat::new(
repeated_operation.clone(),
repeat_operation.min(),
repeat_operation.max(),
));
}
}
}
optimized
})
.collect();
Operation::from(Sequence { operations })
}
}
}
fn matches_empty_string(&self) -> u32 {
// The operation matches empty anywhere if every suboperation matches
// empty anywhere
let mut matches_empty_anywhere = true;
for operation in &self.operations {
let m = operation.matches_empty_string();
if m == MATCHES_ZLS_NEVER {
return MATCHES_ZLS_NEVER;
}
if m != MATCHES_ZLS_ANYWHERE {
matches_empty_anywhere = false;
break;
}
}
if matches_empty_anywhere {
return MATCHES_ZLS_ANYWHERE;
}
// The operation matches BOL if every suboperation matches BOL (which
// includes the case of matching empty anywhere)
let mut matches_bol = true;
for operation in &self.operations {
if (operation.matches_empty_string() & MATCHES_ZLS_AT_START) == 0 {
matches_bol = false;
break;
}
}
if matches_bol {
return MATCHES_ZLS_AT_START;
}
// The operation matches EOL if every suboperation matches EOL (which
// includes the case of matching empty anywhere)
let mut matches_eol = true;
for operation in &self.operations {
if (operation.matches_empty_string() & MATCHES_ZLS_AT_END) == 0 {
matches_eol = false;
break;
}
}
if matches_eol {
return MATCHES_ZLS_AT_END;
}
0
}
fn contains_capturing_expressions(&self) -> bool {
for o in &self.operations {
if (matches!(o, Operation::Capture(_)) || o.contains_capturing_expressions()) {
return true;
}
}
false
}
fn matches_iter<'a>(
&'a self,
matcher: &'a ReMatcher<'a>,
position: usize,
) -> Box<dyn Iterator<Item = usize> + 'a> {
Box::new(SequenceIterator::new(
matcher,
&self.operations,
position,
self.contains_capturing_expressions(),
))
}
fn children(&self) -> Vec<Operation> {
self.operations.clone()
}
}
struct SequenceIterator<'a> {
iterators: Vec<Box<dyn Iterator<Item = usize> + 'a>>,
operations: &'a [Operation],
backtracking_limit: Option<usize>,
matcher: &'a ReMatcher<'a>,
saved_state: Option<CaptureState>,
}
impl<'a> SequenceIterator<'a> {
fn new(
matcher: &'a ReMatcher<'a>,
operations: &'a [Operation],
position: usize,
contains_capturing_expressions: bool,
) -> Self {
let saved_state = if contains_capturing_expressions {
Some(matcher.capture_state())
} else {
None
};
Self {
iterators: vec![operations.first().unwrap().matches_iter(matcher, position)],
operations,
backtracking_limit: matcher.program.backtracking_limit,
matcher,
saved_state,
}
}
}
impl Iterator for SequenceIterator<'_> {
type Item = usize;
// Advance the current iterator if possible. Get the first match for all
// subsequent iterators in the sequence. If we get all the way to the end
// of the sequence, return the position in the input string that we have
// reached. If we don't get all the way to the end of the sequence, work
// backwards getting the next match for each term in the sequence until we
// find a route through.
fn next(&mut self) -> Option<Self::Item> {
let mut counter = 0;
// as long as there are iterators on the stack
while !self.iterators.is_empty() {
loop {
// take the top of the stack
let top = self.iterators.last_mut().unwrap();
// take the next item from the top iterator
if let Some(next) = top.next() {
self.matcher.clear_captured_groups_beyond(next);
// if the amount of iterators to process is equal or
// greater than the amount of operations in this sequence,
// then we return next
let i = self.iterators.len();
if i >= self.operations.len() {
return Some(next);
}
// otherwise we push a new iterator to the stack
let new_top = self.operations[i].matches_iter(self.matcher, next);
self.iterators.push(new_top);
} else {
// continue until we have no more next in the top
break;
}
}
// we are backtracking. pop the iterator from the stack
self.iterators.pop();
if let Some(backtracking_limit) = self.backtracking_limit {
if counter > backtracking_limit {
// TODO: error
// Regex backtracking exceeded processing
todo!()
}
}
counter += 1;
}
// restore saved state
if let Some(saved_state) = &self.saved_state {
self.matcher.reset_state(saved_state.clone());
}
None
}
}