1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
use super::{AcceptingState, StateIdx, NFA};
use crate::collections::Set;
use crate::dfa::simulate::simulate_right_ctx;
use crate::dfa::StateIdx as DfaStateIdx;
use crate::right_ctx::RightCtxDFAs;
pub type Matches<'input, A> = Vec<(&'input str, A)>;
pub type ErrorLoc = usize;
impl<A: std::fmt::Debug + Copy> NFA<A> {
pub fn simulate<'input>(
&self,
input: &'input str,
right_ctx_dfas: &RightCtxDFAs<DfaStateIdx>,
) -> (Matches<'input, A>, Option<ErrorLoc>) {
let mut values: Matches<'input, A> = vec![];
// If we skipped an accepting state because we were able to make progress with the next
// character, this state holds the previous match. If we get stuck we return this match.
//
// This implements backtracking in regexes like:
//
// - aaaaaab
// - a
//
// in an input like "aaaa".
let mut last_match: Option<(usize, A, usize)> = None;
let mut states: Set<StateIdx> = Default::default();
states.insert(StateIdx(0));
states = self.compute_state_closure(&states);
let mut char_indices = input.char_indices();
// Where the current match starts
let mut match_start: usize = 0;
// Index of current character in input string
let mut char_idx;
'outer: loop {
while let Some((char_idx_, char)) = char_indices.next() {
char_idx = match_start + char_idx_;
states = next(self, &states, char);
// When stuck check if we skipped an accepting state
if states.is_empty() {
match last_match.take() {
None => {
// We're stuck and can't backtrack, raise an error
return (values, Some(match_start));
}
Some((last_match_start, last_match_value, last_match_end)) => {
// Backtrack to the previous accepting state
match_start = last_match_end;
char_indices = input[match_start..].char_indices();
// Accept the previous match
values
.push((&input[last_match_start..last_match_end], last_match_value));
// Restart state machine
states.insert(StateIdx(0));
states = self.compute_state_closure(&states);
}
}
} else {
// Check for accepting states. Sort states to pick the one that comes first in
// the program.
let mut states_sorted: Vec<StateIdx> = states.iter().copied().collect();
states_sorted.sort();
for state in states_sorted {
if let Some(AcceptingState { value, right_ctx }) =
&self.states[state.0].accepting
{
match right_ctx {
None => {
last_match =
Some((match_start, *value, char_idx + char.len_utf8()));
break;
}
Some(right_ctx_idx) => {
let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx);
if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) {
last_match =
Some((match_start, *value, char_idx + char.len_utf8()));
break;
}
}
}
}
}
}
}
// Reached EOF, take EOF transitions, check for accepting states
states = next_end_of_input(self, &states);
{
let mut states_sorted: Vec<StateIdx> = states.iter().copied().collect();
states_sorted.sort();
for state in states_sorted {
if let Some(AcceptingState { value, right_ctx }) =
&self.states[state.0].accepting
{
match right_ctx {
None => {
values.push((&input[match_start..], *value));
break 'outer;
}
Some(right_ctx_idx) => {
let right_ctx_dfa = right_ctx_dfas.get(right_ctx_idx);
if simulate_right_ctx(right_ctx_dfa, char_indices.clone()) {
values.push((&input[match_start..], *value));
break 'outer;
}
}
}
}
}
}
// Reached EOF but cannot accept input, backtrack if possible, otherwise raise an error
match last_match.take() {
Some((last_match_start, last_match_value, last_match_end)) => {
values.push((&input[last_match_start..last_match_end], last_match_value));
if last_match_end == input.len() {
break 'outer;
} else {
// Backtrack
match_start = last_match_end;
char_indices = input[match_start..].char_indices();
// Restart state machine
states.insert(StateIdx(0));
states = self.compute_state_closure(&states);
}
}
None => {
// We're stuck and can't backtrack, raise an error
return (values, Some(match_start));
}
}
}
(values, None)
}
}
fn next<A>(nfa: &NFA<A>, states: &Set<StateIdx>, char: char) -> Set<StateIdx> {
let mut next_states: Set<StateIdx> = Default::default();
for state in states {
// Char transitions
if let Some(char_nexts) = nfa.states[state.0].char_transitions.get(&char) {
next_states.extend(char_nexts.iter());
}
// Range transitions
for range in nfa.states[state.0].range_transitions.iter() {
if char as u32 >= range.start && char as u32 <= range.end {
next_states.extend(range.value.clone());
}
}
// Any transitions
next_states.extend(nfa.states[state.0].any_transitions.iter().copied());
}
nfa.compute_state_closure(&next_states)
}
fn next_end_of_input<A>(nfa: &NFA<A>, states: &Set<StateIdx>) -> Set<StateIdx> {
let mut next_states: Set<StateIdx> = Default::default();
for state in states {
next_states.extend(nfa.states[state.0].end_of_input_transitions.iter().copied());
}
nfa.compute_state_closure(&next_states)
}