1use std::{fmt, iter::Peekable, str::Chars};
7
8mod ast;
9mod compile;
10mod haystack;
11mod matches;
12mod stream;
13mod vm;
14
15pub use haystack::Haystack;
16pub use matches::Match;
17pub use stream::{CachingStream, CachingStreamIter};
18pub use vm::{Regex, RevRegex};
19
20#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Error {
23 EmptyParens,
25 EmptyRegex,
27 InvalidClass,
29 InvalidEscape(char),
31 InvalidRepetition,
33 ReTooLong,
35 UnbalancedAlt,
37 UnbalancedParens,
39 UnclosedGroupName(String),
41 UnknownGroupQualifier(char),
43}
44
45impl std::error::Error for Error {}
46
47impl fmt::Display for Error {
48 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
49 match self {
50 Self::EmptyParens => write!(f, "empty parens"),
51 Self::EmptyRegex => write!(f, "empty regular expression"),
52 Self::InvalidClass => write!(f, "invalid class"),
53 Self::InvalidEscape(c) => write!(f, "invalid escaped character {c:?}"),
54 Self::InvalidRepetition => write!(f, "invalid repetition"),
55 Self::ReTooLong => write!(f, "regex too long"),
56 Self::UnbalancedAlt => write!(f, "alternation had no right hand side"),
57 Self::UnbalancedParens => write!(f, "unbalanced parens"),
58 Self::UnclosedGroupName(s) => write!(f, "unclosed group name {s:?}"),
59 Self::UnknownGroupQualifier(c) => write!(f, "unknown group qualifier {c:?}"),
60 }
61 }
62}
63
64const fn char_ix(ch: char) -> usize {
66 ((ch as u16) & 0xFF) as usize
67}
68
69const fn init_escapes() -> [Option<char>; 256] {
70 macro_rules! escape {
71 ($escapes:expr, $($ch:expr),+) => {
72 $($escapes[char_ix($ch)] = Some($ch);)+
73 };
74 ($escapes:expr, $($ch:expr => $esc:expr),+) => {
75 $($escapes[char_ix($ch)] = Some($esc);)+
76 };
77 }
78
79 let mut escapes = [None; 256];
80 escape!(
81 escapes, '*', '+', '?', '.', '@', '(', ')', '[', ']', '{', '}', '|'
82 );
83 escape!(escapes, '\\', '\'', '"', '^', '$', '-');
84 escape!(escapes, 'b', 'B', 'd', 'D', 'w', 'W', 's', 'S');
85 escape!(escapes, 'n'=>'\n', 'r'=>'\r', 't'=>'\t');
86
87 escapes
88}
89
90const ESCAPES: [Option<char>; 256] = init_escapes();
92
93#[derive(Debug, Clone, PartialEq, Eq)]
94struct CharClass {
95 pub(crate) negated: bool,
96 pub(crate) chars: Vec<char>,
97 pub(crate) ranges: Vec<(char, char)>,
98}
99
100impl CharClass {
101 fn try_parse(it: &mut Peekable<Chars<'_>>) -> Result<Self, Error> {
102 let mut next = || next_char(it)?.ok_or(Error::InvalidClass);
103 let (mut ch, _) = next()?;
104
105 let negated = ch == '^';
106 if negated {
107 (ch, _) = next()?
108 };
109 let mut chars = vec![ch];
110 let mut ranges = vec![];
111
112 loop {
113 let (ch, escaped) = next()?;
114 match ch {
115 ']' if !escaped => break,
116
117 '-' if !escaped => {
118 let start = chars.pop().ok_or(Error::InvalidClass)?;
119 let (end, _) = next()?;
120 if start as u32 >= end as u32 {
121 return Err(Error::InvalidClass);
122 }
123
124 ranges.push((start, end));
125 }
126
127 ch => chars.push(ch),
128 }
129 }
130
131 Ok(Self {
132 negated,
133 chars,
134 ranges,
135 })
136 }
137
138 #[inline]
140 fn matches(&self, ch: char) -> bool {
141 if self.negated && ch == '\n' {
142 return false;
143 }
144
145 let res = self.chars.contains(&ch)
146 || self
147 .ranges
148 .iter()
149 .any(|&(start, end)| ch >= start && ch <= end);
150
151 if self.negated { !res } else { res }
152 }
153
154 fn size(&self) -> usize {
155 self.chars.len()
156 + self
157 .ranges
158 .iter()
159 .map(|(s, e)| (*e as u32 - *s as u32 + 1) as usize)
160 .sum::<usize>()
161 }
162}
163
164fn next_char(it: &mut Peekable<Chars<'_>>) -> Result<Option<(char, bool)>, Error> {
165 match it.next() {
166 Some('\\') => (),
167 Some(ch) => return Ok(Some((ch, false))),
168 None => return Ok(None),
169 }
170
171 let ch = match it.next() {
172 Some(ch) => ch,
173 None => return Err(Error::InvalidEscape('\0')),
174 };
175
176 match ESCAPES[char_ix(ch)] {
177 Some(ch) => Ok(Some((ch, true))),
178 None => Err(Error::InvalidEscape(ch)),
179 }
180}
181
182mod impl_structex {
183 use super::*;
184 use crate::buffer::{Buffer, GapBuffer, Slice};
185 use std::{io, ops::Range};
186 use structex::re::{Haystack, RawCaptures, RegexEngine, Sliceable, Writable};
187
188 impl RegexEngine for Regex {
189 type CompileError = Error;
190
191 fn compile(re: &str) -> Result<Self, Self::CompileError> {
192 Regex::compile(re)
193 }
194 }
195
196 impl Haystack<Regex> for str {
197 fn is_match_between(&self, re: &Regex, from: usize, to: usize) -> bool {
198 re.matches_between(&self, from, to)
199 }
200
201 fn captures_between(&self, re: &Regex, from: usize, to: usize) -> Option<RawCaptures> {
202 let m = re.find_between(&self, from, to)?;
203
204 Some(RawCaptures::new(m.iter_locs()))
205 }
206 }
207
208 impl Haystack<Regex> for GapBuffer {
209 fn is_match_between(&self, re: &Regex, from: usize, to: usize) -> bool {
210 re.matches_between(self, from, to)
211 }
212
213 fn captures_between(&self, re: &Regex, from: usize, to: usize) -> Option<RawCaptures> {
214 let m = re.find_between(self, from, to)?;
215
216 Some(RawCaptures::new(m.iter_locs()))
217 }
218 }
219
220 impl Sliceable for GapBuffer {
221 type Slice<'h>
222 = Slice<'h>
223 where
224 Self: 'h;
225
226 fn char_at(&self, byte_offset: usize) -> Option<char> {
227 self.get_char_at(byte_offset)
228 }
229
230 fn slice(&self, range: Range<usize>) -> Self::Slice<'_> {
231 self.slice_from_byte_offsets(range.start, range.end)
232 }
233
234 fn max_len(&self) -> usize {
235 self.len()
236 }
237 }
238
239 impl Writable for GapBuffer {
240 fn write_to<W>(&self, w: &mut W) -> io::Result<usize>
241 where
242 W: io::Write,
243 {
244 let (l, r) = self.as_byte_slices();
245 w.write_all(l)?;
246 w.write_all(r)?;
247
248 Ok(self.len())
249 }
250 }
251
252 impl<'a> Writable for Slice<'a> {
253 fn write_to<W>(&self, w: &mut W) -> io::Result<usize>
254 where
255 W: std::io::Write,
256 {
257 let (l, r) = self.as_slices();
258 w.write_all(l)?;
259 w.write_all(r)?;
260
261 Ok(l.len() + r.len())
262 }
263 }
264
265 impl Haystack<Regex> for Buffer {
266 fn is_match_between(&self, re: &Regex, from: usize, to: usize) -> bool {
267 re.matches_between(self, from, to)
268 }
269
270 fn captures_between(&self, re: &Regex, from: usize, to: usize) -> Option<RawCaptures> {
271 let m = re.find_between(self, from, to)?;
272
273 Some(RawCaptures::new(m.iter_locs()))
274 }
275 }
276
277 impl Sliceable for Buffer {
278 type Slice<'h>
279 = Slice<'h>
280 where
281 Self: 'h;
282
283 fn char_at(&self, byte_offset: usize) -> Option<char> {
284 self.txt.get_char_at(byte_offset)
285 }
286
287 fn slice(&self, range: Range<usize>) -> Self::Slice<'_> {
288 self.txt.slice_from_byte_offsets(range.start, range.end)
289 }
290
291 fn max_len(&self) -> usize {
292 self.txt.len()
293 }
294 }
295
296 impl Writable for Buffer {
297 fn write_to<W>(&self, w: &mut W) -> io::Result<usize>
298 where
299 W: io::Write,
300 {
301 let (l, r) = self.txt.as_byte_slices();
302 w.write_all(l)?;
303 w.write_all(r)?;
304
305 Ok(self.txt.len())
306 }
307 }
308}
309
310#[cfg(test)]
311mod tests {
312 use super::*;
313 use simple_test_case::test_case;
314
315 #[test_case("_", &['_'], &[]; "single underscore")]
316 #[test_case("abc_", &['a', 'b', 'c', '_'], &[]; "chars")]
317 #[test_case("a-z", &[], &[('a', 'z')]; "single range")]
318 #[test_case("a-zA-Z", &[], &[('a', 'z'), ('A', 'Z')]; "multi range")]
319 #[test_case("a-z_./]", &['_', '.', '/'], &[('a', 'z')]; "compound")]
320 #[test_case("a-zA-Z_\\-.]", &['_', '-', '.'], &[('a', 'z'), ('A', 'Z')]; "compound escaped dash")]
321 #[test]
322 fn parsing_classes_works(raw: &str, chars: &[char], ranges: &[(char, char)]) {
323 for (s, negated) in [(format!("{raw}]"), false), (format!("^{raw}]"), true)] {
326 let cls = CharClass::try_parse(&mut s.chars().peekable()).unwrap();
327 let expected = CharClass {
328 negated,
329 chars: chars.to_vec(),
330 ranges: ranges.to_vec(),
331 };
332
333 assert_eq!(cls, expected, "negated={negated}");
334 }
335 }
336
337 #[test_case("z-a]"; "backwards alpha")]
340 #[test_case("Z-A]"; "backwards upper alpha")]
341 #[test_case("9-0]"; "backwards numeric")]
342 #[test_case("a-a]"; "equal alpha endpoints")]
343 #[test_case("5-5]"; "equal numeric endpoints")]
344 #[test_case("a-z9-0]"; "valid then invalid range")]
345 #[test_case("\u{00FF}-\u{0000}]"; "backwards unicode")]
346 #[test]
347 fn invalid_character_ranges_error(s: &str) {
348 assert!(CharClass::try_parse(&mut s.chars().peekable()).is_err());
349 }
350}