1use std::{borrow::Borrow, iter::Peekable, ops::Bound, str::FromStr};
2
3use iregex::automata::{AnyRange, RangeSet};
4
5use crate::{Ast, Atom, Charset, Class, Classes, Disjunction, Repeat, Sequence};
6
7#[derive(Debug, thiserror::Error)]
8pub enum Error {
9 #[error(transparent)]
10 Unexpected(Unexpected),
11
12 #[error("unexpected metacharacter `{0}`")]
13 UnexpectedMetacharacter(char),
14
15 #[error("invalid class name `{0}`")]
16 InvalidClassName(String),
17
18 #[error("overflow")]
19 Overflow,
20}
21
22#[derive(Debug, thiserror::Error)]
23pub enum Unexpected {
24 #[error("unexpected end of stream")]
25 EndOfStream,
26
27 #[error("unexpected character `{0}`")]
28 Char(char),
29}
30
31impl From<Option<char>> for Unexpected {
32 fn from(value: Option<char>) -> Self {
33 match value {
34 Some(c) => Self::Char(c),
35 None => Self::EndOfStream,
36 }
37 }
38}
39
40enum AtomOrRepeat {
41 Atom(Atom),
42 Repeat(Repeat),
43}
44
45impl Atom {
46 pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Option<Self>, Error> {
47 let result = match chars.peek().copied() {
48 None | Some(')' | '|' | '$') => return Ok(None),
49 Some(c @ ('^' | ']' | '}' | '?' | '*' | '+')) => {
50 return Err(Error::UnexpectedMetacharacter(c))
51 }
52 Some('.') => {
53 chars.next();
54 Self::Any
55 }
56 Some('[') => {
57 let charset = Charset::parse(chars)?;
58 Self::Set(charset)
59 }
60 Some('(') => {
61 chars.next();
62 let group = Disjunction::parse(chars)?;
63 match chars.next() {
64 Some(')') => Self::Group(group),
65 other => return Err(Error::Unexpected(other.into())),
66 }
67 }
68 Some('\\') => {
69 chars.next();
70 let c = parse_escaped_char(chars)?;
71 Self::Char(c)
72 }
73 Some(c) => {
74 chars.next();
75 Self::Char(c)
76 }
77 };
78
79 Ok(Some(result))
80 }
81}
82
83impl AtomOrRepeat {
84 pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Option<Self>, Error> {
85 let result = match chars.peek().copied() {
86 None | Some(')' | '|' | '$') => return Ok(None),
87 Some(c @ ('^' | ']' | '}')) => return Err(Error::UnexpectedMetacharacter(c)),
88 Some('.') => {
89 chars.next();
90 Self::Atom(Atom::Any)
91 }
92 Some('[') => {
93 let charset = Charset::parse(chars)?;
94 Self::Atom(Atom::Set(charset))
95 }
96 Some('(') => {
97 chars.next();
98 let group = Disjunction::parse(chars)?;
99 match chars.next() {
100 Some(')') => Self::Atom(Atom::Group(group)),
101 other => return Err(Error::Unexpected(other.into())),
102 }
103 }
104 Some('{') => Self::Repeat(Repeat::parse(chars)?),
105 Some('?') => {
106 chars.next();
107 Self::Repeat(Repeat {
108 min: 0,
109 max: Some(1),
110 })
111 }
112 Some('*') => {
113 chars.next();
114 Self::Repeat(Repeat { min: 0, max: None })
115 }
116 Some('+') => {
117 chars.next();
118 Self::Repeat(Repeat { min: 1, max: None })
119 }
120 Some('\\') => {
121 chars.next();
122 let c = parse_escaped_char(chars)?;
123 Self::Atom(Atom::Char(c))
124 }
125 Some(c) => {
126 chars.next();
127 Self::Atom(Atom::Char(c))
128 }
129 };
130
131 Ok(Some(result))
132 }
133}
134
135impl Sequence {
136 pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
137 match Atom::parse(chars)? {
138 Some(atom) => {
139 let mut result = vec![atom];
140
141 while let Some(atom_or_repeat) = AtomOrRepeat::parse(chars)? {
142 match atom_or_repeat {
143 AtomOrRepeat::Atom(atom) => result.push(atom),
144 AtomOrRepeat::Repeat(r) => result.last_mut().unwrap().repeat(r),
145 }
146 }
147
148 Ok(Self(result))
149 }
150 None => Ok(Self::new()),
151 }
152 }
153}
154
155impl Disjunction {
156 pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
157 let mut result = vec![Sequence::parse(chars)?];
158 while let Some(c) = chars.peek().copied() {
159 match c {
160 '|' => {
161 chars.next();
162 result.push(Sequence::parse(chars)?)
163 }
164 ')' | '$' => break,
165 c => return Err(Error::UnexpectedMetacharacter(c)),
166 }
167 }
168
169 Ok(Self(result))
170 }
171}
172
173impl FromStr for Disjunction {
174 type Err = Error;
175
176 fn from_str(s: &str) -> Result<Self, Self::Err> {
177 let mut chars = s.chars().peekable();
178 Self::parse(&mut chars)
179 }
180}
181
182impl Ast {
183 pub fn parse(chars: impl IntoIterator<Item = char>) -> Result<Self, Error> {
184 let mut chars = chars.into_iter().peekable();
185
186 let start_anchor = match chars.peek().copied() {
187 Some('^') => {
188 chars.next();
189 true
190 }
191 _ => false,
192 };
193
194 let inner = Disjunction::parse(&mut chars)?;
195
196 let end_anchor = match chars.next() {
197 Some('$') => true,
198 None => false,
199 Some(c) => return Err(Error::UnexpectedMetacharacter(c)),
200 };
201
202 Ok(Self {
203 start_anchor,
204 end_anchor,
205 disjunction: inner,
206 })
207 }
208}
209
210impl FromStr for Ast {
211 type Err = Error;
212
213 fn from_str(s: &str) -> Result<Self, Self::Err> {
214 Self::parse(s.chars())
215 }
216}
217
218impl<S: Borrow<str>> From<S> for Ast {
219 fn from(s: S) -> Self {
220 let mut seq = Sequence::new();
221
222 for c in s.borrow().chars() {
223 seq.push(Atom::Char(c))
224 }
225
226 Self {
227 start_anchor: true,
228 end_anchor: true,
229 disjunction: Disjunction(vec![seq]),
230 }
231 }
232}
233
234impl Class {
235 fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
236 match chars.next() {
237 Some(':') => {
238 let mut name = String::new();
239
240 loop {
241 match chars.next() {
242 Some(':') => break,
243 Some(
244 c @ ('(' | ')' | '{' | '}' | '[' | ']' | '|' | '?' | '*' | '+' | '^'),
245 ) => return Err(Error::UnexpectedMetacharacter(c)),
246 Some(c) => name.push(c),
247 None => return Err(Error::Unexpected(Unexpected::EndOfStream)),
248 }
249 }
250
251 match chars.next() {
252 Some(']') => match Class::from_name(&name) {
253 Some(class) => Ok(class),
254 None => Err(Error::InvalidClassName(name)),
255 },
256 other => Err(Error::Unexpected(other.into())),
257 }
258 }
259 other => Err(Error::Unexpected(other.into())),
260 }
261 }
262}
263
264enum RangeOrClass {
265 Range(AnyRange<char>, bool),
266 Class(Class),
267}
268
269impl RangeOrClass {
270 fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Option<Self>, Error> {
271 let start = match chars.next() {
272 Some(']') => return Ok(None),
273 Some('[') => {
274 return Ok(Some(Self::Class(Class::parse(chars)?)));
275 }
276 Some('\\') => parse_escaped_char(chars)?,
277 Some(c) => c,
278 None => return Err(Error::Unexpected(Unexpected::EndOfStream)),
279 };
280
281 let (end, minus) = match chars.peek().copied() {
282 Some('-') => {
283 chars.next();
284 match chars.peek().copied() {
285 Some(']') => (start, true),
286 Some('\\') => {
287 chars.next();
288 let c = parse_escaped_char(chars)?;
289 (c, false)
290 }
291 Some(c) => {
292 chars.next();
293 (c, false)
294 }
295 None => return Err(Error::Unexpected(Unexpected::EndOfStream)),
296 }
297 }
298 _ => (start, false),
299 };
300
301 Ok(Some(Self::Range(
302 AnyRange::new(Bound::Included(start), Bound::Included(end)),
303 minus,
304 )))
305 }
306}
307
308impl Charset {
309 fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
310 match chars.next() {
311 Some('[') => (),
312 other => return Err(Error::Unexpected(other.into())),
313 }
314
315 let negative = match chars.peek().copied() {
316 Some('^') => {
317 chars.next();
318 true
319 }
320 _ => false,
321 };
322
323 let mut classes = Classes::none();
324 let mut set = RangeSet::new();
325
326 while let Some(range_or_class) = RangeOrClass::parse(chars)? {
327 match range_or_class {
328 RangeOrClass::Range(range, and_minus) => {
329 set.insert(range);
330 if and_minus {
331 set.insert('-');
332 }
333 }
334 RangeOrClass::Class(class) => {
335 classes.insert(class);
336 }
337 }
338 }
339
340 Ok(Self {
341 negative,
342 classes,
343 set,
344 })
345 }
346}
347
348impl Repeat {
349 fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
350 match chars.next() {
351 Some('{') => (),
352 other => return Err(Error::Unexpected(other.into())),
353 }
354
355 fn parse_number<T, C: Iterator<Item = char>>(
356 chars: &mut Peekable<C>,
357 f: impl FnOnce(&mut Peekable<C>, Option<u32>, char) -> Result<T, Error>,
358 ) -> Result<T, Error> {
359 match chars.next() {
360 Some(c) => match c.to_digit(10) {
361 Some(mut value) => loop {
362 match chars.next() {
363 Some(c) => match c.to_digit(10) {
364 Some(d) => {
365 value = value.checked_mul(10).ok_or(Error::Overflow)?;
366 value = value.checked_add(d).ok_or(Error::Overflow)?;
367 }
368 None => break f(chars, Some(value), c),
369 },
370 None => break Err(Error::Unexpected(Unexpected::EndOfStream)),
371 }
372 },
373 None => f(chars, None, c),
374 },
375 None => Err(Error::Unexpected(Unexpected::EndOfStream)),
376 }
377 }
378
379 parse_number(chars, |chars, value, next| match value {
380 Some(min) => match next {
381 ',' => parse_number(chars, |_, max, next| {
382 if next == '}' {
383 Ok(Self { min, max })
384 } else {
385 Err(Error::Unexpected(Unexpected::Char(next)))
386 }
387 }),
388 '}' => Ok(Self {
389 min,
390 max: Some(min),
391 }),
392 c => Err(Error::Unexpected(Unexpected::Char(c))),
393 },
394 None => Err(Error::Unexpected(Unexpected::Char(next))),
395 })
396 }
397}
398
399fn parse_escaped_char(chars: &mut impl Iterator<Item = char>) -> Result<char, Error> {
400 match chars.next() {
401 Some(c) => match c {
402 '0' => Ok('\0'),
403 'a' => Ok('\x07'),
404 'b' => Ok('\x08'),
405 's' => Ok(' '),
406 't' => Ok('\t'),
407 'n' => Ok('\n'),
408 'v' => Ok('\x0b'),
409 'f' => Ok('\x0c'),
410 'r' => Ok('\r'),
411 'e' => Ok('\x1b'),
412 c => Ok(c),
413 },
414 None => Err(Error::Unexpected(Unexpected::EndOfStream)),
415 }
416}
417
418#[cfg(test)]
419mod tests {
420 use super::*;
421
422 #[test]
423 fn parse_success() {
424 const INPUTS: [&str; 19] = [
425 "",
426 "abc",
427 "(abc)",
428 "[abc]",
429 "[^abc]",
430 "[a^bc]",
431 "[abc-]",
432 "abc?",
433 "abc*",
434 "abc+",
435 "abc|def",
436 "(abc|def)",
437 "(abc|def)*",
438 "(abc|(def)?)*",
439 "[[:alpha:]]",
440 "(abc){12,}",
441 "(abc){12,34}",
442 "(abc){12}",
443 "(abc){4294967295}",
444 ];
445
446 for input in INPUTS {
447 if let Err(e) = Ast::parse(input.chars()) {
448 panic!("failed to parse `{input}` with {e:?}")
449 }
450 }
451 }
452
453 #[test]
454 fn parse_failure() {
455 const INPUTS: [&str; 13] = [
456 "?",
457 "(abc",
458 "[[:abc:]]",
459 "[abc",
460 "[^abc",
461 "abc)",
462 "abc]",
463 "(abc){,}",
464 "(abc){,12}",
465 "(abc){,12",
466 "(abc){12,34",
467 "(abc){12",
468 "(abc){4294967296}",
469 ];
470
471 for input in INPUTS {
472 if let Ok(ast) = Ast::parse(input.chars()) {
473 panic!("failed to reject `{input}`, parsed as {ast:?}")
474 }
475 }
476 }
477}