iregex_syntax/
parsing.rs

1use std::{borrow::Borrow, iter::Peekable, ops::Bound, str::FromStr};
2
3use iregex::automata::{AnyRange, RangeSet};
4
5use crate::{Ast, Atom, Charset, Class, Classes, Disjunction, Repeat, Sequence};
6
7#[derive(Debug, thiserror::Error)]
8pub enum Error {
9	#[error(transparent)]
10	Unexpected(Unexpected),
11
12	#[error("unexpected metacharacter `{0}`")]
13	UnexpectedMetacharacter(char),
14
15	#[error("invalid class name `{0}`")]
16	InvalidClassName(String),
17
18	#[error("overflow")]
19	Overflow,
20}
21
22#[derive(Debug, thiserror::Error)]
23pub enum Unexpected {
24	#[error("unexpected end of stream")]
25	EndOfStream,
26
27	#[error("unexpected character `{0}`")]
28	Char(char),
29}
30
31impl From<Option<char>> for Unexpected {
32	fn from(value: Option<char>) -> Self {
33		match value {
34			Some(c) => Self::Char(c),
35			None => Self::EndOfStream,
36		}
37	}
38}
39
40enum AtomOrRepeat {
41	Atom(Atom),
42	Repeat(Repeat),
43}
44
45impl Atom {
46	pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Option<Self>, Error> {
47		let result = match chars.peek().copied() {
48			None | Some(')' | '|' | '$') => return Ok(None),
49			Some(c @ ('^' | ']' | '}' | '?' | '*' | '+')) => {
50				return Err(Error::UnexpectedMetacharacter(c))
51			}
52			Some('.') => {
53				chars.next();
54				Self::Any
55			}
56			Some('[') => {
57				let charset = Charset::parse(chars)?;
58				Self::Set(charset)
59			}
60			Some('(') => {
61				chars.next();
62				let group = Disjunction::parse(chars)?;
63				match chars.next() {
64					Some(')') => Self::Group(group),
65					other => return Err(Error::Unexpected(other.into())),
66				}
67			}
68			Some('\\') => {
69				chars.next();
70				let c = parse_escaped_char(chars)?;
71				Self::Char(c)
72			}
73			Some(c) => {
74				chars.next();
75				Self::Char(c)
76			}
77		};
78
79		Ok(Some(result))
80	}
81}
82
83impl AtomOrRepeat {
84	pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Option<Self>, Error> {
85		let result = match chars.peek().copied() {
86			None | Some(')' | '|' | '$') => return Ok(None),
87			Some(c @ ('^' | ']' | '}')) => return Err(Error::UnexpectedMetacharacter(c)),
88			Some('.') => {
89				chars.next();
90				Self::Atom(Atom::Any)
91			}
92			Some('[') => {
93				let charset = Charset::parse(chars)?;
94				Self::Atom(Atom::Set(charset))
95			}
96			Some('(') => {
97				chars.next();
98				let group = Disjunction::parse(chars)?;
99				match chars.next() {
100					Some(')') => Self::Atom(Atom::Group(group)),
101					other => return Err(Error::Unexpected(other.into())),
102				}
103			}
104			Some('{') => Self::Repeat(Repeat::parse(chars)?),
105			Some('?') => {
106				chars.next();
107				Self::Repeat(Repeat {
108					min: 0,
109					max: Some(1),
110				})
111			}
112			Some('*') => {
113				chars.next();
114				Self::Repeat(Repeat { min: 0, max: None })
115			}
116			Some('+') => {
117				chars.next();
118				Self::Repeat(Repeat { min: 1, max: None })
119			}
120			Some('\\') => {
121				chars.next();
122				let c = parse_escaped_char(chars)?;
123				Self::Atom(Atom::Char(c))
124			}
125			Some(c) => {
126				chars.next();
127				Self::Atom(Atom::Char(c))
128			}
129		};
130
131		Ok(Some(result))
132	}
133}
134
135impl Sequence {
136	pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
137		match Atom::parse(chars)? {
138			Some(atom) => {
139				let mut result = vec![atom];
140
141				while let Some(atom_or_repeat) = AtomOrRepeat::parse(chars)? {
142					match atom_or_repeat {
143						AtomOrRepeat::Atom(atom) => result.push(atom),
144						AtomOrRepeat::Repeat(r) => result.last_mut().unwrap().repeat(r),
145					}
146				}
147
148				Ok(Self(result))
149			}
150			None => Ok(Self::new()),
151		}
152	}
153}
154
155impl Disjunction {
156	pub fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
157		let mut result = vec![Sequence::parse(chars)?];
158		while let Some(c) = chars.peek().copied() {
159			match c {
160				'|' => {
161					chars.next();
162					result.push(Sequence::parse(chars)?)
163				}
164				')' | '$' => break,
165				c => return Err(Error::UnexpectedMetacharacter(c)),
166			}
167		}
168
169		Ok(Self(result))
170	}
171}
172
173impl FromStr for Disjunction {
174	type Err = Error;
175
176	fn from_str(s: &str) -> Result<Self, Self::Err> {
177		let mut chars = s.chars().peekable();
178		Self::parse(&mut chars)
179	}
180}
181
182impl Ast {
183	pub fn parse(chars: impl IntoIterator<Item = char>) -> Result<Self, Error> {
184		let mut chars = chars.into_iter().peekable();
185
186		let start_anchor = match chars.peek().copied() {
187			Some('^') => {
188				chars.next();
189				true
190			}
191			_ => false,
192		};
193
194		let inner = Disjunction::parse(&mut chars)?;
195
196		let end_anchor = match chars.next() {
197			Some('$') => true,
198			None => false,
199			Some(c) => return Err(Error::UnexpectedMetacharacter(c)),
200		};
201
202		Ok(Self {
203			start_anchor,
204			end_anchor,
205			disjunction: inner,
206		})
207	}
208}
209
210impl FromStr for Ast {
211	type Err = Error;
212
213	fn from_str(s: &str) -> Result<Self, Self::Err> {
214		Self::parse(s.chars())
215	}
216}
217
218impl<S: Borrow<str>> From<S> for Ast {
219	fn from(s: S) -> Self {
220		let mut seq = Sequence::new();
221
222		for c in s.borrow().chars() {
223			seq.push(Atom::Char(c))
224		}
225
226		Self {
227			start_anchor: true,
228			end_anchor: true,
229			disjunction: Disjunction(vec![seq]),
230		}
231	}
232}
233
234impl Class {
235	fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
236		match chars.next() {
237			Some(':') => {
238				let mut name = String::new();
239
240				loop {
241					match chars.next() {
242						Some(':') => break,
243						Some(
244							c @ ('(' | ')' | '{' | '}' | '[' | ']' | '|' | '?' | '*' | '+' | '^'),
245						) => return Err(Error::UnexpectedMetacharacter(c)),
246						Some(c) => name.push(c),
247						None => return Err(Error::Unexpected(Unexpected::EndOfStream)),
248					}
249				}
250
251				match chars.next() {
252					Some(']') => match Class::from_name(&name) {
253						Some(class) => Ok(class),
254						None => Err(Error::InvalidClassName(name)),
255					},
256					other => Err(Error::Unexpected(other.into())),
257				}
258			}
259			other => Err(Error::Unexpected(other.into())),
260		}
261	}
262}
263
264enum RangeOrClass {
265	Range(AnyRange<char>, bool),
266	Class(Class),
267}
268
269impl RangeOrClass {
270	fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Option<Self>, Error> {
271		let start = match chars.next() {
272			Some(']') => return Ok(None),
273			Some('[') => {
274				return Ok(Some(Self::Class(Class::parse(chars)?)));
275			}
276			Some('\\') => parse_escaped_char(chars)?,
277			Some(c) => c,
278			None => return Err(Error::Unexpected(Unexpected::EndOfStream)),
279		};
280
281		let (end, minus) = match chars.peek().copied() {
282			Some('-') => {
283				chars.next();
284				match chars.peek().copied() {
285					Some(']') => (start, true),
286					Some('\\') => {
287						chars.next();
288						let c = parse_escaped_char(chars)?;
289						(c, false)
290					}
291					Some(c) => {
292						chars.next();
293						(c, false)
294					}
295					None => return Err(Error::Unexpected(Unexpected::EndOfStream)),
296				}
297			}
298			_ => (start, false),
299		};
300
301		Ok(Some(Self::Range(
302			AnyRange::new(Bound::Included(start), Bound::Included(end)),
303			minus,
304		)))
305	}
306}
307
308impl Charset {
309	fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
310		match chars.next() {
311			Some('[') => (),
312			other => return Err(Error::Unexpected(other.into())),
313		}
314
315		let negative = match chars.peek().copied() {
316			Some('^') => {
317				chars.next();
318				true
319			}
320			_ => false,
321		};
322
323		let mut classes = Classes::none();
324		let mut set = RangeSet::new();
325
326		while let Some(range_or_class) = RangeOrClass::parse(chars)? {
327			match range_or_class {
328				RangeOrClass::Range(range, and_minus) => {
329					set.insert(range);
330					if and_minus {
331						set.insert('-');
332					}
333				}
334				RangeOrClass::Class(class) => {
335					classes.insert(class);
336				}
337			}
338		}
339
340		Ok(Self {
341			negative,
342			classes,
343			set,
344		})
345	}
346}
347
348impl Repeat {
349	fn parse(chars: &mut Peekable<impl Iterator<Item = char>>) -> Result<Self, Error> {
350		match chars.next() {
351			Some('{') => (),
352			other => return Err(Error::Unexpected(other.into())),
353		}
354
355		fn parse_number<T, C: Iterator<Item = char>>(
356			chars: &mut Peekable<C>,
357			f: impl FnOnce(&mut Peekable<C>, Option<u32>, char) -> Result<T, Error>,
358		) -> Result<T, Error> {
359			match chars.next() {
360				Some(c) => match c.to_digit(10) {
361					Some(mut value) => loop {
362						match chars.next() {
363							Some(c) => match c.to_digit(10) {
364								Some(d) => {
365									value = value.checked_mul(10).ok_or(Error::Overflow)?;
366									value = value.checked_add(d).ok_or(Error::Overflow)?;
367								}
368								None => break f(chars, Some(value), c),
369							},
370							None => break Err(Error::Unexpected(Unexpected::EndOfStream)),
371						}
372					},
373					None => f(chars, None, c),
374				},
375				None => Err(Error::Unexpected(Unexpected::EndOfStream)),
376			}
377		}
378
379		parse_number(chars, |chars, value, next| match value {
380			Some(min) => match next {
381				',' => parse_number(chars, |_, max, next| {
382					if next == '}' {
383						Ok(Self { min, max })
384					} else {
385						Err(Error::Unexpected(Unexpected::Char(next)))
386					}
387				}),
388				'}' => Ok(Self {
389					min,
390					max: Some(min),
391				}),
392				c => Err(Error::Unexpected(Unexpected::Char(c))),
393			},
394			None => Err(Error::Unexpected(Unexpected::Char(next))),
395		})
396	}
397}
398
399fn parse_escaped_char(chars: &mut impl Iterator<Item = char>) -> Result<char, Error> {
400	match chars.next() {
401		Some(c) => match c {
402			'0' => Ok('\0'),
403			'a' => Ok('\x07'),
404			'b' => Ok('\x08'),
405			's' => Ok(' '),
406			't' => Ok('\t'),
407			'n' => Ok('\n'),
408			'v' => Ok('\x0b'),
409			'f' => Ok('\x0c'),
410			'r' => Ok('\r'),
411			'e' => Ok('\x1b'),
412			c => Ok(c),
413		},
414		None => Err(Error::Unexpected(Unexpected::EndOfStream)),
415	}
416}
417
418#[cfg(test)]
419mod tests {
420	use super::*;
421
422	#[test]
423	fn parse_success() {
424		const INPUTS: [&str; 19] = [
425			"",
426			"abc",
427			"(abc)",
428			"[abc]",
429			"[^abc]",
430			"[a^bc]",
431			"[abc-]",
432			"abc?",
433			"abc*",
434			"abc+",
435			"abc|def",
436			"(abc|def)",
437			"(abc|def)*",
438			"(abc|(def)?)*",
439			"[[:alpha:]]",
440			"(abc){12,}",
441			"(abc){12,34}",
442			"(abc){12}",
443			"(abc){4294967295}",
444		];
445
446		for input in INPUTS {
447			if let Err(e) = Ast::parse(input.chars()) {
448				panic!("failed to parse `{input}` with {e:?}")
449			}
450		}
451	}
452
453	#[test]
454	fn parse_failure() {
455		const INPUTS: [&str; 13] = [
456			"?",
457			"(abc",
458			"[[:abc:]]",
459			"[abc",
460			"[^abc",
461			"abc)",
462			"abc]",
463			"(abc){,}",
464			"(abc){,12}",
465			"(abc){,12",
466			"(abc){12,34",
467			"(abc){12",
468			"(abc){4294967296}",
469		];
470
471		for input in INPUTS {
472			if let Ok(ast) = Ast::parse(input.chars()) {
473				panic!("failed to reject `{input}`, parsed as {ast:?}")
474			}
475		}
476	}
477}