iregex_syntax/
lib.rs

1//! This library provides a parser for POSIX Extended Regular Expressions (iregex).
2//! Once parsed into an abstract syntax tree ([`Ast`]), regular expressions can
3//! then be compiled into a finite automaton running on Unicode scalar values
4//! ([`char`] type) using the [`iregex-automata`] library.
5//!
6//! [`iregex-automata`]: <https://crates.io/crates/iregex-automata>
7use iregex::automata::RangeSet;
8use replace_with::replace_with_or_abort;
9use std::ops::Deref;
10
11mod parsing;
12pub use parsing::*;
13
14mod display;
15pub use display::*;
16
17mod build;
18
19/// Abstract syntax tree of an Extended Regular Expression.
20#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
21pub struct Ast {
22	pub start_anchor: bool,
23	pub end_anchor: bool,
24	pub disjunction: Disjunction,
25}
26
27impl Ast {
28	pub fn empty() -> Self {
29		Self {
30			start_anchor: false,
31			end_anchor: false,
32			disjunction: Disjunction::new(),
33		}
34	}
35
36	pub fn is_empty(&self) -> bool {
37		self.disjunction.is_empty()
38	}
39
40	// /// Checks if this regular expression matches only one value.
41	// pub fn is_singleton(&self) -> bool {
42	// 	match self {
43	// 		Self::Any => false,
44	// 		Self::Set(charset) => charset.len() == 1,
45	// 		Self::Sequence(seq) => seq.iter().all(Self::is_singleton),
46	// 		Self::Repeat(e, min, max) => min == max && e.is_singleton(),
47	// 		Self::Union(items) => items.len() == 1 && items[0].is_singleton(),
48	// 	}
49	// }
50
51	// pub fn to_singleton(&self) -> Option<String> {
52	// 	if self.is_singleton() {
53	// 		let mut s = String::new();
54	// 		self.build_singleton(&mut s);
55	// 		Some(s)
56	// 	} else {
57	// 		None
58	// 	}
59	// }
60
61	// fn build_singleton(&self, s: &mut String) {
62	// 	match self {
63	// 		Self::Any => unreachable!(),
64	// 		Self::Set(charset) => s.push(charset.iter().next().unwrap().first().unwrap()),
65	// 		Self::Sequence(seq) => {
66	// 			for e in seq {
67	// 				e.build_singleton(s)
68	// 			}
69	// 		}
70	// 		Self::Repeat(e, _, _) => e.build_singleton(s),
71	// 		Self::Union(items) => items[0].build_singleton(s),
72	// 	}
73	// }
74}
75
76/// Regular expression sequence disjunction.
77#[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
78pub struct Disjunction(Vec<Sequence>);
79
80impl Disjunction {
81	pub fn new() -> Self {
82		Self::default()
83	}
84}
85
86impl Deref for Disjunction {
87	type Target = [Sequence];
88
89	fn deref(&self) -> &Self::Target {
90		self.0.as_slice()
91	}
92}
93
94impl<'a> IntoIterator for &'a Disjunction {
95	type IntoIter = std::slice::Iter<'a, Sequence>;
96	type Item = &'a Sequence;
97
98	fn into_iter(self) -> Self::IntoIter {
99		self.0.iter()
100	}
101}
102
103impl IntoIterator for Disjunction {
104	type IntoIter = std::vec::IntoIter<Sequence>;
105	type Item = Sequence;
106
107	fn into_iter(self) -> Self::IntoIter {
108		self.0.into_iter()
109	}
110}
111
112impl From<Sequence> for Disjunction {
113	fn from(value: Sequence) -> Self {
114		Self(vec![value])
115	}
116}
117
118/// Regular expression atom sequence.
119#[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
120pub struct Sequence(Vec<Atom>);
121
122impl Sequence {
123	pub fn new() -> Self {
124		Self::default()
125	}
126
127	pub fn push(&mut self, atom: Atom) {
128		self.0.push(atom)
129	}
130
131	pub fn into_disjunction(self) -> Disjunction {
132		self.into()
133	}
134}
135
136impl Deref for Sequence {
137	type Target = [Atom];
138
139	fn deref(&self) -> &Self::Target {
140		self.0.as_slice()
141	}
142}
143
144impl FromIterator<Atom> for Sequence {
145	fn from_iter<T: IntoIterator<Item = Atom>>(iter: T) -> Self {
146		Self(Vec::from_iter(iter))
147	}
148}
149
150impl<'a> IntoIterator for &'a Sequence {
151	type IntoIter = std::slice::Iter<'a, Atom>;
152	type Item = &'a Atom;
153
154	fn into_iter(self) -> Self::IntoIter {
155		self.0.iter()
156	}
157}
158
159impl IntoIterator for Sequence {
160	type IntoIter = std::vec::IntoIter<Atom>;
161	type Item = Atom;
162
163	fn into_iter(self) -> Self::IntoIter {
164		self.0.into_iter()
165	}
166}
167
168#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
169pub enum Atom {
170	/// Any character.
171	///
172	/// `.`
173	Any,
174
175	/// Single character.
176	Char(char),
177
178	/// Character set.
179	///
180	/// `[set]` or `[^set]`
181	Set(Charset),
182
183	/// Repetition.
184	Repeat(Box<Self>, Repeat),
185
186	/// Capture group.
187	Group(Disjunction),
188}
189
190impl Atom {
191	pub fn repeat(&mut self, r: Repeat) {
192		replace_with_or_abort(self, |this| Self::Repeat(Box::new(this), r))
193	}
194}
195
196#[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
197pub struct Charset {
198	negative: bool,
199	classes: Classes,
200	set: RangeSet<char>,
201}
202
203impl From<RangeSet<char>> for Charset {
204	fn from(value: RangeSet<char>) -> Self {
205		Self {
206			negative: false,
207			classes: Classes::none(),
208			set: value,
209		}
210	}
211}
212
213macro_rules! classes {
214	($($id:ident: $name:literal ($flag:ident: $flag_value:literal)),*) => {
215		$(const $flag: u16 = $flag_value;)*
216
217		#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
218		pub enum Class {
219			$($id),*
220		}
221
222		impl Class {
223			pub fn from_name(name: &str) -> Option<Self> {
224				match name {
225					$($name => Some(Self::$id),)*
226					_ => None
227				}
228			}
229
230			pub fn name(&self) -> &'static str {
231				match self {
232					$(Self::$id => $name),*
233				}
234			}
235
236			fn flag(&self) -> u16 {
237				match self {
238					$(Self::$id => $flag),*
239				}
240			}
241		}
242
243		#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
244		pub struct Classes(u16);
245
246		impl Classes {
247			pub fn none() -> Self {
248				Self(0)
249			}
250
251			pub fn all() -> Self {
252				Self($($flag)|*)
253			}
254
255			pub fn contains(&self, c: Class) -> bool {
256				self.0 & c.flag() != 0
257			}
258
259			pub fn insert(&mut self, c: Class) {
260				self.0 |= c.flag()
261			}
262
263			pub fn iter(&self) -> ClassesIter {
264				ClassesIter(self.0)
265			}
266		}
267
268		pub struct ClassesIter(u16);
269
270		impl Iterator for ClassesIter {
271			type Item = Class;
272
273			fn next(&mut self) -> Option<Class> {
274				$(
275					if self.0 & $flag != 0 {
276						self.0 &= !$flag;
277						return Some(Class::$id)
278					}
279				)*
280
281				None
282			}
283		}
284	};
285}
286
287impl IntoIterator for Classes {
288	type IntoIter = ClassesIter;
289	type Item = Class;
290
291	fn into_iter(self) -> Self::IntoIter {
292		self.iter()
293	}
294}
295
296impl<'a> IntoIterator for &'a Classes {
297	type IntoIter = ClassesIter;
298	type Item = Class;
299
300	fn into_iter(self) -> Self::IntoIter {
301		self.iter()
302	}
303}
304
305classes! {
306	Upper:  "upper"  (CLASS_UPPER:  0b0000000000001),
307	Lower:  "lower"  (CLASS_LOWER:  0b0000000000010),
308	Alpha:  "alpha"  (CLASS_ALPHA:  0b0000000000100),
309	Alnum:  "alnum"  (CLASS_ALNUM:  0b0000000001000),
310	Digit:  "digit"  (CLASS_DIGIT:  0b0000000010000),
311	Xdigit: "xdigit" (CLASS_XDIGIT: 0b0000000100000),
312	Punct:  "punct"  (CLASS_PUNCT:  0b0000001000000),
313	Blank:  "blank"  (CLASS_BLANK:  0b0000100000000),
314	Space:  "space"  (CLASS_SPACE:  0b0001000000000),
315	Cntrl:  "cntrl"  (CLASS_CNTRL:  0b0010000000000),
316	Graph:  "graph"  (CLASS_GRAPH:  0b0100000000000),
317	Print:  "print"  (CLASS_PRINT:  0b1000000000000)
318}
319
320#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
321pub struct Repeat {
322	pub min: u32,
323	pub max: Option<u32>,
324}
325
326#[cfg(test)]
327mod tests {
328	use iregex::automata::nfa::U32StateBuilder;
329
330	use crate::Ast;
331
332	#[test]
333	fn test1() {
334		let ast = Ast::parse("^#([^\n#][^\n]*)?$".chars()).unwrap();
335		let exp = ast.build();
336		let aut = exp.compile(U32StateBuilder::new()).unwrap();
337
338		assert!(aut.matches_str("#").next().is_some());
339		assert!(aut.matches_str("#a").next().is_some());
340		assert!(aut.matches_str("##").next().is_none());
341
342		// let aut = aut
343		// 	.root
344		// 	.unwrap()
345		// 	.unwrap();
346
347		// for &q in aut.states() {
348		// 	eprintln!("#{q}:");
349		// 	for (label, r) in aut.successors(&q) {
350		// 		match label {
351		// 			Some(set) => {
352		// 				eprintln!("\t{set:?} => {r:?}");
353		// 			}
354		// 			None => {
355		// 				eprintln!("\telse {r:?}")
356		// 			}
357		// 		}
358		// 	}
359		// }
360	}
361}