pom/
utf8.rs

1// Variants of parser functions specialized for matching UTF-8 strings and returning chars
2
3use super::parser;
4use super::{Error, Result};
5use crate::range::RangeArgument;
6use crate::set::Set;
7use bstr::decode_utf8;
8use std::fmt::Debug;
9use std::ops::{Add, BitOr, Mul, Neg, Not, Shr, Sub};
10use std::str;
11
12// / Parser combinator.
13//type Parse<'a, O> = dyn Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a;
14
15/// Being wrapped in this struct guarantees that the parser within will only match valid UTF-8 strings.
16pub struct Parser<'a, O>(parser::Parser<'a, u8, O>);
17
18impl<'a, O> Parser<'a, O> {
19	/// Create new parser.
20	pub fn new<P>(parse: P) -> Self
21	where
22		P: Fn(&'a [u8], usize) -> Result<(O, usize)> + 'a,
23	{
24		Self(parser::Parser::new(parse))
25	}
26
27	/// Collect all matched input symbols.
28	// This method is the primary reason utf8::Parser exists at all.
29	pub fn collect(self) -> Parser<'a, &'a str>
30	where
31		O: 'a,
32	{
33		Parser(self.0.collect().map(
34			// UNSAFE: Because we only could have constructed this object from other utf8::Parser objects, the match space must be valid UTF-8
35			|s| unsafe { str::from_utf8_unchecked(s) },
36		))
37	}
38
39	// Remaining methods in impl only delegate to base parser::Parser
40
41	/// Apply the parser to parse input.
42	pub fn parse(&self, input: &'a [u8]) -> Result<O> {
43		self.0.parse(input)
44	}
45
46	/// Parse input at specified byte position.
47	pub fn parse_at(&self, input: &'a [u8], start: usize) -> Result<(O, usize)> {
48		self.0.parse_at(input, start)
49	}
50
51	/// Apply the parser to parse input.
52	pub fn parse_str(&self, input: &'a str) -> Result<O> {
53		self.0.parse(input.as_bytes())
54	}
55
56	/// Convert parser result to desired value.
57	pub fn map<U, F>(self, f: F) -> Parser<'a, U>
58	where
59		F: Fn(O) -> U + 'a,
60		O: 'a,
61		U: 'a,
62	{
63		Parser(self.0.map(f))
64	}
65
66	/// Convert parser result to desired value, fail in case of conversion error.
67	pub fn convert<U, E, F>(self, f: F) -> Parser<'a, U>
68	where
69		F: Fn(O) -> ::std::result::Result<U, E> + 'a,
70		E: Debug,
71		O: 'a,
72		U: 'a,
73	{
74		Parser(self.0.convert(f))
75	}
76
77	/// Cache parser output result to speed up backtracking.
78	pub fn cache(self) -> Self
79	where
80		O: Clone + 'a,
81	{
82		Self(self.0.cache())
83	}
84
85	/// Get input position after matching parser.
86	pub fn pos(self) -> Parser<'a, usize>
87	where
88		O: 'a,
89	{
90		Parser(self.0.pos())
91	}
92
93	/// Discard parser output.
94	pub fn discard(self) -> Parser<'a, ()>
95	where
96		O: 'a,
97	{
98		Parser(self.0.discard())
99	}
100
101	/// Make parser optional.
102	pub fn opt(self) -> Parser<'a, Option<O>>
103	where
104		O: 'a,
105	{
106		Parser(self.0.opt())
107	}
108
109	/// `p.repeat(5)` repeat p exactly 5 times
110	/// `p.repeat(0..)` repeat p zero or more times
111	/// `p.repeat(1..)` repeat p one or more times
112	/// `p.repeat(1..4)` match p at least 1 and at most 3 times
113	pub fn repeat<R>(self, range: R) -> Parser<'a, Vec<O>>
114	where
115		R: RangeArgument<usize> + Debug + 'a,
116		O: 'a,
117	{
118		Parser(self.0.repeat(range))
119	}
120
121	/// Give parser a name to identify parsing errors.
122	pub fn name(self, name: &'a str) -> Self
123	where
124		O: 'a,
125	{
126		Self(self.0.name(name))
127	}
128
129	/// Mark parser as expected, abort early when failed in ordered choice.
130	pub fn expect(self, name: &'a str) -> Self
131	where
132		O: 'a,
133	{
134		Self(self.0.expect(name))
135	}
136}
137
138impl<'a, O> From<Parser<'a, O>> for parser::Parser<'a, u8, O> {
139	fn from(parser: Parser<'a, O>) -> Self {
140		parser.0 // Simply unwrap
141	}
142}
143
144pub fn decode(slice: &[u8], start: usize) -> Result<(char, usize)> {
145	let (ch, size) = decode_utf8(&slice[start..]);
146	let Some(ch) = ch else {
147		return no_utf8(start, size);
148	};
149	Ok((ch, size))
150}
151
152// Helper for functions that decode_utf8 and fail
153fn no_utf8<T>(start: usize, size: usize) -> Result<T> {
154	Err(Error::Mismatch {
155		message: if size == 0 {
156			"end of input reached"
157		} else {
158			"not UTF-8"
159		}
160		.to_owned(),
161		position: start,
162	})
163}
164
165/// Match any UTF-8 character.
166pub fn any<'a>() -> Parser<'a, char> {
167	Parser::new(|input: &[u8], start: usize| {
168		let (ch, size) = decode(input, start)?;
169		let pos = start + size;
170		Ok((ch, pos))
171	})
172}
173
174/// Match specific UTF-8 character.
175pub fn sym<'a>(tag: char) -> Parser<'a, char> {
176	Parser::new(move |input: &[u8], start: usize| {
177		let (ch, size) = decode(input, start)?;
178		if ch != tag {
179			return Err(Error::Mismatch {
180				message: format!("expect: {}, found: {}", tag, ch),
181				position: start,
182			});
183		}
184		let pos = start + size;
185		Ok((ch, pos))
186	})
187}
188
189/// Success when sequence of chars matches current input.
190pub fn seq<'a, 'b: 'a>(tag_str: &'b str) -> Parser<'a, &'a str> {
191	let tag = tag_str.as_bytes();
192	Parser::new(move |input: &'a [u8], start: usize| {
193		let mut index = 0;
194		loop {
195			let pos = start + index;
196			if index == tag.len() {
197				let result = &input[start..pos];
198				// UNSAFE: Because slice is byte-identical to a str, it is known valid UTF-8
199				let result_str = unsafe { str::from_utf8_unchecked(result) };
200				return Ok((result_str, pos));
201			}
202			let Some(s) = input.get(pos) else {
203				return Err(Error::Incomplete);
204			};
205			if tag[index] != *s {
206				return Err(Error::Mismatch {
207					message: format!("seq {:?} at byte index: {}", tag, pos),
208					position: pos,
209				});
210			}
211			index += 1;
212		}
213	})
214}
215
216/// Success when current input symbol is one of the set.
217pub fn one_of<'a, S>(set: &'a S) -> Parser<'a, char>
218where
219	S: Set<char> + ?Sized,
220{
221	Parser::new(move |input: &'a [u8], start: usize| {
222		let (ch, size) = decode(input, start)?;
223		if !set.contains(&ch) {
224			return Err(Error::Mismatch {
225				message: format!("expect one of: {}, found: {}", set.to_str(), ch),
226				position: start,
227			});
228		}
229		let pos = start + size;
230		Ok((ch, pos))
231	})
232}
233
234/// Success when current input symbol is none of the set.
235pub fn none_of<'a, S>(set: &'a S) -> Parser<'a, char>
236where
237	S: Set<char> + ?Sized,
238{
239	Parser::new(move |input: &'a [u8], start: usize| {
240		let (ch, size) = decode(input, start)?;
241		if set.contains(&ch) {
242			return Err(Error::Mismatch {
243				message: format!("expect one of: {}, found: {}", set.to_str(), ch),
244				position: start,
245			});
246		}
247		let pos = start + size;
248		Ok((ch, pos))
249	})
250}
251
252/// Success when predicate returns true on current input symbol.
253pub fn is_a<'a, F>(predicate: F) -> Parser<'a, char>
254where
255	F: Fn(char) -> bool + 'a,
256{
257	Parser::new(move |input: &'a [u8], start: usize| {
258		let (ch, size) = decode(input, start)?;
259		if !predicate(ch) {
260			return Err(Error::Mismatch {
261				message: format!("is_a predicate failed on: {}", ch),
262				position: start,
263			});
264		}
265		let pos = start + size;
266		Ok((ch, pos))
267	})
268}
269
270/// Success when predicate returns false on current input symbol.
271pub fn not_a<'a, F>(predicate: F) -> Parser<'a, char>
272where
273	F: Fn(char) -> bool + 'a,
274{
275	Parser::new(move |input: &'a [u8], start: usize| {
276		let (ch, size) = decode(input, start)?;
277		if predicate(ch) {
278			return Err(Error::Mismatch {
279				message: format!("is_a predicate failed on: {}", ch),
280				position: start,
281			});
282		}
283		let pos = start + size;
284		Ok((ch, pos))
285	})
286}
287
288/// Read n chars.
289pub fn take<'a>(n: usize) -> Parser<'a, &'a str> {
290	Parser::new(move |input: &'a [u8], start: usize| {
291		let mut byte_pos = start;
292		for _ in 0..n {
293			let (ch, size) = decode_utf8(&input[start..]);
294			if ch.is_none() {
295				return no_utf8(byte_pos, size);
296			}
297			byte_pos += size;
298		}
299		let result = &input[start..byte_pos];
300		// UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8
301		let result_str = unsafe { str::from_utf8_unchecked(result) };
302		Ok((result_str, byte_pos))
303	})
304}
305
306/// Skip n symbols.
307pub fn skip<'a>(n: usize) -> Parser<'a, ()> {
308	Parser::new(move |input: &'a [u8], start: usize| {
309		let mut byte_pos = start;
310		for _ in 0..n {
311			let (ch, size) = decode_utf8(&input[start..]);
312			if ch.is_none() {
313				return no_utf8(byte_pos, size);
314			}
315			byte_pos += size;
316		}
317		Ok(((), byte_pos))
318	})
319}
320
321/// Read n bytes exactly.
322pub fn take_bytes<'a>(n: usize) -> Parser<'a, &'a str> {
323	Parser::new(move |input: &'a [u8], start: usize| {
324		// FIXME: This runs in linear time because it checks each character.
325		// If we could remember which inputs were passed in from parse_str() instead of parse(),
326		// we could assume the characters are valid utf8 and run this in constant time by only checking
327		// the final character using bstr::decode_last_utf8.
328		let mut byte_pos = start;
329		loop {
330			let (ch, size) = decode_utf8(&input[start..]);
331			if ch.is_none() {
332				return no_utf8(byte_pos, size);
333			}
334			byte_pos += size;
335			if byte_pos > n {
336				return Err(Error::Mismatch {
337					message: "range splits a UTF-8 character".to_owned(),
338					position: start,
339				});
340			}
341			if byte_pos == n {
342				let result = &input[start..byte_pos];
343				// UNSAFE: Because every char has been checked by decode_utf8, this string is known utf8
344				let result_str = unsafe { str::from_utf8_unchecked(result) };
345				return Ok((result_str, byte_pos));
346			}
347		}
348	})
349}
350
351/// Skip n bytes exactly.
352pub fn skip_bytes<'a>(n: usize) -> Parser<'a, ()> {
353	Parser::new(move |input: &'a [u8], start: usize| {
354		// FIXME: See note on take_bytes.
355		let mut byte_pos = start;
356		loop {
357			let (ch, size) = decode_utf8(&input[start..]);
358			if ch.is_none() {
359				return no_utf8(byte_pos, size);
360			}
361			byte_pos += size;
362			if byte_pos > n {
363				return Err(Error::Mismatch {
364					message: "range splits a UTF-8 character".to_owned(),
365					position: start,
366				});
367			}
368			if byte_pos == n {
369				return Ok(((), byte_pos));
370			}
371		}
372	})
373}
374
375/// Chain two parsers where the second parser depends on the first's result.
376impl<'a, O: 'a, U: 'a, F: Fn(O) -> Parser<'a, U> + 'a> Shr<F> for Parser<'a, O> {
377	type Output = Parser<'a, U>;
378
379	fn shr(self, other: F) -> Self::Output {
380		Parser::new(move |input: &'a [u8], start: usize| {
381			(self.0.method)(input, start).and_then(|(out, pos)| (other(out).0.method)(input, pos))
382		})
383	}
384}
385
386// Note: There are no "degrade to parser::Parser" implementations for >>
387// because Rust cannot tell the difference between an FN(O)->U and an FN(O)->V.
388
389// Remaining functions in file only delegate to base parser::Parser
390
391/// Always succeeds, consume no input.
392pub fn empty<'a>() -> Parser<'a, ()> {
393	Parser(parser::empty())
394}
395
396/// Parse separated list.
397pub fn list<'a, O, U>(item: Parser<'a, O>, separator: Parser<'a, U>) -> Parser<'a, Vec<O>>
398where
399	O: 'a,
400	U: 'a,
401{
402	Parser(parser::list(item.0, separator.0))
403}
404
405/// Call a parser factory, can be used to create recursive parsers.
406pub fn call<'a, O, F>(parser_factory: F) -> Parser<'a, O>
407where
408	O: 'a,
409	F: Fn() -> Parser<'a, O> + 'a,
410{
411	Parser(parser::call(move || parser_factory().0))
412}
413
414/// Success when end of input is reached.
415pub fn end<'a>() -> Parser<'a, ()> {
416	Parser(parser::end())
417}
418
419// And, Sub and Mul are similar enough we can implement them with macros
420
421macro_rules! utf_op {
422    ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => {
423    	#[doc=$doc]
424		impl<'a, Left: 'a, Right: 'a> $impl_name<Parser<'a, Right>> for Parser<'a, Left> {
425			type Output = Parser<'a, $return_type>;
426
427			fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output {
428				Parser(self.0 $op other.0)
429			}
430		}
431    };
432}
433
434macro_rules! utf_u8_op {
435    ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => {
436    	#[doc=concat!($doc, " (but degrade to non-utf8 parser)")]
437		impl<'a, Left: 'a, Right: 'a> $impl_name<parser::Parser<'a, u8, Right>> for Parser<'a, Left> {
438			type Output = parser::Parser<'a, u8, $return_type>;
439
440			fn $fn_name (self, other: parser::Parser<'a, u8, Right>) -> Self::Output {
441				self.0 $op other
442			}
443		}
444    };
445}
446
447macro_rules! u8_utf_op {
448    ( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => {
449    	#[doc=concat!($doc, " (but degrade to non-utf8 parser)")]
450		impl<'a, Left: 'a, Right: 'a> $impl_name<Parser<'a, Right>> for parser::Parser<'a, u8, Left> {
451			type Output = parser::Parser<'a, u8, $return_type>;
452
453			fn $fn_name (self, other: Parser<'a, Right>) -> Self::Output {
454				self $op other.0
455			}
456		}
457    };
458}
459
460macro_rules! all_op {
461	( $impl_name:ident, $fn_name:ident, $op:tt, $return_type:ty, $doc:expr ) => {
462		utf_op!($impl_name, $fn_name, $op, $return_type, $doc);
463		utf_u8_op!($impl_name, $fn_name, $op, $return_type, $doc);
464		u8_utf_op!($impl_name, $fn_name, $op, $return_type, $doc);
465	};
466}
467
468all_op!(Add, add, +, (Left, Right), "Sequence reserve value");
469
470all_op!(Sub, sub, -, Left, "Sequence discard second value");
471
472all_op!(Mul, mul, *, Right, "Sequence discard first value");
473
474/// Ordered choice
475impl<'a, O: 'a> BitOr for Parser<'a, O> {
476	type Output = Self;
477
478	fn bitor(self, other: Self) -> Self {
479		Self(self.0 | other.0)
480	}
481}
482
483/// Ordered choice (but degrade to non-utf8 parser)
484impl<'a, O: 'a> BitOr<parser::Parser<'a, u8, O>> for Parser<'a, O> {
485	type Output = parser::Parser<'a, u8, O>;
486
487	fn bitor(self, other: parser::Parser<'a, u8, O>) -> Self::Output {
488		self.0 | other
489	}
490}
491
492/// Ordered choice (but degrade to non-utf8 parser)
493impl<'a, O: 'a> BitOr<Parser<'a, O>> for parser::Parser<'a, u8, O> {
494	type Output = parser::Parser<'a, u8, O>;
495
496	fn bitor(self, other: Parser<'a, O>) -> Self::Output {
497		self | other.0
498	}
499}
500
501/// And predicate
502impl<'a, O: 'a> Neg for Parser<'a, O> {
503	type Output = Parser<'a, bool>;
504
505	fn neg(self) -> Self::Output {
506		Parser(-self.0)
507	}
508}
509
510/// Not predicate
511impl<'a, O: 'a> Not for Parser<'a, O> {
512	type Output = Parser<'a, bool>;
513
514	fn not(self) -> Self::Output {
515		Parser(!self.0)
516	}
517}