nquads_syntax/
lexing.rs

1use super::BlankIdBuf;
2use decoded_char::DecodedChar;
3use iref::IriBuf;
4use langtag::LangTagBuf;
5use locspan::{ErrAt, Meta, Span};
6use std::{fmt, iter::Peekable};
7
8/// Fallible tokens iterator with lookahead.
9pub trait Tokens {
10	type Error;
11
12	#[allow(clippy::type_complexity)]
13	fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Self::Error, Span>>;
14
15	#[allow(clippy::type_complexity)]
16	fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Self::Error, Span>>;
17
18	/// Begin a new span.
19	///
20	/// Skips white spaces and return an empty span at the cursor position.
21	fn begin(&mut self) -> Result<Span, Meta<Self::Error, Span>>;
22
23	/// Returns the span of the last parsed token.
24	fn last(&self) -> Span;
25}
26
27/// Lexing error.
28#[derive(Debug)]
29pub enum Error<E = std::convert::Infallible> {
30	InvalidLangTag,
31	InvalidCodepoint(u32),
32	InvalidIriRef(String),
33	Unexpected(Option<char>),
34	Stream(E),
35}
36
37impl<E: fmt::Display> fmt::Display for Error<E> {
38	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
39		match self {
40			Self::InvalidLangTag => write!(f, "invalid language tag"),
41			Self::InvalidCodepoint(c) => write!(f, "invalid character code point {c:x}"),
42			Self::InvalidIriRef(iri_ref) => {
43				write!(f, "invalid IRI reference <{iri_ref}>")
44			}
45			Self::Unexpected(None) => write!(f, "unexpected end of file"),
46			Self::Unexpected(Some(c)) => write!(f, "unexpected character `{c}`"),
47			Self::Stream(e) => e.fmt(f),
48		}
49	}
50}
51
52impl<E: 'static + std::error::Error> std::error::Error for Error<E> {
53	fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
54		match self {
55			Self::Stream(e) => Some(e),
56			_ => None,
57		}
58	}
59}
60
61/// Token.
62#[derive(Debug)]
63pub enum Token {
64	LangTag(LangTagBuf),
65	Iri(IriBuf),
66	StringLiteral(String),
67	BlankNodeLabel(BlankIdBuf),
68	Dot,
69	Carets,
70}
71
72impl fmt::Display for Token {
73	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
74		match self {
75			Self::LangTag(tag) => write!(f, "language tag `{tag}`"),
76			Self::Iri(iri) => write!(f, "IRI <{iri}>"),
77			Self::StringLiteral(string) => {
78				write!(f, "string literal \"{}\"", DisplayStringLiteral(string))
79			}
80			Self::BlankNodeLabel(label) => write!(f, "blank node label `{label}`"),
81			Self::Dot => write!(f, "dot `.`"),
82			Self::Carets => write!(f, "carets `^^`"),
83		}
84	}
85}
86
87/// Wrapper to display string literals.
88pub struct DisplayStringLiteral<'a>(pub &'a str);
89
90impl<'a> fmt::Display for DisplayStringLiteral<'a> {
91	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
92		for c in self.0.chars() {
93			match c {
94				'"' => write!(f, "\\u0022"),
95				'\\' => write!(f, "\\u005c"),
96				'\n' => write!(f, "\\n"),
97				'\r' => write!(f, "\\r"),
98				'\t' => write!(f, "\\t"),
99				'\u{08}' => write!(f, "\\b"),
100				'\u{0c}' => write!(f, "\\f"),
101				c => c.fmt(f),
102			}?
103		}
104
105		Ok(())
106	}
107}
108
109/// Characters iterator.
110struct Chars<C: Iterator>(Peekable<C>);
111
112impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Chars<C> {
113	fn peek(&mut self) -> Result<Option<DecodedChar>, Error<E>> {
114		match self.0.peek() {
115			None => Ok(None),
116			Some(Ok(c)) => Ok(Some(*c)),
117			Some(Err(_)) => self.next(),
118		}
119	}
120
121	fn next(&mut self) -> Result<Option<DecodedChar>, Error<E>> {
122		self.0.next().transpose().map_err(Error::Stream)
123	}
124}
125
126/// Lexer position.
127#[derive(Default)]
128struct Position {
129	span: Span,
130	last_span: Span,
131}
132
133impl Position {
134	fn current(&self) -> Span {
135		self.span
136	}
137
138	fn end(&self) -> Span {
139		self.span.end().into()
140	}
141
142	fn last(&self) -> Span {
143		self.last_span
144	}
145}
146
147/// Lexer.
148///
149/// Changes a character iterator into a `Token` iterator.
150pub struct Lexer<C: Iterator<Item = Result<DecodedChar, E>>, E> {
151	chars: Chars<C>,
152	pos: Position,
153	lookahead: Option<Meta<Token, Span>>,
154}
155
156impl<C: Iterator<Item = Result<DecodedChar, E>>, E> Lexer<C, E> {
157	pub fn new(chars: C) -> Self {
158		Self {
159			chars: Chars(chars.peekable()),
160			pos: Position::default(),
161			lookahead: None,
162		}
163	}
164}
165
166impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Lexer<C, E> {
167	fn peek_decoded_char(&mut self) -> Result<Option<DecodedChar>, Meta<Error<E>, Span>> {
168		self.chars.peek().err_at(|| self.pos.end())
169	}
170
171	fn peek_char(&mut self) -> Result<Option<char>, Meta<Error<E>, Span>> {
172		self.peek_decoded_char()
173			.map(|c| c.map(DecodedChar::into_char))
174	}
175
176	fn next_char(&mut self) -> Result<Option<char>, Meta<Error<E>, Span>> {
177		match self.chars.next().err_at(|| self.pos.end())? {
178			Some(c) => {
179				self.pos.span.push(c.len());
180				self.pos.last_span.clear();
181				self.pos.last_span.push(c.len());
182				Ok(Some(*c))
183			}
184			None => Ok(None),
185		}
186	}
187
188	fn expect_char(&mut self) -> Result<char, Meta<Error<E>, Span>> {
189		self.next_char()?
190			.ok_or_else(|| Meta(Error::Unexpected(None), self.pos.end()))
191	}
192
193	fn skip_whitespaces(&mut self) -> Result<(), Meta<Error<E>, Span>> {
194		while let Some(c) = self.peek_char()? {
195			if c.is_whitespace() {
196				self.next_char()?;
197			} else if c == '#' {
198				self.next_comment()?;
199			} else {
200				break;
201			}
202		}
203
204		self.pos.span.clear();
205		Ok(())
206	}
207
208	/// Parses the rest of a comment, after the first `#` character.
209	///
210	/// Comments in N-Quads take the form of `#`,
211	/// outside an IRIREF or STRING_LITERAL_QUOTE,
212	/// and continue to the end of line (EOL) or end of file
213	/// if there is no end of line after the comment marker.
214	fn next_comment(&mut self) -> Result<(), Meta<Error<E>, Span>> {
215		loop {
216			if matches!(self.next_char()?, None | Some('\n')) {
217				break Ok(());
218			}
219		}
220	}
221
222	/// Parses the rest of a lang tag, after the first `@` character.
223	fn next_langtag(&mut self) -> Result<Meta<LangTagBuf, Span>, Meta<Error<E>, Span>> {
224		let mut tag = String::new();
225
226		loop {
227			match self.peek_char()? {
228				None => {
229					if tag.is_empty() {
230						return Err(Meta(Error::InvalidLangTag, self.pos.current()));
231					} else {
232						break;
233					}
234				}
235				Some(c) => {
236					if c.is_ascii_alphabetic() {
237						tag.push(self.expect_char()?);
238					} else if c.is_whitespace() || c == '-' {
239						if tag.is_empty() {
240							return Err(Meta(Error::InvalidLangTag, self.pos.current()));
241						} else {
242							break;
243						}
244					} else {
245						self.next_char()?;
246						return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
247					}
248				}
249			}
250		}
251
252		let mut empty_subtag = true;
253		if let Some('-') = self.peek_char()? {
254			tag.push(self.expect_char()?);
255			loop {
256				match self.peek_char()? {
257					Some('-') if !empty_subtag => tag.push(self.expect_char()?),
258					Some(c) if c.is_ascii_alphanumeric() => {
259						empty_subtag = false;
260						tag.push(self.expect_char()?)
261					}
262					Some(c) => {
263						if c.is_whitespace() {
264							if empty_subtag {
265								return Err(Meta(Error::InvalidLangTag, self.pos.current()));
266							} else {
267								break;
268							}
269						} else {
270							self.next_char()?;
271							return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
272						}
273					}
274					None => {
275						if empty_subtag {
276							return Err(Meta(Error::InvalidLangTag, self.pos.current()));
277						} else {
278							break;
279						}
280					}
281				}
282			}
283		}
284
285		match LangTagBuf::new(tag) {
286			Ok(tag) => Ok(Meta(tag, self.pos.current())),
287			Err(_) => Err(Meta(Error::InvalidLangTag, self.pos.current())),
288		}
289	}
290
291	/// Parses an IRI, starting after the first `<` until the closing `>`.
292	fn next_iri(&mut self) -> Result<Meta<IriBuf, Span>, Meta<Error<E>, Span>> {
293		let mut iri = String::new();
294
295		loop {
296			match self.next_char()? {
297				Some('>') => break,
298				Some('\\') => {
299					let span = self.pos.last();
300					let c = match self.next_char()? {
301						Some('u') => self.next_uchar(span, 4)?,
302						Some('U') => self.next_uchar(span, 8)?,
303						unexpected => {
304							return Err(Meta(Error::Unexpected(unexpected), self.pos.last()))
305						}
306					};
307
308					iri.push(c)
309				}
310				Some(c) => {
311					if matches!(
312						c,
313						'\u{00}'..='\u{20}' | '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\'
314					) {
315						return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
316					}
317
318					iri.push(c)
319				}
320				None => return Err(Meta(Error::Unexpected(None), self.pos.end())),
321			}
322		}
323
324		match IriBuf::new(iri) {
325			Ok(iri) => Ok(Meta(iri, self.pos.current())),
326			Err(e) => Err(Meta(Error::InvalidIriRef(e.0), self.pos.current())),
327		}
328	}
329
330	fn next_uchar(&mut self, mut span: Span, len: u8) -> Result<char, Meta<Error<E>, Span>> {
331		let mut codepoint = 0;
332
333		for _ in 0..len {
334			let c = self.expect_char()?;
335			match c.to_digit(16) {
336				Some(d) => codepoint = codepoint << 4 | d,
337				None => return Err(Meta(Error::Unexpected(Some(c)), self.pos.last())),
338			}
339		}
340
341		span.set_end(self.pos.current().end());
342		match char::try_from(codepoint) {
343			Ok(c) => Ok(c),
344			Err(_) => Err(Meta(Error::InvalidCodepoint(codepoint), span)),
345		}
346	}
347
348	/// Parses a string literal, starting after the first `"` until the closing `"`.
349	fn next_string_literal(&mut self) -> Result<Meta<String, Span>, Meta<Error<E>, Span>> {
350		let mut string = String::new();
351
352		loop {
353			match self.next_char()? {
354				Some('"') => break,
355				Some('\\') => {
356					let span = self.pos.last();
357					let c = match self.next_char()? {
358						Some('u') => self.next_uchar(span, 4)?,
359						Some('U') => self.next_uchar(span, 8)?,
360						Some('t') => '\t',
361						Some('b') => '\u{08}',
362						Some('n') => '\n',
363						Some('r') => '\r',
364						Some('f') => '\u{0c}',
365						Some('\'') => '\'',
366						Some('"') => '"',
367						Some('\\') => '\\',
368						unexpected => {
369							return Err(Meta(Error::Unexpected(unexpected), self.pos.last()))
370						}
371					};
372
373					string.push(c)
374				}
375				Some(c) => {
376					if matches!(c, '\n' | '\r') {
377						return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
378					}
379
380					string.push(c)
381				}
382				None => return Err(Meta(Error::Unexpected(None), self.pos.end())),
383			}
384		}
385
386		Ok(Meta(string, self.pos.current()))
387	}
388
389	/// Parses a blank node label, starting after the first `_`.
390	fn next_blank_node_label(&mut self) -> Result<Meta<BlankIdBuf, Span>, Meta<Error<E>, Span>> {
391		match self.next_char()? {
392			Some(':') => {
393				let mut label = String::new();
394				label.push('_');
395				label.push(':');
396				match self.next_char()? {
397					Some(c) if c.is_ascii_digit() || is_pn_chars_u(c) => {
398						label.push(c);
399						let mut last_is_pn_chars = true;
400						loop {
401							match self.peek_char()? {
402								Some(c) if is_pn_chars(c) => {
403									label.push(self.expect_char()?);
404									last_is_pn_chars = true
405								}
406								Some('.') => {
407									label.push(self.expect_char()?);
408									last_is_pn_chars = false;
409								}
410								_ if last_is_pn_chars => break,
411								unexpected => {
412									return Err(Meta(
413										Error::Unexpected(unexpected),
414										self.pos.last(),
415									))
416								}
417							}
418						}
419
420						Ok(Meta(
421							unsafe { BlankIdBuf::new_unchecked(label) },
422							self.pos.current(),
423						))
424					}
425					unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
426				}
427			}
428			unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
429		}
430	}
431
432	pub fn consume(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
433		self.skip_whitespaces()?;
434		match self.next_char()? {
435			Some('@') => Ok(self.next_langtag()?.map(|t| Some(Token::LangTag(t)))),
436			Some('<') => Ok(self.next_iri()?.map(|t| Some(Token::Iri(t)))),
437			Some('"') => Ok(self
438				.next_string_literal()?
439				.map(|t| Some(Token::StringLiteral(t)))),
440			Some('_') => Ok(self
441				.next_blank_node_label()?
442				.map(|t| Some(Token::BlankNodeLabel(t)))),
443			Some('.') => Ok(Meta(Some(Token::Dot), self.pos.current())),
444			Some('^') => match self.next_char()? {
445				Some('^') => Ok(Meta(Some(Token::Carets), self.pos.current())),
446				unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
447			},
448			None => Ok(Meta(None, self.pos.end())),
449			unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
450		}
451	}
452
453	#[allow(clippy::type_complexity)]
454	pub fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Error<E>, Span>> {
455		if self.lookahead.is_none() {
456			if let Meta(Some(token), span) = self.consume()? {
457				self.lookahead = Some(Meta::new(token, span));
458			}
459		}
460
461		match &self.lookahead {
462			Some(Meta(token, span)) => Ok(Meta::new(Some(token), *span)),
463			None => Ok(Meta::new(None, self.pos.end())),
464		}
465	}
466
467	#[allow(clippy::type_complexity, clippy::should_implement_trait)]
468	pub fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
469		match self.lookahead.take() {
470			Some(Meta(token, span)) => Ok(Meta::new(Some(token), span)),
471			None => self.consume(),
472		}
473	}
474}
475
476impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Tokens for Lexer<C, E> {
477	type Error = Error<E>;
478
479	fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Error<E>, Span>> {
480		self.peek()
481	}
482
483	fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
484		self.next()
485	}
486
487	fn begin(&mut self) -> Result<Span, Meta<Error<E>, Span>> {
488		self.skip_whitespaces()?;
489		Ok(self.pos.current())
490	}
491
492	fn last(&self) -> Span {
493		self.pos.last_span
494	}
495}
496
497impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Iterator for Lexer<C, E> {
498	type Item = Result<Meta<Token, Span>, Meta<Error<E>, Span>>;
499
500	fn next(&mut self) -> Option<Self::Item> {
501		match self.next() {
502			Ok(Meta(Some(token), loc)) => Some(Ok(Meta::new(token, loc))),
503			Ok(Meta(None, _)) => None,
504			Err(e) => Some(Err(e)),
505		}
506	}
507}
508
509fn is_pn_chars_base(c: char) -> bool {
510	matches!(c, 'A'..='Z' | 'a'..='z' | '\u{00c0}'..='\u{00d6}' | '\u{00d8}'..='\u{00f6}' | '\u{00f8}'..='\u{02ff}' | '\u{0370}'..='\u{037d}' | '\u{037f}'..='\u{1fff}' | '\u{200c}'..='\u{200d}' | '\u{2070}'..='\u{218f}' | '\u{2c00}'..='\u{2fef}' | '\u{3001}'..='\u{d7ff}' | '\u{f900}'..='\u{fdcf}' | '\u{fdf0}'..='\u{fffd}' | '\u{10000}'..='\u{effff}')
511}
512
513fn is_pn_chars_u(c: char) -> bool {
514	is_pn_chars_base(c) || matches!(c, '_' | ':')
515}
516
517fn is_pn_chars(c: char) -> bool {
518	is_pn_chars_u(c)
519		|| matches!(c, '-' | '0'..='9' | '\u{00b7}' | '\u{0300}'..='\u{036f}' | '\u{203f}'..='\u{2040}')
520}