1use super::BlankIdBuf;
2use decoded_char::DecodedChar;
3use iref::IriBuf;
4use langtag::LangTagBuf;
5use locspan::{ErrAt, Meta, Span};
6use std::{fmt, iter::Peekable};
7
8pub trait Tokens {
10 type Error;
11
12 #[allow(clippy::type_complexity)]
13 fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Self::Error, Span>>;
14
15 #[allow(clippy::type_complexity)]
16 fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Self::Error, Span>>;
17
18 fn begin(&mut self) -> Result<Span, Meta<Self::Error, Span>>;
22
23 fn last(&self) -> Span;
25}
26
27#[derive(Debug)]
29pub enum Error<E = std::convert::Infallible> {
30 InvalidLangTag,
31 InvalidCodepoint(u32),
32 InvalidIriRef(String),
33 Unexpected(Option<char>),
34 Stream(E),
35}
36
37impl<E: fmt::Display> fmt::Display for Error<E> {
38 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
39 match self {
40 Self::InvalidLangTag => write!(f, "invalid language tag"),
41 Self::InvalidCodepoint(c) => write!(f, "invalid character code point {c:x}"),
42 Self::InvalidIriRef(iri_ref) => {
43 write!(f, "invalid IRI reference <{iri_ref}>")
44 }
45 Self::Unexpected(None) => write!(f, "unexpected end of file"),
46 Self::Unexpected(Some(c)) => write!(f, "unexpected character `{c}`"),
47 Self::Stream(e) => e.fmt(f),
48 }
49 }
50}
51
52impl<E: 'static + std::error::Error> std::error::Error for Error<E> {
53 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
54 match self {
55 Self::Stream(e) => Some(e),
56 _ => None,
57 }
58 }
59}
60
61#[derive(Debug)]
63pub enum Token {
64 LangTag(LangTagBuf),
65 Iri(IriBuf),
66 StringLiteral(String),
67 BlankNodeLabel(BlankIdBuf),
68 Dot,
69 Carets,
70}
71
72impl fmt::Display for Token {
73 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
74 match self {
75 Self::LangTag(tag) => write!(f, "language tag `{tag}`"),
76 Self::Iri(iri) => write!(f, "IRI <{iri}>"),
77 Self::StringLiteral(string) => {
78 write!(f, "string literal \"{}\"", DisplayStringLiteral(string))
79 }
80 Self::BlankNodeLabel(label) => write!(f, "blank node label `{label}`"),
81 Self::Dot => write!(f, "dot `.`"),
82 Self::Carets => write!(f, "carets `^^`"),
83 }
84 }
85}
86
87pub struct DisplayStringLiteral<'a>(pub &'a str);
89
90impl<'a> fmt::Display for DisplayStringLiteral<'a> {
91 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
92 for c in self.0.chars() {
93 match c {
94 '"' => write!(f, "\\u0022"),
95 '\\' => write!(f, "\\u005c"),
96 '\n' => write!(f, "\\n"),
97 '\r' => write!(f, "\\r"),
98 '\t' => write!(f, "\\t"),
99 '\u{08}' => write!(f, "\\b"),
100 '\u{0c}' => write!(f, "\\f"),
101 c => c.fmt(f),
102 }?
103 }
104
105 Ok(())
106 }
107}
108
109struct Chars<C: Iterator>(Peekable<C>);
111
112impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Chars<C> {
113 fn peek(&mut self) -> Result<Option<DecodedChar>, Error<E>> {
114 match self.0.peek() {
115 None => Ok(None),
116 Some(Ok(c)) => Ok(Some(*c)),
117 Some(Err(_)) => self.next(),
118 }
119 }
120
121 fn next(&mut self) -> Result<Option<DecodedChar>, Error<E>> {
122 self.0.next().transpose().map_err(Error::Stream)
123 }
124}
125
126#[derive(Default)]
128struct Position {
129 span: Span,
130 last_span: Span,
131}
132
133impl Position {
134 fn current(&self) -> Span {
135 self.span
136 }
137
138 fn end(&self) -> Span {
139 self.span.end().into()
140 }
141
142 fn last(&self) -> Span {
143 self.last_span
144 }
145}
146
147pub struct Lexer<C: Iterator<Item = Result<DecodedChar, E>>, E> {
151 chars: Chars<C>,
152 pos: Position,
153 lookahead: Option<Meta<Token, Span>>,
154}
155
156impl<C: Iterator<Item = Result<DecodedChar, E>>, E> Lexer<C, E> {
157 pub fn new(chars: C) -> Self {
158 Self {
159 chars: Chars(chars.peekable()),
160 pos: Position::default(),
161 lookahead: None,
162 }
163 }
164}
165
166impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Lexer<C, E> {
167 fn peek_decoded_char(&mut self) -> Result<Option<DecodedChar>, Meta<Error<E>, Span>> {
168 self.chars.peek().err_at(|| self.pos.end())
169 }
170
171 fn peek_char(&mut self) -> Result<Option<char>, Meta<Error<E>, Span>> {
172 self.peek_decoded_char()
173 .map(|c| c.map(DecodedChar::into_char))
174 }
175
176 fn next_char(&mut self) -> Result<Option<char>, Meta<Error<E>, Span>> {
177 match self.chars.next().err_at(|| self.pos.end())? {
178 Some(c) => {
179 self.pos.span.push(c.len());
180 self.pos.last_span.clear();
181 self.pos.last_span.push(c.len());
182 Ok(Some(*c))
183 }
184 None => Ok(None),
185 }
186 }
187
188 fn expect_char(&mut self) -> Result<char, Meta<Error<E>, Span>> {
189 self.next_char()?
190 .ok_or_else(|| Meta(Error::Unexpected(None), self.pos.end()))
191 }
192
193 fn skip_whitespaces(&mut self) -> Result<(), Meta<Error<E>, Span>> {
194 while let Some(c) = self.peek_char()? {
195 if c.is_whitespace() {
196 self.next_char()?;
197 } else if c == '#' {
198 self.next_comment()?;
199 } else {
200 break;
201 }
202 }
203
204 self.pos.span.clear();
205 Ok(())
206 }
207
208 fn next_comment(&mut self) -> Result<(), Meta<Error<E>, Span>> {
215 loop {
216 if matches!(self.next_char()?, None | Some('\n')) {
217 break Ok(());
218 }
219 }
220 }
221
222 fn next_langtag(&mut self) -> Result<Meta<LangTagBuf, Span>, Meta<Error<E>, Span>> {
224 let mut tag = String::new();
225
226 loop {
227 match self.peek_char()? {
228 None => {
229 if tag.is_empty() {
230 return Err(Meta(Error::InvalidLangTag, self.pos.current()));
231 } else {
232 break;
233 }
234 }
235 Some(c) => {
236 if c.is_ascii_alphabetic() {
237 tag.push(self.expect_char()?);
238 } else if c.is_whitespace() || c == '-' {
239 if tag.is_empty() {
240 return Err(Meta(Error::InvalidLangTag, self.pos.current()));
241 } else {
242 break;
243 }
244 } else {
245 self.next_char()?;
246 return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
247 }
248 }
249 }
250 }
251
252 let mut empty_subtag = true;
253 if let Some('-') = self.peek_char()? {
254 tag.push(self.expect_char()?);
255 loop {
256 match self.peek_char()? {
257 Some('-') if !empty_subtag => tag.push(self.expect_char()?),
258 Some(c) if c.is_ascii_alphanumeric() => {
259 empty_subtag = false;
260 tag.push(self.expect_char()?)
261 }
262 Some(c) => {
263 if c.is_whitespace() {
264 if empty_subtag {
265 return Err(Meta(Error::InvalidLangTag, self.pos.current()));
266 } else {
267 break;
268 }
269 } else {
270 self.next_char()?;
271 return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
272 }
273 }
274 None => {
275 if empty_subtag {
276 return Err(Meta(Error::InvalidLangTag, self.pos.current()));
277 } else {
278 break;
279 }
280 }
281 }
282 }
283 }
284
285 match LangTagBuf::new(tag) {
286 Ok(tag) => Ok(Meta(tag, self.pos.current())),
287 Err(_) => Err(Meta(Error::InvalidLangTag, self.pos.current())),
288 }
289 }
290
291 fn next_iri(&mut self) -> Result<Meta<IriBuf, Span>, Meta<Error<E>, Span>> {
293 let mut iri = String::new();
294
295 loop {
296 match self.next_char()? {
297 Some('>') => break,
298 Some('\\') => {
299 let span = self.pos.last();
300 let c = match self.next_char()? {
301 Some('u') => self.next_uchar(span, 4)?,
302 Some('U') => self.next_uchar(span, 8)?,
303 unexpected => {
304 return Err(Meta(Error::Unexpected(unexpected), self.pos.last()))
305 }
306 };
307
308 iri.push(c)
309 }
310 Some(c) => {
311 if matches!(
312 c,
313 '\u{00}'..='\u{20}' | '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\'
314 ) {
315 return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
316 }
317
318 iri.push(c)
319 }
320 None => return Err(Meta(Error::Unexpected(None), self.pos.end())),
321 }
322 }
323
324 match IriBuf::new(iri) {
325 Ok(iri) => Ok(Meta(iri, self.pos.current())),
326 Err(e) => Err(Meta(Error::InvalidIriRef(e.0), self.pos.current())),
327 }
328 }
329
330 fn next_uchar(&mut self, mut span: Span, len: u8) -> Result<char, Meta<Error<E>, Span>> {
331 let mut codepoint = 0;
332
333 for _ in 0..len {
334 let c = self.expect_char()?;
335 match c.to_digit(16) {
336 Some(d) => codepoint = codepoint << 4 | d,
337 None => return Err(Meta(Error::Unexpected(Some(c)), self.pos.last())),
338 }
339 }
340
341 span.set_end(self.pos.current().end());
342 match char::try_from(codepoint) {
343 Ok(c) => Ok(c),
344 Err(_) => Err(Meta(Error::InvalidCodepoint(codepoint), span)),
345 }
346 }
347
348 fn next_string_literal(&mut self) -> Result<Meta<String, Span>, Meta<Error<E>, Span>> {
350 let mut string = String::new();
351
352 loop {
353 match self.next_char()? {
354 Some('"') => break,
355 Some('\\') => {
356 let span = self.pos.last();
357 let c = match self.next_char()? {
358 Some('u') => self.next_uchar(span, 4)?,
359 Some('U') => self.next_uchar(span, 8)?,
360 Some('t') => '\t',
361 Some('b') => '\u{08}',
362 Some('n') => '\n',
363 Some('r') => '\r',
364 Some('f') => '\u{0c}',
365 Some('\'') => '\'',
366 Some('"') => '"',
367 Some('\\') => '\\',
368 unexpected => {
369 return Err(Meta(Error::Unexpected(unexpected), self.pos.last()))
370 }
371 };
372
373 string.push(c)
374 }
375 Some(c) => {
376 if matches!(c, '\n' | '\r') {
377 return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
378 }
379
380 string.push(c)
381 }
382 None => return Err(Meta(Error::Unexpected(None), self.pos.end())),
383 }
384 }
385
386 Ok(Meta(string, self.pos.current()))
387 }
388
389 fn next_blank_node_label(&mut self) -> Result<Meta<BlankIdBuf, Span>, Meta<Error<E>, Span>> {
391 match self.next_char()? {
392 Some(':') => {
393 let mut label = String::new();
394 label.push('_');
395 label.push(':');
396 match self.next_char()? {
397 Some(c) if c.is_ascii_digit() || is_pn_chars_u(c) => {
398 label.push(c);
399 let mut last_is_pn_chars = true;
400 loop {
401 match self.peek_char()? {
402 Some(c) if is_pn_chars(c) => {
403 label.push(self.expect_char()?);
404 last_is_pn_chars = true
405 }
406 Some('.') => {
407 label.push(self.expect_char()?);
408 last_is_pn_chars = false;
409 }
410 _ if last_is_pn_chars => break,
411 unexpected => {
412 return Err(Meta(
413 Error::Unexpected(unexpected),
414 self.pos.last(),
415 ))
416 }
417 }
418 }
419
420 Ok(Meta(
421 unsafe { BlankIdBuf::new_unchecked(label) },
422 self.pos.current(),
423 ))
424 }
425 unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
426 }
427 }
428 unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
429 }
430 }
431
432 pub fn consume(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
433 self.skip_whitespaces()?;
434 match self.next_char()? {
435 Some('@') => Ok(self.next_langtag()?.map(|t| Some(Token::LangTag(t)))),
436 Some('<') => Ok(self.next_iri()?.map(|t| Some(Token::Iri(t)))),
437 Some('"') => Ok(self
438 .next_string_literal()?
439 .map(|t| Some(Token::StringLiteral(t)))),
440 Some('_') => Ok(self
441 .next_blank_node_label()?
442 .map(|t| Some(Token::BlankNodeLabel(t)))),
443 Some('.') => Ok(Meta(Some(Token::Dot), self.pos.current())),
444 Some('^') => match self.next_char()? {
445 Some('^') => Ok(Meta(Some(Token::Carets), self.pos.current())),
446 unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
447 },
448 None => Ok(Meta(None, self.pos.end())),
449 unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
450 }
451 }
452
453 #[allow(clippy::type_complexity)]
454 pub fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Error<E>, Span>> {
455 if self.lookahead.is_none() {
456 if let Meta(Some(token), span) = self.consume()? {
457 self.lookahead = Some(Meta::new(token, span));
458 }
459 }
460
461 match &self.lookahead {
462 Some(Meta(token, span)) => Ok(Meta::new(Some(token), *span)),
463 None => Ok(Meta::new(None, self.pos.end())),
464 }
465 }
466
467 #[allow(clippy::type_complexity, clippy::should_implement_trait)]
468 pub fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
469 match self.lookahead.take() {
470 Some(Meta(token, span)) => Ok(Meta::new(Some(token), span)),
471 None => self.consume(),
472 }
473 }
474}
475
476impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Tokens for Lexer<C, E> {
477 type Error = Error<E>;
478
479 fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Error<E>, Span>> {
480 self.peek()
481 }
482
483 fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
484 self.next()
485 }
486
487 fn begin(&mut self) -> Result<Span, Meta<Error<E>, Span>> {
488 self.skip_whitespaces()?;
489 Ok(self.pos.current())
490 }
491
492 fn last(&self) -> Span {
493 self.pos.last_span
494 }
495}
496
497impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Iterator for Lexer<C, E> {
498 type Item = Result<Meta<Token, Span>, Meta<Error<E>, Span>>;
499
500 fn next(&mut self) -> Option<Self::Item> {
501 match self.next() {
502 Ok(Meta(Some(token), loc)) => Some(Ok(Meta::new(token, loc))),
503 Ok(Meta(None, _)) => None,
504 Err(e) => Some(Err(e)),
505 }
506 }
507}
508
509fn is_pn_chars_base(c: char) -> bool {
510 matches!(c, 'A'..='Z' | 'a'..='z' | '\u{00c0}'..='\u{00d6}' | '\u{00d8}'..='\u{00f6}' | '\u{00f8}'..='\u{02ff}' | '\u{0370}'..='\u{037d}' | '\u{037f}'..='\u{1fff}' | '\u{200c}'..='\u{200d}' | '\u{2070}'..='\u{218f}' | '\u{2c00}'..='\u{2fef}' | '\u{3001}'..='\u{d7ff}' | '\u{f900}'..='\u{fdcf}' | '\u{fdf0}'..='\u{fffd}' | '\u{10000}'..='\u{effff}')
511}
512
513fn is_pn_chars_u(c: char) -> bool {
514 is_pn_chars_base(c) || matches!(c, '_' | ':')
515}
516
517fn is_pn_chars(c: char) -> bool {
518 is_pn_chars_u(c)
519 || matches!(c, '-' | '0'..='9' | '\u{00b7}' | '\u{0300}'..='\u{036f}' | '\u{203f}'..='\u{2040}')
520}