use super::{BlankIdBuf, StringLiteral};
use decoded_char::DecodedChar;
use iref::IriBuf;
use langtag::LanguageTagBuf;
use locspan::{ErrAt, Meta, Span};
use std::{fmt, iter::Peekable};
pub trait Tokens {
type Error;
#[allow(clippy::type_complexity)]
fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Self::Error, Span>>;
#[allow(clippy::type_complexity)]
fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Self::Error, Span>>;
fn begin(&mut self) -> Result<Span, Meta<Self::Error, Span>>;
fn last(&self) -> Span;
}
#[derive(Debug)]
pub enum Error<E = std::convert::Infallible> {
InvalidLangTag,
InvalidCodepoint(u32),
InvalidIriRef(iref::Error, String),
Unexpected(Option<char>),
Stream(E),
}
impl<E: fmt::Display> fmt::Display for Error<E> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::InvalidLangTag => write!(f, "invalid language tag"),
Self::InvalidCodepoint(c) => write!(f, "invalid character code point {:x}", c),
Self::InvalidIriRef(e, iri_ref) => {
write!(f, "invalid IRI reference <{}>: {}", iri_ref, e)
}
Self::Unexpected(None) => write!(f, "unexpected end of file"),
Self::Unexpected(Some(c)) => write!(f, "unexpected character `{}`", c),
Self::Stream(e) => e.fmt(f),
}
}
}
impl<E: 'static + std::error::Error> std::error::Error for Error<E> {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::InvalidIriRef(e, _) => Some(e),
Self::Stream(e) => Some(e),
_ => None,
}
}
}
#[derive(Debug)]
pub enum Token {
LangTag(LanguageTagBuf),
Iri(IriBuf),
StringLiteral(StringLiteral),
BlankNodeLabel(BlankIdBuf),
Dot,
Carets,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Self::LangTag(tag) => write!(f, "language tag `{}`", tag),
Self::Iri(iri) => write!(f, "IRI <{}>", iri),
Self::StringLiteral(string) => {
write!(f, "string literal \"{}\"", DisplayStringLiteral(string))
}
Self::BlankNodeLabel(label) => write!(f, "blank node label `{}`", label),
Self::Dot => write!(f, "dot `.`"),
Self::Carets => write!(f, "carets `^^`"),
}
}
}
pub struct DisplayStringLiteral<'a>(pub &'a str);
impl<'a> fmt::Display for DisplayStringLiteral<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.0.chars() {
match c {
'"' => write!(f, "\\u0022"),
'\\' => write!(f, "\\u005c"),
'\n' => write!(f, "\\n"),
'\r' => write!(f, "\\r"),
'\t' => write!(f, "\\t"),
'\u{08}' => write!(f, "\\b"),
'\u{0c}' => write!(f, "\\f"),
c => c.fmt(f),
}?
}
Ok(())
}
}
struct Chars<C: Iterator>(Peekable<C>);
impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Chars<C> {
fn peek(&mut self) -> Result<Option<DecodedChar>, Error<E>> {
match self.0.peek() {
None => Ok(None),
Some(Ok(c)) => Ok(Some(*c)),
Some(Err(_)) => self.next(),
}
}
fn next(&mut self) -> Result<Option<DecodedChar>, Error<E>> {
self.0.next().transpose().map_err(Error::Stream)
}
}
#[derive(Default)]
struct Position {
span: Span,
last_span: Span,
}
impl Position {
fn current(&self) -> Span {
self.span
}
fn end(&self) -> Span {
self.span.end().into()
}
fn last(&self) -> Span {
self.last_span
}
}
pub struct Lexer<C: Iterator<Item = Result<DecodedChar, E>>, E> {
chars: Chars<C>,
pos: Position,
lookahead: Option<Meta<Token, Span>>,
}
impl<C: Iterator<Item = Result<DecodedChar, E>>, E> Lexer<C, E> {
pub fn new(chars: C) -> Self {
Self {
chars: Chars(chars.peekable()),
pos: Position::default(),
lookahead: None,
}
}
}
impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Lexer<C, E> {
fn peek_decoded_char(&mut self) -> Result<Option<DecodedChar>, Meta<Error<E>, Span>> {
self.chars.peek().err_at(|| self.pos.end())
}
fn peek_char(&mut self) -> Result<Option<char>, Meta<Error<E>, Span>> {
self.peek_decoded_char()
.map(|c| c.map(DecodedChar::into_char))
}
fn next_char(&mut self) -> Result<Option<char>, Meta<Error<E>, Span>> {
match self.chars.next().err_at(|| self.pos.end())? {
Some(c) => {
self.pos.span.push(c.len());
self.pos.last_span.clear();
self.pos.last_span.push(c.len());
Ok(Some(*c))
}
None => Ok(None),
}
}
fn expect_char(&mut self) -> Result<char, Meta<Error<E>, Span>> {
self.next_char()?
.ok_or_else(|| Meta(Error::Unexpected(None), self.pos.end()))
}
fn skip_whitespaces(&mut self) -> Result<(), Meta<Error<E>, Span>> {
while let Some(c) = self.peek_char()? {
if c.is_whitespace() {
self.next_char()?;
} else if c == '#' {
self.next_comment()?;
} else {
break;
}
}
self.pos.span.clear();
Ok(())
}
fn next_comment(&mut self) -> Result<(), Meta<Error<E>, Span>> {
loop {
if matches!(self.next_char()?, None | Some('\n')) {
break Ok(());
}
}
}
fn next_langtag(&mut self) -> Result<Meta<LanguageTagBuf, Span>, Meta<Error<E>, Span>> {
let mut tag = String::new();
loop {
match self.peek_char()? {
None => {
if tag.is_empty() {
return Err(Meta(Error::InvalidLangTag, self.pos.current()));
} else {
break;
}
}
Some(c) => {
if c.is_ascii_alphabetic() {
tag.push(self.expect_char()?);
} else if c.is_whitespace() || c == '-' {
if tag.is_empty() {
return Err(Meta(Error::InvalidLangTag, self.pos.current()));
} else {
break;
}
} else {
self.next_char()?;
return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
}
}
}
}
let mut empty_subtag = true;
if let Some('-') = self.peek_char()? {
tag.push(self.expect_char()?);
loop {
match self.peek_char()? {
Some('-') if !empty_subtag => tag.push(self.expect_char()?),
Some(c) if c.is_ascii_alphanumeric() => {
empty_subtag = false;
tag.push(self.expect_char()?)
}
Some(c) => {
if c.is_whitespace() {
if empty_subtag {
return Err(Meta(Error::InvalidLangTag, self.pos.current()));
} else {
break;
}
} else {
self.next_char()?;
return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
}
}
None => {
if empty_subtag {
return Err(Meta(Error::InvalidLangTag, self.pos.current()));
} else {
break;
}
}
}
}
}
match LanguageTagBuf::new(tag.into_bytes()) {
Ok(tag) => Ok(Meta(tag, self.pos.current())),
Err(_) => Err(Meta(Error::InvalidLangTag, self.pos.current())),
}
}
fn next_iri(&mut self) -> Result<Meta<IriBuf, Span>, Meta<Error<E>, Span>> {
let mut iri = String::new();
loop {
match self.next_char()? {
Some('>') => break,
Some('\\') => {
let span = self.pos.last();
let c = match self.next_char()? {
Some('u') => self.next_uchar(span, 4)?,
Some('U') => self.next_uchar(span, 8)?,
unexpected => {
return Err(Meta(Error::Unexpected(unexpected), self.pos.last()))
}
};
iri.push(c)
}
Some(c) => {
if matches!(
c,
'\u{00}'..='\u{20}' | '<' | '>' | '"' | '{' | '}' | '|' | '^' | '`' | '\\'
) {
return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
}
iri.push(c)
}
None => return Err(Meta(Error::Unexpected(None), self.pos.end())),
}
}
match IriBuf::from_string(iri) {
Ok(iri) => Ok(Meta(iri, self.pos.current())),
Err((e, string)) => Err(Meta(Error::InvalidIriRef(e, string), self.pos.current())),
}
}
fn next_uchar(&mut self, mut span: Span, len: u8) -> Result<char, Meta<Error<E>, Span>> {
let mut codepoint = 0;
for _ in 0..len {
let c = self.expect_char()?;
match c.to_digit(16) {
Some(d) => codepoint = codepoint << 4 | d,
None => return Err(Meta(Error::Unexpected(Some(c)), self.pos.last())),
}
}
span.set_end(self.pos.current().end());
match char::try_from(codepoint) {
Ok(c) => Ok(c),
Err(_) => Err(Meta(Error::InvalidCodepoint(codepoint), span)),
}
}
fn next_string_literal(&mut self) -> Result<Meta<StringLiteral, Span>, Meta<Error<E>, Span>> {
let mut string = String::new();
loop {
match self.next_char()? {
Some('"') => break,
Some('\\') => {
let span = self.pos.last();
let c = match self.next_char()? {
Some('u') => self.next_uchar(span, 4)?,
Some('U') => self.next_uchar(span, 8)?,
Some('t') => '\t',
Some('b') => '\u{08}',
Some('n') => '\n',
Some('r') => '\r',
Some('f') => '\u{0c}',
Some('\'') => '\'',
Some('"') => '"',
Some('\\') => '\\',
unexpected => {
return Err(Meta(Error::Unexpected(unexpected), self.pos.last()))
}
};
string.push(c)
}
Some(c) => {
if matches!(c, '\n' | '\r') {
return Err(Meta(Error::Unexpected(Some(c)), self.pos.last()));
}
string.push(c)
}
None => return Err(Meta(Error::Unexpected(None), self.pos.end())),
}
}
Ok(Meta(string.into(), self.pos.current()))
}
fn next_blank_node_label(&mut self) -> Result<Meta<BlankIdBuf, Span>, Meta<Error<E>, Span>> {
match self.next_char()? {
Some(':') => {
let mut label = String::new();
label.push('_');
label.push(':');
match self.next_char()? {
Some(c) if c.is_ascii_digit() || is_pn_chars_u(c) => {
label.push(c);
let mut last_is_pn_chars = true;
loop {
match self.peek_char()? {
Some(c) if is_pn_chars(c) => {
label.push(self.expect_char()?);
last_is_pn_chars = true
}
Some('.') => {
label.push(self.expect_char()?);
last_is_pn_chars = false;
}
_ if last_is_pn_chars => break,
unexpected => {
return Err(Meta(
Error::Unexpected(unexpected),
self.pos.last(),
))
}
}
}
Ok(Meta(
unsafe { BlankIdBuf::new_unchecked(label) },
self.pos.current(),
))
}
unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
}
}
unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
}
}
pub fn consume(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
self.skip_whitespaces()?;
match self.next_char()? {
Some('@') => Ok(self.next_langtag()?.map(|t| Some(Token::LangTag(t)))),
Some('<') => Ok(self.next_iri()?.map(|t| Some(Token::Iri(t)))),
Some('"') => Ok(self
.next_string_literal()?
.map(|t| Some(Token::StringLiteral(t)))),
Some('_') => Ok(self
.next_blank_node_label()?
.map(|t| Some(Token::BlankNodeLabel(t)))),
Some('.') => Ok(Meta(Some(Token::Dot), self.pos.current())),
Some('^') => match self.next_char()? {
Some('^') => Ok(Meta(Some(Token::Carets), self.pos.current())),
unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
},
None => Ok(Meta(None, self.pos.end())),
unexpected => Err(Meta(Error::Unexpected(unexpected), self.pos.last())),
}
}
#[allow(clippy::type_complexity)]
pub fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Error<E>, Span>> {
if self.lookahead.is_none() {
if let Meta(Some(token), span) = self.consume()? {
self.lookahead = Some(Meta::new(token, span));
}
}
match &self.lookahead {
Some(Meta(token, span)) => Ok(Meta::new(Some(token), *span)),
None => Ok(Meta::new(None, self.pos.end())),
}
}
#[allow(clippy::type_complexity, clippy::should_implement_trait)]
pub fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
match self.lookahead.take() {
Some(Meta(token, span)) => Ok(Meta::new(Some(token), span)),
None => self.consume(),
}
}
}
impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Tokens for Lexer<C, E> {
type Error = Error<E>;
fn peek(&mut self) -> Result<Meta<Option<&Token>, Span>, Meta<Error<E>, Span>> {
self.peek()
}
fn next(&mut self) -> Result<Meta<Option<Token>, Span>, Meta<Error<E>, Span>> {
self.next()
}
fn begin(&mut self) -> Result<Span, Meta<Error<E>, Span>> {
self.skip_whitespaces()?;
Ok(self.pos.current())
}
fn last(&self) -> Span {
self.pos.last_span
}
}
impl<E, C: Iterator<Item = Result<DecodedChar, E>>> Iterator for Lexer<C, E> {
type Item = Result<Meta<Token, Span>, Meta<Error<E>, Span>>;
fn next(&mut self) -> Option<Self::Item> {
match self.next() {
Ok(Meta(Some(token), loc)) => Some(Ok(Meta::new(token, loc))),
Ok(Meta(None, _)) => None,
Err(e) => Some(Err(e)),
}
}
}
fn is_pn_chars_base(c: char) -> bool {
matches!(c, 'A'..='Z' | 'a'..='z' | '\u{00c0}'..='\u{00d6}' | '\u{00d8}'..='\u{00f6}' | '\u{00f8}'..='\u{02ff}' | '\u{0370}'..='\u{037d}' | '\u{037f}'..='\u{1fff}' | '\u{200c}'..='\u{200d}' | '\u{2070}'..='\u{218f}' | '\u{2c00}'..='\u{2fef}' | '\u{3001}'..='\u{d7ff}' | '\u{f900}'..='\u{fdcf}' | '\u{fdf0}'..='\u{fffd}' | '\u{10000}'..='\u{effff}')
}
fn is_pn_chars_u(c: char) -> bool {
is_pn_chars_base(c) || matches!(c, '_' | ':')
}
fn is_pn_chars(c: char) -> bool {
is_pn_chars_u(c)
|| matches!(c, '-' | '0'..='9' | '\u{00b7}' | '\u{0300}'..='\u{036f}' | '\u{203f}'..='\u{2040}')
}