asciidork-parser 0.37.0

Asciidork parser
Documentation
use std::borrow::Cow;
use std::cmp::Ordering;

use crate::internal::*;

#[derive(Clone, PartialEq, Eq)]
pub struct Token<'arena> {
  pub kind: TokenKind,
  pub loc: SourceLocation,
  pub lexeme: BumpString<'arena>,
  pub attr_replacement: bool,
}

#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
pub enum TokenKind {
  Ampersand,
  AttrDef,
  AttrPassDbl,
  AttrRef,
  Backtick,
  Backslash,
  Bang,
  CalloutNumber,
  Caret,
  CloseBrace,
  CloseBracket,
  CloseParens,
  Colon,
  Comma,
  Dashes,
  DelimiterLine,
  Digits,
  Directive,
  Discard,
  DoubleQuote,
  Dots,
  Email,
  Entity,
  EqualSigns,
  #[default]
  Eof,
  ForwardSlashes,
  GreaterThan,
  Hash,
  LessThan,
  MacroName,
  MaybeEmail,
  Newline,
  NoBreakSpace,
  OpenBrace,
  OpenBracket,
  OpenParens,
  Percent,
  Pipe,
  Plus,
  PreprocPassthru,
  Punctuation,
  QuestionMark,
  SemiColon,
  SingleQuote,
  Star,
  TermDelimiter,
  Tilde,
  Underscore,
  UriScheme,
  Whitespace,
  Word,
}

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum TokenSpec {
  Kind(TokenKind),
  Len(u8, TokenKind),
  Not(TokenKind),
  OneOf(&'static [TokenKind]),
  NotOneOf(&'static [TokenKind]),
}

impl TokenSpec {
  pub const fn token_kind(&self) -> Option<TokenKind> {
    match self {
      TokenSpec::Kind(kind) => Some(*kind),
      TokenSpec::Not(_) => None,
      TokenSpec::Len(_, kind) => Some(*kind),
      TokenSpec::OneOf(_) => None,
      TokenSpec::NotOneOf(_) => None,
    }
  }
}

impl From<TokenKind> for TokenSpec {
  fn from(kind: TokenKind) -> Self {
    TokenSpec::Kind(kind)
  }
}

impl<'arena> Token<'arena> {
  pub fn new(kind: TokenKind, loc: impl Into<SourceLocation>, lexeme: BumpString<'arena>) -> Self {
    Self {
      kind,
      loc: loc.into(),
      lexeme,
      attr_replacement: false,
    }
  }

  pub fn into_source_string(self) -> SourceString<'arena> {
    SourceString::new(self.lexeme, self.loc)
  }

  pub fn to_url_scheme(&self) -> Option<UrlScheme> {
    match self.kind {
      TokenKind::UriScheme => match self.lexeme.as_str() {
        "https://" => Some(UrlScheme::Https),
        "http://" => Some(UrlScheme::Http),
        "ftp://" => Some(UrlScheme::Ftp),
        "irc://" => Some(UrlScheme::Irc),
        "mailto:" => Some(UrlScheme::Mailto),
        "file:///" => Some(UrlScheme::File),
        _ => None,
      },
      _ => None,
    }
  }

  pub fn len(&self) -> usize {
    self.lexeme.len()
  }

  pub fn parse_callout_num(&self) -> Option<u8> {
    let ascii_digits = self
      .lexeme
      .bytes()
      .filter(u8::is_ascii_digit)
      .collect::<SmallVec<[u8; 3]>>();
    if ascii_digits.is_empty() {
      None // autogenerated: <.>
    } else {
      // SAFETY: we only have ascii digits, so this is fine
      let num_str = unsafe { std::str::from_utf8_unchecked(&ascii_digits) };
      // TODO: would be better to emit an error on >255, but would require
      // reworking how Line relies on this method and can't emit
      num_str.parse::<u8>().ok()
    }
  }

  pub fn drop_leading_bytes(&mut self, n: u32) {
    if n == 0 {
      return;
    }
    debug_assert!(n as usize <= self.lexeme.len());
    self.kind = TokenKind::Word;
    let mut removed = 0;
    loop {
      let char = self.lexeme.remove(0);
      let mut buf = [0; 4];
      let bytes = char.encode_utf8(&mut buf).as_bytes();
      removed += bytes.len();
      match removed.cmp(&(n as usize)) {
        Ordering::Less => continue,
        Ordering::Equal => break,
        Ordering::Greater => panic!("Token::drop_leading_bytes() mid-char boundary"),
      }
    }
    self.loc.start += n;
    if self.len() == 0 {
      self.kind = TokenKind::Discard;
    }
  }

  pub fn attr_name(&self) -> &str {
    assert_eq!(self.kind, TokenKind::AttrRef);
    &self.lexeme[1..self.lexeme.len() - 1]
  }

  pub fn lowercase_attr_name(&self) -> Cow<'_, str> {
    let attr_name = self.attr_name();
    if attr_name.chars().any(|c| c.is_uppercase()) {
      Cow::Owned(attr_name.to_lowercase())
    } else {
      Cow::Borrowed(attr_name)
    }
  }
}

pub trait TokenIs {
  fn is_len(&self, len: usize) -> bool;
  fn kind(&self, kind: TokenKind) -> bool;
  fn is_kind_len(&self, kind: TokenKind, len: usize) -> bool;
  fn matches(&self, kind: TokenKind, lexeme: &'static str) -> bool;
  fn satisfies(&self, spec: TokenSpec) -> bool;
  fn can_start_block_macro(&self) -> bool;
  fn is_attr_replacement(&self) -> bool;
  fn not_kind(&self, kind: TokenKind) -> bool {
    !self.kind(kind)
  }
  fn is_not_kind_len(&self, kind: TokenKind, len: usize) -> bool {
    !self.is_kind_len(kind, len)
  }
  fn is_whitespaceish(&self) -> bool {
    self.kind(TokenKind::Whitespace) || self.kind(TokenKind::Newline)
  }
  fn satisfies_any(&self, specs: &[TokenSpec]) -> bool {
    specs.iter().any(|spec| self.satisfies(*spec))
  }
  fn is_macro(&self) -> bool {
    self.kind(TokenKind::MacroName)
  }
}

impl<'arena> DefaultIn<'arena> for Token<'arena> {
  fn default_in(bump: &'arena Bump) -> Self {
    Self {
      kind: TokenKind::Eof,
      loc: SourceLocation::default(),
      lexeme: BumpString::from_str_in("", bump),
      attr_replacement: false,
    }
  }
}

impl TokenIs for Token<'_> {
  fn is_len(&self, len: usize) -> bool {
    self.len() == len
  }

  fn kind(&self, kind: TokenKind) -> bool {
    self.kind == kind
  }

  fn is_kind_len(&self, kind: TokenKind, len: usize) -> bool {
    self.kind == kind && self.len() == len
  }

  fn satisfies(&self, spec: TokenSpec) -> bool {
    match spec {
      TokenSpec::Kind(kind) => self.kind == kind,
      TokenSpec::Not(kind) => self.kind != kind,
      TokenSpec::Len(len, kind) => self.kind == kind && self.len() == len as usize,
      TokenSpec::OneOf(kinds) => kinds.contains(&self.kind),
      TokenSpec::NotOneOf(kinds) => !kinds.contains(&self.kind),
    }
  }

  fn matches(&self, kind: TokenKind, lexeme: &'static str) -> bool {
    self.kind == kind && self.lexeme == lexeme
  }

  fn is_attr_replacement(&self) -> bool {
    self.attr_replacement
  }

  fn can_start_block_macro(&self) -> bool {
    if !self.is_macro() {
      return false;
    }
    if matches!(
      self.lexeme.as_str(),
      "image:" | "toc:" | "audio:" | "video:"
    ) {
      return true;
    }

    let bytes = self.lexeme.as_bytes();
    let bytes = &bytes[..bytes.len() - 1];
    !Lexer::is_builtin_macro_name(bytes)
  }
}

impl TokenIs for Option<&Token<'_>> {
  fn is_len(&self, len: usize) -> bool {
    self.is_some_and(|t| t.is_len(len))
  }

  fn kind(&self, kind: TokenKind) -> bool {
    self.is_some_and(|t| t.kind(kind))
  }

  fn is_kind_len(&self, kind: TokenKind, len: usize) -> bool {
    self.is_some_and(|t| t.is_kind_len(kind, len))
  }

  fn matches(&self, kind: TokenKind, lexeme: &'static str) -> bool {
    self.is_some_and(|t| t.matches(kind, lexeme))
  }

  fn satisfies(&self, spec: TokenSpec) -> bool {
    self.is_some_and(|t| t.satisfies(spec))
  }

  fn can_start_block_macro(&self) -> bool {
    self.is_some_and(|t| t.can_start_block_macro())
  }

  fn is_attr_replacement(&self) -> bool {
    self.is_some_and(|t| t.is_attr_replacement())
  }
}

impl std::fmt::Debug for Token<'_> {
  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    write!(
      f,
      "Token {{ {:?}, \"{}\", {:?}{} }}",
      self.kind,
      iff!(&self.lexeme == "\n", "\\n", &self.lexeme),
      self.loc,
      iff!(self.attr_replacement, ", attr_repl=true", "")
    )
  }
}

#[test]
fn test_size_of_token() {
  assert!(std::mem::size_of::<Token>() == 48);
}