use std::borrow::Cow;
use std::cmp::Ordering;
use crate::internal::*;
#[derive(Clone, PartialEq, Eq)]
pub struct Token<'arena> {
pub kind: TokenKind,
pub loc: SourceLocation,
pub lexeme: BumpString<'arena>,
pub attr_replacement: bool,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
pub enum TokenKind {
Ampersand,
AttrDef,
AttrPassDbl,
AttrRef,
Backtick,
Backslash,
Bang,
CalloutNumber,
Caret,
CloseBrace,
CloseBracket,
CloseParens,
Colon,
Comma,
Dashes,
DelimiterLine,
Digits,
Directive,
Discard,
DoubleQuote,
Dots,
Email,
Entity,
EqualSigns,
#[default]
Eof,
ForwardSlashes,
GreaterThan,
Hash,
LessThan,
MacroName,
MaybeEmail,
Newline,
NoBreakSpace,
OpenBrace,
OpenBracket,
OpenParens,
Percent,
Pipe,
Plus,
PreprocPassthru,
Punctuation,
QuestionMark,
SemiColon,
SingleQuote,
Star,
TermDelimiter,
Tilde,
Underscore,
UriScheme,
Whitespace,
Word,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum TokenSpec {
Kind(TokenKind),
Len(u8, TokenKind),
Not(TokenKind),
OneOf(&'static [TokenKind]),
NotOneOf(&'static [TokenKind]),
}
impl TokenSpec {
pub const fn token_kind(&self) -> Option<TokenKind> {
match self {
TokenSpec::Kind(kind) => Some(*kind),
TokenSpec::Not(_) => None,
TokenSpec::Len(_, kind) => Some(*kind),
TokenSpec::OneOf(_) => None,
TokenSpec::NotOneOf(_) => None,
}
}
}
impl From<TokenKind> for TokenSpec {
fn from(kind: TokenKind) -> Self {
TokenSpec::Kind(kind)
}
}
impl<'arena> Token<'arena> {
pub fn new(kind: TokenKind, loc: impl Into<SourceLocation>, lexeme: BumpString<'arena>) -> Self {
Self {
kind,
loc: loc.into(),
lexeme,
attr_replacement: false,
}
}
pub fn into_source_string(self) -> SourceString<'arena> {
SourceString::new(self.lexeme, self.loc)
}
pub fn to_url_scheme(&self) -> Option<UrlScheme> {
match self.kind {
TokenKind::UriScheme => match self.lexeme.as_str() {
"https://" => Some(UrlScheme::Https),
"http://" => Some(UrlScheme::Http),
"ftp://" => Some(UrlScheme::Ftp),
"irc://" => Some(UrlScheme::Irc),
"mailto:" => Some(UrlScheme::Mailto),
"file:///" => Some(UrlScheme::File),
_ => None,
},
_ => None,
}
}
pub fn len(&self) -> usize {
self.lexeme.len()
}
pub fn parse_callout_num(&self) -> Option<u8> {
let ascii_digits = self
.lexeme
.bytes()
.filter(u8::is_ascii_digit)
.collect::<SmallVec<[u8; 3]>>();
if ascii_digits.is_empty() {
None } else {
let num_str = unsafe { std::str::from_utf8_unchecked(&ascii_digits) };
num_str.parse::<u8>().ok()
}
}
pub fn drop_leading_bytes(&mut self, n: u32) {
if n == 0 {
return;
}
debug_assert!(n as usize <= self.lexeme.len());
self.kind = TokenKind::Word;
let mut removed = 0;
loop {
let char = self.lexeme.remove(0);
let mut buf = [0; 4];
let bytes = char.encode_utf8(&mut buf).as_bytes();
removed += bytes.len();
match removed.cmp(&(n as usize)) {
Ordering::Less => continue,
Ordering::Equal => break,
Ordering::Greater => panic!("Token::drop_leading_bytes() mid-char boundary"),
}
}
self.loc.start += n;
if self.len() == 0 {
self.kind = TokenKind::Discard;
}
}
pub fn attr_name(&self) -> &str {
assert_eq!(self.kind, TokenKind::AttrRef);
&self.lexeme[1..self.lexeme.len() - 1]
}
pub fn lowercase_attr_name(&self) -> Cow<'_, str> {
let attr_name = self.attr_name();
if attr_name.chars().any(|c| c.is_uppercase()) {
Cow::Owned(attr_name.to_lowercase())
} else {
Cow::Borrowed(attr_name)
}
}
}
pub trait TokenIs {
fn is_len(&self, len: usize) -> bool;
fn kind(&self, kind: TokenKind) -> bool;
fn is_kind_len(&self, kind: TokenKind, len: usize) -> bool;
fn matches(&self, kind: TokenKind, lexeme: &'static str) -> bool;
fn satisfies(&self, spec: TokenSpec) -> bool;
fn can_start_block_macro(&self) -> bool;
fn is_attr_replacement(&self) -> bool;
fn not_kind(&self, kind: TokenKind) -> bool {
!self.kind(kind)
}
fn is_not_kind_len(&self, kind: TokenKind, len: usize) -> bool {
!self.is_kind_len(kind, len)
}
fn is_whitespaceish(&self) -> bool {
self.kind(TokenKind::Whitespace) || self.kind(TokenKind::Newline)
}
fn satisfies_any(&self, specs: &[TokenSpec]) -> bool {
specs.iter().any(|spec| self.satisfies(*spec))
}
fn is_macro(&self) -> bool {
self.kind(TokenKind::MacroName)
}
}
impl<'arena> DefaultIn<'arena> for Token<'arena> {
fn default_in(bump: &'arena Bump) -> Self {
Self {
kind: TokenKind::Eof,
loc: SourceLocation::default(),
lexeme: BumpString::from_str_in("", bump),
attr_replacement: false,
}
}
}
impl TokenIs for Token<'_> {
fn is_len(&self, len: usize) -> bool {
self.len() == len
}
fn kind(&self, kind: TokenKind) -> bool {
self.kind == kind
}
fn is_kind_len(&self, kind: TokenKind, len: usize) -> bool {
self.kind == kind && self.len() == len
}
fn satisfies(&self, spec: TokenSpec) -> bool {
match spec {
TokenSpec::Kind(kind) => self.kind == kind,
TokenSpec::Not(kind) => self.kind != kind,
TokenSpec::Len(len, kind) => self.kind == kind && self.len() == len as usize,
TokenSpec::OneOf(kinds) => kinds.contains(&self.kind),
TokenSpec::NotOneOf(kinds) => !kinds.contains(&self.kind),
}
}
fn matches(&self, kind: TokenKind, lexeme: &'static str) -> bool {
self.kind == kind && self.lexeme == lexeme
}
fn is_attr_replacement(&self) -> bool {
self.attr_replacement
}
fn can_start_block_macro(&self) -> bool {
if !self.is_macro() {
return false;
}
if matches!(self.lexeme.as_str(), "image:" | "toc:") {
return true;
}
let bytes = self.lexeme.as_bytes();
let bytes = &bytes[..bytes.len() - 1];
!Lexer::is_builtin_macro_name(bytes)
}
}
impl TokenIs for Option<&Token<'_>> {
fn is_len(&self, len: usize) -> bool {
self.is_some_and(|t| t.is_len(len))
}
fn kind(&self, kind: TokenKind) -> bool {
self.is_some_and(|t| t.kind(kind))
}
fn is_kind_len(&self, kind: TokenKind, len: usize) -> bool {
self.is_some_and(|t| t.is_kind_len(kind, len))
}
fn matches(&self, kind: TokenKind, lexeme: &'static str) -> bool {
self.is_some_and(|t| t.matches(kind, lexeme))
}
fn satisfies(&self, spec: TokenSpec) -> bool {
self.is_some_and(|t| t.satisfies(spec))
}
fn can_start_block_macro(&self) -> bool {
self.is_some_and(|t| t.can_start_block_macro())
}
fn is_attr_replacement(&self) -> bool {
self.is_some_and(|t| t.is_attr_replacement())
}
}
impl std::fmt::Debug for Token<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"Token {{ {:?}, \"{}\", {:?}{} }}",
self.kind,
iff!(&self.lexeme == "\n", "\\n", &self.lexeme),
self.loc,
iff!(self.attr_replacement, ", attr_repl=true", "")
)
}
}
#[test]
fn test_size_of_token() {
assert!(std::mem::size_of::<Token>() == 48);
}