use std::str::from_utf8;
use logos::Logos;
use crate::Span;
pub(crate) use tokens::Token;
pub(crate) use tokens::TokenId;
mod tokens;
#[cfg(test)]
mod tests;
pub struct Tokenizer<'src> {
source: &'src [u8],
mode: Mode<'src>,
lexer_starting_pos: usize,
}
impl<'src> Tokenizer<'src> {
pub fn new(source: &'src [u8]) -> Self {
assert!(source.len() < Span::MAX);
Self {
source,
lexer_starting_pos: 0,
mode: Mode::Normal(Logos::lexer(source)),
}
}
#[inline]
pub fn source(&self) -> &'src [u8] {
self.source
}
pub fn next_token(&mut self) -> Option<Token> {
loop {
match &mut self.mode {
Mode::Normal(lexer) => match lexer.next()? {
Ok(token) => {
return Some(convert_normal_token(
token,
Span::from(lexer.span())
.offset(self.lexer_starting_pos as isize),
));
}
Err(()) => return Some(self.unexpected_token()),
},
Mode::HexPattern(lexer) => match lexer.next()? {
Ok(token) => {
return Some(convert_hex_pattern_token(
token,
Span::from(lexer.span())
.offset(self.lexer_starting_pos as isize),
))
}
Err(()) => {
self.lexer_starting_pos += match &self.mode {
Mode::HexPattern(lexer) => lexer.span().start,
_ => unreachable!(),
};
self.mode = Mode::Normal(Logos::lexer(
&self.source[self.lexer_starting_pos..],
));
}
},
Mode::HexJump(lexer) => match lexer.next()? {
Ok(token) => {
return Some(convert_hex_jump_token(
token,
Span::from(lexer.span())
.offset(self.lexer_starting_pos as isize),
))
}
Err(()) => {
self.lexer_starting_pos += match &self.mode {
Mode::HexJump(lexer) => lexer.span().start,
_ => unreachable!(),
};
self.mode = Mode::HexPattern(Logos::lexer(
&self.source[self.lexer_starting_pos..],
));
}
},
}
}
}
pub fn enter_hex_pattern_mode(&mut self) {
self.lexer_starting_pos += match &self.mode {
Mode::Normal(lexer) => lexer.span().end,
mode => {
panic!(r"enter_hex_pattern_mode called from mode: {mode:?}")
}
};
self.mode = Mode::HexPattern(Logos::lexer(
&self.source[self.lexer_starting_pos..],
));
}
pub fn enter_hex_jump_mode(&mut self) {
self.lexer_starting_pos += match &self.mode {
Mode::HexPattern(lexer) => lexer.span().end,
mode => {
panic!(r"enter_hex_jump_mode called from mode: {mode:?}")
}
};
self.mode = Mode::HexJump(Logos::lexer(
&self.source[self.lexer_starting_pos..],
));
}
}
impl Tokenizer<'_> {
fn unexpected_token(&mut self) -> Token {
let lexer = match &mut self.mode {
Mode::Normal(lexer) => lexer,
_ => unreachable!(),
};
let start = lexer.span().start;
let end = lexer.source().len();
let unexpected = lexer.source().get(start..end).unwrap();
let chunk = unexpected.utf8_chunks().next().unwrap();
if chunk.valid().is_empty() {
return Token::INVALID_UTF8(
Span(start as u32..(start + 1) as u32)
.offset(self.lexer_starting_pos as isize),
);
}
let unexpected = chunk.valid();
let unexpected = unexpected.split(char::is_whitespace).next().unwrap();
lexer.bump(unexpected.len().saturating_sub(lexer.span().len()));
Token::UNKNOWN(
Span::from(lexer.span()).offset(self.lexer_starting_pos as isize),
)
}
}
#[derive(Debug)]
enum Mode<'src> {
Normal(logos::Lexer<'src, NormalToken<'src>>),
HexPattern(logos::Lexer<'src, HexPatternToken>),
HexJump(logos::Lexer<'src, HexJumpToken<'src>>),
}
#[allow(clippy::upper_case_acronyms)]
#[derive(logos::Logos, Debug, PartialEq)]
#[logos(source = [u8])]
enum NormalToken<'src> {
#[token("all")]
All,
#[token("and")]
And,
#[token("any")]
Any,
#[token("ascii")]
Ascii,
#[token("at")]
At,
#[token("base64")]
Base64,
#[token("base64wide")]
Base64Wide,
#[token("condition")]
Condition,
#[token("contains")]
Contains,
#[token("defined")]
Defined,
#[token("endswith")]
EndsWith,
#[token("entrypoint")]
Entrypoint,
#[token("false")]
False,
#[token("filesize")]
Filesize,
#[token("for")]
For,
#[token("fullword")]
Fullword,
#[token("global")]
Global,
#[token("icontains")]
IContains,
#[token("iendswith")]
IEndsWith,
#[token("iequals")]
IEquals,
#[token("import")]
Import,
#[token("in")]
In,
#[token("include")]
Include,
#[token("istartswith")]
IStarsWith,
#[token("matches")]
Matches,
#[token("meta")]
Meta,
#[token("nocase")]
Nocase,
#[token("none")]
None,
#[token("not")]
Not,
#[token("of")]
Of,
#[token("or")]
Or,
#[token("private")]
Private,
#[token("rule")]
Rule,
#[token("startswith")]
StartsWith,
#[token("strings")]
Strings,
#[token("them")]
Them,
#[token("true")]
True,
#[token("wide")]
Wide,
#[token("xor")]
Xor,
#[token("with")]
With,
#[token("<<")]
Shl,
#[token(">>")]
Shr,
#[token("==")]
Eq,
#[token("!=")]
Ne,
#[token("<=")]
Le,
#[token(">=")]
Ge,
#[token("<")]
Lt,
#[token(">")]
Gt,
#[token("&")]
Ampersand,
#[token("*")]
Asterisk,
#[token("\\")]
Backslash,
#[token(":")]
Colon,
#[token(",")]
Comma,
#[token(".")]
Dot,
#[token("=")]
Equal,
#[token("+")]
Plus,
#[token("-")]
Minus,
#[token("%")]
Percent,
#[token("|")]
Pipe,
#[token("^")]
Caret,
#[token("~")]
Tilde,
#[token("{")]
LBrace,
#[token("}")]
RBrace,
#[token("(")]
LParen,
#[token(")")]
RParen,
#[token("[")]
LBracket,
#[token("]")]
RBracket,
#[regex(
r#"(?x) # allow comments in the regexp
\$ # first character is $
([[:alpha:]]|\d|_)* # any number of letters, digits, or _
"#,
|token| token.slice())
]
PatternIdent(&'src [u8]),
#[regex(
r#"(?x) # allow comments in the regexp
\# # first character is #
([[:alpha:]]|\d|_)* # any number of letters, digits, or _
"#,
|token| token.slice())
]
PatternCount(&'src [u8]),
#[regex(
r#"(?x) # allow comments in the regexp
@ # first character is @
([[:alpha:]]|\d|_)* # any number of letters, digits, or _
"#,
|token| token.slice())
]
PatternOffset(&'src [u8]),
#[regex(
r#"(?x) # allow comments in the regexp
! # first character is !
([[:alpha:]]|\d|_)* # any number of letters, digits, or _
"#,
|token| token.slice())
]
PatternLength(&'src [u8]),
#[regex(
r#"(?x) # allow comments in the regexp
([[:alpha:]]|_) # first character is letter or _
([[:alpha:]]|\d|_)* # any number of letters, digits, or _
"#,
|token| token.slice())
]
Ident(&'src [u8]),
#[regex(
r#"(?x) # allow comments in the regexp
([0-9]+_*)+ # one or more digits or underscores
\. # a dot
([0-9_]+_*)+ # one more digits or underscores
"#,
|token| token.slice())
]
FloatLit(&'src [u8]),
#[regex(
r#"(?x)
(
0x([a-fA-F0-9]+_*)+ | # hexadecimal number
0o([0-7]+_*)+ | # octal number
([0-9]+_*)+(KB|MB)? # decimal number followed by optional underscore and optional KB or MB
)
"#,
|token| token.slice())
]
IntegerLit(&'src [u8]),
#[regex(
r#"(?x) # allow comments in the regexp
" # starts with double quotes
( # any number of
\\. # escape sequence
| # or ..
[^"\n\\] # anything except quotes, newlines and backslashes
)*
" # ends with double quotes
"#)
]
StringLit,
#[regex(
r#"(?x) # allow comments in the regexp
""" # starts with 3 double quotes
( # any number of
\\. # escape sequence
| # or ..
[^"\\] # anything except quotes, newlines and backslashes
)*
""" # ends with 3 double quotes
"#)
]
MultiLineStringLit,
#[regex(
r#"(?x) # allow comments in the regexp
/ # starts with /
(\\.|[^*/\\\n]) # followed by escape sequence or anything that is
# not *, /, \, or newline. This prevents collision
# with comments.
( # zero or more..
\\. # escape sequence
| # or ..
[^\\/\n] # anything except \, / and newlines
)*
/ # ends with /
[[:alpha:]]{0,2} # up to 2 optional modifiers like "s" and "i"
"#)
]
Regexp,
#[regex(
r#"(?x) # allow comments in the regexp
/\* # starts with /*
[^*]* # zero or more characters except *
\*+ # one or more *
( # zero or more..
[^/*] # anything except / and *
[^*]* # zero or more characters except *
\*+ # one or more *
)*
/ # ends with /
"#
)]
BlockComment,
#[regex(r#"//[^\n]*"#)]
Comment,
#[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,
#[token("\n")]
LF,
#[token("\r")]
CR,
#[token("\r\n")]
CRLF,
}
#[allow(clippy::upper_case_acronyms)]
#[derive(logos::Logos, Debug, PartialEq)]
#[logos(source = [u8])]
enum HexPatternToken {
#[regex("~?[?0-9a-fA-F]{2}")]
Byte,
#[token("|")]
Pipe,
#[token("(")]
LParen,
#[token(")")]
RParen,
#[token("[")]
LBracket,
#[token("]")]
RBracket,
#[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,
#[token("\n")]
LF,
#[token("\r")]
CR,
#[token("\r\n")]
CRLF,
#[regex(
r#"(?x) # allow comments in the regexp
/\* # starts with /*
[^*]* # zero or more characters except *
\*+ # one or more *
( # zero or more..
[^/*] # anything except / and *
[^*]* # zero or more characters except *
\*+ # one or more *
)*
/ # ends with /
"#
)]
BlockComment,
#[regex(r#"//[^\n]*"#)]
Comment,
}
#[allow(clippy::upper_case_acronyms)]
#[derive(logos::Logos, Debug, PartialEq)]
#[logos(source = [u8])]
enum HexJumpToken<'src> {
#[token("-")]
Hyphen,
#[regex(
r#"(?x)
(
0x[a-fA-F0-9]+ | # hexadecimal number
0o[0-7]+ | # octal number
[0-9]+ # decimal number
)
"#,
|token| token.slice())
]
IntegerLit(&'src [u8]),
#[regex("[ \t\u{a0}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{202f}\u{205f}]+")]
Whitespace,
#[token("\n")]
LF,
#[token("\r")]
CR,
#[token("\r\n")]
CRLF,
}
fn convert_normal_token(token: NormalToken, span: Span) -> Token {
match token {
NormalToken::All => Token::ALL_KW(span),
NormalToken::And => Token::AND_KW(span),
NormalToken::Any => Token::ANY_KW(span),
NormalToken::Ascii => Token::ASCII_KW(span),
NormalToken::At => Token::AT_KW(span),
NormalToken::Base64 => Token::BASE64_KW(span),
NormalToken::Base64Wide => Token::BASE64WIDE_KW(span),
NormalToken::Condition => Token::CONDITION_KW(span),
NormalToken::Contains => Token::CONTAINS_KW(span),
NormalToken::Defined => Token::DEFINED_KW(span),
NormalToken::EndsWith => Token::ENDSWITH_KW(span),
NormalToken::Entrypoint => Token::ENTRYPOINT_KW(span),
NormalToken::False => Token::FALSE_KW(span),
NormalToken::Filesize => Token::FILESIZE_KW(span),
NormalToken::For => Token::FOR_KW(span),
NormalToken::Fullword => Token::FULLWORD_KW(span),
NormalToken::Global => Token::GLOBAL_KW(span),
NormalToken::IContains => Token::ICONTAINS_KW(span),
NormalToken::IEndsWith => Token::IENDSWITH_KW(span),
NormalToken::IEquals => Token::IEQUALS_KW(span),
NormalToken::Import => Token::IMPORT_KW(span),
NormalToken::In => Token::IN_KW(span),
NormalToken::Include => Token::INCLUDE_KW(span),
NormalToken::IStarsWith => Token::ISTARTSWITH_KW(span),
NormalToken::Matches => Token::MATCHES_KW(span),
NormalToken::Meta => Token::META_KW(span),
NormalToken::Nocase => Token::NOCASE_KW(span),
NormalToken::None => Token::NONE_KW(span),
NormalToken::Not => Token::NOT_KW(span),
NormalToken::Of => Token::OF_KW(span),
NormalToken::Or => Token::OR_KW(span),
NormalToken::Private => Token::PRIVATE_KW(span),
NormalToken::Rule => Token::RULE_KW(span),
NormalToken::StartsWith => Token::STARTSWITH_KW(span),
NormalToken::Strings => Token::STRINGS_KW(span),
NormalToken::Them => Token::THEM_KW(span),
NormalToken::True => Token::TRUE_KW(span),
NormalToken::Wide => Token::WIDE_KW(span),
NormalToken::Xor => Token::XOR_KW(span),
NormalToken::With => Token::WITH_KW(span),
NormalToken::Shl => Token::SHL(span),
NormalToken::Shr => Token::SHR(span),
NormalToken::Eq => Token::EQ(span),
NormalToken::Ne => Token::NE(span),
NormalToken::Lt => Token::LT(span),
NormalToken::Gt => Token::GT(span),
NormalToken::Le => Token::LE(span),
NormalToken::Ge => Token::GE(span),
NormalToken::Ampersand => Token::AMPERSAND(span),
NormalToken::Asterisk => Token::ASTERISK(span),
NormalToken::Backslash => Token::BACKSLASH(span),
NormalToken::Caret => Token::CARET(span),
NormalToken::Comma => Token::COMMA(span),
NormalToken::Colon => Token::COLON(span),
NormalToken::Dot => Token::DOT(span),
NormalToken::Equal => Token::EQUAL(span),
NormalToken::Minus => Token::HYPHEN(span),
NormalToken::Percent => Token::PERCENT(span),
NormalToken::Pipe => Token::PIPE(span),
NormalToken::Plus => Token::PLUS(span),
NormalToken::Tilde => Token::TILDE(span),
NormalToken::LBrace => Token::L_BRACE(span),
NormalToken::RBrace => Token::R_BRACE(span),
NormalToken::LParen => Token::L_PAREN(span),
NormalToken::RParen => Token::R_PAREN(span),
NormalToken::LBracket => Token::L_BRACKET(span),
NormalToken::RBracket => Token::R_BRACKET(span),
NormalToken::StringLit | NormalToken::MultiLineStringLit => {
Token::STRING_LIT(span)
}
NormalToken::Regexp => Token::REGEXP(span),
NormalToken::BlockComment | NormalToken::Comment => {
Token::COMMENT(span)
}
NormalToken::Whitespace => Token::WHITESPACE(span),
NormalToken::LF | NormalToken::CR | NormalToken::CRLF => {
Token::NEWLINE(span)
}
NormalToken::Ident(ident) => match from_utf8(ident) {
Ok(_) => Token::IDENT(span),
Err(_) => unreachable!(),
},
NormalToken::PatternIdent(ident) => match from_utf8(ident) {
Ok(_) => Token::PATTERN_IDENT(span),
Err(_) => unreachable!(),
},
NormalToken::PatternCount(ident) => match from_utf8(ident) {
Ok(_) => Token::PATTERN_COUNT(span),
Err(_) => unreachable!(),
},
NormalToken::PatternOffset(ident) => match from_utf8(ident) {
Ok(_) => Token::PATTERN_OFFSET(span),
Err(_) => unreachable!(),
},
NormalToken::PatternLength(ident) => match from_utf8(ident) {
Ok(_) => Token::PATTERN_LENGTH(span),
Err(_) => unreachable!(),
},
NormalToken::FloatLit(lit) => match from_utf8(lit) {
Ok(_) => Token::FLOAT_LIT(span),
Err(_) => unreachable!(),
},
NormalToken::IntegerLit(lit) => match from_utf8(lit) {
Ok(_) => Token::INTEGER_LIT(span),
Err(_) => unreachable!(),
},
}
}
fn convert_hex_pattern_token(token: HexPatternToken, span: Span) -> Token {
match token {
HexPatternToken::Byte => Token::HEX_BYTE(span),
HexPatternToken::Pipe => Token::PIPE(span),
HexPatternToken::LParen => Token::L_PAREN(span),
HexPatternToken::RParen => Token::R_PAREN(span),
HexPatternToken::LBracket => Token::L_BRACKET(span),
HexPatternToken::RBracket => Token::R_BRACKET(span),
HexPatternToken::Whitespace => Token::WHITESPACE(span),
HexPatternToken::LF | HexPatternToken::CR | HexPatternToken::CRLF => {
Token::NEWLINE(span)
}
HexPatternToken::BlockComment | HexPatternToken::Comment => {
Token::COMMENT(span)
}
}
}
fn convert_hex_jump_token(token: HexJumpToken, span: Span) -> Token {
match token {
HexJumpToken::Hyphen => Token::HYPHEN(span),
HexJumpToken::Whitespace => Token::WHITESPACE(span),
HexJumpToken::LF | HexJumpToken::CR | HexJumpToken::CRLF => {
Token::NEWLINE(span)
}
HexJumpToken::IntegerLit(lit) => match from_utf8(lit) {
Ok(_) => Token::INTEGER_LIT(span),
Err(_) => unreachable!(),
},
}
}