use ordered_float::NotNan;
#[cfg(feature = "quasi-quote")]
use crate::quasi_quote::Template;
use crate::{
ast::*,
span::{ContextMapping, SourceContext, Span, Spanned},
};
pub fn lex<'a>(source: &'a str, filename: Option<&str>) -> (BalancedTokenSequence, ContextMapping<'a>) {
let mut lexer = Lexer::new(source, filename);
let result = lexer.balanced_token_sequence();
(result, lexer.ctx_map)
}
mod lexer_core {
use regex_automata::{Anchored, Input, meta::Regex};
use crate::span::{ContextId, ContextMapping, SourceContext, Span};
pub trait Pattern {
fn matches(self, string: &str) -> Option<usize>;
}
impl Pattern for &'_ str {
fn matches(self, string: &str) -> Option<usize> {
string.starts_with(self).then_some(self.len())
}
}
impl Pattern for char {
fn matches(self, string: &str) -> Option<usize> {
string.chars().next().filter(|c| *c == self).map(|c| c.len_utf8())
}
}
impl Pattern for &'_ Regex {
fn matches(self, string: &str) -> Option<usize> {
let input = Input::new(string).anchored(Anchored::Yes);
self.find(input).map(|mat| mat.len())
}
}
pub struct Lexer<'a> {
string: &'a str,
cursor: usize,
line_begin: bool,
lineno: i32,
ctx_id: ContextId,
pub(crate) ctx_map: ContextMapping<'a>,
}
#[derive(Clone, Copy)]
pub struct LexerCheckpoint {
cursor: usize,
line_begin: bool,
lineno: i32,
ctx_id: ContextId,
}
impl<'a> Lexer<'a> {
pub fn new(string: &'a str, filename: Option<&str>) -> Self {
let mut ctx_map = ContextMapping::new(string);
Self {
string,
cursor: 0,
line_begin: true,
lineno: 1,
ctx_id: filename.map_or(ContextId::none(), |filename| {
ctx_map.insert_context(SourceContext {
filename: filename.into(),
line_offset: 0,
})
}),
ctx_map,
}
}
pub fn checkpoint(&self) -> LexerCheckpoint {
LexerCheckpoint {
cursor: self.cursor,
line_begin: self.line_begin,
lineno: self.lineno,
ctx_id: self.ctx_id,
}
}
pub fn restore(&mut self, checkpoint: LexerCheckpoint) {
self.cursor = checkpoint.cursor;
self.line_begin = checkpoint.line_begin;
self.lineno = checkpoint.lineno;
self.ctx_id = checkpoint.ctx_id;
}
fn on_token(&mut self, ch: char) {
if ch == '\n' {
self.line_begin = true;
self.lineno += 1;
} else if self.line_begin && !ch.is_whitespace() {
self.line_begin = false;
}
}
pub fn remaining(&self) -> &'a str {
&self.string[self.cursor..]
}
pub fn is_eof(&self) -> bool {
self.cursor >= self.string.len()
}
pub fn cursor(&self) -> usize {
self.cursor
}
pub fn line_begin(&self) -> bool {
self.line_begin
}
pub fn lineno(&self) -> i32 {
self.lineno
}
pub fn ctx_id(&self) -> ContextId {
self.ctx_id
}
pub fn set_context(&mut self, context: SourceContext) {
self.ctx_id = self.ctx_map.insert_context(context);
}
pub fn peek(&self) -> Option<char> {
self.remaining().chars().next()
}
pub fn eat(&mut self) -> Option<char> {
let ch = self.peek()?;
self.cursor += ch.len_utf8();
self.on_token(ch);
Some(ch)
}
pub fn eat_if(&mut self, pat: impl Pattern) -> Option<&'a str> {
let remaining = self.remaining();
let len = pat.matches(remaining)?;
let matched = &remaining[..len];
for ch in matched.chars() {
self.on_token(ch);
}
self.cursor += len;
Some(matched)
}
pub fn make_span(&self, start: usize) -> Span {
Span::new(start..self.cursor, self.ctx_id)
}
}
}
use lexer_core::Lexer;
impl<'a> Lexer<'a> {
fn identifier(&mut self) -> Option<Identifier> {
let ident = self.eat_if(re!(r"[_\p{XID_Start}]\p{XID_Continue}*"))?;
Some(Identifier(ident.into()))
}
fn integer_constant(&mut self) -> Option<IntegerConstant> {
let value = self
.hexadecimal_constant()
.or_else(|| self.binary_constant())
.or_else(|| self.octal_constant())
.or_else(|| self.decimal_constant())?;
let suffix = self.integer_suffix();
Some(IntegerConstant { value, suffix })
}
fn decimal_constant(&mut self) -> Option<i128> {
let value = self.eat_if(re!(r"[1-9](?:'?[0-9])*"))?;
let value = value.replace("'", "").parse().unwrap_or(i128::MAX);
Some(value)
}
fn octal_constant(&mut self) -> Option<i128> {
if let Some(value) = self.eat_if(re!(r"0[oO][0-7](?:'?[0-7])*")) {
let digits = &value[2..];
return Some(i128::from_str_radix(&digits.replace("'", ""), 8).unwrap_or(i128::MAX));
}
if let Some(value) = self.eat_if(re!(r"0(?:'?[0-7])*")) {
return Some(i128::from_str_radix(&value.replace("'", ""), 8).unwrap_or(i128::MAX));
}
None
}
fn hexadecimal_constant(&mut self) -> Option<i128> {
let value = self.eat_if(re!(r"0[xX][0-9a-fA-F](?:'?[0-9a-fA-F])*"))?;
let value = i128::from_str_radix(&value[2..].replace("'", ""), 16).unwrap_or(i128::MAX);
Some(value)
}
fn binary_constant(&mut self) -> Option<i128> {
let value = self.eat_if(re!(r"0[bB][01](?:'?[01])*"))?;
let value = i128::from_str_radix(&value[2..].replace("'", ""), 2).unwrap_or(i128::MAX);
Some(value)
}
fn integer_suffix(&mut self) -> Option<IntegerSuffix> {
[
(re!(r"(u|U)(ll|LL)|(ll|LL)(u|U)"), IntegerSuffix::UnsignedLongLong),
(re!(r"(u|U)(l|L)|(l|L)(u|U)"), IntegerSuffix::UnsignedLong),
(re!(r"(u|U)(wb|WB)|(wb|WB)(u|U)"), IntegerSuffix::UnsignedBitPrecise),
(re!(r"u|U"), IntegerSuffix::Unsigned),
(re!(r"ll|LL"), IntegerSuffix::LongLong),
(re!(r"l|L"), IntegerSuffix::Long),
(re!(r"wb|WB"), IntegerSuffix::BitPrecise),
]
.into_iter()
.find_map(|(pattern, suffix)| self.eat_if(pattern).map(|_| suffix))
}
fn floating_constant(&mut self) -> Option<FloatingConstant> {
let value = self
.hexadecimal_floating_constant()
.or_else(|| self.decimal_floating_constant())?;
let suffix = self.floating_suffix();
Some(FloatingConstant { value, suffix })
}
fn decimal_floating_constant(&mut self) -> Option<NotNan<f64>> {
let value = self.eat_if(re!(r"(?:(?:\d+(?:'?\d+)*)?\.(?:\d+(?:'?\d+)*)|(?:\d+(?:'?\d+)*)\.)(?:[eE][+-]?(?:\d+(?:'?\d+)*))?|(?:\d+(?:'?\d+)*)(?:[eE][+-]?(?:\d+(?:'?\d+)*))"))?;
let parsed: f64 = value.replace("'", "").parse().ok()?;
NotNan::new(parsed).ok()
}
fn hexadecimal_floating_constant(&mut self) -> Option<NotNan<f64>> {
let value = self.eat_if(re!(r"(?:0[xX])(?:(?:[0-9a-fA-F]+(?:'?[0-9a-fA-F]+)*)?\.(?:[0-9a-fA-F]+(?:'?[0-9a-fA-F]+)*)|(?:[0-9a-fA-F]+(?:'?[0-9a-fA-F]+)*)\.?)(?:[pP][+-]?(?:\d+(?:'?\d+)*))"))?;
let parsed = hexf_parse::parse_hexf64(&value.replace("'", ""), false).ok()?;
NotNan::new(parsed).ok()
}
fn floating_suffix(&mut self) -> Option<FloatingSuffix> {
[
(re!(r"df|DF"), FloatingSuffix::DF),
(re!(r"dd|DD"), FloatingSuffix::DD),
(re!(r"dl|DL"), FloatingSuffix::DL),
(re!(r"f|F"), FloatingSuffix::F),
(re!(r"l|L"), FloatingSuffix::L),
]
.into_iter()
.find_map(|(pattern, suffix)| self.eat_if(pattern).map(|_| suffix))
}
fn encoding_prefix(&mut self) -> Option<EncodingPrefix> {
if self.eat_if("u8").is_some() {
Some(EncodingPrefix::U8)
} else if self.eat_if('u').is_some() {
Some(EncodingPrefix::U)
} else if self.eat_if('U').is_some() {
Some(EncodingPrefix::CapitalU)
} else if self.eat_if('L').is_some() {
Some(EncodingPrefix::L)
} else {
None
}
}
fn escape_sequence(&mut self) -> Option<char> {
self.eat_if('\\')?;
match self.peek()? {
c @ ('\'' | '"' | '?' | '\\' | 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v') => {
self.eat();
Some(match c {
'\'' => '\'',
'"' => '"',
'?' => '?',
'\\' => '\\',
'a' => '\x07',
'b' => '\x08',
'f' => '\x0C',
'n' => '\n',
'r' => '\r',
't' => '\t',
'v' => '\x0B',
_ => unreachable!(),
})
}
'0'..='7' => {
let digits = self.eat_if(re!(r"[0-7]{1,3}"))?;
char::from_u32(u32::from_str_radix(digits, 8).ok()?)
}
'x' => {
self.eat();
let digits = self.eat_if(re!(r"[0-9a-fA-F]+"))?;
char::from_u32(u32::from_str_radix(digits, 16).ok()?)
}
'u' => {
self.eat();
let digits = self.eat_if(re!(r"[0-9a-fA-F]{4}"))?;
char::from_u32(u32::from_str_radix(digits, 16).ok()?)
}
'U' => {
self.eat();
let digits = self.eat_if(re!(r"[0-9a-fA-F]{8}"))?;
char::from_u32(u32::from_str_radix(digits, 16).ok()?)
}
_ => self.eat(),
}
}
fn character_constant(&mut self) -> Option<CharacterConstant> {
let ckpt = self.checkpoint();
let encoding_prefix = self.encoding_prefix();
if self.eat_if('\'').is_none() {
self.restore(ckpt);
return None;
}
let mut value = String::new();
loop {
match self.peek() {
Some('\'') => {
self.eat();
break;
}
Some('\\') => {
if let Some(ch) = self.escape_sequence() {
value.push(ch);
}
}
Some(ch) if ch != '\n' => {
value.push(ch);
self.eat();
}
_ => break, }
}
Some(CharacterConstant { encoding_prefix, value })
}
fn predefined_constant(&mut self) -> Option<PredefinedConstant> {
[
("false", PredefinedConstant::False),
("true", PredefinedConstant::True),
("nullptr", PredefinedConstant::Nullptr),
]
.into_iter()
.find_map(|(keyword, constant)| {
let ckpt = self.checkpoint();
if self.eat_if(keyword).is_some() {
if self.peek().is_none_or(|c| !c.is_alphanumeric() && c != '_') {
Some(constant)
} else {
self.restore(ckpt);
None
}
} else {
None
}
})
}
fn constant(&mut self) -> Option<Constant> {
let ckpt = self.checkpoint();
if let Some(pc) = self.predefined_constant() {
return Some(Constant::Predefined(pc));
}
self.restore(ckpt);
if let Some(fc) = self.floating_constant() {
return Some(Constant::Floating(fc));
}
self.restore(ckpt);
if let Some(cc) = self.character_constant() {
return Some(Constant::Character(cc));
}
self.restore(ckpt);
if let Some(ic) = self.integer_constant() {
return Some(Constant::Integer(ic));
}
self.restore(ckpt);
None
}
fn string_literal(&mut self) -> Option<StringLiterals> {
let mut literals = Vec::new();
loop {
let ckpt = self.checkpoint();
let encoding_prefix = self.encoding_prefix();
if self.eat_if('"').is_none() {
self.restore(ckpt);
break;
}
let mut value = String::new();
loop {
match self.peek() {
Some('"') => {
self.eat();
break;
}
Some('\\') => {
if let Some(ch) = self.escape_sequence() {
value.push(ch);
}
}
Some(ch) if ch != '\n' => {
value.push(ch);
self.eat();
}
_ => break, }
}
literals.push(StringLiteral { encoding_prefix, value });
self.skip_whitespace();
}
if literals.is_empty() {
None
} else {
Some(StringLiterals(literals))
}
}
fn quoted_string(&mut self) -> Option<String> {
self.eat_if('`')?;
let mut content = String::new();
loop {
match self.peek() {
Some('`') => {
self.eat();
break;
}
Some(ch) => {
content.push(ch);
self.eat();
}
None => break, }
}
Some(content)
}
fn punctuator(&mut self) -> Option<Punctuator> {
macro_rules! try_punct {
($($pat:expr => $variant:expr),* $(,)?) => {
$(
if self.eat_if($pat).is_some() {
return Some($variant);
}
)*
};
}
try_punct! {
"<<=" => Punctuator::LeftShiftAssign,
">>=" => Punctuator::RightShiftAssign,
"..." => Punctuator::Ellipsis,
"*=" => Punctuator::MulAssign,
"/=" => Punctuator::DivAssign,
"%=" => Punctuator::ModAssign,
"+=" => Punctuator::AddAssign,
"-=" => Punctuator::SubAssign,
"&=" => Punctuator::AndAssign,
"^=" => Punctuator::XorAssign,
"|=" => Punctuator::OrAssign,
"##" => Punctuator::HashHash,
"++" => Punctuator::Increment,
"--" => Punctuator::Decrement,
"<<" => Punctuator::LeftShift,
">>" => Punctuator::RightShift,
"<=" => Punctuator::LessEqual,
">=" => Punctuator::GreaterEqual,
"==" => Punctuator::Equal,
"!=" => Punctuator::NotEqual,
"&&" => Punctuator::LogicalAnd,
"||" => Punctuator::LogicalOr,
"->" => Punctuator::Arrow,
"::" => Punctuator::Scope,
'.' => Punctuator::Dot,
'&' => Punctuator::Ampersand,
'*' => Punctuator::Star,
'+' => Punctuator::Plus,
'-' => Punctuator::Minus,
'~' => Punctuator::Tilde,
'!' => Punctuator::Bang,
'/' => Punctuator::Slash,
'%' => Punctuator::Percent,
'<' => Punctuator::Less,
'>' => Punctuator::Greater,
'^' => Punctuator::Caret,
'|' => Punctuator::Pipe,
'?' => Punctuator::Question,
':' => Punctuator::Colon,
';' => Punctuator::Semicolon,
'=' => Punctuator::Assign,
',' => Punctuator::Comma,
'#' => Punctuator::Hash,
}
None
}
#[cfg(feature = "quasi-quote")]
fn template(&mut self) -> Option<Template> {
self.eat_if('@')?;
let id = self.identifier()?;
Some(Template { name: id.0 })
}
fn skip_line_comment(&mut self) -> bool {
if self.eat_if("//").is_some() {
while let Some(ch) = self.peek() {
if ch == '\n' {
break;
}
self.eat();
}
true
} else {
false
}
}
fn skip_block_comment(&mut self) -> bool {
if self.eat_if("/*").is_some() {
loop {
if self.eat_if("*/").is_some() {
break;
}
if self.eat().is_none() {
break; }
}
true
} else {
false
}
}
fn skip_line_directive(&mut self) -> bool {
if !self.line_begin() {
return false;
}
let ckpt = self.checkpoint();
while let Some(ch) = self.peek() {
if ch == ' ' || ch == '\t' {
self.eat();
} else {
break;
}
}
if self.eat_if('#').is_none() {
self.restore(ckpt);
return false;
}
let is_pragma = self.eat_if("pragma").is_some();
let mut directive = String::new();
while let Some(ch) = self.peek() {
if ch == '\n' {
self.eat();
break;
}
directive.push(ch);
self.eat();
}
if !is_pragma {
let parts: Vec<&str> = directive.split_whitespace().collect();
if let [line, file, ..] = &parts[..]
&& let Ok(line_num) = line.parse::<i32>()
{
self.set_context(SourceContext {
filename: file.trim_matches('"').to_string(),
line_offset: self.lineno() - line_num,
});
}
}
true
}
fn skip_whitespace(&mut self) {
loop {
let start = self.cursor();
while let Some(ch) = self.peek() {
if ch.is_whitespace() {
self.eat();
} else {
break;
}
}
if self.skip_line_comment() || self.skip_block_comment() {
continue;
}
if self.skip_line_directive() {
continue;
}
if self.cursor() == start {
break;
}
}
}
fn parse_bracketed<F>(&mut self, open: char, close: char, make_token: F) -> Option<Spanned<BalancedToken>>
where
F: Fn(BalancedTokenSequence) -> BalancedToken,
{
let start = self.cursor();
if self.eat_if(open).is_some() {
let mut inner = self.balanced_token_sequence();
if self.eat_if(close).is_none() {
inner.closed = false;
}
let span = self.make_span(start);
return Some(Spanned::new(make_token(inner), span));
}
None
}
fn balanced_token(&mut self) -> Option<Spanned<BalancedToken>> {
self.skip_whitespace();
let start = self.cursor();
if let Some(token) = self.parse_bracketed('(', ')', BalancedToken::Parenthesized) {
return Some(token);
}
if let Some(token) = self.parse_bracketed('[', ']', BalancedToken::Bracketed) {
return Some(token);
}
if let Some(token) = self.parse_bracketed('{', '}', BalancedToken::Braced) {
return Some(token);
}
if let Some(sl) = self.string_literal() {
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::StringLiteral(sl), span));
}
if let Some(qs) = self.quoted_string() {
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::QuotedString(qs), span));
}
#[cfg(feature = "quasi-quote")]
if let Some(t) = self.template() {
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::Template(t), span));
}
if let Some(c) = self.constant() {
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::Constant(c), span));
}
if let Some(id) = self.identifier() {
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::Identifier(id), span));
}
if let Some(p) = self.punctuator() {
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::Punctuator(p), span));
}
if !self.is_eof() && self.peek().is_some_and(|c| !c.is_whitespace()) {
self.eat();
let span = self.make_span(start);
return Some(Spanned::new(BalancedToken::Unknown, span));
}
None
}
fn balanced_token_sequence(&mut self) -> BalancedTokenSequence {
let mut tokens = Vec::new();
loop {
self.skip_whitespace();
match self.peek() {
Some(')') | Some(']') | Some('}') | None => break,
_ => {}
}
if let Some(token) = self.balanced_token() {
tokens.push(token);
} else {
break;
}
}
self.skip_whitespace();
let eoi = Span::new_eoi(self.cursor(), self.ctx_id());
BalancedTokenSequence { tokens, closed: true, eoi }
}
}