use crate::token::{Float, Integer, Location, PreprocessorError, Punct};
use std::str::Chars;
use unicode_xid::UnicodeXID;
type CharAndLine = (char, u32);
#[derive(Clone)]
pub struct CharsAndLine<'a> {
inner: Chars<'a>,
line: u32,
}
impl<'a> CharsAndLine<'a> {
pub fn new(input: &'a str) -> Self {
CharsAndLine {
inner: input.chars(),
line: 1,
}
}
pub fn get_current_ptr(&self) -> *const u8 {
self.inner.as_str().as_ptr()
}
}
impl<'a> Iterator for CharsAndLine<'a> {
type Item = CharAndLine;
fn next(&mut self) -> Option<Self::Item> {
let current = self.inner.next()?;
match current {
'\n' => {
let mut peek_inner = self.inner.clone();
if peek_inner.next() == Some('\r') {
self.inner = peek_inner;
}
let res = Some(('\n', self.line));
self.line += 1;
res
}
'\r' => {
let mut peek_inner = self.inner.clone();
if peek_inner.next() == Some('\n') {
self.inner = peek_inner;
}
let res = Some(('\n', self.line));
self.line += 1;
res
}
_ => Some((current, self.line)),
}
}
}
#[derive(Clone)]
pub struct SkipBackslashNewline<'a> {
inner: CharsAndLine<'a>,
}
impl<'a> SkipBackslashNewline<'a> {
pub fn new(input: &'a str) -> Self {
SkipBackslashNewline {
inner: CharsAndLine::new(input),
}
}
pub fn get_current_ptr(&self) -> *const u8 {
self.inner.get_current_ptr()
}
}
impl<'a> Iterator for SkipBackslashNewline<'a> {
type Item = CharAndLine;
fn next(&mut self) -> Option<Self::Item> {
let mut current = self.inner.next()?;
while current.0 == '\\' {
let mut peek_inner = self.inner.clone();
if let Some(('\n', _)) = peek_inner.next() {
self.inner = peek_inner;
current = self.next()?;
} else {
return Some(current);
}
}
Some(current)
}
}
#[derive(Clone)]
pub struct ReplaceComments<'a> {
inner: SkipBackslashNewline<'a>,
}
pub const COMMENT_SENTINEL_VALUE: char = '\r';
impl<'a> ReplaceComments<'a> {
pub fn new(input: &'a str) -> Self {
ReplaceComments {
inner: SkipBackslashNewline::new(input),
}
}
pub fn get_current_ptr(&self) -> *const u8 {
self.inner.get_current_ptr()
}
}
impl<'a> Iterator for ReplaceComments<'a> {
type Item = CharAndLine;
fn next(&mut self) -> Option<Self::Item> {
let current = self.inner.next()?;
if current.0 != '/' {
debug_assert!(current.0 != COMMENT_SENTINEL_VALUE);
return Some(current);
}
let mut peek_inner = self.inner.clone();
match peek_inner.next() {
Some(('/', _)) => {
self.inner = peek_inner.clone();
while let Some((next, _)) = peek_inner.next() {
if next == '\n' {
break;
}
self.inner = peek_inner.clone();
}
Some((COMMENT_SENTINEL_VALUE, current.1))
}
Some(('*', _)) => {
let mut was_star = false;
while let Some((next, _)) = peek_inner.next() {
if was_star && next == '/' {
break;
}
was_star = next == '*';
}
self.inner = peek_inner;
Some((COMMENT_SENTINEL_VALUE, current.1))
}
_ => Some(current),
}
}
}
#[derive(Clone)]
struct LexerCharIterator<'a> {
inner: ReplaceComments<'a>,
peeked: Option<(CharAndLine, *const u8)>,
last_consumed: (CharAndLine, *const u8),
input_start: *const u8,
}
pub const NONE_CONSUMED_SENTINEL_VALUE: char = '\r';
impl<'a> LexerCharIterator<'a> {
pub fn new(input: &'a str) -> Self {
LexerCharIterator {
inner: ReplaceComments::new(input),
peeked: None,
last_consumed: ((NONE_CONSUMED_SENTINEL_VALUE, 0), input.as_bytes().as_ptr()),
input_start: input.as_bytes().as_ptr(),
}
}
fn next_char(&mut self) -> Option<char> {
self.last_consumed = match self.peeked.take() {
Some(v) => v,
None => {
let ptr = self.inner.get_current_ptr();
(self.inner.next()?, ptr)
}
};
Some(self.last_consumed.0 .0)
}
fn peek_char(&mut self) -> Option<char> {
match self.peeked {
Some(v) => Some(v.0 .0),
None => {
let ptr = self.inner.get_current_ptr();
let next = self.inner.next()?;
self.peeked = Some((next, ptr));
Some(next.0)
}
}
}
fn get_last_seen_line(&self) -> u32 {
self.peeked.unwrap_or(self.last_consumed).0 .1
}
fn get_last_seen_start_offset(&self) -> usize {
self.peeked.unwrap_or(self.last_consumed).1 as usize - self.input_start as usize
}
fn get_last_consumed_end_offset(&self) -> usize {
self.last_consumed.1 as usize - self.input_start as usize
+ self.last_consumed.0 .0.len_utf8()
}
}
#[derive(Clone, PartialEq, Debug)]
pub enum TokenValue {
Hash,
NewLine,
Ident(String),
Integer(Integer),
Float(Float),
Punct(Punct),
}
impl From<Punct> for TokenValue {
fn from(punct: Punct) -> Self {
TokenValue::Punct(punct)
}
}
#[derive(Clone, PartialEq, Debug)]
pub struct Token {
pub value: TokenValue,
pub location: Location,
pub leading_whitespace: bool,
pub start_of_line: bool,
}
pub type LexerItem = Result<Token, (PreprocessorError, Location)>;
pub struct Lexer<'a> {
inner: LexerCharIterator<'a>,
leading_whitespace: bool,
start_of_line: bool,
had_comments: bool,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
Lexer {
inner: LexerCharIterator::new(input),
leading_whitespace: true,
start_of_line: true,
had_comments: false,
}
}
pub fn had_comments(&self) -> bool {
self.had_comments
}
fn parse_identifier(&mut self) -> Result<TokenValue, PreprocessorError> {
let mut identifier = String::default();
if let Some(c) = self.next_char_if(|c| c.is_xid_start() || c == '_') {
identifier.push(c);
}
let rest = self.consume_chars(|c| c.is_xid_continue());
identifier.push_str(&rest);
Ok(TokenValue::Ident(identifier))
}
fn parse_integer_signedness_suffix(&mut self) -> bool {
self.next_char_if(|c| c == 'u' || c == 'U').is_none()
}
fn parse_integer_width_suffix(&mut self) -> Result<i32, PreprocessorError> {
match self.inner.peek_char() {
Some('l') | Some('L') => Err(PreprocessorError::NotSupported64BitLiteral),
Some('s') | Some('S') => Err(PreprocessorError::NotSupported16BitLiteral),
_ => Ok(32),
}
}
fn parse_float_width_suffix(&mut self) -> Result<i32, PreprocessorError> {
match self.inner.peek_char() {
Some('l') | Some('L') => Err(PreprocessorError::NotSupported64BitLiteral),
Some('h') | Some('H') => Err(PreprocessorError::NotSupported16BitLiteral),
Some('f') | Some('F') => {
self.inner.next_char();
Ok(32)
}
_ => Ok(32),
}
}
fn next_char_if(&mut self, predicate: impl FnOnce(char) -> bool) -> Option<char> {
if let Some(c) = self.inner.peek_char() {
if predicate(c) {
return self.inner.next_char();
}
}
None
}
fn consume_chars(&mut self, filter: impl Fn(char) -> bool) -> String {
let mut result: String = Default::default();
while let Some(c) = self.next_char_if(&filter) {
result.push(c);
}
result
}
fn parse_number(&mut self, first_char: char) -> Result<TokenValue, PreprocessorError> {
let mut is_float = false;
let mut integer_radix = 10;
let mut raw: String = Default::default();
raw.push(first_char);
if first_char == '0' {
match self.inner.peek_char() {
Some('x') | Some('X') => {
self.inner.next_char();
raw += &self.consume_chars(|c| matches!(c, '0'..='9' | 'a'..='f' | 'A'..='F'));
integer_radix = 16;
}
Some('0'..='9') => {
integer_radix = 8;
}
_ => {}
};
}
if first_char == '.' {
is_float = true;
} else {
raw += &self.consume_chars(|c| ('0'..='9').contains(&c));
if self.next_char_if(|c| c == '.').is_some() {
raw.push('.');
is_float = true;
}
}
if is_float {
raw += &self.consume_chars(|c| ('0'..='9').contains(&c));
}
if (is_float || integer_radix == 8 || integer_radix == 10)
&& self.next_char_if(|c| c == 'e' || c == 'E').is_some()
{
raw.push('e');
is_float = true;
match self.inner.peek_char() {
Some('+') => {
self.inner.next_char();
raw.push('+');
}
Some('-') => {
self.inner.next_char();
raw.push('-');
}
_ => {}
}
raw += &self.consume_chars(|c| ('0'..='9').contains(&c));
}
if is_float {
let width = self.parse_float_width_suffix()?;
Ok(TokenValue::Float(Float {
value: raw
.parse::<f32>()
.map_err(|_| PreprocessorError::FloatParsingError)?,
width,
}))
} else {
let signed = self.parse_integer_signedness_suffix();
let width = self.parse_integer_width_suffix()?;
if integer_radix != 10 {
raw = raw.split_off(1);
}
Ok(TokenValue::Integer(Integer {
value: u64::from_str_radix(&raw, integer_radix)
.map_err(|_err| PreprocessorError::IntegerOverflow)?,
signed,
width,
}))
}
}
fn parse_punctuation(&mut self) -> Result<TokenValue, PreprocessorError> {
let save_point = self.inner.clone();
let char0 = self.inner.next_char().unwrap_or('\0');
let char1 = self.inner.next_char().unwrap_or('\0');
let char2 = self.inner.next_char().unwrap_or('\0');
let maybe_punct = match (char0, char1, char2) {
('<', '<', '=') => Some((Punct::LeftShiftAssign, 3)),
('<', '<', _) => Some((Punct::LeftShift, 2)),
('<', '=', _) => Some((Punct::LessEqual, 2)),
('<', _, _) => Some((Punct::LeftAngle, 1)),
('>', '>', '=') => Some((Punct::RightShiftAssign, 3)),
('>', '>', _) => Some((Punct::RightShift, 2)),
('>', '=', _) => Some((Punct::GreaterEqual, 2)),
('>', _, _) => Some((Punct::RightAngle, 1)),
('+', '+', _) => Some((Punct::Increment, 2)),
('+', '=', _) => Some((Punct::AddAssign, 2)),
('+', _, _) => Some((Punct::Plus, 1)),
('-', '-', _) => Some((Punct::Decrement, 2)),
('-', '=', _) => Some((Punct::SubAssign, 2)),
('-', _, _) => Some((Punct::Minus, 1)),
('&', '&', _) => Some((Punct::LogicalAnd, 2)),
('&', '=', _) => Some((Punct::AndAssign, 2)),
('&', _, _) => Some((Punct::Ampersand, 1)),
('|', '|', _) => Some((Punct::LogicalOr, 2)),
('|', '=', _) => Some((Punct::OrAssign, 2)),
('|', _, _) => Some((Punct::Pipe, 1)),
('^', '^', _) => Some((Punct::LogicalXor, 2)),
('^', '=', _) => Some((Punct::XorAssign, 2)),
('^', _, _) => Some((Punct::Caret, 1)),
('=', '=', _) => Some((Punct::EqualEqual, 2)),
('=', _, _) => Some((Punct::Equal, 1)),
('!', '=', _) => Some((Punct::NotEqual, 2)),
('!', _, _) => Some((Punct::Bang, 1)),
('*', '=', _) => Some((Punct::MulAssign, 2)),
('*', _, _) => Some((Punct::Star, 1)),
('/', '=', _) => Some((Punct::DivAssign, 2)),
('/', _, _) => Some((Punct::Slash, 1)),
('%', '=', _) => Some((Punct::ModAssign, 2)),
('%', _, _) => Some((Punct::Percent, 1)),
('(', _, _) => Some((Punct::LeftParen, 1)),
(')', _, _) => Some((Punct::RightParen, 1)),
('{', _, _) => Some((Punct::LeftBrace, 1)),
('}', _, _) => Some((Punct::RightBrace, 1)),
('[', _, _) => Some((Punct::LeftBracket, 1)),
(']', _, _) => Some((Punct::RightBracket, 1)),
(',', _, _) => Some((Punct::Comma, 1)),
(';', _, _) => Some((Punct::Semicolon, 1)),
(':', _, _) => Some((Punct::Colon, 1)),
('~', _, _) => Some((Punct::Tilde, 1)),
('?', _, _) => Some((Punct::Question, 1)),
_ => None,
};
if let Some((punct, size)) = maybe_punct {
self.inner = save_point;
for _ in 0..size {
self.inner.next_char();
}
Ok(punct.into())
} else if char0 == '#' {
self.inner = save_point;
self.inner.next_char();
Ok(TokenValue::Hash)
} else {
Err(PreprocessorError::UnexpectedCharacter)
}
}
}
impl<'a> Iterator for Lexer<'a> {
type Item = LexerItem;
fn next(&mut self) -> Option<Self::Item> {
while let Some(current_char) = self.inner.peek_char() {
let had_leading_whitespace = self.leading_whitespace;
self.leading_whitespace = false;
let mut location = Location {
line: self.inner.get_last_seen_line(),
start: self.inner.get_last_seen_start_offset() as u32,
end: 0,
};
let was_start_of_line = self.start_of_line;
self.start_of_line = false;
let value = match current_char {
' ' | '\t' | '\x0b' | '\x0c' | COMMENT_SENTINEL_VALUE => {
if current_char == COMMENT_SENTINEL_VALUE {
self.had_comments = true;
}
self.start_of_line = was_start_of_line;
self.leading_whitespace = true;
self.inner.next_char();
continue;
}
'\n' => {
self.leading_whitespace = true;
self.start_of_line = true;
self.inner.next_char();
Ok(TokenValue::NewLine)
}
c @ '0'..='9' => {
self.inner.next_char();
self.parse_number(c)
}
'.' => {
self.inner.next_char();
match self.inner.peek_char() {
Some('0'..='9') => self.parse_number('.'),
_ => Ok(TokenValue::Punct(Punct::Dot)),
}
}
_ => {
if current_char.is_xid_start() || current_char == '_' {
self.parse_identifier()
} else {
self.parse_punctuation()
}
}
};
location.end = self.inner.get_last_consumed_end_offset() as u32;
return Some(value.map_err(|e| (e, Default::default())).map(|t| Token {
value: t,
location,
leading_whitespace: had_leading_whitespace,
start_of_line: was_start_of_line,
}));
}
if !self.start_of_line {
self.start_of_line = true;
let end_offset = self.inner.get_last_consumed_end_offset() as u32;
Some(Ok(Token {
value: TokenValue::NewLine,
location: Location {
line: self.inner.get_last_seen_line(),
start: end_offset,
end: end_offset,
},
leading_whitespace: self.leading_whitespace,
start_of_line: false,
}))
} else {
None
}
}
}