use std::cmp::Ordering;
use unicode_ident::{is_xid_continue, is_xid_start};
use ruff_python_ast::StringFlags;
use ruff_python_ast::str_prefix::{AnyStringPrefix, StringLiteralPrefix};
use ruff_python_ast::token::{TokenFlags, TokenKind};
use ruff_python_trivia::is_python_whitespace;
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::Mode;
use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
use crate::lexer::interpolated_string::{
InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
};
use crate::string::InterpolatedStringKind;
mod cursor;
mod indentation;
mod interpolated_string;
const BOM: char = '\u{feff}';
#[derive(Debug)]
pub struct Lexer<'src> {
source: &'src str,
cursor: Cursor<'src>,
current_kind: TokenKind,
current_range: TextRange,
current_flags: TokenFlags,
state: State,
nesting: u32,
indentations: Indentations,
pending_indentation: Option<Indentation>,
mode: Mode,
interpolated_strings: InterpolatedStrings,
errors: Vec<LexicalError>,
}
impl<'src> Lexer<'src> {
pub(crate) fn new(source: &'src str, mode: Mode, start_offset: TextSize) -> Self {
assert!(
u32::try_from(source.len()).is_ok(),
"Lexer only supports files with a size up to 4GB"
);
let (state, nesting) = if mode == Mode::ParenthesizedExpression {
(State::Other, 1)
} else {
(State::AfterNewline, 0)
};
let mut lexer = Lexer {
source,
cursor: Cursor::new(source),
state,
current_kind: TokenKind::EndOfFile,
current_range: TextRange::empty(start_offset),
current_flags: TokenFlags::empty(),
nesting,
indentations: Indentations::default(),
pending_indentation: None,
mode,
interpolated_strings: InterpolatedStrings::default(),
errors: Vec::new(),
};
if start_offset == TextSize::new(0) {
lexer.cursor.eat_char(BOM);
} else {
lexer.cursor.skip_bytes(start_offset.to_usize());
}
lexer
}
pub(crate) const fn current_kind(&self) -> TokenKind {
self.current_kind
}
pub(crate) const fn current_range(&self) -> TextRange {
self.current_range
}
#[inline]
pub(crate) const fn nesting(&self) -> u32 {
self.nesting
}
pub(crate) const fn current_flags(&self) -> TokenFlags {
self.current_flags
}
fn push_error(&mut self, error: LexicalError) -> TokenKind {
self.current_range = error.location();
self.errors.push(error);
TokenKind::Unknown
}
pub fn next_token(&mut self) -> TokenKind {
self.cursor.start_token();
self.current_flags = TokenFlags::empty();
self.current_kind = self.lex_token();
if !matches!(self.current_kind, TokenKind::Unknown) {
self.current_range = self.token_range();
}
self.current_kind
}
fn lex_token(&mut self) -> TokenKind {
if let Some(interpolated_string) = self.interpolated_strings.current() {
if !interpolated_string.is_in_interpolation(self.nesting) {
if let Some(token) = self.lex_interpolated_string_middle_or_end() {
if token.is_interpolated_string_end() {
self.interpolated_strings.pop();
}
return token;
}
}
}
else if let Some(indentation) = self.pending_indentation.take() {
match self.indentations.current().try_compare(indentation) {
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
if self.indentations.dedent_one(indentation).is_err() {
return self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
));
}
return TokenKind::Dedent;
}
Ok(_) => {}
Err(_) => {
return self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
));
}
}
}
if self.state.is_after_newline() {
if let Some(indentation) = self.eat_indentation() {
return indentation;
}
} else {
if let Err(error) = self.skip_whitespace() {
return self.push_error(error);
}
}
self.cursor.start_token();
if let Some(c) = self.cursor.bump() {
if c.is_ascii() {
self.consume_ascii_character(c)
} else if is_unicode_identifier_start(c) {
let identifier = self.lex_identifier(c);
self.state = State::Other;
identifier
} else {
self.push_error(LexicalError::new(
LexicalErrorType::UnrecognizedToken { tok: c },
self.token_range(),
))
}
} else {
self.consume_end()
}
}
fn eat_indentation(&mut self) -> Option<TokenKind> {
let mut indentation = Indentation::root();
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
indentation = indentation.add_space();
}
'\t' => {
self.cursor.bump();
indentation = indentation.add_tab();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if !self.cursor.eat_char('\n') {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::LineContinuationError,
TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
)));
}
if self.cursor.is_eof() {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::Eof,
self.token_range(),
)));
}
if indentation != Indentation::root() {
self.cursor.eat_while(is_python_whitespace);
}
}
'\x0C' => {
self.cursor.bump();
indentation = Indentation::root();
}
_ => break,
}
}
if !matches!(self.cursor.first(), '\n' | '\r' | '#' | EOF_CHAR) {
self.state = State::NonEmptyLogicalLine;
return self.handle_indentation(indentation);
}
None
}
fn handle_indentation(&mut self, indentation: Indentation) -> Option<TokenKind> {
match self.indentations.current().try_compare(indentation) {
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
if self.indentations.dedent_one(indentation).is_err() {
return Some(self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
)));
}
self.cursor.start_token();
Some(TokenKind::Dedent)
}
Ok(Ordering::Equal) => None,
Ok(Ordering::Less) => {
self.indentations.indent(indentation);
Some(TokenKind::Indent)
}
Err(_) => Some(self.push_error(LexicalError::new(
LexicalErrorType::IndentationError,
self.token_range(),
))),
}
}
fn skip_whitespace(&mut self) -> Result<(), LexicalError> {
loop {
match self.cursor.first() {
' ' => {
self.cursor.bump();
}
'\t' => {
self.cursor.bump();
}
'\\' => {
self.cursor.bump();
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else if !self.cursor.eat_char('\n') {
return Err(LexicalError::new(
LexicalErrorType::LineContinuationError,
TextRange::at(self.offset() - '\\'.text_len(), '\\'.text_len()),
));
}
if self.cursor.is_eof() {
return Err(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
}
}
'\x0C' => {
self.cursor.bump();
}
_ => break,
}
}
Ok(())
}
fn consume_ascii_character(&mut self, c: char) -> TokenKind {
let token = match c {
c if is_ascii_identifier_start(c) => self.lex_identifier(c),
'0'..='9' => self.lex_number(c),
'#' => return self.lex_comment(),
'\'' | '"' => self.lex_string(c),
'=' => {
if self.cursor.eat_char('=') {
TokenKind::EqEqual
} else {
self.state = State::AfterEqual;
return TokenKind::Equal;
}
}
'+' => {
if self.cursor.eat_char('=') {
TokenKind::PlusEqual
} else {
TokenKind::Plus
}
}
'*' => {
if self.cursor.eat_char('=') {
TokenKind::StarEqual
} else if self.cursor.eat_char('*') {
if self.cursor.eat_char('=') {
TokenKind::DoubleStarEqual
} else {
TokenKind::DoubleStar
}
} else {
TokenKind::Star
}
}
'%' | '!'
if self.mode == Mode::Ipython
&& self.state.is_after_equal()
&& self.nesting == 0 =>
{
self.lex_ipython_escape_command()
}
'%' | '!' | '?' | '/' | ';' | ','
if self.mode == Mode::Ipython && self.state.is_new_logical_line() =>
{
self.lex_ipython_escape_command()
}
'?' if self.mode == Mode::Ipython => TokenKind::Question,
'/' => {
if self.cursor.eat_char('=') {
TokenKind::SlashEqual
} else if self.cursor.eat_char('/') {
if self.cursor.eat_char('=') {
TokenKind::DoubleSlashEqual
} else {
TokenKind::DoubleSlash
}
} else {
TokenKind::Slash
}
}
'%' => {
if self.cursor.eat_char('=') {
TokenKind::PercentEqual
} else {
TokenKind::Percent
}
}
'|' => {
if self.cursor.eat_char('=') {
TokenKind::VbarEqual
} else {
TokenKind::Vbar
}
}
'^' => {
if self.cursor.eat_char('=') {
TokenKind::CircumflexEqual
} else {
TokenKind::CircumFlex
}
}
'&' => {
if self.cursor.eat_char('=') {
TokenKind::AmperEqual
} else {
TokenKind::Amper
}
}
'-' => {
if self.cursor.eat_char('=') {
TokenKind::MinusEqual
} else if self.cursor.eat_char('>') {
TokenKind::Rarrow
} else {
TokenKind::Minus
}
}
'@' => {
if self.cursor.eat_char('=') {
TokenKind::AtEqual
} else {
TokenKind::At
}
}
'!' => {
if self.cursor.eat_char('=') {
TokenKind::NotEqual
} else {
TokenKind::Exclamation
}
}
'~' => TokenKind::Tilde,
'(' => {
self.nesting += 1;
TokenKind::Lpar
}
')' => {
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rpar
}
'[' => {
self.nesting += 1;
TokenKind::Lsqb
}
']' => {
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rsqb
}
'{' => {
self.nesting += 1;
TokenKind::Lbrace
}
'}' => {
if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
if interpolated_string.nesting() == self.nesting {
let error_type = LexicalErrorType::from_interpolated_string_error(
InterpolatedStringErrorType::SingleRbrace,
interpolated_string.kind(),
);
return self.push_error(LexicalError::new(error_type, self.token_range()));
}
interpolated_string.try_end_format_spec(self.nesting);
}
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rbrace
}
':' => {
if self
.interpolated_strings
.current_mut()
.is_some_and(|interpolated_string| {
interpolated_string.try_start_format_spec(self.nesting)
})
{
TokenKind::Colon
} else if self.cursor.eat_char('=') {
TokenKind::ColonEqual
} else {
TokenKind::Colon
}
}
';' => TokenKind::Semi,
'<' => {
if self.cursor.eat_char('<') {
if self.cursor.eat_char('=') {
TokenKind::LeftShiftEqual
} else {
TokenKind::LeftShift
}
} else if self.cursor.eat_char('=') {
TokenKind::LessEqual
} else {
TokenKind::Less
}
}
'>' => {
if self.cursor.eat_char('>') {
if self.cursor.eat_char('=') {
TokenKind::RightShiftEqual
} else {
TokenKind::RightShift
}
} else if self.cursor.eat_char('=') {
TokenKind::GreaterEqual
} else {
TokenKind::Greater
}
}
',' => TokenKind::Comma,
'.' => {
if self.cursor.first().is_ascii_digit() {
self.lex_decimal_number('.')
} else if self.cursor.eat_char2('.', '.') {
TokenKind::Ellipsis
} else {
TokenKind::Dot
}
}
'\n' => {
return if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
interpolated_string.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
};
}
'\r' => {
self.cursor.eat_char('\n');
return if self.nesting == 0 && !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
interpolated_string.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
};
}
_ => {
self.state = State::Other;
return self.push_error(LexicalError::new(
LexicalErrorType::UnrecognizedToken { tok: c },
self.token_range(),
));
}
};
self.state = State::Other;
token
}
fn lex_identifier(&mut self, first: char) -> TokenKind {
let quote = match (first, self.cursor.first()) {
(_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
self.cursor.bump();
quote
}),
(_, second) if is_quote(self.cursor.second()) => {
self.try_double_char_prefix([first, second]).then(|| {
self.cursor.bump();
self.cursor.bump().unwrap()
})
}
_ => None,
};
if let Some(quote) = quote {
if self.current_flags.is_interpolated_string() {
if let Some(kind) = self.lex_interpolated_string_start(quote) {
return kind;
}
}
return self.lex_string(quote);
}
let mut is_ascii = first.is_ascii();
self.cursor
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
if !is_ascii {
self.current_flags |= TokenFlags::NON_ASCII_NAME;
return TokenKind::Name;
}
let text = self.token_text();
if text.len() > 8 {
return TokenKind::Name;
}
match text.as_bytes() {
b"False" => TokenKind::False,
b"None" => TokenKind::None,
b"True" => TokenKind::True,
b"and" => TokenKind::And,
b"as" => TokenKind::As,
b"assert" => TokenKind::Assert,
b"async" => TokenKind::Async,
b"await" => TokenKind::Await,
b"break" => TokenKind::Break,
b"case" => TokenKind::Case,
b"class" => TokenKind::Class,
b"continue" => TokenKind::Continue,
b"def" => TokenKind::Def,
b"del" => TokenKind::Del,
b"elif" => TokenKind::Elif,
b"else" => TokenKind::Else,
b"except" => TokenKind::Except,
b"finally" => TokenKind::Finally,
b"for" => TokenKind::For,
b"from" => TokenKind::From,
b"global" => TokenKind::Global,
b"if" => TokenKind::If,
b"import" => TokenKind::Import,
b"in" => TokenKind::In,
b"is" => TokenKind::Is,
b"lazy" => TokenKind::Lazy,
b"lambda" => TokenKind::Lambda,
b"match" => TokenKind::Match,
b"nonlocal" => TokenKind::Nonlocal,
b"not" => TokenKind::Not,
b"or" => TokenKind::Or,
b"pass" => TokenKind::Pass,
b"raise" => TokenKind::Raise,
b"return" => TokenKind::Return,
b"try" => TokenKind::Try,
b"type" => TokenKind::Type,
b"while" => TokenKind::While,
b"with" => TokenKind::With,
b"yield" => TokenKind::Yield,
_ => TokenKind::Name,
}
}
fn try_single_char_prefix(&mut self, first: char) -> bool {
match first {
'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
'R' => self.current_flags |= TokenFlags::RAW_STRING_UPPERCASE,
_ => return false,
}
true
}
fn try_double_char_prefix(&mut self, value: [char; 2]) -> bool {
match value {
['r', 'f' | 'F'] | ['f' | 'F', 'r'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 'b' | 'B'] | ['b' | 'B', 'R'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
_ => return false,
}
true
}
fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
if quote == '"' {
self.current_flags |= TokenFlags::DOUBLE_QUOTES;
}
if self.cursor.eat_char2(quote, quote) {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
let kind = ftcontext.kind();
self.interpolated_strings.push(ftcontext);
Some(kind.start_token())
}
fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
let interpolated_string = self.interpolated_strings.current().unwrap();
let string_kind = interpolated_string.kind();
let interpolated_flags = interpolated_string.flags();
if interpolated_string.is_triple_quoted() {
let quote_char = interpolated_string.quote_char();
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
self.current_flags = interpolated_string.flags();
return Some(string_kind.end_token());
}
} else if self.cursor.eat_char(interpolated_string.quote_char()) {
self.current_flags = interpolated_string.flags();
return Some(string_kind.end_token());
}
let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
let mut in_named_unicode = false;
loop {
match self.cursor.first() {
EOF_CHAR if self.cursor.is_eof() => {
let error = if interpolated_string.is_triple_quoted() {
InterpolatedStringErrorType::UnterminatedTripleQuotedString
} else {
InterpolatedStringErrorType::UnterminatedString
};
self.nesting = interpolated_string.nesting();
self.interpolated_strings.pop();
self.current_flags |= TokenFlags::UNCLOSED_STRING;
self.push_error(LexicalError::new(
LexicalErrorType::from_interpolated_string_error(error, string_kind),
self.token_range(),
));
break;
}
'\n' | '\r' if !interpolated_string.is_triple_quoted() => {
let error_type = if in_format_spec {
InterpolatedStringErrorType::NewlineInFormatSpec
} else {
InterpolatedStringErrorType::UnterminatedString
};
self.nesting = interpolated_string.nesting();
self.interpolated_strings.pop();
self.current_flags |= TokenFlags::UNCLOSED_STRING;
self.push_error(LexicalError::new(
LexicalErrorType::from_interpolated_string_error(error_type, string_kind),
self.token_range(),
));
break;
}
'\\' => {
self.cursor.bump(); if matches!(self.cursor.first(), '{' | '}') {
continue;
} else if !interpolated_string.is_raw_string() {
if self.cursor.eat_char2('N', '{') {
in_named_unicode = true;
continue;
}
}
if self.cursor.eat_char('\r') {
self.cursor.eat_char('\n');
} else {
self.cursor.bump();
}
}
quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
if let Some(triple_quotes) = interpolated_string.triple_quotes() {
if self.cursor.rest().starts_with(triple_quotes) {
break;
}
self.cursor.bump();
} else {
break;
}
}
'{' => {
if self.cursor.second() == '{' && !in_format_spec {
self.cursor.bump();
self.cursor.bump(); } else {
break;
}
}
'}' => {
if in_named_unicode {
in_named_unicode = false;
self.cursor.bump();
} else if self.cursor.second() == '}' && !in_format_spec {
self.cursor.bump();
self.cursor.bump(); } else {
break;
}
}
_ => {
self.cursor.bump();
}
}
}
let range = self.token_range();
if range.is_empty() {
return None;
}
self.current_flags = interpolated_flags;
Some(string_kind.middle_token())
}
fn lex_string(&mut self, quote: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
if quote == '"' {
self.current_flags |= TokenFlags::DOUBLE_QUOTES;
}
if self.cursor.eat_char2(quote, quote) {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
let quote_byte = u8::try_from(quote).expect("char that fits in u8");
if self.current_flags.is_triple_quoted() {
loop {
let Some(index) = memchr::memchr(quote_byte, self.cursor.rest().as_bytes()) else {
self.cursor.skip_to_end();
self.current_flags |= TokenFlags::UNCLOSED_STRING;
self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
break;
};
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();
self.cursor.skip_bytes(index + 1);
if num_backslashes % 2 == 1 {
continue;
}
if self.cursor.eat_char2(quote, quote) {
break;
}
}
} else {
loop {
let Some(index) =
memchr::memchr3(quote_byte, b'\r', b'\n', self.cursor.rest().as_bytes())
else {
self.cursor.skip_to_end();
self.current_flags |= TokenFlags::UNCLOSED_STRING;
self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
break;
};
let num_backslashes = self.cursor.rest().as_bytes()[..index]
.iter()
.rev()
.take_while(|&&c| c == b'\\')
.count();
self.cursor.skip_bytes(index);
let quote_or_newline = self.cursor.first();
if num_backslashes % 2 == 1 {
self.cursor.bump();
if quote_or_newline == '\r' {
self.cursor.eat_char('\n');
}
continue;
}
match quote_or_newline {
'\r' | '\n' => {
self.current_flags |= TokenFlags::UNCLOSED_STRING;
self.push_error(LexicalError::new(
LexicalErrorType::UnclosedStringError,
self.token_range(),
));
break;
}
ch if ch == quote => {
self.cursor.bump();
break;
}
_ => unreachable!("memchr2 returned an index that is not a quote or a newline"),
}
}
}
TokenKind::String
}
fn lex_number(&mut self, first: char) -> TokenKind {
if first == '0' {
if self.cursor.eat_if(|c| matches!(c, 'x' | 'X')).is_some() {
self.lex_number_radix(Radix::Hex)
} else if self.cursor.eat_if(|c| matches!(c, 'o' | 'O')).is_some() {
self.lex_number_radix(Radix::Octal)
} else if self.cursor.eat_if(|c| matches!(c, 'b' | 'B')).is_some() {
self.lex_number_radix(Radix::Binary)
} else {
self.lex_decimal_number(first)
}
} else {
self.lex_decimal_number(first)
}
}
fn lex_number_radix(&mut self, radix: Radix) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert!(matches!(
self.cursor.previous().to_ascii_lowercase(),
'x' | 'o' | 'b'
));
let number = self.radix_run(radix);
if !number.has_digit {
let err = u64::from_str_radix("", radix.as_u32()).unwrap_err();
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError(format!("{err:?}").into_boxed_str()),
self.token_range(),
));
}
TokenKind::Int
}
fn lex_decimal_number(&mut self, first_digit_or_dot: char) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.');
let start_is_zero = first_digit_or_dot == '0';
let mut integer_part = RadixRun {
has_digit: first_digit_or_dot != '.',
has_nonzero_digit: first_digit_or_dot != '.' && first_digit_or_dot != '0',
};
if first_digit_or_dot != '.' {
integer_part.has_nonzero_digit |= self.radix_run(Radix::Decimal).has_nonzero_digit;
}
let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') {
if self.cursor.eat_char('_') {
return self.push_error(LexicalError::new(
LexicalErrorType::OtherError("Invalid Syntax".to_string().into_boxed_str()),
TextRange::new(self.offset() - TextSize::new(1), self.offset()),
));
}
self.radix_run(Radix::Decimal);
true
} else {
false
};
let is_float = match self.cursor.rest().as_bytes() {
[b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => {
self.cursor.bump();
self.cursor.eat_if(|c| matches!(c, '+' | '-'));
self.radix_run(Radix::Decimal);
true
}
_ => is_float,
};
if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() {
TokenKind::Complex
} else if is_float {
TokenKind::Float
} else if start_is_zero && integer_part.has_nonzero_digit {
self.push_error(LexicalError::new(
LexicalErrorType::OtherError(
"Invalid decimal integer literal"
.to_string()
.into_boxed_str(),
),
self.token_range(),
))
} else {
TokenKind::Int
}
}
fn radix_run(&mut self, radix: Radix) -> RadixRun {
let mut run = RadixRun::default();
loop {
if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) {
run.has_digit = true;
run.has_nonzero_digit |= c != '0';
}
else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) {
self.cursor.bump();
} else {
break;
}
}
run
}
fn lex_comment(&mut self) -> TokenKind {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), '#');
let bytes = self.cursor.rest().as_bytes();
let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len());
self.cursor.skip_bytes(offset);
TokenKind::Comment
}
fn lex_ipython_escape_command(&mut self) -> TokenKind {
loop {
match self.cursor.first() {
'\\' => {
self.cursor.bump();
self.cursor.eat_char('\r');
self.cursor.eat_char('\n');
}
'\n' | '\r' | EOF_CHAR => return TokenKind::IpyEscapeCommand,
_ => {
self.cursor.bump();
}
}
}
}
fn consume_end(&mut self) -> TokenKind {
while let Some(interpolated_string) = self.interpolated_strings.pop() {
self.nesting = interpolated_string.nesting();
self.push_error(LexicalError::new(
LexicalErrorType::from_interpolated_string_error(
InterpolatedStringErrorType::UnterminatedString,
interpolated_string.kind(),
),
self.token_range(),
));
}
let init_nesting = u32::from(self.mode == Mode::ParenthesizedExpression);
if self.nesting > init_nesting {
self.nesting = 0;
return self.push_error(LexicalError::new(LexicalErrorType::Eof, self.token_range()));
}
if !self.state.is_new_logical_line() {
self.state = State::AfterNewline;
TokenKind::Newline
}
else if self.indentations.dedent().is_some() {
TokenKind::Dedent
} else {
TokenKind::EndOfFile
}
}
pub(crate) fn re_lex_logical_token(
&mut self,
non_logical_newline_start: Option<TextSize>,
) -> bool {
if self.nesting == 0 {
return false;
}
self.nesting -= 1;
if self.current_flags.is_triple_quoted_interpolated_string() {
return false;
}
let Some(new_position) = non_logical_newline_start else {
return false;
};
if matches!(
self.current_kind,
TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace
) {
self.nesting += 1;
}
self.cursor = Cursor::new(self.source);
self.cursor.skip_bytes(new_position.to_usize());
self.state = State::Other;
self.next_token();
true
}
pub(crate) fn re_lex_string_token_in_interpolation_element(
&mut self,
kind: InterpolatedStringKind,
) {
let Some(interpolated_string) = self.interpolated_strings.current() else {
return;
};
let current_string_flags = self.current_flags().as_any_string_flags();
if !matches!(self.current_kind, TokenKind::String)
|| !self.current_flags.is_unclosed()
|| current_string_flags.prefix() != AnyStringPrefix::Regular(StringLiteralPrefix::Empty)
|| current_string_flags.quote_style().as_char() != interpolated_string.quote_char()
|| current_string_flags.is_triple_quoted() != interpolated_string.is_triple_quoted()
{
return;
}
let first_line = &self.source
[(self.current_range.start() + current_string_flags.quote_len()).to_usize()..];
for c in first_line.chars() {
if matches!(c, '\n' | '\r' | '#') {
break;
}
if !is_python_whitespace(c) {
return;
}
}
if self.errors.last().is_some_and(|error| {
error.location() == self.current_range
&& matches!(error.error(), LexicalErrorType::UnclosedStringError)
}) {
self.errors.pop();
}
self.current_range =
TextRange::at(self.current_range.start(), self.current_flags.quote_len());
self.current_kind = kind.end_token();
self.current_flags = TokenFlags::empty();
self.nesting = interpolated_string.nesting();
self.interpolated_strings.pop();
self.cursor = Cursor::new(self.source);
self.cursor.skip_bytes(self.current_range.end().to_usize());
}
pub(crate) fn re_lex_raw_string_in_format_spec(&mut self) {
if matches!(self.current_kind, TokenKind::String)
&& self.current_flags.is_unclosed()
&& self.current_flags.prefix()
== AnyStringPrefix::Regular(StringLiteralPrefix::Raw { uppercase: false })
{
if self.errors.last().is_some_and(|error| {
error.location() == self.current_range
&& matches!(error.error(), LexicalErrorType::UnclosedStringError)
}) {
self.errors.pop();
}
self.current_range = TextRange::at(self.current_range.start(), 'r'.text_len());
self.current_kind = TokenKind::Name;
self.current_flags = TokenFlags::empty();
self.cursor = Cursor::new(self.source);
self.cursor.skip_bytes(self.current_range.end().to_usize());
}
}
#[inline]
fn token_range(&self) -> TextRange {
let end = self.offset();
let len = self.cursor.token_len();
TextRange::at(end - len, len)
}
#[inline]
fn token_text(&self) -> &'src str {
&self.source[self.token_range()]
}
#[expect(clippy::cast_possible_truncation)]
#[inline]
fn offset(&self) -> TextSize {
TextSize::new(self.source.len() as u32) - self.cursor.text_len()
}
pub(crate) fn checkpoint(&self) -> LexerCheckpoint {
LexerCheckpoint {
current_kind: self.current_kind,
current_range: self.current_range,
current_flags: self.current_flags,
cursor_offset: self.offset(),
state: self.state,
nesting: self.nesting,
indentations_checkpoint: self.indentations.checkpoint(),
pending_indentation: self.pending_indentation,
interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
errors_position: self.errors.len(),
}
}
pub(crate) fn rewind(&mut self, checkpoint: LexerCheckpoint) {
let LexerCheckpoint {
current_kind,
current_range,
current_flags,
cursor_offset,
state,
nesting,
indentations_checkpoint,
pending_indentation,
interpolated_strings_checkpoint,
errors_position,
} = checkpoint;
let mut cursor = Cursor::new(self.source);
cursor.skip_bytes(cursor_offset.to_usize());
self.current_kind = current_kind;
self.current_range = current_range;
self.current_flags = current_flags;
self.cursor = cursor;
self.state = state;
self.nesting = nesting;
self.indentations.rewind(indentations_checkpoint);
self.pending_indentation = pending_indentation;
self.interpolated_strings
.rewind(interpolated_strings_checkpoint);
self.errors.truncate(errors_position);
}
pub fn finish(self) -> Vec<LexicalError> {
self.errors
}
}
pub(crate) struct LexerCheckpoint {
current_kind: TokenKind,
current_range: TextRange,
current_flags: TokenFlags,
cursor_offset: TextSize,
state: State,
nesting: u32,
indentations_checkpoint: IndentationsCheckpoint,
pending_indentation: Option<Indentation>,
interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
errors_position: usize,
}
#[derive(Copy, Clone, Debug)]
enum State {
AfterNewline,
NonEmptyLogicalLine,
AfterEqual,
Other,
}
impl State {
const fn is_after_newline(self) -> bool {
matches!(self, State::AfterNewline)
}
const fn is_new_logical_line(self) -> bool {
matches!(self, State::AfterNewline | State::NonEmptyLogicalLine)
}
const fn is_after_equal(self) -> bool {
matches!(self, State::AfterEqual)
}
}
#[derive(Copy, Clone, Debug)]
enum Radix {
Binary,
Octal,
Decimal,
Hex,
}
impl Radix {
const fn as_u32(self) -> u32 {
match self {
Radix::Binary => 2,
Radix::Octal => 8,
Radix::Decimal => 10,
Radix::Hex => 16,
}
}
const fn is_digit(self, c: char) -> bool {
match self {
Radix::Binary => matches!(c, '0'..='1'),
Radix::Octal => matches!(c, '0'..='7'),
Radix::Decimal => c.is_ascii_digit(),
Radix::Hex => c.is_ascii_hexdigit(),
}
}
}
#[derive(Default)]
struct RadixRun {
has_digit: bool,
has_nonzero_digit: bool,
}
const fn is_quote(c: char) -> bool {
matches!(c, '\'' | '"')
}
const fn is_ascii_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}
fn is_unicode_identifier_start(c: char) -> bool {
is_xid_start(c)
}
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
let is_continuation = is_xid_continue(c);
if is_continuation {
*identifier_is_ascii_only = false;
}
is_continuation
}
}
pub fn lex(source: &str, mode: Mode) -> Lexer<'_> {
Lexer::new(source, mode, TextSize::default())
}
#[cfg(test)]
mod tests {
use std::fmt::Write;
use insta::assert_snapshot;
use ruff_python_ast::token::Token;
use ruff_text_size::Ranged;
use super::*;
const WINDOWS_EOL: &str = "\r\n";
const MAC_EOL: &str = "\r";
const UNIX_EOL: &str = "\n";
struct LexerOutput {
tokens: Vec<Token>,
errors: Vec<LexicalError>,
}
impl std::fmt::Display for LexerOutput {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "## Tokens")?;
writeln!(f, "```\n{:#?}\n```", self.tokens)?;
if !self.errors.is_empty() {
writeln!(f, "## Errors")?;
writeln!(f, "```\n{:#?}\n```", self.errors)?;
}
Ok(())
}
}
fn lex(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let mut lexer = Lexer::new(source, mode, start_offset);
let mut tokens = Vec::new();
loop {
let kind = lexer.next_token();
if kind.is_eof() {
break;
}
tokens.push(Token::new(
kind,
lexer.current_range(),
lexer.current_flags(),
));
}
LexerOutput {
tokens,
errors: lexer.finish(),
}
}
#[track_caller]
fn lex_valid(source: &str, mode: Mode, start_offset: TextSize) -> LexerOutput {
let output = lex(source, mode, start_offset);
if !output.errors.is_empty() {
let mut message = "Unexpected lexical errors for a valid source:\n".to_string();
for error in &output.errors {
writeln!(&mut message, "{error:?}").unwrap();
}
writeln!(&mut message, "Source:\n{source}").unwrap();
panic!("{message}");
}
output
}
#[track_caller]
fn lex_invalid(source: &str, mode: Mode) -> LexerOutput {
let output = lex(source, mode, TextSize::default());
assert!(
!output.errors.is_empty(),
"Expected lexer to generate at least one error for the following source:\n{source}"
);
output
}
#[track_caller]
fn lex_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Module, TextSize::default())
}
#[track_caller]
fn lex_source_with_offset(source: &str, start_offset: TextSize) -> LexerOutput {
lex_valid(source, Mode::Module, start_offset)
}
#[track_caller]
fn lex_jupyter_source(source: &str) -> LexerOutput {
lex_valid(source, Mode::Ipython, TextSize::default())
}
#[test]
fn bom() {
let source = "\u{feff}x = 1";
assert_snapshot!(lex_source(source));
}
#[test]
fn bom_with_offset() {
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(7)));
}
#[test]
fn bom_with_offset_edge() {
let source = "\u{feff}x + y + z";
assert_snapshot!(lex_source_with_offset(source, TextSize::new(11)));
}
fn ipython_escape_command_line_continuation_eol(eol: &str) -> LexerOutput {
let source = format!("%matplotlib \\{eol} --inline");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_unix_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(UNIX_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_mac_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(MAC_EOL));
}
#[test]
fn test_ipython_escape_command_line_continuation_windows_eol() {
assert_snapshot!(ipython_escape_command_line_continuation_eol(WINDOWS_EOL));
}
fn ipython_escape_command_line_continuation_with_eol_and_eof(eol: &str) -> LexerOutput {
let source = format!("%matplotlib \\{eol}");
lex_jupyter_source(&source)
}
#[test]
fn test_ipython_escape_command_line_continuation_with_unix_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
UNIX_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_mac_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
MAC_EOL
));
}
#[test]
fn test_ipython_escape_command_line_continuation_with_windows_eol_and_eof() {
assert_snapshot!(ipython_escape_command_line_continuation_with_eol_and_eof(
WINDOWS_EOL
));
}
#[test]
fn test_empty_ipython_escape_command() {
let source = "%\n%%\n!\n!!\n?\n??\n/\n,\n;";
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command() {
let source = r"
?foo
??foo
%timeit a = b
%timeit a % 3
%matplotlib \
--inline
!pwd \
&& ls -a | sed 's/^/\\ /'
!!cd /Users/foo/Library/Application\ Support/
/foo 1 2
,foo 1 2
;foo 1 2
!ls
"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_help_end_escape_command() {
let source = r"
?foo?
?? foo?
?? foo ?
?foo??
??foo??
???foo?
???foo??
??foo???
???foo???
?? \
foo?
?? \
?
????
%foo?
%foo??
%%foo???
!pwd?"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_indentation() {
let source = r"
if True:
%matplotlib \
--inline"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
#[test]
fn test_ipython_escape_command_assignment() {
let source = r"
pwd = !pwd
foo = %timeit a = b
bar = %timeit a % 3
baz = %matplotlib \
inline
qux = %foo?
quux = !pwd?"
.trim();
assert_snapshot!(lex_jupyter_source(source));
}
fn assert_no_ipython_escape_command(tokens: &[Token]) {
for token in tokens {
if matches!(token.kind(), TokenKind::IpyEscapeCommand) {
panic!("Unexpected escape command token at {:?}", token.range())
}
}
}
#[test]
fn test_ipython_escape_command_not_an_assignment() {
let source = r"
# Other escape kinds are not valid here (can't test `foo = ?str` because '?' is not a valid token)
foo = /func
foo = ;func
foo = ,func
(foo == %timeit a = b)
(foo := %timeit a = b)
def f(arg=%timeit a = b):
pass"
.trim();
let output = lex(source, Mode::Ipython, TextSize::default());
assert!(output.errors.is_empty());
assert_no_ipython_escape_command(&output.tokens);
}
#[test]
fn test_numbers() {
let source = "0x2f 0o12 0b1101 0 123 123_45_67_890 0.2 1e+2 2.1e3 2j 2.2j 000 0x995DC9BBDF1939FA 0x995DC9BBDF1939FA995DC9BBDF1939FA";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_invalid_leading_zero_small() {
let source = "025";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_invalid_leading_zero_big() {
let source =
"0252222222222222522222222222225222222222222252222222222222522222222222225222222222222";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_line_comment_long() {
let source = "99232 # foo".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_whitespace() {
let source = "99232 # ".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_single_whitespace() {
let source = "99232 # ".to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn test_line_comment_empty() {
let source = "99232 #".to_string();
assert_snapshot!(lex_source(&source));
}
fn comment_until_eol(eol: &str) -> LexerOutput {
let source = format!("123 # Foo{eol}456");
lex_source(&source)
}
#[test]
fn test_comment_until_unix_eol() {
assert_snapshot!(comment_until_eol(UNIX_EOL));
}
#[test]
fn test_comment_until_mac_eol() {
assert_snapshot!(comment_until_eol(MAC_EOL));
}
#[test]
fn test_comment_until_windows_eol() {
assert_snapshot!(comment_until_eol(WINDOWS_EOL));
}
#[test]
fn test_assignment() {
let source = r"a_variable = 99 + 2-0";
assert_snapshot!(lex_source(source));
}
fn indentation_with_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_indentation_with_unix_eol() {
assert_snapshot!(indentation_with_eol(UNIX_EOL));
}
#[test]
fn test_indentation_with_mac_eol() {
assert_snapshot!(indentation_with_eol(MAC_EOL));
}
#[test]
fn test_indentation_with_windows_eol() {
assert_snapshot!(indentation_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol} if x:{eol}{eol} return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_unix_eol() {
assert_snapshot!(double_dedent_with_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_mac_eol() {
assert_snapshot!(double_dedent_with_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_windows_eol() {
assert_snapshot!(double_dedent_with_eol(WINDOWS_EOL));
}
fn double_dedent_with_tabs_eol(eol: &str) -> LexerOutput {
let source = format!("def foo():{eol}\tif x:{eol}{eol}\t\t return 99{eol}{eol}");
lex_source(&source)
}
#[test]
fn test_double_dedent_with_tabs_unix_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(UNIX_EOL));
}
#[test]
fn test_double_dedent_with_tabs_mac_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(MAC_EOL));
}
#[test]
fn test_double_dedent_with_tabs_windows_eol() {
assert_snapshot!(double_dedent_with_tabs_eol(WINDOWS_EOL));
}
#[test]
fn dedent_after_whitespace() {
let source = "\
if first:
if second:
pass
foo
";
assert_snapshot!(lex_source(source));
}
fn newline_in_brackets_eol(eol: &str) -> LexerOutput {
let source = r"x = [
1,2
,(3,
4,
), {
5,
6,\
7}]
"
.replace('\n', eol);
lex_source(&source)
}
#[test]
fn test_newline_in_brackets_unix_eol() {
assert_snapshot!(newline_in_brackets_eol(UNIX_EOL));
}
#[test]
fn test_newline_in_brackets_mac_eol() {
assert_snapshot!(newline_in_brackets_eol(MAC_EOL));
}
#[test]
fn test_newline_in_brackets_windows_eol() {
assert_snapshot!(newline_in_brackets_eol(WINDOWS_EOL));
}
#[test]
fn test_non_logical_newline_in_string_continuation() {
let source = r"(
'a'
'b'
'c' \
'd'
)";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_logical_newline_line_comment() {
let source = "#Hello\n#World\n";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_operators() {
let source = "//////=/ /";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_string() {
let source = r#""double" 'single' 'can\'t' "\\\"" '\t\r\n' '\g' r'raw\'' '\420' '\200\0a'"#;
assert_snapshot!(lex_source(source));
}
fn string_continuation_with_eol(eol: &str) -> LexerOutput {
let source = format!("\"abc\\{eol}def\"");
lex_source(&source)
}
#[test]
fn test_string_continuation_with_unix_eol() {
assert_snapshot!(string_continuation_with_eol(UNIX_EOL));
}
#[test]
fn test_string_continuation_with_mac_eol() {
assert_snapshot!(string_continuation_with_eol(MAC_EOL));
}
#[test]
fn test_string_continuation_with_windows_eol() {
assert_snapshot!(string_continuation_with_eol(WINDOWS_EOL));
}
#[test]
fn test_escape_unicode_name() {
let source = r#""\N{EN SPACE}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_non_ascii_name_flag() {
let mut lexer = Lexer::new("a€\naβ = β\nascii", Mode::Module, TextSize::default());
let mut flags = Vec::new();
loop {
let kind = lexer.next_token();
if kind.is_eof() {
break;
}
if kind == TokenKind::Name {
flags.push(lexer.current_flags().is_non_ascii_name());
}
}
assert_eq!(lexer.finish().len(), 1);
assert_eq!(flags, [false, true, true, false]);
}
fn triple_quoted_eol(eol: &str) -> LexerOutput {
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
lex_source(&source)
}
#[test]
fn test_triple_quoted_unix_eol() {
assert_snapshot!(triple_quoted_eol(UNIX_EOL));
}
#[test]
fn test_triple_quoted_mac_eol() {
assert_snapshot!(triple_quoted_eol(MAC_EOL));
}
#[test]
fn test_triple_quoted_windows_eol() {
assert_snapshot!(triple_quoted_eol(WINDOWS_EOL));
}
fn line_continuation_at_eof_after_newline(eol: &str) -> LexerOutput {
let source = format!(r"\{eol}");
lex_invalid(&source, Mode::Module)
}
#[test]
fn test_line_continuation_at_eof_after_newline_unix_eol() {
assert_snapshot!(line_continuation_at_eof_after_newline(UNIX_EOL));
}
#[test]
fn test_line_continuation_at_eof_after_newline_mac_eol() {
assert_snapshot!(line_continuation_at_eof_after_newline(MAC_EOL));
}
#[test]
fn test_line_continuation_at_eof_after_newline_windows_eol() {
assert_snapshot!(line_continuation_at_eof_after_newline(WINDOWS_EOL));
}
fn line_continuation_at_eof(eol: &str) -> LexerOutput {
let source = format!(r"1, \{eol}");
lex_invalid(&source, Mode::Module)
}
#[test]
fn test_line_continuation_at_eof_unix_eol() {
assert_snapshot!(line_continuation_at_eof(UNIX_EOL));
}
#[test]
fn test_line_continuation_at_eof_mac_eol() {
assert_snapshot!(line_continuation_at_eof(MAC_EOL));
}
#[test]
fn test_line_continuation_at_eof_windows_eol() {
assert_snapshot!(line_continuation_at_eof(WINDOWS_EOL));
}
#[test]
fn test_infinite_loop() {
let source = "[1";
lex_invalid(source, Mode::Module);
}
#[test]
fn test_emoji_identifier() {
let source = "🐦";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn tet_too_low_dedent() {
let source = "if True:
pass
pass";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_empty_fstrings() {
let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_prefix() {
let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring() {
let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_parentheses() {
let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
assert_snapshot!(lex_source(source));
}
fn fstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
let source = format!(r"f'text \{eol} more text'");
lex_source(&source)
}
#[test]
fn test_fstring_single_quote_escape_unix_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(UNIX_EOL));
}
#[test]
fn test_fstring_single_quote_escape_mac_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(MAC_EOL));
}
#[test]
fn test_fstring_single_quote_escape_windows_eol() {
assert_snapshot!(fstring_single_quote_escape_eol(WINDOWS_EOL));
}
#[test]
fn test_fstring_escape() {
let source = r#"f"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_braces() {
let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_raw() {
let source = r#"rf"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode() {
let source = r#"f"\N{BULLET} normal \Nope \N""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode_raw() {
let source = r#"rf"\N{BULLET} normal""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_named_expression() {
let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_format_spec() {
let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_multiline_format_spec() {
let source = r"f'''__{
x:d
}__'''
f'''__{
x:a
b
c
}__'''
";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_newline_format_spec() {
let source = r"
f'__{
x:d
}__'
f'__{
x:a
b
}__'
";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_fstring_conversion() {
let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_nested() {
let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_expression_multiline() {
let source = r#"f"first {
x
*
y
} second""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_multiline() {
let source = r#"f"""
hello
world
""" f'''
world
hello
''' f"some {f"""multiline
allowed {x}"""} string""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_comments() {
let source = r#"f"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_ipy_escape_command() {
let source = r#"f"foo {!pwd} bar""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_lambda_expression() {
let source = r#"
f"{lambda x:{x}}"
f"{(lambda x:{x})}"
"#
.trim();
assert_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_nul_char() {
let source = r"f'\0'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_empty_tstrings() {
let source = r#"t"" "" t"" t'' '' t"""""" t''''''"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_prefix() {
let source = r#"t"" t"" rt"" rt"" Rt"" Rt"" tr"" Tr"" tR"" TR"""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring() {
let source = r#"t"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_parentheses() {
let source = r#"t"{}" t"{{}}" t" {}" t"{{{}}}" t"{{{{}}}}" t" {} {{}} {{{}}} {{{{}}}} ""#;
assert_snapshot!(lex_source(source));
}
fn tstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
let source = format!(r"t'text \{eol} more text'");
lex_source(&source)
}
#[test]
fn test_tstring_single_quote_escape_unix_eol() {
assert_snapshot!(tstring_single_quote_escape_eol(UNIX_EOL));
}
#[test]
fn test_tstring_single_quote_escape_mac_eol() {
assert_snapshot!(tstring_single_quote_escape_eol(MAC_EOL));
}
#[test]
fn test_tstring_single_quote_escape_windows_eol() {
assert_snapshot!(tstring_single_quote_escape_eol(WINDOWS_EOL));
}
#[test]
fn test_tstring_escape() {
let source = r#"t"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_escape_braces() {
let source = r"t'\{foo}' t'\\{foo}' t'\{{foo}}' t'\\{{foo}}'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_escape_raw() {
let source = r#"rt"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_named_unicode() {
let source = r#"t"\N{BULLET} normal \Nope \N""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_named_unicode_raw() {
let source = r#"rt"\N{BULLET} normal""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_named_expression() {
let source = r#"t"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_format_spec() {
let source = r#"t"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_multiline_format_spec() {
let source = r"t'''__{
x:d
}__'''
t'''__{
x:a
b
c
}__'''
";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_newline_format_spec() {
let source = r"
t'__{
x:d
}__'
t'__{
x:a
b
}__'
";
assert_snapshot!(lex_invalid(source, Mode::Module));
}
#[test]
fn test_tstring_conversion() {
let source = r#"t"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_nested() {
let source = r#"t"foo {t"bar {x + t"{wow}"}"} baz" t'foo {t'bar'} some {t"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_expression_multiline() {
let source = r#"t"first {
x
*
y
} second""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_multiline() {
let source = r#"t"""
hello
world
""" t'''
world
hello
''' t"some {t"""multiline
allowed {x}"""} string""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_comments() {
let source = r#"t"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_ipy_escape_command() {
let source = r#"t"foo {!pwd} bar""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_lambda_expression() {
let source = r#"
t"{lambda x:{x}}"
t"{(lambda x:{x})}"
"#
.trim();
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_nul_char() {
let source = r"t'\0'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_nested_t_and_fstring() {
let source = r#"t"foo {f"bar {x + t"{wow}"}"} baz" f'foo {t'bar'!r} some {f"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_match_softkeyword_in_notebook() {
let source = r"match foo:
case bar:
pass";
assert_snapshot!(lex_jupyter_source(source));
}
fn lex_fstring_error(source: &str) -> InterpolatedStringErrorType {
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
.into_iter()
.next()
.expect("lexer should give at least one error")
.into_error()
{
LexicalErrorType::FStringError(error) => error,
err => panic!("Expected FStringError: {err:?}"),
}
}
#[test]
fn test_fstring_error() {
use InterpolatedStringErrorType::{
SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
};
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
assert_eq!(lex_fstring_error(r"f'"), UnterminatedString);
assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_fstring_error(r"f'''"), UnterminatedTripleQuotedString);
assert_eq!(
lex_fstring_error(r#"f"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_fstring_error(r#"f""""""#),
UnterminatedTripleQuotedString
);
}
fn lex_tstring_error(source: &str) -> InterpolatedStringErrorType {
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
.into_iter()
.next()
.expect("lexer should give at least one error")
.into_error()
{
LexicalErrorType::TStringError(error) => error,
err => panic!("Expected TStringError: {err:?}"),
}
}
#[test]
fn lex_fstring_unclosed() {
let source = r#"f"hello"#;
assert_snapshot!(lex_invalid(source, Mode::Module), @"
## Tokens
```
[
FStringStart 0..2 (flags = DOUBLE_QUOTES | F_STRING),
FStringMiddle 2..7 (flags = DOUBLE_QUOTES | F_STRING),
Newline 7..7,
]
```
## Errors
```
[
LexicalError {
error: FStringError(
UnterminatedString,
),
location: 2..7,
},
]
```
");
}
#[test]
fn lex_fstring_missing_brace() {
let source = "f'{'";
assert_snapshot!(lex_invalid(source, Mode::Module), @"
## Tokens
```
[
FStringStart 0..2 (flags = F_STRING),
Lbrace 2..3,
String 3..4 (flags = UNCLOSED_STRING),
Newline 4..4,
]
```
## Errors
```
[
LexicalError {
error: UnclosedStringError,
location: 3..4,
},
LexicalError {
error: FStringError(
UnterminatedString,
),
location: 4..4,
},
]
```
");
}
#[test]
fn lex_fstring_missing_brace_after_format_spec() {
let source = r#"f"{foo!r""#;
assert_snapshot!(lex_invalid(source, Mode::Module), @"
## Tokens
```
[
FStringStart 0..2 (flags = DOUBLE_QUOTES | F_STRING),
Lbrace 2..3,
Name 3..6,
Exclamation 6..7,
String 7..9 (flags = DOUBLE_QUOTES | RAW_STRING_LOWERCASE | UNCLOSED_STRING),
Newline 9..9,
]
```
## Errors
```
[
LexicalError {
error: UnclosedStringError,
location: 7..9,
},
LexicalError {
error: FStringError(
UnterminatedString,
),
location: 9..9,
},
]
```
");
}
#[test]
fn test_tstring_error() {
use InterpolatedStringErrorType::{
SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
};
assert_eq!(lex_tstring_error("t'}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{{}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{{}}}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'foo}'"), SingleRbrace);
assert_eq!(lex_tstring_error(r"t'\u007b}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{a:b}}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_tstring_error(r"t'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_tstring_error(r#"t""#), UnterminatedString);
assert_eq!(lex_tstring_error(r"t'"), UnterminatedString);
assert_eq!(lex_tstring_error(r#"t""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_tstring_error(r"t'''"), UnterminatedTripleQuotedString);
assert_eq!(
lex_tstring_error(r#"t"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_tstring_error(r#"t""""""#),
UnterminatedTripleQuotedString
);
}
#[test]
fn backslash_continuation_indentation() {
let source = r"if True:
1
\
2
\
3
else:
pass
"
.to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn backslash_continuation_at_root() {
let source = r"if True:
\
1
if True:
\
2
else:\
3
"
.to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn multiple_backslash_continuation() {
let source = r"if True:
1
\
\
\
\
2
"
.to_string();
assert_snapshot!(lex_source(&source));
}
#[test]
fn backslash_continuation_mismatch_indentation() {
let source = r"if True:
1
\
2
"
.to_string();
assert_snapshot!(lex_invalid(&source, Mode::Module));
}
}