use unicode_ident::{is_xid_continue, is_xid_start};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use crate::{Cursor, is_python_whitespace};
pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
SimpleTokenizer::starts_at(offset, code)
.skip_trivia()
.next()
}
pub fn find_only_token_in_range(
range: TextRange,
token_kind: SimpleTokenKind,
code: &str,
) -> SimpleToken {
let mut tokens = SimpleTokenizer::new(code, range)
.skip_trivia()
.skip_while(|token| token.kind == SimpleTokenKind::RParen);
let token = tokens.next().expect("Expected a token");
debug_assert_eq!(token.kind(), token_kind);
let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen);
#[expect(clippy::debug_assert_with_mut_call)]
{
debug_assert_eq!(tokens.next(), None);
}
token
}
pub fn lines_before(offset: TextSize, code: &str) -> u32 {
let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
let mut newlines = 0u32;
while let Some(c) = cursor.bump_back() {
match c {
'\n' => {
cursor.eat_char_back('\r');
newlines += 1;
}
'\r' => {
newlines += 1;
}
c if is_python_whitespace(c) => {
continue;
}
_ => {
break;
}
}
}
newlines
}
pub fn lines_after(offset: TextSize, code: &str) -> u32 {
let mut cursor = Cursor::new(&code[offset.to_usize()..]);
let mut newlines = 0u32;
while let Some(c) = cursor.bump() {
match c {
'\n' => {
newlines += 1;
}
'\r' => {
cursor.eat_char('\n');
newlines += 1;
}
c if is_python_whitespace(c) => {
continue;
}
_ => {
break;
}
}
}
newlines
}
pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
let mut newlines = 0u32;
for token in SimpleTokenizer::starts_at(offset, code) {
match token.kind() {
SimpleTokenKind::Newline => {
newlines += 1;
}
SimpleTokenKind::Whitespace => {}
SimpleTokenKind::Comment => {
newlines = 0;
}
_ => {
break;
}
}
}
newlines
}
#[expect(clippy::cast_possible_truncation)]
pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 {
SimpleTokenizer::starts_at(offset, code)
.skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
.take_while(|token| {
token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
})
.filter(|token| token.kind == SimpleTokenKind::Newline)
.count() as u32
}
fn is_identifier_start(c: char) -> bool {
if c.is_ascii() {
c.is_ascii_alphabetic() || c == '_'
} else {
is_xid_start(c)
}
}
fn is_identifier_continuation(c: char) -> bool {
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
is_xid_continue(c)
}
}
fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
match source {
"and" => SimpleTokenKind::And,
"as" => SimpleTokenKind::As,
"assert" => SimpleTokenKind::Assert,
"async" => SimpleTokenKind::Async,
"await" => SimpleTokenKind::Await,
"break" => SimpleTokenKind::Break,
"class" => SimpleTokenKind::Class,
"continue" => SimpleTokenKind::Continue,
"def" => SimpleTokenKind::Def,
"del" => SimpleTokenKind::Del,
"elif" => SimpleTokenKind::Elif,
"else" => SimpleTokenKind::Else,
"except" => SimpleTokenKind::Except,
"finally" => SimpleTokenKind::Finally,
"for" => SimpleTokenKind::For,
"from" => SimpleTokenKind::From,
"global" => SimpleTokenKind::Global,
"if" => SimpleTokenKind::If,
"import" => SimpleTokenKind::Import,
"in" => SimpleTokenKind::In,
"is" => SimpleTokenKind::Is,
"lambda" => SimpleTokenKind::Lambda,
"nonlocal" => SimpleTokenKind::Nonlocal,
"not" => SimpleTokenKind::Not,
"or" => SimpleTokenKind::Or,
"pass" => SimpleTokenKind::Pass,
"raise" => SimpleTokenKind::Raise,
"return" => SimpleTokenKind::Return,
"try" => SimpleTokenKind::Try,
"while" => SimpleTokenKind::While,
"match" => SimpleTokenKind::Match, "type" => SimpleTokenKind::Type, "case" => SimpleTokenKind::Case,
"with" => SimpleTokenKind::With,
"yield" => SimpleTokenKind::Yield,
_ => SimpleTokenKind::Name, }
}
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub struct SimpleToken {
pub kind: SimpleTokenKind,
pub range: TextRange,
}
impl SimpleToken {
pub const fn kind(&self) -> SimpleTokenKind {
self.kind
}
}
impl Ranged for SimpleToken {
fn range(&self) -> TextRange {
self.range
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub enum SimpleTokenKind {
Comment,
Whitespace,
EndOfFile,
Continuation,
Newline,
LParen,
RParen,
LBrace,
RBrace,
LBracket,
RBracket,
Comma,
Colon,
Semi,
Slash,
Star,
Dot,
Plus,
Minus,
Equals,
Greater,
Less,
Percent,
Ampersand,
Circumflex,
Vbar,
At,
Tilde,
EqEqual,
NotEqual,
LessEqual,
GreaterEqual,
LeftShift,
RightShift,
DoubleStar,
DoubleStarEqual,
PlusEqual,
MinusEqual,
StarEqual,
SlashEqual,
PercentEqual,
AmperEqual,
VbarEqual,
CircumflexEqual,
LeftShiftEqual,
RightShiftEqual,
DoubleSlash,
DoubleSlashEqual,
ColonEqual,
Ellipsis,
AtEqual,
RArrow,
And,
As,
Assert,
Async,
Await,
Break,
Class,
Continue,
Def,
Del,
Elif,
Else,
Except,
Finally,
For,
From,
Global,
If,
Import,
In,
Is,
Lambda,
Nonlocal,
Not,
Or,
Pass,
Raise,
Return,
Try,
While,
Match,
Type,
Case,
With,
Yield,
Name,
Other,
Bogus,
}
impl SimpleTokenKind {
pub const fn is_trivia(self) -> bool {
matches!(
self,
SimpleTokenKind::Whitespace
| SimpleTokenKind::Newline
| SimpleTokenKind::Comment
| SimpleTokenKind::Continuation
)
}
pub const fn is_comment(self) -> bool {
matches!(self, SimpleTokenKind::Comment)
}
}
pub struct SimpleTokenizer<'a> {
offset: TextSize,
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
}
impl<'a> SimpleTokenizer<'a> {
pub fn new(source: &'a str, range: TextRange) -> Self {
Self {
offset: range.start(),
bogus: false,
source,
cursor: Cursor::new(&source[range]),
}
}
pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
let range = TextRange::new(offset, source.text_len());
Self::new(source, range)
}
fn next_token(&mut self) -> SimpleToken {
self.cursor.start_token();
let Some(first) = self.cursor.bump() else {
return SimpleToken {
kind: SimpleTokenKind::EndOfFile,
range: TextRange::empty(self.offset),
};
};
if self.bogus {
let token = SimpleToken {
kind: SimpleTokenKind::Bogus,
range: TextRange::new(self.offset, self.source.text_len()),
};
self.cursor = Cursor::new("");
self.offset = self.source.text_len();
return token;
}
let kind = self.next_token_inner(first);
let token_len = self.cursor.token_len();
let token = SimpleToken {
kind,
range: TextRange::at(self.offset, token_len),
};
self.offset += token_len;
token
}
fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
match first {
c if is_identifier_start(c) => {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
let kind = to_keyword_or_other(&self.source[range]);
if kind == SimpleTokenKind::Name
&& matches!(self.cursor.first(), '"' | '\'')
&& matches!(
&self.source[range],
"B" | "BR"
| "Br"
| "F"
| "FR"
| "Fr"
| "R"
| "RB"
| "RF"
| "Rb"
| "Rf"
| "U"
| "b"
| "bR"
| "br"
| "f"
| "fR"
| "fr"
| "r"
| "rB"
| "rF"
| "rb"
| "rf"
| "u"
| "T"
| "TR"
| "Tr"
| "RT"
| "Rt"
| "t"
| "tR"
| "tr"
| "rT"
| "rt"
)
{
self.bogus = true;
SimpleTokenKind::Other
} else {
kind
}
}
' ' | '\t' | '\x0C' => {
self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
SimpleTokenKind::Whitespace
}
'\n' => SimpleTokenKind::Newline,
'\r' => {
self.cursor.eat_char('\n');
SimpleTokenKind::Newline
}
'#' => {
self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
SimpleTokenKind::Comment
}
'\\' => SimpleTokenKind::Continuation,
'=' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::EqEqual
} else {
SimpleTokenKind::Equals
}
}
'+' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::PlusEqual
} else {
SimpleTokenKind::Plus
}
}
'*' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::StarEqual
} else if self.cursor.eat_char('*') {
if self.cursor.eat_char('=') {
SimpleTokenKind::DoubleStarEqual
} else {
SimpleTokenKind::DoubleStar
}
} else {
SimpleTokenKind::Star
}
}
'/' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::SlashEqual
} else if self.cursor.eat_char('/') {
if self.cursor.eat_char('=') {
SimpleTokenKind::DoubleSlashEqual
} else {
SimpleTokenKind::DoubleSlash
}
} else {
SimpleTokenKind::Slash
}
}
'%' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::PercentEqual
} else {
SimpleTokenKind::Percent
}
}
'|' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::VbarEqual
} else {
SimpleTokenKind::Vbar
}
}
'^' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::CircumflexEqual
} else {
SimpleTokenKind::Circumflex
}
}
'&' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::AmperEqual
} else {
SimpleTokenKind::Ampersand
}
}
'-' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::MinusEqual
} else if self.cursor.eat_char('>') {
SimpleTokenKind::RArrow
} else {
SimpleTokenKind::Minus
}
}
'@' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::AtEqual
} else {
SimpleTokenKind::At
}
}
'!' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::NotEqual
} else {
self.bogus = true;
SimpleTokenKind::Other
}
}
'~' => SimpleTokenKind::Tilde,
':' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::ColonEqual
} else {
SimpleTokenKind::Colon
}
}
';' => SimpleTokenKind::Semi,
'<' => {
if self.cursor.eat_char('<') {
if self.cursor.eat_char('=') {
SimpleTokenKind::LeftShiftEqual
} else {
SimpleTokenKind::LeftShift
}
} else if self.cursor.eat_char('=') {
SimpleTokenKind::LessEqual
} else {
SimpleTokenKind::Less
}
}
'>' => {
if self.cursor.eat_char('>') {
if self.cursor.eat_char('=') {
SimpleTokenKind::RightShiftEqual
} else {
SimpleTokenKind::RightShift
}
} else if self.cursor.eat_char('=') {
SimpleTokenKind::GreaterEqual
} else {
SimpleTokenKind::Greater
}
}
',' => SimpleTokenKind::Comma,
'.' => {
if self.cursor.first() == '.' && self.cursor.second() == '.' {
self.cursor.bump();
self.cursor.bump();
SimpleTokenKind::Ellipsis
} else {
SimpleTokenKind::Dot
}
}
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
_ => {
self.bogus = true;
SimpleTokenKind::Other
}
}
}
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
self.filter(|t| !t.kind().is_trivia())
}
}
impl Iterator for SimpleTokenizer<'_> {
type Item = SimpleToken;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
if token.kind == SimpleTokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
pub struct BackwardsTokenizer<'a> {
offset: TextSize,
back_offset: TextSize,
comment_ranges: &'a [TextRange],
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
}
impl<'a> BackwardsTokenizer<'a> {
pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
Self {
offset: range.start(),
back_offset: range.end(),
comment_ranges: &comment_range
[..comment_range.partition_point(|comment| comment.start() <= range.end())],
bogus: false,
source,
cursor: Cursor::new(&source[range]),
}
}
pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
Self::new(source, TextRange::up_to(offset), comment_range)
}
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
self.filter(|t| !t.kind().is_trivia())
}
pub fn next_token(&mut self) -> SimpleToken {
self.cursor.start_token();
self.back_offset = self.cursor.text_len() + self.offset;
let Some(last) = self.cursor.bump_back() else {
return SimpleToken {
kind: SimpleTokenKind::EndOfFile,
range: TextRange::empty(self.back_offset),
};
};
if self.bogus {
let token = SimpleToken {
kind: SimpleTokenKind::Bogus,
range: TextRange::up_to(self.back_offset),
};
self.cursor = Cursor::new("");
self.back_offset = TextSize::new(0);
return token;
}
if let Some(comment) = self
.comment_ranges
.last()
.filter(|comment| comment.contains_inclusive(self.back_offset))
{
self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
return SimpleToken {
kind: SimpleTokenKind::Comment,
range: comment.range(),
};
}
let kind = match last {
' ' | '\t' | '\x0C' => {
self.cursor
.eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
SimpleTokenKind::Whitespace
}
'\r' => SimpleTokenKind::Newline,
'\n' => {
self.cursor.eat_char_back('\r');
SimpleTokenKind::Newline
}
_ => self.next_token_inner(last),
};
let token_len = self.cursor.token_len();
let start = self.back_offset - token_len;
SimpleToken {
kind,
range: TextRange::at(start, token_len),
}
}
fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
match last {
c if is_identifier_continuation(c) => {
let savepoint = self.cursor.clone();
self.cursor.eat_back_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
if self.source[range]
.chars()
.next()
.is_some_and(is_identifier_start)
{
to_keyword_or_other(&self.source[range])
} else {
self.cursor = savepoint;
self.bogus = true;
SimpleTokenKind::Other
}
}
'\\' => SimpleTokenKind::Continuation,
':' => SimpleTokenKind::Colon,
'~' => SimpleTokenKind::Tilde,
'%' => SimpleTokenKind::Percent,
'|' => SimpleTokenKind::Vbar,
',' => SimpleTokenKind::Comma,
';' => SimpleTokenKind::Semi,
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
let mut cursor = self.cursor.clone();
cursor.eat_back_while(|c| {
matches!(
c,
':' | '~'
| '%'
| '|'
| '&'
| '^'
| '+'
| '-'
| '='
| '*'
| '/'
| '@'
| '!'
| '<'
| '>'
| '.'
)
});
let token_len = cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
let forward_lexer = SimpleTokenizer::new(self.source, range);
if let Some(token) = forward_lexer.last() {
for _ in self.source[token.range].chars().rev().skip(1) {
self.cursor.bump_back().unwrap();
}
token.kind()
} else {
self.bogus = true;
SimpleTokenKind::Other
}
}
_ => {
self.bogus = true;
SimpleTokenKind::Other
}
}
}
}
impl Iterator for BackwardsTokenizer<'_> {
type Item = SimpleToken;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
if token.kind == SimpleTokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}