use bstr::ByteSlice;
use std::fmt;
use ruff_python_ast::token::TokenKind;
use ruff_python_ast::{self as ast, AnyStringFlags, AtomicNodeIndex, Expr, StringFlags};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::error::{LexicalError, LexicalErrorType};
#[derive(Debug)]
pub(crate) enum StringType {
Str(ast::StringLiteral),
Bytes(ast::BytesLiteral),
FString(ast::FString),
TString(ast::TString),
}
impl Ranged for StringType {
fn range(&self) -> TextRange {
match self {
Self::Str(node) => node.range(),
Self::Bytes(node) => node.range(),
Self::FString(node) => node.range(),
Self::TString(node) => node.range(),
}
}
}
impl From<StringType> for Expr {
fn from(string: StringType) -> Self {
match string {
StringType::Str(node) => Expr::from(node),
StringType::Bytes(node) => Expr::from(node),
StringType::FString(node) => Expr::from(node),
StringType::TString(node) => Expr::from(node),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum InterpolatedStringKind {
FString,
TString,
}
impl InterpolatedStringKind {
#[inline]
pub(crate) const fn start_token(self) -> TokenKind {
match self {
InterpolatedStringKind::FString => TokenKind::FStringStart,
InterpolatedStringKind::TString => TokenKind::TStringStart,
}
}
#[inline]
pub(crate) const fn middle_token(self) -> TokenKind {
match self {
InterpolatedStringKind::FString => TokenKind::FStringMiddle,
InterpolatedStringKind::TString => TokenKind::TStringMiddle,
}
}
#[inline]
pub(crate) const fn end_token(self) -> TokenKind {
match self {
InterpolatedStringKind::FString => TokenKind::FStringEnd,
InterpolatedStringKind::TString => TokenKind::TStringEnd,
}
}
}
impl fmt::Display for InterpolatedStringKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
InterpolatedStringKind::FString => f.write_str("f-string"),
InterpolatedStringKind::TString => f.write_str("t-string"),
}
}
}
enum EscapedChar {
Literal(char),
Escape(char),
}
struct StringParser {
source: Box<str>,
cursor: usize,
flags: AnyStringFlags,
offset: TextSize,
range: TextRange,
}
impl StringParser {
fn new(source: Box<str>, flags: AnyStringFlags, offset: TextSize, range: TextRange) -> Self {
Self {
source,
cursor: 0,
flags,
offset,
range,
}
}
#[inline]
fn skip_bytes(&mut self, bytes: usize) -> &str {
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
self.cursor += bytes;
skipped_str
}
#[inline]
fn position(&self) -> TextSize {
self.compute_position(self.cursor)
}
#[inline]
fn compute_position(&self, cursor: usize) -> TextSize {
self.offset + TextSize::try_from(cursor).unwrap()
}
#[inline]
fn next_byte(&mut self) -> Option<u8> {
self.source[self.cursor..].as_bytes().first().map(|&byte| {
self.cursor += 1;
byte
})
}
#[inline]
fn next_char(&mut self) -> Option<char> {
self.source[self.cursor..].chars().next().inspect(|c| {
self.cursor += c.len_utf8();
})
}
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.source[self.cursor..].as_bytes().first().copied()
}
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
let mut p: u32 = 0u32;
for i in 1..=literal_number {
let start = self.position();
match self.next_char() {
Some(c) => match c.to_digit(16) {
Some(d) => p += d << ((literal_number - i) * 4),
None => {
return Err(LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::at(start, TextSize::try_from(c.len_utf8()).unwrap()),
));
}
},
None => {
return Err(LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::empty(self.position()),
));
}
}
}
match p {
0xD800..=0xDFFF => Ok(std::char::REPLACEMENT_CHARACTER),
_ => std::char::from_u32(p).ok_or(LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::empty(self.position()),
)),
}
}
fn parse_octet(&mut self, o: u8) -> char {
let mut radix_bytes = [o, 0, 0];
let mut len = 1;
while len < 3 {
let Some(b'0'..=b'7') = self.peek_byte() else {
break;
};
radix_bytes[len] = self.next_byte().unwrap();
len += 1;
}
let radix_str = std::str::from_utf8(&radix_bytes[..len]).expect("ASCII bytes");
let value = u32::from_str_radix(radix_str, 8).unwrap();
char::from_u32(value).unwrap()
}
fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
let start_pos = self.position();
let Some('{') = self.next_char() else {
return Err(LexicalError::new(
LexicalErrorType::MissingUnicodeLbrace,
TextRange::empty(start_pos),
));
};
let start_pos = self.position();
let Some(close_idx) = self.source[self.cursor..].find('}') else {
return Err(LexicalError::new(
LexicalErrorType::MissingUnicodeRbrace,
TextRange::empty(self.compute_position(self.source.len())),
));
};
let name_and_ending = self.skip_bytes(close_idx + 1);
let name = &name_and_ending[..name_and_ending.len() - 1];
unicode_names2::character(name).ok_or_else(|| {
LexicalError::new(
LexicalErrorType::UnicodeError,
TextRange::new(
start_pos,
self.compute_position(self.cursor - '}'.len_utf8()),
),
)
})
}
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
let Some(first_char) = self.next_char() else {
return Err(LexicalError::new(
LexicalErrorType::StringError,
TextRange::empty(self.position()),
));
};
let new_char = match first_char {
'\\' => '\\',
'\'' => '\'',
'\"' => '"',
'a' => '\x07',
'b' => '\x08',
'f' => '\x0c',
'n' => '\n',
'r' => '\r',
't' => '\t',
'v' => '\x0b',
o @ '0'..='7' => self.parse_octet(o as u8),
'x' => self.parse_unicode_literal(2)?,
'u' if !self.flags.is_byte_string() => self.parse_unicode_literal(4)?,
'U' if !self.flags.is_byte_string() => self.parse_unicode_literal(8)?,
'N' if !self.flags.is_byte_string() => self.parse_unicode_name()?,
'\n' => return Ok(None),
'\r' => {
if self.peek_byte() == Some(b'\n') {
self.next_byte();
}
return Ok(None);
}
_ => return Ok(Some(EscapedChar::Escape(first_char))),
};
Ok(Some(EscapedChar::Literal(new_char)))
}
fn parse_interpolated_string_middle(
mut self,
) -> Result<ast::InterpolatedStringLiteralElement, LexicalError> {
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
return Ok(ast::InterpolatedStringLiteralElement {
value: self.source,
range: self.range,
node_index: AtomicNodeIndex::NONE,
});
};
let mut value = String::with_capacity(self.source.len());
loop {
let before_with_slash_or_brace = self.skip_bytes(index + 1);
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
value.push_str(before);
match &self.source.as_bytes()[self.cursor - 1] {
b'{' => {
self.offset += TextSize::from(1);
value.push('{');
}
b'}' => {
self.offset += TextSize::from(1);
value.push('}');
}
b'\\' => {
if !self.flags.is_raw_string() && self.peek_byte().is_some() {
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
} else {
value.push('\\');
}
}
ch => {
unreachable!("Expected '{{', '}}', or '\\' but got {:?}", ch);
}
}
let Some(next_index) =
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
else {
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
index = next_index;
}
Ok(ast::InterpolatedStringLiteralElement {
value: value.into_boxed_str(),
range: self.range,
node_index: AtomicNodeIndex::NONE,
})
}
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
let ch = self.source.chars().nth(index).unwrap();
return Err(LexicalError::new(
LexicalErrorType::InvalidByteLiteral,
TextRange::at(
self.compute_position(index),
TextSize::try_from(ch.len_utf8()).unwrap(),
),
));
}
if self.flags.is_raw_string() {
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_boxed_bytes(),
range: self.range,
flags: self.flags.into(),
node_index: AtomicNodeIndex::NONE,
}));
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_boxed_bytes(),
range: self.range,
flags: self.flags.into(),
node_index: AtomicNodeIndex::NONE,
}));
};
let mut value = Vec::with_capacity(self.source.len());
loop {
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.extend_from_slice(before.as_bytes());
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c as u8),
Some(EscapedChar::Escape(c)) => {
value.push(b'\\');
value.push(c as u8);
}
}
let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
else {
let rest = &self.source[self.cursor..];
value.extend_from_slice(rest.as_bytes());
break;
};
escape = next_escape;
}
Ok(StringType::Bytes(ast::BytesLiteral {
value: value.into_boxed_slice(),
range: self.range,
flags: self.flags.into(),
node_index: AtomicNodeIndex::NONE,
}))
}
fn parse_string(mut self) -> Result<StringType, LexicalError> {
if self.flags.is_raw_string() {
return Ok(StringType::Str(ast::StringLiteral {
value: self.source,
range: self.range,
flags: self.flags.into(),
node_index: AtomicNodeIndex::NONE,
}));
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
return Ok(StringType::Str(ast::StringLiteral {
value: self.source,
range: self.range,
flags: self.flags.into(),
node_index: AtomicNodeIndex::NONE,
}));
};
let mut value = String::with_capacity(self.source.len());
loop {
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.push_str(before);
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
escape = next_escape;
}
Ok(StringType::Str(ast::StringLiteral {
value: value.into_boxed_str(),
range: self.range,
flags: self.flags.into(),
node_index: AtomicNodeIndex::NONE,
}))
}
fn parse(self) -> Result<StringType, LexicalError> {
if self.flags.is_byte_string() {
self.parse_bytes()
} else {
self.parse_string()
}
}
}
pub(crate) fn parse_string_literal(
source: Box<str>,
flags: AnyStringFlags,
range: TextRange,
) -> Result<StringType, LexicalError> {
StringParser::new(source, flags, range.start() + flags.opener_len(), range).parse()
}
pub(crate) fn parse_interpolated_string_literal_element(
source: Box<str>,
flags: AnyStringFlags,
range: TextRange,
) -> Result<ast::InterpolatedStringLiteralElement, LexicalError> {
StringParser::new(source, flags, range.start(), range).parse_interpolated_string_middle()
}