mod string_types;
use regex::Regex;
use std::cell::RefCell;
use std::cmp::Ordering;
use std::convert::TryInto;
use std::fmt::Debug;
use std::fmt::Formatter;
use std::rc::Rc;
use crate::tokenizer::core::string_types::FTStringType;
use crate::tokenizer::{
core::string_types::{FTStringNode, StringQuoteChar, StringQuoteSize},
operators::OPERATOR_RE,
text_position::{TextPosition, TextPositionSnapshot},
whitespace_parser::State as WhitespaceState,
};
const MAX_INDENT: usize = 100;
const MAX_CHAR: char = '\u{10ffff}';
thread_local! {
static SPACE_TAB_FORMFEED_RE: Regex = Regex::new(r"\A[ \f\t]+").expect("regex");
static ANY_NON_NEWLINE_RE: Regex = Regex::new(r"\A[^\r\n]+").expect("regex");
static STRING_PREFIX_RE: Regex =
Regex::new(r"\A(?i)(u|[bf]r|r[bft]|r|b|f|t)").expect("regex");
static POTENTIAL_IDENTIFIER_TAIL_RE: Regex =
Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex");
static DECIMAL_DOT_DIGIT_RE: Regex = Regex::new(r"\A\.[0-9]").expect("regex");
static DECIMAL_TAIL_RE: Regex =
Regex::new(r"\A[0-9](_?[0-9])*").expect("regex");
static HEXADECIMAL_TAIL_RE: Regex =
Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex");
static OCTAL_TAIL_RE: Regex = Regex::new(r"\A(_?[0-7])+").expect("regex");
static BINARY_TAIL_RE: Regex = Regex::new(r"\A(_?[01])+").expect("regex");
static UNICODE_IDENTIFIER_RE: Regex =
Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex");
}
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
pub enum TokType {
String,
Name,
Number,
Op,
Newline,
Indent,
Dedent,
Async,
Await,
FStringStart,
FStringString,
FStringEnd,
TStringStart,
TStringString,
TStringEnd,
EndMarker,
}
#[derive(Debug, thiserror::Error, Eq, PartialEq)]
pub enum TokError<'t> {
#[error("inconsistent mixing of tabs and spaces")]
TabSpace,
#[error("too many indentation levels")]
TooDeep,
#[error("no matching outer block for dedent")]
Dedent,
#[error("unexpected characters after a line continuation")]
LineContinuation,
#[error("unexpected end of file after a line continuation")]
LineContinuationEof,
#[error("{0:?} is not a valid identifier")]
BadIdentifier(&'t str),
#[error("invalid decimal literal")]
BadDecimal,
#[error(
"{}{}",
"leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal ",
"integers"
)]
BadDecimalLeadingZeros,
#[error("invalid hexadecimal literal")]
BadHexadecimal,
#[error("invalid octal literal")]
BadOctal,
#[error("invalid digit {0:?} in octal literal")]
BadOctalDigit(char),
#[error("invalid binary literal")]
BadBinary,
#[error("invalid digit {0:?} in binary literal")]
BadBinaryDigit(char),
#[error("unterminated string literal")]
UnterminatedString,
#[error("unterminated triple-quoted string literal")]
UnterminatedTripleQuotedString,
#[error("unmatched {0:?}")]
UnmatchedClosingParen(char),
#[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?}")]
MismatchedClosingParen(char, char),
#[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?} on line {2:}")]
MismatchedClosingParenOnLine(char, char, usize),
#[error("{0:?} is not a valid character in this position")]
BadCharacter(char),
}
#[derive(Clone)]
pub struct TokState<'t> {
pub text_pos: TextPosition<'t>,
pub start_pos: TextPositionSnapshot,
done: bool,
tab_size: usize,
alt_tab_size: usize,
indent_stack: Vec<usize>,
alt_indent_stack: Vec<usize>,
at_bol: bool,
pub bol_width: usize,
blank_line: bool,
pending_indents: i32,
paren_stack: Vec<(char, usize)>,
cont_line: bool,
async_hacks: bool,
async_def: bool,
async_def_indent: usize,
async_def_nl: bool,
split_ftstring: bool,
ftstring_stack: Vec<FTStringNode>,
missing_nl_before_eof: bool,
}
pub struct TokConfig {
pub async_hacks: bool,
pub split_ftstring: bool,
}
fn is_digit<C: Into<Option<char>>>(ch: C) -> bool {
matches!(ch.into(), Some('0'..='9'))
}
#[derive(Debug)]
enum NumberState {
StartDigit,
Fraction,
Exponent,
Imaginary,
}
impl<'t> TokState<'t> {
pub fn new(text: &'t str, config: &TokConfig) -> Self {
let text_pos = TextPosition::new(text);
let start_pos = (&text_pos).into();
Self {
text_pos,
start_pos,
done: false,
tab_size: 8,
alt_tab_size: 1,
indent_stack: Vec::new(),
alt_indent_stack: Vec::new(),
at_bol: true,
bol_width: 0,
blank_line: false,
pending_indents: 0,
paren_stack: Vec::new(),
cont_line: false,
async_hacks: config.async_hacks,
async_def: false,
async_def_indent: 0,
async_def_nl: false,
split_ftstring: config.split_ftstring,
ftstring_stack: Vec::new(),
missing_nl_before_eof: text.is_empty() || text.as_bytes()[text.len() - 1] != b'\n',
}
}
pub fn is_parenthesized(&self) -> bool {
!self.paren_stack.is_empty()
}
fn next_inner(&mut self) -> Result<TokType, TokError<'t>> {
if self.split_ftstring {
if let Some(tos) = self.ftstring_stack.last() {
if !tos.is_in_expr() {
self.start_pos = (&self.text_pos).into();
let is_in_format_spec = tos.is_in_format_spec();
let is_raw_string = tos.is_raw_string;
if let Some(tok) =
self.maybe_consume_ftstring_string(is_in_format_spec, is_raw_string)?
{
return Ok(tok);
}
if let Some(tok) = self.maybe_consume_ftstring_end() {
return Ok(tok);
}
}
}
}
self.consume_bol_whitespace()?;
if let Some(t) = self.process_pending_indents() {
self.start_pos = (&self.text_pos).into();
return Ok(t);
}
self.maybe_close_async_def();
'again: loop {
SPACE_TAB_FORMFEED_RE.with(|v| self.text_pos.consume(v));
if self.text_pos.peek() == Some('#') {
ANY_NON_NEWLINE_RE.with(|v| self.text_pos.consume(v));
}
self.start_pos = (&self.text_pos).into();
return match self.text_pos.peek() {
None => {
if self.missing_nl_before_eof && !self.blank_line {
self.at_bol = true;
self.missing_nl_before_eof = false;
Ok(TokType::Newline)
} else {
let hanging_indents = self.indent_stack.len() as i32;
if self.pending_indents == 0 && hanging_indents != 0 {
self.pending_indents = -hanging_indents;
self.indent_stack.clear();
self.alt_indent_stack.clear();
self.missing_nl_before_eof = false;
}
if let Some(t) = self.process_pending_indents() {
Ok(t)
} else {
Ok(TokType::EndMarker)
}
}
}
Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR) => {
self.consume_identifier_or_prefixed_string()
}
Some('\n') => {
self.text_pos.next();
self.at_bol = true;
if self.split_ftstring
&& self
.ftstring_stack
.last()
.map(|node| node.allow_multiline())
== Some(false)
{
Err(TokError::UnterminatedString)
} else if self.blank_line || !self.paren_stack.is_empty() {
self.next_inner()
} else {
self.cont_line = false;
if self.async_def {
self.async_def_nl = true;
}
Ok(TokType::Newline)
}
}
Some('.') if self.text_pos.consume("...") => {
return Ok(TokType::Op);
}
Some('.') if DECIMAL_DOT_DIGIT_RE.with(|r| self.text_pos.matches(r)) => {
self.consume_number(NumberState::Fraction)
}
Some('.') => {
self.text_pos.next();
Ok(TokType::Op)
}
Some('0'..='9') => self.consume_number(NumberState::StartDigit),
Some('\'') | Some('"') => self.consume_string(),
Some('\\') => {
self.text_pos.next();
if let Some('\n') = self.text_pos.next() {
if self.text_pos.peek() == None {
Err(TokError::LineContinuationEof)
} else {
self.cont_line = true;
continue 'again;
}
} else {
Err(TokError::LineContinuation)
}
}
Some(ch @ '(') | Some(ch @ '[') | Some(ch @ '{') => {
self.text_pos.next();
if let Some(tos) = self.ftstring_stack.last_mut() {
tos.open_parentheses();
}
self.paren_stack.push((ch, self.text_pos.line_number()));
Ok(TokType::Op)
}
Some(closing @ ')') | Some(closing @ ']') | Some(closing @ '}') => {
self.text_pos.next();
if let Some(tos) = self.ftstring_stack.last_mut() {
tos.close_parentheses();
}
if let Some((opening, line_number)) = self.paren_stack.pop() {
match (opening, closing) {
('(', ')') | ('[', ']') | ('{', '}') => Ok(TokType::Op),
_ => {
if line_number != self.text_pos.line_number() {
Err(TokError::MismatchedClosingParenOnLine(
opening,
closing,
line_number,
))
} else {
Err(TokError::MismatchedClosingParen(opening, closing))
}
}
}
} else {
Err(TokError::UnmatchedClosingParen(closing))
}
}
Some(':')
if self
.ftstring_stack
.last()
.map(|tos| tos.parentheses_count - tos.format_spec_count == 1)
.unwrap_or(false) =>
{
let tos = self
.ftstring_stack
.last_mut()
.expect("ftstring_stack is not empty");
tos.format_spec_count += 1;
self.text_pos.next();
Ok(TokType::Op)
}
Some(_) if OPERATOR_RE.with(|r| self.text_pos.consume(r)) => Ok(TokType::Op),
Some(ch) => Err(TokError::BadCharacter(ch)),
};
}
}
fn consume_bol_whitespace(&mut self) -> Result<(), TokError<'t>> {
self.blank_line = false;
if !self.at_bol {
return Ok(());
}
let mut col = 0; let mut altcol = 0; self.at_bol = false;
self.bol_width = 0;
loop {
match self.text_pos.peek() {
Some(' ') => {
col += 1;
altcol += 1;
self.bol_width += 1;
self.text_pos.next();
}
Some('\t') => {
col = (col / self.tab_size + 1) * self.tab_size;
altcol = (altcol / self.alt_tab_size + 1) * self.alt_tab_size;
self.bol_width += 1;
self.text_pos.next();
}
Some('\x0c') => {
col = 0;
altcol = 0;
self.bol_width += 1;
self.text_pos.next();
}
_ => {
break;
}
}
}
self.blank_line = matches!(
self.text_pos.peek(),
Some('#') | Some('\n') | Some('\\') | None
);
if self.blank_line || !self.paren_stack.is_empty() {
return Ok(());
}
let prev_col = self.indent_stack.last().unwrap_or(&0);
match col.cmp(prev_col) {
Ordering::Equal => {
if altcol != *self.alt_indent_stack.last().unwrap_or(&0) {
return Err(TokError::TabSpace);
}
}
Ordering::Greater => {
if self.indent_stack.len() + 1 >= MAX_INDENT {
return Err(TokError::TooDeep);
}
if altcol <= *self.alt_indent_stack.last().unwrap_or(&0) {
return Err(TokError::TabSpace);
}
if self.text_pos.peek().is_some() {
self.pending_indents += 1;
self.indent_stack.push(col);
self.alt_indent_stack.push(altcol);
}
}
Ordering::Less => {
while matches!(self.indent_stack.last(), Some(&ind_cols) if col < ind_cols) {
self.pending_indents -= 1;
self.indent_stack.pop();
self.alt_indent_stack.pop();
}
if col != *self.indent_stack.last().unwrap_or(&0) {
return Err(TokError::Dedent);
}
if altcol != *self.alt_indent_stack.last().unwrap_or(&0) {
return Err(TokError::TabSpace);
}
}
}
Ok(())
}
fn process_pending_indents(&mut self) -> Option<TokType> {
if self.pending_indents != 0 {
if self.pending_indents < 0 {
self.pending_indents += 1;
Some(TokType::Dedent)
} else {
self.pending_indents -= 1;
Some(TokType::Indent)
}
} else {
None
}
}
fn maybe_close_async_def(&mut self) {
if self.async_def
&& !self.blank_line
&& self.paren_stack.is_empty()
&& self.async_def_nl
&& self.async_def_indent >= self.indent_stack.len()
{
self.async_def = false;
self.async_def_indent = 0;
self.async_def_nl = false;
}
}
fn consume_identifier_or_prefixed_string(&mut self) -> Result<TokType, TokError<'t>> {
if STRING_PREFIX_RE.with(|r| self.text_pos.consume(r)) {
if let Some('"') | Some('\'') = self.text_pos.peek() {
if self.split_ftstring {
let res = match self
.text_pos
.slice_from_start_pos(&self.start_pos)
.chars()
.find(|c| matches!(c, 'f' | 'F' | 't' | 'T'))
{
Some('f' | 'F') => Some(FTStringType::FString),
Some('t' | 'T') => Some(FTStringType::TString),
_ => None,
};
if let Some(str_type) = res {
return self.consume_prefixed_string_start(str_type);
}
}
return self.consume_string();
}
} else {
let first_ch = self.text_pos.next();
debug_assert!(matches!(
first_ch,
Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR)
));
}
POTENTIAL_IDENTIFIER_TAIL_RE.with(|r| self.text_pos.consume(r));
let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos);
if !verify_identifier(identifier_str) {
return Err(TokError::BadIdentifier(identifier_str));
}
let allow_async = !self.async_hacks || self.async_def;
match (identifier_str, allow_async) {
("async", true) => Ok(TokType::Async),
("await", true) => Ok(TokType::Await),
("async", false) => {
let mut lookahead_state = self.clone();
if lookahead_state.next_inner() == Ok(TokType::Name)
&& lookahead_state
.text_pos
.slice_from_start_pos(&lookahead_state.start_pos)
== "def"
{
self.async_def = true;
self.async_def_indent = self.indent_stack.len();
Ok(TokType::Async)
} else {
Ok(TokType::Name)
}
}
_ => Ok(TokType::Name),
}
}
fn consume_number(&mut self, state: NumberState) -> Result<TokType, TokError<'t>> {
match state {
NumberState::StartDigit => {
let start_digit_ch = self.text_pos.peek();
debug_assert!(is_digit(start_digit_ch));
if start_digit_ch == Some('0') {
self.text_pos.next();
match self.text_pos.peek() {
Some('x') | Some('X') => {
self.text_pos.next();
if !HEXADECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
Err(TokError::BadHexadecimal)
} else {
Ok(TokType::Number)
}
}
Some('o') | Some('O') => {
self.text_pos.next();
if !OCTAL_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
return Err(TokError::BadOctal);
}
if let Some(next_ch) = self.text_pos.peek() {
if is_digit(next_ch) {
return Err(TokError::BadOctalDigit(next_ch));
}
}
Ok(TokType::Number)
}
Some('b') | Some('B') => {
self.text_pos.next();
if !BINARY_TAIL_RE.with(|r| self.text_pos.consume(r))
|| self.text_pos.peek() == Some('_')
{
return Err(TokError::BadBinary);
}
if let Some(next_ch) = self.text_pos.peek() {
if is_digit(next_ch) {
return Err(TokError::BadBinaryDigit(next_ch));
}
}
Ok(TokType::Number)
}
_ => {
let mut nonzero = false;
loop {
if self.text_pos.peek() == Some('_') {
self.text_pos.next();
if !is_digit(self.text_pos.peek()) {
return Err(TokError::BadDecimal);
}
}
if self.text_pos.peek() != Some('0') {
break;
}
self.text_pos.next();
}
if is_digit(self.text_pos.peek()) {
nonzero = true;
self.consume_decimal_tail()?;
}
if self.text_pos.peek() == Some('.') {
self.consume_number(NumberState::Fraction)
} else if let Some('e') | Some('E') = self.text_pos.peek() {
self.consume_number(NumberState::Exponent)
} else if let Some('j') | Some('J') = self.text_pos.peek() {
self.consume_number(NumberState::Imaginary)
} else if nonzero {
Err(TokError::BadDecimalLeadingZeros)
} else {
Ok(TokType::Number)
}
}
}
} else {
self.consume_decimal_tail()?;
if self.text_pos.peek() == Some('.') {
self.consume_number(NumberState::Fraction)
} else if let Some('e') | Some('E') = self.text_pos.peek() {
self.consume_number(NumberState::Exponent)
} else if let Some('j') | Some('J') = self.text_pos.peek() {
self.consume_number(NumberState::Imaginary)
} else {
Ok(TokType::Number)
}
}
}
NumberState::Fraction => {
let dot_ch = self.text_pos.next();
debug_assert!(dot_ch == Some('.'));
if is_digit(self.text_pos.peek()) {
self.consume_decimal_tail()?;
}
if let Some('e') | Some('E') = self.text_pos.peek() {
self.consume_number(NumberState::Exponent)
} else if let Some('j') | Some('J') = self.text_pos.peek() {
self.consume_number(NumberState::Imaginary)
} else {
Ok(TokType::Number)
}
}
NumberState::Exponent => {
let e_ch = self.text_pos.next();
debug_assert!(matches!(e_ch, Some('e') | Some('E')));
if let Some('+') | Some('-') = self.text_pos.peek() {
self.text_pos.next();
if !is_digit(self.text_pos.peek()) {
return Err(TokError::BadDecimal);
}
} else if !is_digit(self.text_pos.peek()) {
self.text_pos.backup_no_newline();
return Ok(TokType::Number);
}
self.consume_decimal_tail()?;
if let Some('j') | Some('J') = self.text_pos.peek() {
self.consume_number(NumberState::Imaginary)
} else {
Ok(TokType::Number)
}
}
NumberState::Imaginary => {
let j_ch = self.text_pos.next();
debug_assert!(matches!(j_ch, Some('j') | Some('J')));
Ok(TokType::Number)
}
}
}
fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> {
let result = DECIMAL_TAIL_RE.with(|r| self.text_pos.consume(r));
debug_assert!(result, "try_decimal_tail was called on a non-digit char");
if self.text_pos.peek() == Some('_') {
Err(TokError::BadDecimal)
} else {
Ok(())
}
}
fn consume_open_quote(&mut self) -> (StringQuoteChar, StringQuoteSize) {
let quote_char: StringQuoteChar = self
.text_pos
.peek()
.try_into()
.expect("the next character must be a quote when calling consume_open_quote");
let triple_quote_pattern = quote_char.triple_str();
let quote_size = if self.text_pos.consume(triple_quote_pattern) {
StringQuoteSize::Triple
} else {
self.text_pos.next(); StringQuoteSize::Single
};
(quote_char, quote_size)
}
fn consume_string(&mut self) -> Result<TokType, TokError<'t>> {
let (quote_char, quote_size) = self.consume_open_quote();
let quote_raw = quote_char.into();
let mut end_quote_size: usize = 0;
let quote_usize: usize = quote_size.into();
while end_quote_size != quote_usize {
match (self.text_pos.next(), quote_size) {
(None, StringQuoteSize::Triple) => {
return Err(TokError::UnterminatedTripleQuotedString);
}
(None, StringQuoteSize::Single) | (Some('\n'), StringQuoteSize::Single) => {
return Err(TokError::UnterminatedString);
}
(ch @ Some('\''), _) | (ch @ Some('"'), _) if ch == Some(quote_raw) => {
end_quote_size += 1;
}
(Some(ch), _) => {
end_quote_size = 0;
if ch == '\\' {
self.text_pos.next();
}
}
}
}
Ok(TokType::String)
}
fn consume_prefixed_string_start(
&mut self,
str_type: FTStringType,
) -> Result<TokType, TokError<'t>> {
let (quote_char, quote_size) = self.consume_open_quote();
let is_raw_string = self
.text_pos
.slice_from_start_pos(&self.start_pos)
.contains(&['r', 'R'][..]);
self.ftstring_stack.push(FTStringNode::new(
quote_char,
quote_size,
is_raw_string,
str_type.clone(),
));
match str_type {
FTStringType::FString => Ok(TokType::FStringStart),
FTStringType::TString => Ok(TokType::TStringStart),
}
}
fn maybe_consume_ftstring_string(
&mut self,
is_in_format_spec: bool,
is_raw_string: bool,
) -> Result<Option<TokType>, TokError<'t>> {
let allow_multiline = self
.ftstring_stack
.last()
.map(|node| node.allow_multiline())
== Some(true);
let str_type = self
.ftstring_stack
.last()
.map(|node| node.string_type.clone());
let mut in_named_unicode: bool = false;
let mut ok_result = Ok(None); 'outer: loop {
match (self.text_pos.peek(), allow_multiline) {
(None, true) => {
return Err(TokError::UnterminatedTripleQuotedString);
}
(None, false) | (Some('\n'), false) => {
return Err(TokError::UnterminatedString);
}
(ch @ Some('\''), _) | (ch @ Some('"'), _) => {
if let Some(node) = self.ftstring_stack.last() {
if ch == Some(node.quote_char.into()) {
match node.quote_size {
StringQuoteSize::Single => {
break 'outer;
}
StringQuoteSize::Triple => {
if self.text_pos.matches(node.quote_char.triple_str()) {
break 'outer;
}
}
}
}
}
self.text_pos.next();
}
(Some('\\'), _) if !is_raw_string => {
self.text_pos.next();
if is_in_format_spec {
if let Some('{') | Some('}') = self.text_pos.peek() {
} else {
self.text_pos.next();
}
} else if let Some(
'\n'
| '\\'
| '\''
| '"'
| 'a'
| 'b'
| 'f'
| 'n'
| 'r'
| 't'
| 'v'
| 'x'
| '0'..='9'
| 'N'
| 'u'
| 'U',
) = self.text_pos.peek()
{
let next_ch = self.text_pos.next();
if let Some('N') = next_ch {
if let Some('{') = self.text_pos.peek() {
in_named_unicode = true;
self.text_pos.next();
}
}
}
}
(Some('\\'), _) if is_raw_string => {
self.text_pos.next();
if let Some('"' | '\'' | '\\') = self.text_pos.peek() {
self.text_pos.next();
}
}
(Some('{'), _) => {
if is_in_format_spec {
break 'outer;
}
let consumed_double = self.text_pos.consume("{{");
if !consumed_double {
break 'outer;
}
}
(Some('}'), _) => {
if in_named_unicode {
in_named_unicode = false;
self.text_pos.next();
} else if is_in_format_spec {
break 'outer;
} else if !self.text_pos.consume("}}") {
return Err(TokError::UnmatchedClosingParen('}'));
}
}
_ => {
self.text_pos.next();
}
}
ok_result = match str_type {
Some(FTStringType::FString) => Ok(Some(TokType::FStringString)),
Some(FTStringType::TString) => Ok(Some(TokType::TStringString)),
None => unreachable!("We should always have a string type"),
};
}
ok_result
}
fn maybe_consume_ftstring_end(&mut self) -> Option<TokType> {
let ch = self.text_pos.peek();
if let Some(node) = self.ftstring_stack.last() {
if ch == Some(node.quote_char.into()) {
if node.quote_size == StringQuoteSize::Triple {
self.text_pos.consume(node.quote_char.triple_str());
} else {
self.text_pos.next(); }
let tok_type = match node.string_type {
FTStringType::FString => TokType::FStringEnd,
FTStringType::TString => TokType::TStringEnd,
};
self.ftstring_stack.pop();
return Some(tok_type);
}
}
None
}
}
impl<'t> Iterator for TokState<'t> {
type Item = Result<TokType, TokError<'t>>;
fn next(&mut self) -> Option<Result<TokType, TokError<'t>>> {
if self.done {
None
} else {
match self.next_inner() {
Err(err) => {
self.done = true;
Some(Err(err))
}
Ok(TokType::EndMarker) => {
self.done = true;
Some(Ok(TokType::EndMarker))
}
Ok(t) => Some(Ok(t)),
}
}
}
}
fn verify_identifier(name: &str) -> bool {
name.is_ascii() || UNICODE_IDENTIFIER_RE.with(|r| r.is_match(name))
}
#[derive(Clone)]
pub struct Token<'a> {
pub r#type: TokType,
pub string: &'a str,
pub start_pos: TextPositionSnapshot,
pub end_pos: TextPositionSnapshot,
pub whitespace_before: Rc<RefCell<WhitespaceState<'a>>>,
pub whitespace_after: Rc<RefCell<WhitespaceState<'a>>>,
pub relative_indent: Option<&'a str>,
}
impl<'a> Debug for Token<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(
f,
"Token({:?}, {}, start={:?}, end={:?}, relative_indent={:?}, ws_before={:?}, ws_after={:?}",
self.r#type, self.string, self.start_pos, self.end_pos, self.relative_indent, self.whitespace_before, self.whitespace_after
)
}
}
impl<'a> PartialEq for Token<'a> {
fn eq(&self, _other: &Self) -> bool {
true
}
}
impl<'a> Eq for Token<'a> {}
pub struct TokenIterator<'a> {
previous_whitespace: Option<Rc<RefCell<WhitespaceState<'a>>>>,
core_state: TokState<'a>,
absolute_indents: Vec<&'a str>,
}
impl<'a> TokenIterator<'a> {
pub fn new(module_text: &'a str, config: &TokConfig) -> Self {
Self {
previous_whitespace: None,
absolute_indents: vec![],
core_state: TokState::new(module_text, config),
}
}
}
impl<'a> Iterator for TokenIterator<'a> {
type Item = Result<Token<'a>, TokError<'a>>;
fn next(&mut self) -> Option<Self::Item> {
let next = self.core_state.next();
next.as_ref()?;
Some((|| {
let tok_type = next.unwrap()?;
let relative_indent = match tok_type {
TokType::Indent => {
let end_idx = self.core_state.text_pos.byte_idx();
let start_idx = end_idx - self.core_state.bol_width;
let absolute_indent = &self.core_state.text_pos.text()[start_idx..end_idx];
let relative_indent =
if let Some(prev_absolute_indent) = self.absolute_indents.last() {
if let Some(ri) = absolute_indent.strip_prefix(prev_absolute_indent) {
ri
} else {
return Err(TokError::Dedent);
}
} else {
absolute_indent
};
self.absolute_indents.push(absolute_indent);
if let Some(ws) = self.previous_whitespace.as_mut() {
ws.borrow_mut().absolute_indent = absolute_indent;
}
Some(relative_indent)
}
TokType::Dedent => {
self.absolute_indents.pop();
if let Some(ws) = self.previous_whitespace.as_mut() {
ws.borrow_mut().absolute_indent =
self.absolute_indents.last().unwrap_or(&"");
}
None
}
_ => None,
};
let text_pos = &self.core_state.text_pos;
let whitespace_before = self.previous_whitespace.clone().unwrap_or_default();
let whitespace_after = match tok_type {
TokType::Indent | TokType::Dedent | TokType::EndMarker => whitespace_before.clone(),
_ => Rc::new(RefCell::new(WhitespaceState {
line: text_pos.line_number(),
column: text_pos.char_column_number(),
column_byte: text_pos.byte_column_number(),
byte_offset: text_pos.byte_idx(),
absolute_indent: self.absolute_indents.last().unwrap_or(&""),
is_parenthesized: self.core_state.is_parenthesized(),
})),
};
self.previous_whitespace = Some(whitespace_after.clone());
Ok(Token {
r#type: tok_type,
string: text_pos.slice_from_start_pos(&self.core_state.start_pos),
start_pos: self.core_state.start_pos.clone(),
end_pos: text_pos.into(),
whitespace_after: whitespace_after.clone(),
whitespace_before: whitespace_before.clone(),
relative_indent,
})
})())
}
}