use std::convert::TryFrom;
use std::error::Error;
use std::fmt::{self, Display};
use std::sync::Arc;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::types::FormulaDialect;
const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%@";
const fn build_token_enders() -> [bool; 256] {
let mut tbl = [false; 256];
let bytes = TOKEN_ENDERS.as_bytes();
let mut i = 0;
while i < bytes.len() {
tbl[bytes[i] as usize] = true;
i += 1;
}
tbl
}
static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
#[inline(always)]
fn is_token_ender(c: u8) -> bool {
TOKEN_ENDERS_TABLE[c as usize]
}
static ERROR_CODES: &[&str] = &[
"#GETTING_DATA",
"#DIV/0!",
"#VALUE!",
"#SPILL!",
"#NAME?",
"#NULL!",
"#CALC!",
"#NUM!",
"#REF!",
"#N/A",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Associativity {
Left,
Right,
}
#[derive(Debug)]
pub struct TokenizerError {
pub message: String,
pub pos: usize,
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RecoveryAction {
SkippedUnmatchedCloser,
UnterminatedString,
UnmatchedBracket,
InvalidErrorLiteral,
UnmatchedOpener,
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenDiagnostic {
pub span: TokenSpan,
pub message: String,
pub recovery: RecoveryAction,
}
impl TokenDiagnostic {
fn new(span: TokenSpan, message: String, recovery: RecoveryAction) -> Self {
Self {
span,
message,
recovery,
}
}
}
#[derive(Debug, Clone)]
struct SpanTokenizerError {
kind: SpanTokenizerErrorKind,
pos: usize,
message: String,
span_start: Option<usize>,
span_end: Option<usize>,
}
#[derive(Debug, Clone, Copy)]
enum SpanTokenizerErrorKind {
NoMatchingOpener,
UnmatchedOpening,
UnterminatedString,
UnmatchedBracket,
MismatchedPair,
InvalidErrorLiteral,
}
impl From<SpanTokenizerError> for TokenizerError {
fn from(value: SpanTokenizerError) -> Self {
TokenizerError {
message: value.message,
pos: value.pos,
}
}
}
impl fmt::Display for TokenizerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "TokenizerError: {}", self.message)
}
}
impl Error for TokenizerError {}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenType {
Literal,
Operand,
Func,
Array,
Paren,
Sep,
OpPrefix,
OpInfix,
OpPostfix,
Whitespace,
}
impl Display for TokenType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{self:?}")
}
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TokenSubType {
None,
Text,
Number,
Logical,
Error,
Range,
Open,
Close,
Arg,
Row,
}
impl Display for TokenSubType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{self:?}")
}
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, PartialEq, Hash)]
pub struct Token {
pub value: String, pub token_type: TokenType,
pub subtype: TokenSubType,
pub start: usize,
pub end: usize,
}
impl Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"<{} subtype: {:?} value: {}>",
self.token_type, self.subtype, self.value
)
}
}
impl Token {
pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
Token {
value,
token_type,
subtype,
start: 0,
end: 0,
}
}
pub fn new_with_span(
value: String,
token_type: TokenType,
subtype: TokenSubType,
start: usize,
end: usize,
) -> Self {
Token {
value,
token_type,
subtype,
start,
end,
}
}
fn from_slice(
source: &str,
token_type: TokenType,
subtype: TokenSubType,
start: usize,
end: usize,
) -> Self {
Token {
value: source[start..end].to_string(),
token_type,
subtype,
start,
end,
}
}
pub fn is_operator(&self) -> bool {
matches!(
self.token_type,
TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
)
}
pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
let op = if self.token_type == TokenType::OpPrefix {
"u"
} else {
self.value.as_str()
};
match op {
"#" => Some((11, Associativity::Left)),
":" => Some((10, Associativity::Left)),
" " => Some((9, Associativity::Left)),
"," => Some((8, Associativity::Left)),
"%" => Some((7, Associativity::Left)),
"u" => Some((6, Associativity::Right)),
"^" => Some((5, Associativity::Right)),
"*" | "/" => Some((4, Associativity::Left)),
"+" | "-" => Some((3, Associativity::Left)),
"&" => Some((2, Associativity::Left)),
"=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
_ => None,
}
}
pub fn make_operand(value: String) -> Self {
let subtype = if value.starts_with('"') {
TokenSubType::Text
} else if value.starts_with('#') {
TokenSubType::Error
} else if value.eq_ignore_ascii_case("TRUE") || value.eq_ignore_ascii_case("FALSE") {
TokenSubType::Logical
} else if value.parse::<f64>().is_ok() {
TokenSubType::Number
} else {
TokenSubType::Range
};
Token::new(value, TokenType::Operand, subtype)
}
pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
let subtype = if value.starts_with('"') {
TokenSubType::Text
} else if value.starts_with('#') {
TokenSubType::Error
} else if value.eq_ignore_ascii_case("TRUE") || value.eq_ignore_ascii_case("FALSE") {
TokenSubType::Logical
} else if value.parse::<f64>().is_ok() {
TokenSubType::Number
} else {
TokenSubType::Range
};
Token::new_with_span(value, TokenType::Operand, subtype, start, end)
}
fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
let value_str = &source[start..end];
let subtype = if value_str.starts_with('"') {
TokenSubType::Text
} else if value_str.starts_with('#') {
TokenSubType::Error
} else if value_str.eq_ignore_ascii_case("TRUE") || value_str.eq_ignore_ascii_case("FALSE")
{
TokenSubType::Logical
} else if value_str.parse::<f64>().is_ok() {
TokenSubType::Number
} else {
TokenSubType::Range
};
Token::from_slice(source, TokenType::Operand, subtype, start, end)
}
pub fn make_subexp(value: &str, func: bool) -> Self {
let last_char = value.chars().last().expect("Empty token value");
assert!(matches!(last_char, '{' | '}' | '(' | ')'));
let token_type = if func {
TokenType::Func
} else if "{}".contains(last_char) {
TokenType::Array
} else if "()".contains(last_char) {
TokenType::Paren
} else {
TokenType::Func
};
let subtype = if ")}".contains(last_char) {
TokenSubType::Close
} else {
TokenSubType::Open
};
Token::new(value.to_string(), token_type, subtype)
}
pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
let last_char = value.chars().last().expect("Empty token value");
assert!(matches!(last_char, '{' | '}' | '(' | ')'));
let token_type = if func {
TokenType::Func
} else if "{}".contains(last_char) {
TokenType::Array
} else if "()".contains(last_char) {
TokenType::Paren
} else {
TokenType::Func
};
let subtype = if ")}".contains(last_char) {
TokenSubType::Close
} else {
TokenSubType::Open
};
Token::new_with_span(value.to_string(), token_type, subtype, start, end)
}
fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
let value_str = &source[start..end];
let last_char = value_str.chars().last().expect("Empty token value");
let token_type = if func {
TokenType::Func
} else if "{}".contains(last_char) {
TokenType::Array
} else if "()".contains(last_char) {
TokenType::Paren
} else {
TokenType::Func
};
let subtype = if ")}".contains(last_char) {
TokenSubType::Close
} else {
TokenSubType::Open
};
Token::from_slice(source, token_type, subtype, start, end)
}
pub fn get_closer(&self) -> Result<Token, TokenizerError> {
if self.subtype != TokenSubType::Open {
return Err(TokenizerError {
message: "Token is not an opener".to_string(),
pos: 0,
});
}
let closer_value = if self.token_type == TokenType::Array {
"}"
} else {
")"
};
Ok(Token::make_subexp(
closer_value,
self.token_type == TokenType::Func,
))
}
pub fn make_separator(value: &str) -> Self {
assert!(value == "," || value == ";");
let subtype = if value == "," {
TokenSubType::Arg
} else {
TokenSubType::Row
};
Token::new(value.to_string(), TokenType::Sep, subtype)
}
pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
assert!(value == "," || value == ";");
let subtype = if value == "," {
TokenSubType::Arg
} else {
TokenSubType::Row
};
Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
}
}
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct TokenSpan {
pub token_type: TokenType,
pub subtype: TokenSubType,
pub start: usize,
pub end: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct TokenView<'a> {
pub span: &'a TokenSpan,
pub value: &'a str,
}
#[derive(Debug, Clone)]
pub struct TokenStream {
source: Arc<str>,
pub spans: Vec<TokenSpan>,
dialect: FormulaDialect,
diagnostics: Vec<TokenDiagnostic>,
}
impl TokenStream {
pub fn new(formula: &str) -> Result<Self, TokenizerError> {
Self::new_with_dialect(formula, FormulaDialect::Excel)
}
pub fn new_with_dialect(
formula: &str,
dialect: FormulaDialect,
) -> Result<Self, TokenizerError> {
let source: Arc<str> = Arc::from(formula);
let spans = tokenize_spans_with_dialect(source.as_ref(), dialect)?;
Ok(TokenStream {
source,
spans,
dialect,
diagnostics: Vec::new(),
})
}
pub fn new_best_effort(formula: &str) -> Self {
Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
}
pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
let source: Arc<str> = Arc::from(formula);
let mut tokenizer = SpanTokenizer::new(source.as_ref(), dialect);
let spans = tokenizer.parse_best_effort();
let diagnostics = tokenizer.diagnostics;
TokenStream {
source,
spans,
dialect,
diagnostics,
}
}
pub fn diagnostics(&self) -> Vec<TokenDiagnostic> {
self.diagnostics.clone()
}
pub fn diagnostics_ref(&self) -> &[TokenDiagnostic] {
&self.diagnostics
}
pub fn has_errors(&self) -> bool {
!self.diagnostics.is_empty()
}
pub fn invalid_spans_iter(&self) -> impl Iterator<Item = &TokenSpan> {
self.spans.iter().filter(|span| {
self.diagnostics.iter().any(|diag| {
diag.span.start == span.start
&& diag.span.end == span.end
&& diag.span.token_type == span.token_type
})
})
}
pub fn invalid_spans(&self) -> Vec<&TokenSpan> {
self.invalid_spans_iter().collect()
}
pub fn source(&self) -> &str {
&self.source
}
pub fn dialect(&self) -> FormulaDialect {
self.dialect
}
pub fn len(&self) -> usize {
self.spans.len()
}
pub fn is_empty(&self) -> bool {
self.spans.is_empty()
}
pub fn get(&self, index: usize) -> Option<TokenView<'_>> {
let span = self.spans.get(index)?;
let value = self.source.get(span.start..span.end)?;
Some(TokenView { span, value })
}
pub fn to_tokens(&self) -> Vec<Token> {
self.spans
.iter()
.map(|s| {
let value = self
.source
.get(s.start..s.end)
.unwrap_or_default()
.to_string();
Token::new_with_span(value, s.token_type, s.subtype, s.start, s.end)
})
.collect()
}
pub fn render(&self) -> String {
let mut out = String::with_capacity(self.source.len());
for span in &self.spans {
if let Some(s) = self.source.get(span.start..span.end) {
out.push_str(s);
}
}
out
}
pub fn render_formula(&self) -> String {
if self.source.as_bytes().first() == Some(&b'=') {
format!("={}", self.render())
} else {
self.render()
}
}
}
pub(crate) fn tokenize_spans_with_dialect(
formula: &str,
dialect: FormulaDialect,
) -> Result<Vec<TokenSpan>, TokenizerError> {
let mut tokenizer = SpanTokenizer::new(formula, dialect);
tokenizer.parse()?;
Ok(tokenizer.spans)
}
fn operand_subtype(value_str: &str) -> TokenSubType {
if value_str.starts_with('"') {
TokenSubType::Text
} else if value_str.starts_with('#') {
TokenSubType::Error
} else if value_str.eq_ignore_ascii_case("TRUE") || value_str.eq_ignore_ascii_case("FALSE") {
TokenSubType::Logical
} else if value_str.parse::<f64>().is_ok() {
TokenSubType::Number
} else {
TokenSubType::Range
}
}
fn is_cell_reference_like(value: &str) -> bool {
let bytes = value.as_bytes();
let mut i = 0;
if i < bytes.len() && bytes[i] == b'$' {
i += 1;
}
let col_start = i;
while i < bytes.len() && bytes[i].is_ascii_alphabetic() {
i += 1;
}
if i == col_start {
return false;
}
if i < bytes.len() && bytes[i] == b'$' {
i += 1;
}
let row_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
i == bytes.len() && i > row_start
}
fn reference_value_contains_range_colon(value: &str) -> bool {
let value_part = value
.rsplit_once('!')
.map_or(value, |(_, value_part)| value_part);
value_part.contains(':')
}
fn is_reference_operand_value(value: &str) -> bool {
operand_subtype(value) == TokenSubType::Range
&& (reference_value_contains_range_colon(value)
|| value.contains('!')
|| value.contains('[')
|| is_cell_reference_like(value))
}
fn next_starts_reference_expression(formula: &str, mut offset: usize) -> bool {
let bytes = formula.as_bytes();
while offset < bytes.len() && matches!(bytes[offset], b' ' | b'\n') {
offset += 1;
}
if offset >= bytes.len() {
return false;
}
matches!(bytes[offset], b'(' | b'[' | b'\'' | b'$') || bytes[offset].is_ascii_alphabetic()
}
fn next_reference_has_sheet_qualifier(formula: &str, mut offset: usize) -> bool {
let bytes = formula.as_bytes();
while offset < bytes.len() && matches!(bytes[offset], b' ' | b'\n') {
offset += 1;
}
let mut in_quote = false;
while offset < bytes.len() {
match bytes[offset] {
b'\'' => {
if in_quote && offset + 1 < bytes.len() && bytes[offset + 1] == b'\'' {
offset += 2;
continue;
}
in_quote = !in_quote;
}
b'!' => return true,
b':' if !in_quote => return false,
b',' | b';' | b'}' | b')' | b' ' | b'\n' | b'+' | b'-' | b'*' | b'/' | b'^' | b'&'
| b'=' | b'>' | b'<' | b'%' | b'@'
if !in_quote =>
{
return false;
}
_ => {}
}
offset += 1;
}
false
}
struct SpanTokenizer<'a> {
formula: &'a str,
spans: Vec<TokenSpan>,
token_stack: Vec<TokenSpan>,
offset: usize,
token_start: usize,
token_end: usize,
dialect: FormulaDialect,
diagnostics: Vec<TokenDiagnostic>,
}
impl<'a> SpanTokenizer<'a> {
fn new(formula: &'a str, dialect: FormulaDialect) -> Self {
SpanTokenizer {
formula,
spans: Vec::with_capacity(formula.len() / 2),
token_stack: Vec::with_capacity(16),
offset: 0,
token_start: 0,
token_end: 0,
dialect,
diagnostics: Vec::new(),
}
}
#[inline]
fn current_byte(&self) -> Option<u8> {
self.formula.as_bytes().get(self.offset).copied()
}
#[inline]
fn has_token(&self) -> bool {
self.token_end > self.token_start
}
#[inline]
fn start_token(&mut self) {
self.token_start = self.offset;
self.token_end = self.offset;
}
#[inline]
fn extend_token(&mut self) {
self.token_end = self.offset;
}
fn push_span(
&mut self,
token_type: TokenType,
subtype: TokenSubType,
start: usize,
end: usize,
) {
self.spans.push(TokenSpan {
token_type,
subtype,
start,
end,
});
}
fn save_token(&mut self) {
if self.has_token() {
let value_str = &self.formula[self.token_start..self.token_end];
let subtype = operand_subtype(value_str);
self.push_span(
TokenType::Operand,
subtype,
self.token_start,
self.token_end,
);
}
}
fn check_scientific_notation(&mut self) -> bool {
if let Some(curr_byte) = self.current_byte() {
if (curr_byte == b'+' || curr_byte == b'-')
&& self.has_token()
&& self.is_scientific_notation_base()
&& self
.formula
.as_bytes()
.get(self.offset + 1)
.is_some_and(|b| b.is_ascii_digit())
{
self.offset += 1;
self.extend_token();
return true;
}
}
false
}
fn is_scientific_notation_base(&self) -> bool {
if !self.has_token() {
return false;
}
let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
if token_slice.len() < 2 {
return false;
}
let last = token_slice[token_slice.len() - 1];
if !(last == b'E' || last == b'e') {
return false;
}
let first = token_slice[0];
if !first.is_ascii_digit() {
return false;
}
let mut dot_seen = false;
for &ch in &token_slice[1..token_slice.len() - 1] {
match ch {
b'0'..=b'9' => {}
b'.' if !dot_seen => dot_seen = true,
_ => return false,
}
}
true
}
fn parse(&mut self) -> Result<(), TokenizerError> {
self.parse_with_recovery(false).map_err(Into::into)
}
pub(crate) fn parse_best_effort(&mut self) -> Vec<TokenSpan> {
let _ = self.parse_with_recovery(true);
self.spans.clone()
}
fn parse_with_recovery(&mut self, best_effort: bool) -> Result<(), SpanTokenizerError> {
if self.formula.is_empty() {
return Ok(());
}
if self.formula.as_bytes()[0] != b'=' {
self.push_span(
TokenType::Literal,
TokenSubType::None,
0,
self.formula.len(),
);
return Ok(());
}
self.offset = 1;
self.start_token();
while self.offset < self.formula.len() {
if self.check_scientific_notation() {
continue;
}
let curr_byte = self.formula.as_bytes()[self.offset];
if is_token_ender(curr_byte) && self.has_token() {
self.save_token();
self.start_token();
}
let parse_result = match curr_byte {
b'"' | b'\'' => self.parse_string(),
b'[' => self.parse_brackets(),
b'#' => {
if self.should_emit_hash_postfix() {
self.emit_hash_postfix();
Ok(())
} else {
self.parse_error()
}
}
b' ' | b'\n' => self.parse_whitespace(),
b':' => {
if self.should_emit_colon_infix() {
self.emit_infix_operator(self.offset, self.offset + 1);
Ok(())
} else {
if !self.has_token() {
self.start_token();
}
self.offset += 1;
self.extend_token();
Ok(())
}
}
b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
self.parse_operator()
}
b'{' | b'(' => self.parse_opener(),
b')' | b'}' => self.parse_closer(),
b';' | b',' => self.parse_separator(),
_ => {
if !self.has_token() {
self.start_token();
}
self.offset += 1;
self.extend_token();
Ok(())
}
};
if let Err(err) = parse_result {
if best_effort {
self.recover_from_error(err);
} else {
return Err(err);
}
}
}
if self.has_token() {
self.save_token();
}
if !self.token_stack.is_empty() {
if best_effort {
while let Some(open_token) = self.token_stack.pop() {
if let Some(span) = self.spans.iter().find(|span| {
span.start == open_token.start
&& span.end == open_token.end
&& span.token_type == open_token.token_type
&& span.subtype == open_token.subtype
}) {
self.diagnostics.push(TokenDiagnostic::new(
*span,
"Unmatched opening parenthesis or bracket".to_string(),
RecoveryAction::UnmatchedOpener,
));
}
}
} else {
return Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::UnmatchedOpening,
pos: self.offset,
message: "Unmatched opening parenthesis or bracket".to_string(),
span_start: None,
span_end: None,
});
}
}
Ok(())
}
fn recover_from_error(&mut self, error: SpanTokenizerError) {
match error.kind {
SpanTokenizerErrorKind::NoMatchingOpener => {
let span = TokenSpan {
token_type: TokenType::Operand,
subtype: TokenSubType::None,
start: error.pos,
end: error.pos + 1,
};
self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
self.offset = span.end;
self.start_token();
self.diagnostics.push(TokenDiagnostic::new(
span,
format!("No matching opener for closer at position {}", error.pos),
RecoveryAction::SkippedUnmatchedCloser,
));
}
SpanTokenizerErrorKind::UnmatchedOpening => {
debug_assert!(
false,
"UnmatchedOpening is handled at end-of-input and should not be routed through recover_from_error"
);
}
SpanTokenizerErrorKind::UnterminatedString => {
let start = error.span_start.unwrap_or(error.pos);
let span = TokenSpan {
token_type: TokenType::Operand,
subtype: TokenSubType::None,
start,
end: self.formula.len(),
};
self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
self.offset = span.end;
self.start_token();
self.diagnostics.push(TokenDiagnostic::new(
span,
"Reached end of formula while parsing string".to_string(),
RecoveryAction::UnterminatedString,
));
}
SpanTokenizerErrorKind::UnmatchedBracket => {
let start = error.span_start.unwrap_or(error.pos);
let end = error.span_end.unwrap_or(self.formula.len());
let span = TokenSpan {
token_type: TokenType::Operand,
subtype: TokenSubType::None,
start,
end,
};
self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
self.offset = span.end;
self.start_token();
self.diagnostics.push(TokenDiagnostic::new(
span,
"Encountered unmatched '['".to_string(),
RecoveryAction::UnmatchedBracket,
));
}
SpanTokenizerErrorKind::MismatchedPair => {
let span = TokenSpan {
token_type: TokenType::Operand,
subtype: TokenSubType::None,
start: error.pos,
end: error.pos + 1,
};
self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
self.offset = span.end;
self.start_token();
self.diagnostics.push(TokenDiagnostic::new(
span,
"Mismatched ( and { pair".to_string(),
RecoveryAction::SkippedUnmatchedCloser,
));
}
SpanTokenizerErrorKind::InvalidErrorLiteral => {
let start = error.span_start.unwrap_or(error.pos);
let end = error.span_end.unwrap_or(error.pos + 1);
let span = TokenSpan {
token_type: TokenType::Operand,
subtype: TokenSubType::None,
start,
end,
};
self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
self.offset = span.end;
self.start_token();
self.diagnostics.push(TokenDiagnostic::new(
span,
"Invalid error code".to_string(),
RecoveryAction::InvalidErrorLiteral,
));
}
}
}
fn parse_string(&mut self) -> Result<(), SpanTokenizerError> {
let delim = self.formula.as_bytes()[self.offset];
assert!(delim == b'"' || delim == b'\'');
let is_dollar_ref = delim == b'\''
&& self.has_token()
&& self.token_end - self.token_start == 1
&& self.formula.as_bytes()[self.token_start] == b'$';
let glue_to_token = delim == b'\''
&& self.has_token()
&& self.token_end > 0
&& self.formula.as_bytes()[self.token_end - 1] == b':';
if !is_dollar_ref && !glue_to_token && self.has_token() {
self.save_token();
self.start_token();
}
let string_start = if is_dollar_ref {
self.token_start
} else {
self.offset
};
self.offset += 1;
while self.offset < self.formula.len() {
if self.formula.as_bytes()[self.offset] == delim {
self.offset += 1;
if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
{
self.offset += 1;
} else {
if delim == b'"' {
let value_str = &self.formula[string_start..self.offset];
let subtype = operand_subtype(value_str);
self.push_span(TokenType::Operand, subtype, string_start, self.offset);
self.start_token();
} else {
self.token_end = self.offset;
}
return Ok(());
}
} else {
self.offset += 1;
}
}
Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::UnterminatedString,
pos: self.offset,
message: "Reached end of formula while parsing string".to_string(),
span_start: Some(string_start),
span_end: Some(self.formula.len()),
})
}
fn parse_brackets(&mut self) -> Result<(), SpanTokenizerError> {
assert_eq!(self.formula.as_bytes()[self.offset], b'[');
if !self.has_token() {
self.start_token();
}
let bracket_start = self.offset;
let mut open_count = 1;
self.offset += 1;
while self.offset < self.formula.len() {
match self.formula.as_bytes()[self.offset] {
b'\'' => {
if self.offset + 1 < self.formula.len() {
self.offset += 2;
continue;
}
self.offset += 1;
continue;
}
b'[' => open_count += 1,
b']' => {
open_count -= 1;
if open_count == 0 {
self.offset += 1;
self.extend_token();
return Ok(());
}
}
_ => {}
}
self.offset += 1;
}
Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::UnmatchedBracket,
pos: self.offset,
message: "Encountered unmatched '['".to_string(),
span_start: Some(bracket_start),
span_end: Some(self.formula.len()),
})
}
fn parse_error(&mut self) -> Result<(), SpanTokenizerError> {
let has_sheet_prefix = self.has_token()
&& self.token_end > 0
&& self.formula.as_bytes()[self.token_end - 1] == b'!';
if has_sheet_prefix {
if self.token_end - self.token_start <= 1 {
return Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::InvalidErrorLiteral,
pos: self.offset,
message: format!(
"Empty sheet qualifier before error literal at position {}",
self.offset
),
span_start: Some(self.token_start),
span_end: Some(self.offset),
});
}
self.start_token();
} else if self.has_token() {
self.save_token();
self.start_token();
}
let error_start = self.offset;
for &err_code in ERROR_CODES {
let err_bytes = err_code.as_bytes();
if self.offset + err_bytes.len() <= self.formula.len() {
let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
if slice.eq_ignore_ascii_case(err_bytes) {
self.push_span(
TokenType::Operand,
TokenSubType::Error,
error_start,
self.offset + err_bytes.len(),
);
self.offset += err_bytes.len();
self.start_token();
return Ok(());
}
}
}
let mut end = self.offset + 1;
while end < self.formula.len() {
let ch = self.formula.as_bytes()[end];
if is_token_ender(ch)
|| ch == b' '
|| ch == b'\n'
|| ch == b'('
|| ch == b'{'
|| ch == b'['
|| ch == b'"'
|| ch == b'\''
{
break;
}
end += 1;
}
Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::InvalidErrorLiteral,
pos: self.offset,
message: format!("Invalid error code at position {}", self.offset),
span_start: Some(error_start),
span_end: Some(end),
})
}
fn parse_whitespace(&mut self) -> Result<(), SpanTokenizerError> {
self.save_token();
let ws_start = self.offset;
while self.offset < self.formula.len() {
match self.formula.as_bytes()[self.offset] {
b' ' | b'\n' => self.offset += 1,
_ => break,
}
}
let token_type = if self.prev_is_reference_producing()
&& next_starts_reference_expression(self.formula, self.offset)
{
TokenType::OpInfix
} else {
TokenType::Whitespace
};
self.push_span(token_type, TokenSubType::None, ws_start, self.offset);
self.start_token();
Ok(())
}
fn prev_is_reference_producing(&self) -> bool {
match self.prev_non_whitespace() {
Some(prev) => match prev.token_type {
TokenType::OpPostfix => true,
TokenType::Paren | TokenType::Func | TokenType::Array
if prev.subtype == TokenSubType::Close =>
{
true
}
TokenType::Operand if prev.subtype == TokenSubType::Range => self
.formula
.get(prev.start..prev.end)
.is_some_and(is_reference_operand_value),
_ => false,
},
None => false,
}
}
fn should_emit_colon_infix(&self) -> bool {
if self.has_token() {
let value = &self.formula[self.token_start..self.token_end];
if value.ends_with('!') {
return false;
}
return reference_value_contains_range_colon(value)
|| value.contains('[')
|| (value.contains('!')
&& next_reference_has_sheet_qualifier(self.formula, self.offset + 1));
}
self.prev_is_reference_producing()
}
fn emit_infix_operator(&mut self, start: usize, end: usize) {
self.save_token();
self.start_token();
self.push_span(TokenType::OpInfix, TokenSubType::None, start, end);
self.offset = end;
self.start_token();
}
fn prev_non_whitespace(&self) -> Option<&TokenSpan> {
self.spans
.iter()
.rev()
.find(|t| t.token_type != TokenType::Whitespace)
}
fn should_emit_hash_postfix(&self) -> bool {
if self.has_token() {
if self.formula.as_bytes()[self.token_end - 1] == b'!' {
return false;
}
let value = &self.formula[self.token_start..self.token_end];
return operand_subtype(value) == TokenSubType::Range;
}
match self.prev_non_whitespace() {
Some(prev) => match prev.token_type {
TokenType::OpPostfix => true,
TokenType::Paren | TokenType::Func | TokenType::Array
if prev.subtype == TokenSubType::Close =>
{
true
}
TokenType::Operand if prev.subtype == TokenSubType::Range => true,
_ => false,
},
None => false,
}
}
fn emit_hash_postfix(&mut self) {
self.save_token();
self.start_token();
self.push_span(
TokenType::OpPostfix,
TokenSubType::None,
self.offset,
self.offset + 1,
);
self.offset += 1;
self.start_token();
}
fn parse_operator(&mut self) -> Result<(), SpanTokenizerError> {
self.save_token();
if self.offset + 1 < self.formula.len() {
let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
self.push_span(
TokenType::OpInfix,
TokenSubType::None,
self.offset,
self.offset + 2,
);
self.offset += 2;
self.start_token();
return Ok(());
}
}
let curr_byte = self.formula.as_bytes()[self.offset];
let token_type = match curr_byte {
b'@' => TokenType::OpPrefix,
b'%' => TokenType::OpPostfix,
b'+' | b'-' => {
if self.spans.is_empty() {
TokenType::OpPrefix
} else {
let prev = self.prev_non_whitespace();
if let Some(p) = prev {
if p.subtype == TokenSubType::Close
|| p.token_type == TokenType::OpPostfix
|| p.token_type == TokenType::Operand
{
TokenType::OpInfix
} else {
TokenType::OpPrefix
}
} else {
TokenType::OpPrefix
}
}
}
_ => TokenType::OpInfix,
};
self.push_span(token_type, TokenSubType::None, self.offset, self.offset + 1);
self.offset += 1;
self.start_token();
Ok(())
}
fn parse_opener(&mut self) -> Result<(), SpanTokenizerError> {
let curr_byte = self.formula.as_bytes()[self.offset];
assert!(curr_byte == b'(' || curr_byte == b'{');
let token = if curr_byte == b'{' {
self.save_token();
TokenSpan {
token_type: TokenType::Array,
subtype: TokenSubType::Open,
start: self.offset,
end: self.offset + 1,
}
} else if self.has_token() {
let token = TokenSpan {
token_type: TokenType::Func,
subtype: TokenSubType::Open,
start: self.token_start,
end: self.offset + 1,
};
self.token_start = self.offset + 1;
self.token_end = self.offset + 1;
token
} else {
TokenSpan {
token_type: TokenType::Paren,
subtype: TokenSubType::Open,
start: self.offset,
end: self.offset + 1,
}
};
self.spans.push(token);
self.token_stack.push(token);
self.offset += 1;
self.start_token();
Ok(())
}
fn parse_closer(&mut self) -> Result<(), SpanTokenizerError> {
self.save_token();
let curr_byte = self.formula.as_bytes()[self.offset];
assert!(curr_byte == b')' || curr_byte == b'}');
if let Some(open_token) = self.token_stack.last().copied() {
let expected = if open_token.token_type == TokenType::Array {
b'}'
} else {
b')'
};
if curr_byte != expected {
return Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::MismatchedPair,
pos: self.offset,
message: "Mismatched ( and { pair".to_string(),
span_start: Some(self.offset),
span_end: Some(self.offset + 1),
});
}
self.token_stack.pop();
self.push_span(
open_token.token_type,
TokenSubType::Close,
self.offset,
self.offset + 1,
);
} else {
return Err(SpanTokenizerError {
kind: SpanTokenizerErrorKind::NoMatchingOpener,
pos: self.offset,
message: format!("No matching opener for closer at position {}", self.offset),
span_start: Some(self.offset),
span_end: Some(self.offset + 1),
});
}
self.offset += 1;
self.start_token();
Ok(())
}
fn parse_separator(&mut self) -> Result<(), SpanTokenizerError> {
self.save_token();
let curr_byte = self.formula.as_bytes()[self.offset];
assert!(curr_byte == b';' || curr_byte == b',');
let top_token = self.token_stack.last();
let in_function_or_array = matches!(
top_token.map(|t| t.token_type),
Some(TokenType::Func | TokenType::Array)
);
let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
let (token_type, subtype) = match curr_byte {
b',' => {
if in_function_or_array {
(TokenType::Sep, TokenSubType::Arg)
} else {
(TokenType::OpInfix, TokenSubType::None)
}
}
b';' => {
if in_array {
(TokenType::Sep, TokenSubType::Row)
} else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
(TokenType::Sep, TokenSubType::Arg)
} else if self.dialect == FormulaDialect::OpenFormula {
(TokenType::OpInfix, TokenSubType::None)
} else {
(TokenType::Sep, TokenSubType::Row)
}
}
_ => (TokenType::OpInfix, TokenSubType::None),
};
self.push_span(token_type, subtype, self.offset, self.offset + 1);
self.offset += 1;
self.start_token();
Ok(())
}
}
pub struct Tokenizer {
formula: String, pub items: Vec<Token>,
token_stack: Vec<Token>,
offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
}
impl Tokenizer {
pub fn new(formula: &str) -> Result<Self, TokenizerError> {
Self::new_with_dialect(formula, FormulaDialect::Excel)
}
pub fn new_best_effort(formula: &str) -> Self {
Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
}
pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
let stream = TokenStream::new_best_effort_with_dialect(formula, dialect);
Self::from_token_stream(&stream)
}
pub fn new_with_dialect(
formula: &str,
dialect: FormulaDialect,
) -> Result<Self, TokenizerError> {
let mut tokenizer = Tokenizer {
formula: formula.to_string(),
items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
offset: 0,
token_start: 0,
token_end: 0,
dialect,
};
tokenizer.parse()?;
Ok(tokenizer)
}
pub fn from_token_stream(stream: &TokenStream) -> Self {
Tokenizer {
formula: stream.source.to_string(),
items: stream.to_tokens(),
token_stack: Vec::with_capacity(16),
offset: 0,
token_start: 0,
token_end: 0,
dialect: stream.dialect,
}
}
#[inline]
fn current_byte(&self) -> Option<u8> {
self.formula.as_bytes().get(self.offset).copied()
}
#[inline]
fn has_token(&self) -> bool {
self.token_end > self.token_start
}
#[inline]
fn start_token(&mut self) {
self.token_start = self.offset;
self.token_end = self.offset;
}
#[inline]
fn extend_token(&mut self) {
self.token_end = self.offset;
}
fn parse(&mut self) -> Result<(), TokenizerError> {
if self.formula.is_empty() {
return Ok(());
}
if self.formula.as_bytes()[0] != b'=' {
self.items.push(Token::new_with_span(
self.formula.clone(),
TokenType::Literal,
TokenSubType::None,
0,
self.formula.len(),
));
return Ok(());
}
self.offset = 1;
self.start_token();
while self.offset < self.formula.len() {
if self.check_scientific_notation()? {
continue;
}
let curr_byte = self.formula.as_bytes()[self.offset];
if is_token_ender(curr_byte) && self.has_token() {
self.save_token();
self.start_token();
}
match curr_byte {
b'"' | b'\'' => self.parse_string()?,
b'[' => self.parse_brackets()?,
b'#' => {
if self.should_emit_hash_postfix() {
self.emit_hash_postfix();
} else {
self.parse_error()?
}
}
b' ' | b'\n' => self.parse_whitespace()?,
b':' => {
if self.should_emit_colon_infix() {
self.emit_infix_operator(self.offset, self.offset + 1);
} else {
if !self.has_token() {
self.start_token();
}
self.offset += 1;
self.extend_token();
}
}
b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
self.parse_operator()?
}
b'{' | b'(' => self.parse_opener()?,
b')' | b'}' => self.parse_closer()?,
b';' | b',' => self.parse_separator()?,
_ => {
if !self.has_token() {
self.start_token();
}
self.offset += 1;
self.extend_token();
}
}
}
if self.has_token() {
self.save_token();
}
if !self.token_stack.is_empty() {
return Err(TokenizerError {
message: "Unmatched opening parenthesis or bracket".to_string(),
pos: self.offset,
});
}
Ok(())
}
fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
if let Some(curr_byte) = self.current_byte() {
if (curr_byte == b'+' || curr_byte == b'-')
&& self.has_token()
&& self.is_scientific_notation_base()
&& self
.formula
.as_bytes()
.get(self.offset + 1)
.is_some_and(|b| b.is_ascii_digit())
{
self.offset += 1;
self.extend_token();
return Ok(true);
}
}
Ok(false)
}
fn is_scientific_notation_base(&self) -> bool {
if !self.has_token() {
return false;
}
let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
if token_slice.len() < 2 {
return false;
}
let last = token_slice[token_slice.len() - 1];
if !(last == b'E' || last == b'e') {
return false;
}
let first = token_slice[0];
if !first.is_ascii_digit() {
return false;
}
let mut dot_seen = false;
for &ch in &token_slice[1..token_slice.len() - 1] {
match ch {
b'0'..=b'9' => {}
b'.' if !dot_seen => dot_seen = true,
_ => return false,
}
}
true
}
fn save_token(&mut self) {
if self.has_token() {
let token =
Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
self.items.push(token);
}
}
fn parse_string(&mut self) -> Result<(), TokenizerError> {
let delim = self.formula.as_bytes()[self.offset];
assert!(delim == b'"' || delim == b'\'');
let is_dollar_ref = delim == b'\''
&& self.has_token()
&& self.token_end - self.token_start == 1
&& self.formula.as_bytes()[self.token_start] == b'$';
let glue_to_token = delim == b'\''
&& self.has_token()
&& self.token_end > 0
&& self.formula.as_bytes()[self.token_end - 1] == b':';
if !is_dollar_ref && !glue_to_token && self.has_token() {
self.save_token();
self.start_token();
}
let string_start = if is_dollar_ref {
self.token_start
} else {
self.offset
};
self.offset += 1;
while self.offset < self.formula.len() {
if self.formula.as_bytes()[self.offset] == delim {
self.offset += 1;
if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
{
self.offset += 1; } else {
if delim == b'"' {
let token = Token::make_operand_from_slice(
&self.formula,
string_start,
self.offset,
);
self.items.push(token);
self.start_token();
} else {
self.token_end = self.offset;
}
return Ok(());
}
} else {
self.offset += 1;
}
}
Err(TokenizerError {
message: "Reached end of formula while parsing string".to_string(),
pos: self.offset,
})
}
fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
assert_eq!(self.formula.as_bytes()[self.offset], b'[');
if !self.has_token() {
self.start_token();
}
let mut open_count = 1;
self.offset += 1;
while self.offset < self.formula.len() {
match self.formula.as_bytes()[self.offset] {
b'\'' => {
if self.offset + 1 < self.formula.len() {
self.offset += 2;
continue;
}
self.offset += 1;
continue;
}
b'[' => open_count += 1,
b']' => {
open_count -= 1;
if open_count == 0 {
self.offset += 1;
self.extend_token();
return Ok(());
}
}
_ => {}
}
self.offset += 1;
}
Err(TokenizerError {
message: "Encountered unmatched '['".to_string(),
pos: self.offset,
})
}
fn should_emit_hash_postfix(&self) -> bool {
if self.has_token() {
if self.formula.as_bytes()[self.token_end - 1] == b'!' {
return false;
}
let value = &self.formula[self.token_start..self.token_end];
let is_range = !value.starts_with('"')
&& !value.starts_with('#')
&& value != "TRUE"
&& value != "FALSE"
&& value.parse::<f64>().is_err();
return is_range;
}
let prev = self
.items
.iter()
.rev()
.find(|t| t.token_type != TokenType::Whitespace);
match prev {
Some(p) => match p.token_type {
TokenType::OpPostfix => true,
TokenType::Paren | TokenType::Func | TokenType::Array
if p.subtype == TokenSubType::Close =>
{
true
}
TokenType::Operand if p.subtype == TokenSubType::Range => true,
_ => false,
},
None => false,
}
}
fn emit_hash_postfix(&mut self) {
self.save_token();
self.start_token();
self.items.push(Token::from_slice(
&self.formula,
TokenType::OpPostfix,
TokenSubType::None,
self.offset,
self.offset + 1,
));
self.offset += 1;
self.start_token();
}
fn parse_error(&mut self) -> Result<(), TokenizerError> {
let has_sheet_prefix = self.has_token()
&& self.token_end > 0
&& self.formula.as_bytes()[self.token_end - 1] == b'!';
if has_sheet_prefix {
if self.token_end - self.token_start <= 1 {
return Err(TokenizerError {
message: format!(
"Empty sheet qualifier before error literal at position {}",
self.offset
),
pos: self.offset,
});
}
self.start_token();
} else if self.has_token() {
self.save_token();
self.start_token();
}
let error_start = self.offset;
for &err_code in ERROR_CODES {
let err_bytes = err_code.as_bytes();
if self.offset + err_bytes.len() <= self.formula.len() {
let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
if slice.eq_ignore_ascii_case(err_bytes) {
let token = Token::make_operand_from_slice(
&self.formula,
error_start,
self.offset + err_bytes.len(),
);
self.items.push(token);
self.offset += err_bytes.len();
self.start_token();
return Ok(());
}
}
}
Err(TokenizerError {
message: format!("Invalid error code at position {}", self.offset),
pos: self.offset,
})
}
fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
self.save_token();
let ws_start = self.offset;
while self.offset < self.formula.len() {
match self.formula.as_bytes()[self.offset] {
b' ' | b'\n' => self.offset += 1,
_ => break,
}
}
let token_type = if self.prev_is_reference_producing()
&& next_starts_reference_expression(&self.formula, self.offset)
{
TokenType::OpInfix
} else {
TokenType::Whitespace
};
self.items.push(Token::from_slice(
&self.formula,
token_type,
TokenSubType::None,
ws_start,
self.offset,
));
self.start_token();
Ok(())
}
fn prev_non_whitespace(&self) -> Option<&Token> {
self.items
.iter()
.rev()
.find(|t| t.token_type != TokenType::Whitespace)
}
fn prev_is_reference_producing(&self) -> bool {
match self.prev_non_whitespace() {
Some(prev) => match prev.token_type {
TokenType::OpPostfix => true,
TokenType::Paren | TokenType::Func | TokenType::Array
if prev.subtype == TokenSubType::Close =>
{
true
}
TokenType::Operand if prev.subtype == TokenSubType::Range => {
is_reference_operand_value(&prev.value)
}
_ => false,
},
None => false,
}
}
fn should_emit_colon_infix(&self) -> bool {
if self.has_token() {
let value = &self.formula[self.token_start..self.token_end];
if value.ends_with('!') {
return false;
}
return reference_value_contains_range_colon(value)
|| value.contains('[')
|| (value.contains('!')
&& next_reference_has_sheet_qualifier(&self.formula, self.offset + 1));
}
self.prev_is_reference_producing()
}
fn emit_infix_operator(&mut self, start: usize, end: usize) {
self.save_token();
self.start_token();
self.items.push(Token::from_slice(
&self.formula,
TokenType::OpInfix,
TokenSubType::None,
start,
end,
));
self.offset = end;
self.start_token();
}
fn parse_operator(&mut self) -> Result<(), TokenizerError> {
self.save_token();
if self.offset + 1 < self.formula.len() {
let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
self.items.push(Token::from_slice(
&self.formula,
TokenType::OpInfix,
TokenSubType::None,
self.offset,
self.offset + 2,
));
self.offset += 2;
self.start_token();
return Ok(());
}
}
let curr_byte = self.formula.as_bytes()[self.offset];
let token_type = match curr_byte {
b'@' => TokenType::OpPrefix,
b'%' => TokenType::OpPostfix,
b'+' | b'-' => {
if self.items.is_empty() {
TokenType::OpPrefix
} else {
let prev = self
.items
.iter()
.rev()
.find(|t| t.token_type != TokenType::Whitespace);
if let Some(p) = prev {
if p.subtype == TokenSubType::Close
|| p.token_type == TokenType::OpPostfix
|| p.token_type == TokenType::Operand
{
TokenType::OpInfix
} else {
TokenType::OpPrefix
}
} else {
TokenType::OpPrefix
}
}
}
_ => TokenType::OpInfix,
};
self.items.push(Token::from_slice(
&self.formula,
token_type,
TokenSubType::None,
self.offset,
self.offset + 1,
));
self.offset += 1;
self.start_token();
Ok(())
}
fn parse_opener(&mut self) -> Result<(), TokenizerError> {
let curr_byte = self.formula.as_bytes()[self.offset];
assert!(curr_byte == b'(' || curr_byte == b'{');
let token = if curr_byte == b'{' {
self.save_token();
Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
} else if self.has_token() {
let token = Token::make_subexp_from_slice(
&self.formula,
true,
self.token_start,
self.offset + 1,
);
self.token_start = self.offset + 1;
self.token_end = self.offset + 1;
token
} else {
Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
};
self.items.push(token.clone());
self.token_stack.push(token);
self.offset += 1;
self.start_token();
Ok(())
}
fn parse_closer(&mut self) -> Result<(), TokenizerError> {
self.save_token();
let curr_byte = self.formula.as_bytes()[self.offset];
assert!(curr_byte == b')' || curr_byte == b'}');
if let Some(open_token) = self.token_stack.pop() {
let closer = open_token.get_closer()?;
if (curr_byte == b'}' && closer.value != "}")
|| (curr_byte == b')' && closer.value != ")")
{
return Err(TokenizerError {
message: "Mismatched ( and { pair".to_string(),
pos: self.offset,
});
}
self.items.push(Token::from_slice(
&self.formula,
closer.token_type,
TokenSubType::Close,
self.offset,
self.offset + 1,
));
} else {
return Err(TokenizerError {
message: format!("No matching opener for closer at position {}", self.offset),
pos: self.offset,
});
}
self.offset += 1;
self.start_token();
Ok(())
}
fn parse_separator(&mut self) -> Result<(), TokenizerError> {
self.save_token();
let curr_byte = self.formula.as_bytes()[self.offset];
assert!(curr_byte == b';' || curr_byte == b',');
let top_token = self.token_stack.last();
let in_function_or_array = matches!(
top_token.map(|t| t.token_type),
Some(TokenType::Func | TokenType::Array)
);
let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
let (token_type, subtype) = match curr_byte {
b',' => {
if in_function_or_array {
(TokenType::Sep, TokenSubType::Arg)
} else {
(TokenType::OpInfix, TokenSubType::None)
}
}
b';' => {
if in_array {
(TokenType::Sep, TokenSubType::Row)
} else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
(TokenType::Sep, TokenSubType::Arg)
} else if self.dialect == FormulaDialect::OpenFormula {
(TokenType::OpInfix, TokenSubType::None)
} else {
(TokenType::Sep, TokenSubType::Row)
}
}
_ => (TokenType::OpInfix, TokenSubType::None),
};
self.items.push(Token::from_slice(
&self.formula,
token_type,
subtype,
self.offset,
self.offset + 1,
));
self.offset += 1;
self.start_token();
Ok(())
}
pub fn render(&self) -> String {
if self.items.is_empty() {
"".to_string()
} else if self.items[0].token_type == TokenType::Literal {
self.items[0].value.clone()
} else {
let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
format!("={concatenated}")
}
}
pub fn dialect(&self) -> FormulaDialect {
self.dialect
}
}
impl TryFrom<&str> for Tokenizer {
type Error = TokenizerError;
fn try_from(value: &str) -> Result<Self, Self::Error> {
Tokenizer::new(value)
}
}
impl TryFrom<String> for Tokenizer {
type Error = TokenizerError;
fn try_from(value: String) -> Result<Self, Self::Error> {
Tokenizer::new(&value)
}
}