1use std::convert::TryFrom;
2use std::error::Error;
3use std::fmt::{self, Display};
4use std::sync::Arc;
5
6#[cfg(feature = "serde")]
7use serde::{Deserialize, Serialize};
8
9use crate::types::FormulaDialect;
10
11const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%@";
12
13const fn build_token_enders() -> [bool; 256] {
14 let mut tbl = [false; 256];
15 let bytes = TOKEN_ENDERS.as_bytes();
16 let mut i = 0;
17 while i < bytes.len() {
18 tbl[bytes[i] as usize] = true;
19 i += 1;
20 }
21 tbl
22}
23static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
24
25#[inline(always)]
26fn is_token_ender(c: u8) -> bool {
27 TOKEN_ENDERS_TABLE[c as usize]
28}
29
30static ERROR_CODES: &[&str] = &[
47 "#GETTING_DATA",
48 "#DIV/0!",
49 "#VALUE!",
50 "#SPILL!",
51 "#NAME?",
52 "#NULL!",
53 "#CALC!",
54 "#NUM!",
55 "#REF!",
56 "#N/A",
57];
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum Associativity {
62 Left,
63 Right,
64}
65
66#[derive(Debug)]
68pub struct TokenizerError {
69 pub message: String,
70 pub pos: usize,
71}
72
73#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
75#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
76pub enum RecoveryAction {
77 SkippedUnmatchedCloser,
79 UnterminatedString,
81 UnmatchedBracket,
83 InvalidErrorLiteral,
85 UnmatchedOpener,
87}
88
89#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
91#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct TokenDiagnostic {
93 pub span: TokenSpan,
94 pub message: String,
95 pub recovery: RecoveryAction,
96}
97
98impl TokenDiagnostic {
99 fn new(span: TokenSpan, message: String, recovery: RecoveryAction) -> Self {
100 Self {
101 span,
102 message,
103 recovery,
104 }
105 }
106}
107
108#[derive(Debug, Clone)]
109struct SpanTokenizerError {
110 kind: SpanTokenizerErrorKind,
111 pos: usize,
112 message: String,
113 span_start: Option<usize>,
114 span_end: Option<usize>,
115}
116
117#[derive(Debug, Clone, Copy)]
118enum SpanTokenizerErrorKind {
119 NoMatchingOpener,
120 UnmatchedOpening,
121 UnterminatedString,
122 UnmatchedBracket,
123 MismatchedPair,
124 InvalidErrorLiteral,
125}
126
127impl From<SpanTokenizerError> for TokenizerError {
128 fn from(value: SpanTokenizerError) -> Self {
129 TokenizerError {
130 message: value.message,
131 pos: value.pos,
132 }
133 }
134}
135
136impl fmt::Display for TokenizerError {
137 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138 write!(f, "TokenizerError: {}", self.message)
139 }
140}
141
142impl Error for TokenizerError {}
143
144#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
147pub enum TokenType {
148 Literal,
149 Operand,
150 Func,
151 Array,
152 Paren,
153 Sep,
154 OpPrefix,
155 OpInfix,
156 OpPostfix,
157 Whitespace,
158}
159impl Display for TokenType {
160 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
161 write!(f, "{self:?}")
162 }
163}
164
165#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
167#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
168pub enum TokenSubType {
169 None,
170 Text,
171 Number,
172 Logical,
173 Error,
174 Range,
175 Open,
176 Close,
177 Arg,
178 Row,
179}
180impl Display for TokenSubType {
181 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
182 write!(f, "{self:?}")
183 }
184}
185
186#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
188#[derive(Debug, Clone, PartialEq, Hash)]
189pub struct Token {
190 pub value: String, pub token_type: TokenType,
192 pub subtype: TokenSubType,
193 pub start: usize,
194 pub end: usize,
195}
196
197impl Display for Token {
198 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
199 write!(
200 f,
201 "<{} subtype: {:?} value: {}>",
202 self.token_type, self.subtype, self.value
203 )
204 }
205}
206
207impl Token {
208 pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
209 Token {
210 value,
211 token_type,
212 subtype,
213 start: 0,
214 end: 0,
215 }
216 }
217
218 pub fn new_with_span(
219 value: String,
220 token_type: TokenType,
221 subtype: TokenSubType,
222 start: usize,
223 end: usize,
224 ) -> Self {
225 Token {
226 value,
227 token_type,
228 subtype,
229 start,
230 end,
231 }
232 }
233
234 fn from_slice(
235 source: &str,
236 token_type: TokenType,
237 subtype: TokenSubType,
238 start: usize,
239 end: usize,
240 ) -> Self {
241 Token {
242 value: source[start..end].to_string(),
243 token_type,
244 subtype,
245 start,
246 end,
247 }
248 }
249
250 pub fn is_operator(&self) -> bool {
251 matches!(
252 self.token_type,
253 TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
254 )
255 }
256
257 pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
258 let op = if self.token_type == TokenType::OpPrefix {
260 "u"
261 } else {
262 self.value.as_str()
263 };
264
265 match op {
276 "#" => Some((11, Associativity::Left)),
277 ":" => Some((10, Associativity::Left)),
278 " " => Some((9, Associativity::Left)),
279 "," => Some((8, Associativity::Left)),
280 "%" => Some((7, Associativity::Left)),
281 "u" => Some((6, Associativity::Right)),
282 "^" => Some((5, Associativity::Right)),
283 "*" | "/" => Some((4, Associativity::Left)),
284 "+" | "-" => Some((3, Associativity::Left)),
285 "&" => Some((2, Associativity::Left)),
286 "=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
287 _ => None,
288 }
289 }
290
291 pub fn make_operand(value: String) -> Self {
293 let subtype = if value.starts_with('"') {
294 TokenSubType::Text
295 } else if value.starts_with('#') {
296 TokenSubType::Error
297 } else if value.eq_ignore_ascii_case("TRUE") || value.eq_ignore_ascii_case("FALSE") {
298 TokenSubType::Logical
299 } else if value.parse::<f64>().is_ok() {
300 TokenSubType::Number
301 } else {
302 TokenSubType::Range
303 };
304 Token::new(value, TokenType::Operand, subtype)
305 }
306
307 pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
309 let subtype = if value.starts_with('"') {
310 TokenSubType::Text
311 } else if value.starts_with('#') {
312 TokenSubType::Error
313 } else if value.eq_ignore_ascii_case("TRUE") || value.eq_ignore_ascii_case("FALSE") {
314 TokenSubType::Logical
315 } else if value.parse::<f64>().is_ok() {
316 TokenSubType::Number
317 } else {
318 TokenSubType::Range
319 };
320 Token::new_with_span(value, TokenType::Operand, subtype, start, end)
321 }
322
323 fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
324 let value_str = &source[start..end];
325 let subtype = if value_str.starts_with('"') {
326 TokenSubType::Text
327 } else if value_str.starts_with('#') {
328 TokenSubType::Error
329 } else if value_str.eq_ignore_ascii_case("TRUE") || value_str.eq_ignore_ascii_case("FALSE")
330 {
331 TokenSubType::Logical
332 } else if value_str.parse::<f64>().is_ok() {
333 TokenSubType::Number
334 } else {
335 TokenSubType::Range
336 };
337 Token::from_slice(source, TokenType::Operand, subtype, start, end)
338 }
339
340 pub fn make_subexp(value: &str, func: bool) -> Self {
345 let last_char = value.chars().last().expect("Empty token value");
346 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
347 let token_type = if func {
348 TokenType::Func
349 } else if "{}".contains(last_char) {
350 TokenType::Array
351 } else if "()".contains(last_char) {
352 TokenType::Paren
353 } else {
354 TokenType::Func
355 };
356 let subtype = if ")}".contains(last_char) {
357 TokenSubType::Close
358 } else {
359 TokenSubType::Open
360 };
361 Token::new(value.to_string(), token_type, subtype)
362 }
363
364 pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
366 let last_char = value.chars().last().expect("Empty token value");
367 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
368 let token_type = if func {
369 TokenType::Func
370 } else if "{}".contains(last_char) {
371 TokenType::Array
372 } else if "()".contains(last_char) {
373 TokenType::Paren
374 } else {
375 TokenType::Func
376 };
377 let subtype = if ")}".contains(last_char) {
378 TokenSubType::Close
379 } else {
380 TokenSubType::Open
381 };
382 Token::new_with_span(value.to_string(), token_type, subtype, start, end)
383 }
384
385 fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
386 let value_str = &source[start..end];
387 let last_char = value_str.chars().last().expect("Empty token value");
388 let token_type = if func {
389 TokenType::Func
390 } else if "{}".contains(last_char) {
391 TokenType::Array
392 } else if "()".contains(last_char) {
393 TokenType::Paren
394 } else {
395 TokenType::Func
396 };
397 let subtype = if ")}".contains(last_char) {
398 TokenSubType::Close
399 } else {
400 TokenSubType::Open
401 };
402 Token::from_slice(source, token_type, subtype, start, end)
403 }
404
405 pub fn get_closer(&self) -> Result<Token, TokenizerError> {
407 if self.subtype != TokenSubType::Open {
408 return Err(TokenizerError {
409 message: "Token is not an opener".to_string(),
410 pos: 0,
411 });
412 }
413 let closer_value = if self.token_type == TokenType::Array {
414 "}"
415 } else {
416 ")"
417 };
418 Ok(Token::make_subexp(
419 closer_value,
420 self.token_type == TokenType::Func,
421 ))
422 }
423
424 pub fn make_separator(value: &str) -> Self {
426 assert!(value == "," || value == ";");
427 let subtype = if value == "," {
428 TokenSubType::Arg
429 } else {
430 TokenSubType::Row
431 };
432 Token::new(value.to_string(), TokenType::Sep, subtype)
433 }
434
435 pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
437 assert!(value == "," || value == ";");
438 let subtype = if value == "," {
439 TokenSubType::Arg
440 } else {
441 TokenSubType::Row
442 };
443 Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
444 }
445}
446
447#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
448#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
449pub struct TokenSpan {
450 pub token_type: TokenType,
451 pub subtype: TokenSubType,
452 pub start: usize,
453 pub end: usize,
454}
455
456#[derive(Debug, Clone, Copy, PartialEq, Eq)]
457pub struct TokenView<'a> {
458 pub span: &'a TokenSpan,
459 pub value: &'a str,
460}
461
462#[derive(Debug, Clone)]
468pub struct TokenStream {
469 source: Arc<str>,
470 pub spans: Vec<TokenSpan>,
471 dialect: FormulaDialect,
472 diagnostics: Vec<TokenDiagnostic>,
473}
474
475impl TokenStream {
476 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
477 Self::new_with_dialect(formula, FormulaDialect::Excel)
478 }
479
480 pub fn new_with_dialect(
481 formula: &str,
482 dialect: FormulaDialect,
483 ) -> Result<Self, TokenizerError> {
484 let source: Arc<str> = Arc::from(formula);
485 let spans = tokenize_spans_with_dialect(source.as_ref(), dialect)?;
486 Ok(TokenStream {
487 source,
488 spans,
489 dialect,
490 diagnostics: Vec::new(),
491 })
492 }
493
494 pub fn new_best_effort(formula: &str) -> Self {
495 Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
496 }
497
498 pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
499 let source: Arc<str> = Arc::from(formula);
500 let mut tokenizer = SpanTokenizer::new(source.as_ref(), dialect);
501 let spans = tokenizer.parse_best_effort();
502 let diagnostics = tokenizer.diagnostics;
503 TokenStream {
504 source,
505 spans,
506 dialect,
507 diagnostics,
508 }
509 }
510
511 pub fn diagnostics(&self) -> Vec<TokenDiagnostic> {
512 self.diagnostics.clone()
513 }
514
515 pub fn diagnostics_ref(&self) -> &[TokenDiagnostic] {
516 &self.diagnostics
517 }
518
519 pub fn has_errors(&self) -> bool {
520 !self.diagnostics.is_empty()
521 }
522
523 pub fn invalid_spans_iter(&self) -> impl Iterator<Item = &TokenSpan> {
524 self.spans.iter().filter(|span| {
525 self.diagnostics.iter().any(|diag| {
526 diag.span.start == span.start
527 && diag.span.end == span.end
528 && diag.span.token_type == span.token_type
529 })
530 })
531 }
532
533 pub fn invalid_spans(&self) -> Vec<&TokenSpan> {
534 self.invalid_spans_iter().collect()
535 }
536
537 pub fn source(&self) -> &str {
538 &self.source
539 }
540
541 pub fn dialect(&self) -> FormulaDialect {
542 self.dialect
543 }
544
545 pub fn len(&self) -> usize {
546 self.spans.len()
547 }
548
549 pub fn is_empty(&self) -> bool {
550 self.spans.is_empty()
551 }
552
553 pub fn get(&self, index: usize) -> Option<TokenView<'_>> {
554 let span = self.spans.get(index)?;
555 let value = self.source.get(span.start..span.end)?;
556 Some(TokenView { span, value })
557 }
558
559 pub fn to_tokens(&self) -> Vec<Token> {
560 self.spans
561 .iter()
562 .map(|s| {
563 let value = self
564 .source
565 .get(s.start..s.end)
566 .unwrap_or_default()
567 .to_string();
568 Token::new_with_span(value, s.token_type, s.subtype, s.start, s.end)
569 })
570 .collect()
571 }
572
573 pub fn render(&self) -> String {
578 let mut out = String::with_capacity(self.source.len());
579 for span in &self.spans {
580 if let Some(s) = self.source.get(span.start..span.end) {
581 out.push_str(s);
582 }
583 }
584 out
585 }
586
587 pub fn render_formula(&self) -> String {
589 if self.source.as_bytes().first() == Some(&b'=') {
590 format!("={}", self.render())
591 } else {
592 self.render()
593 }
594 }
595}
596
597pub(crate) fn tokenize_spans_with_dialect(
598 formula: &str,
599 dialect: FormulaDialect,
600) -> Result<Vec<TokenSpan>, TokenizerError> {
601 let mut tokenizer = SpanTokenizer::new(formula, dialect);
602 tokenizer.parse()?;
603 Ok(tokenizer.spans)
604}
605
606fn operand_subtype(value_str: &str) -> TokenSubType {
607 if value_str.starts_with('"') {
608 TokenSubType::Text
609 } else if value_str.starts_with('#') {
610 TokenSubType::Error
611 } else if value_str.eq_ignore_ascii_case("TRUE") || value_str.eq_ignore_ascii_case("FALSE") {
612 TokenSubType::Logical
613 } else if value_str.parse::<f64>().is_ok() {
614 TokenSubType::Number
615 } else {
616 TokenSubType::Range
617 }
618}
619
620fn is_cell_reference_like(value: &str) -> bool {
621 let bytes = value.as_bytes();
622 let mut i = 0;
623
624 if i < bytes.len() && bytes[i] == b'$' {
625 i += 1;
626 }
627
628 let col_start = i;
629 while i < bytes.len() && bytes[i].is_ascii_alphabetic() {
630 i += 1;
631 }
632 if i == col_start {
633 return false;
634 }
635
636 if i < bytes.len() && bytes[i] == b'$' {
637 i += 1;
638 }
639
640 let row_start = i;
641 while i < bytes.len() && bytes[i].is_ascii_digit() {
642 i += 1;
643 }
644
645 i == bytes.len() && i > row_start
646}
647
648fn reference_value_contains_range_colon(value: &str) -> bool {
649 let value_part = value
650 .rsplit_once('!')
651 .map_or(value, |(_, value_part)| value_part);
652 value_part.contains(':')
653}
654
655fn is_reference_operand_value(value: &str) -> bool {
656 operand_subtype(value) == TokenSubType::Range
657 && (reference_value_contains_range_colon(value)
658 || value.contains('!')
659 || value.contains('[')
660 || is_cell_reference_like(value))
661}
662
663fn next_starts_reference_expression(formula: &str, mut offset: usize) -> bool {
664 let bytes = formula.as_bytes();
665 while offset < bytes.len() && matches!(bytes[offset], b' ' | b'\n') {
666 offset += 1;
667 }
668 if offset >= bytes.len() {
669 return false;
670 }
671
672 matches!(bytes[offset], b'(' | b'[' | b'\'' | b'$') || bytes[offset].is_ascii_alphabetic()
673}
674
675fn next_reference_has_sheet_qualifier(formula: &str, mut offset: usize) -> bool {
676 let bytes = formula.as_bytes();
677 while offset < bytes.len() && matches!(bytes[offset], b' ' | b'\n') {
678 offset += 1;
679 }
680
681 let mut in_quote = false;
682 while offset < bytes.len() {
683 match bytes[offset] {
684 b'\'' => {
685 if in_quote && offset + 1 < bytes.len() && bytes[offset + 1] == b'\'' {
686 offset += 2;
687 continue;
688 }
689 in_quote = !in_quote;
690 }
691 b'!' => return true,
692 b':' if !in_quote => return false,
693 b',' | b';' | b'}' | b')' | b' ' | b'\n' | b'+' | b'-' | b'*' | b'/' | b'^' | b'&'
694 | b'=' | b'>' | b'<' | b'%' | b'@'
695 if !in_quote =>
696 {
697 return false;
698 }
699 _ => {}
700 }
701 offset += 1;
702 }
703
704 false
705}
706
707struct SpanTokenizer<'a> {
708 formula: &'a str,
709 spans: Vec<TokenSpan>,
710 token_stack: Vec<TokenSpan>,
711 offset: usize,
712 token_start: usize,
713 token_end: usize,
714 dialect: FormulaDialect,
715 diagnostics: Vec<TokenDiagnostic>,
716}
717
718impl<'a> SpanTokenizer<'a> {
719 fn new(formula: &'a str, dialect: FormulaDialect) -> Self {
720 SpanTokenizer {
721 formula,
722 spans: Vec::with_capacity(formula.len() / 2),
723 token_stack: Vec::with_capacity(16),
724 offset: 0,
725 token_start: 0,
726 token_end: 0,
727 dialect,
728 diagnostics: Vec::new(),
729 }
730 }
731
732 #[inline]
733 fn current_byte(&self) -> Option<u8> {
734 self.formula.as_bytes().get(self.offset).copied()
735 }
736
737 #[inline]
738 fn has_token(&self) -> bool {
739 self.token_end > self.token_start
740 }
741
742 #[inline]
743 fn start_token(&mut self) {
744 self.token_start = self.offset;
745 self.token_end = self.offset;
746 }
747
748 #[inline]
749 fn extend_token(&mut self) {
750 self.token_end = self.offset;
751 }
752
753 fn push_span(
754 &mut self,
755 token_type: TokenType,
756 subtype: TokenSubType,
757 start: usize,
758 end: usize,
759 ) {
760 self.spans.push(TokenSpan {
761 token_type,
762 subtype,
763 start,
764 end,
765 });
766 }
767
768 fn save_token(&mut self) {
769 if self.has_token() {
770 let value_str = &self.formula[self.token_start..self.token_end];
771 let subtype = operand_subtype(value_str);
772 self.push_span(
773 TokenType::Operand,
774 subtype,
775 self.token_start,
776 self.token_end,
777 );
778 }
779 }
780
781 fn check_scientific_notation(&mut self) -> bool {
782 if let Some(curr_byte) = self.current_byte() {
783 if (curr_byte == b'+' || curr_byte == b'-')
784 && self.has_token()
785 && self.is_scientific_notation_base()
786 && self
787 .formula
788 .as_bytes()
789 .get(self.offset + 1)
790 .is_some_and(|b| b.is_ascii_digit())
791 {
792 self.offset += 1;
793 self.extend_token();
794 return true;
795 }
796 }
797 false
798 }
799
800 fn is_scientific_notation_base(&self) -> bool {
801 if !self.has_token() {
802 return false;
803 }
804
805 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
806 if token_slice.len() < 2 {
807 return false;
808 }
809
810 let last = token_slice[token_slice.len() - 1];
811 if !(last == b'E' || last == b'e') {
812 return false;
813 }
814
815 let first = token_slice[0];
816 if !first.is_ascii_digit() {
817 return false;
818 }
819
820 let mut dot_seen = false;
821 for &ch in &token_slice[1..token_slice.len() - 1] {
822 match ch {
823 b'0'..=b'9' => {}
824 b'.' if !dot_seen => dot_seen = true,
825 _ => return false,
826 }
827 }
828 true
829 }
830
831 fn parse(&mut self) -> Result<(), TokenizerError> {
832 self.parse_with_recovery(false).map_err(Into::into)
833 }
834
835 pub(crate) fn parse_best_effort(&mut self) -> Vec<TokenSpan> {
836 let _ = self.parse_with_recovery(true);
837 self.spans.clone()
838 }
839
840 fn parse_with_recovery(&mut self, best_effort: bool) -> Result<(), SpanTokenizerError> {
841 if self.formula.is_empty() {
842 return Ok(());
843 }
844
845 if self.formula.as_bytes()[0] != b'=' {
846 self.push_span(
847 TokenType::Literal,
848 TokenSubType::None,
849 0,
850 self.formula.len(),
851 );
852 return Ok(());
853 }
854
855 self.offset = 1;
856 self.start_token();
857
858 while self.offset < self.formula.len() {
859 if self.check_scientific_notation() {
860 continue;
861 }
862
863 let curr_byte = self.formula.as_bytes()[self.offset];
864
865 if is_token_ender(curr_byte) && self.has_token() {
866 self.save_token();
867 self.start_token();
868 }
869
870 let parse_result = match curr_byte {
871 b'"' | b'\'' => self.parse_string(),
872 b'[' => self.parse_brackets(),
873 b'#' => {
874 if self.should_emit_hash_postfix() {
875 self.emit_hash_postfix();
876 Ok(())
877 } else {
878 self.parse_error()
879 }
880 }
881 b' ' | b'\n' => self.parse_whitespace(),
882 b':' => {
883 if self.should_emit_colon_infix() {
884 self.emit_infix_operator(self.offset, self.offset + 1);
885 Ok(())
886 } else {
887 if !self.has_token() {
888 self.start_token();
889 }
890 self.offset += 1;
891 self.extend_token();
892 Ok(())
893 }
894 }
895 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
896 self.parse_operator()
897 }
898 b'{' | b'(' => self.parse_opener(),
899 b')' | b'}' => self.parse_closer(),
900 b';' | b',' => self.parse_separator(),
901 _ => {
902 if !self.has_token() {
903 self.start_token();
904 }
905 self.offset += 1;
906 self.extend_token();
907 Ok(())
908 }
909 };
910
911 if let Err(err) = parse_result {
912 if best_effort {
913 self.recover_from_error(err);
914 } else {
915 return Err(err);
916 }
917 }
918 }
919
920 if self.has_token() {
921 self.save_token();
922 }
923
924 if !self.token_stack.is_empty() {
925 if best_effort {
926 while let Some(open_token) = self.token_stack.pop() {
927 if let Some(span) = self.spans.iter().find(|span| {
928 span.start == open_token.start
929 && span.end == open_token.end
930 && span.token_type == open_token.token_type
931 && span.subtype == open_token.subtype
932 }) {
933 self.diagnostics.push(TokenDiagnostic::new(
934 *span,
935 "Unmatched opening parenthesis or bracket".to_string(),
936 RecoveryAction::UnmatchedOpener,
937 ));
938 }
939 }
940 } else {
941 return Err(SpanTokenizerError {
942 kind: SpanTokenizerErrorKind::UnmatchedOpening,
943 pos: self.offset,
944 message: "Unmatched opening parenthesis or bracket".to_string(),
945 span_start: None,
946 span_end: None,
947 });
948 }
949 }
950
951 Ok(())
952 }
953
954 fn recover_from_error(&mut self, error: SpanTokenizerError) {
955 match error.kind {
956 SpanTokenizerErrorKind::NoMatchingOpener => {
957 let span = TokenSpan {
958 token_type: TokenType::Operand,
959 subtype: TokenSubType::None,
960 start: error.pos,
961 end: error.pos + 1,
962 };
963 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
964 self.offset = span.end;
965 self.start_token();
966 self.diagnostics.push(TokenDiagnostic::new(
967 span,
968 format!("No matching opener for closer at position {}", error.pos),
969 RecoveryAction::SkippedUnmatchedCloser,
970 ));
971 }
972 SpanTokenizerErrorKind::UnmatchedOpening => {
973 debug_assert!(
974 false,
975 "UnmatchedOpening is handled at end-of-input and should not be routed through recover_from_error"
976 );
977 }
978 SpanTokenizerErrorKind::UnterminatedString => {
979 let start = error.span_start.unwrap_or(error.pos);
980 let span = TokenSpan {
981 token_type: TokenType::Operand,
982 subtype: TokenSubType::None,
983 start,
984 end: self.formula.len(),
985 };
986 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
987 self.offset = span.end;
988 self.start_token();
989 self.diagnostics.push(TokenDiagnostic::new(
990 span,
991 "Reached end of formula while parsing string".to_string(),
992 RecoveryAction::UnterminatedString,
993 ));
994 }
995 SpanTokenizerErrorKind::UnmatchedBracket => {
996 let start = error.span_start.unwrap_or(error.pos);
997 let end = error.span_end.unwrap_or(self.formula.len());
998 let span = TokenSpan {
999 token_type: TokenType::Operand,
1000 subtype: TokenSubType::None,
1001 start,
1002 end,
1003 };
1004 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
1005 self.offset = span.end;
1006 self.start_token();
1007 self.diagnostics.push(TokenDiagnostic::new(
1008 span,
1009 "Encountered unmatched '['".to_string(),
1010 RecoveryAction::UnmatchedBracket,
1011 ));
1012 }
1013 SpanTokenizerErrorKind::MismatchedPair => {
1014 let span = TokenSpan {
1015 token_type: TokenType::Operand,
1016 subtype: TokenSubType::None,
1017 start: error.pos,
1018 end: error.pos + 1,
1019 };
1020 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
1021 self.offset = span.end;
1022 self.start_token();
1023 self.diagnostics.push(TokenDiagnostic::new(
1024 span,
1025 "Mismatched ( and { pair".to_string(),
1026 RecoveryAction::SkippedUnmatchedCloser,
1027 ));
1028 }
1029 SpanTokenizerErrorKind::InvalidErrorLiteral => {
1030 let start = error.span_start.unwrap_or(error.pos);
1031 let end = error.span_end.unwrap_or(error.pos + 1);
1032 let span = TokenSpan {
1033 token_type: TokenType::Operand,
1034 subtype: TokenSubType::None,
1035 start,
1036 end,
1037 };
1038 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
1039 self.offset = span.end;
1040 self.start_token();
1041 self.diagnostics.push(TokenDiagnostic::new(
1042 span,
1043 "Invalid error code".to_string(),
1044 RecoveryAction::InvalidErrorLiteral,
1045 ));
1046 }
1047 }
1048 }
1049
1050 fn parse_string(&mut self) -> Result<(), SpanTokenizerError> {
1051 let delim = self.formula.as_bytes()[self.offset];
1052 assert!(delim == b'"' || delim == b'\'');
1053
1054 let is_dollar_ref = delim == b'\''
1055 && self.has_token()
1056 && self.token_end - self.token_start == 1
1057 && self.formula.as_bytes()[self.token_start] == b'$';
1058
1059 let glue_to_token = delim == b'\''
1064 && self.has_token()
1065 && self.token_end > 0
1066 && self.formula.as_bytes()[self.token_end - 1] == b':';
1067
1068 if !is_dollar_ref && !glue_to_token && self.has_token() {
1069 self.save_token();
1070 self.start_token();
1071 }
1072
1073 let string_start = if is_dollar_ref {
1074 self.token_start
1075 } else {
1076 self.offset
1077 };
1078 self.offset += 1;
1079
1080 while self.offset < self.formula.len() {
1081 if self.formula.as_bytes()[self.offset] == delim {
1082 self.offset += 1;
1083 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
1084 {
1085 self.offset += 1;
1086 } else {
1087 if delim == b'"' {
1088 let value_str = &self.formula[string_start..self.offset];
1089 let subtype = operand_subtype(value_str);
1090 self.push_span(TokenType::Operand, subtype, string_start, self.offset);
1091 self.start_token();
1092 } else {
1093 self.token_end = self.offset;
1094 }
1095 return Ok(());
1096 }
1097 } else {
1098 self.offset += 1;
1099 }
1100 }
1101
1102 Err(SpanTokenizerError {
1103 kind: SpanTokenizerErrorKind::UnterminatedString,
1104 pos: self.offset,
1105 message: "Reached end of formula while parsing string".to_string(),
1106 span_start: Some(string_start),
1107 span_end: Some(self.formula.len()),
1108 })
1109 }
1110
1111 fn parse_brackets(&mut self) -> Result<(), SpanTokenizerError> {
1112 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
1113
1114 if !self.has_token() {
1115 self.start_token();
1116 }
1117
1118 let bracket_start = self.offset;
1119 let mut open_count = 1;
1120 self.offset += 1;
1121
1122 while self.offset < self.formula.len() {
1123 match self.formula.as_bytes()[self.offset] {
1124 b'\'' => {
1129 if self.offset + 1 < self.formula.len() {
1130 self.offset += 2;
1131 continue;
1132 }
1133 self.offset += 1;
1136 continue;
1137 }
1138 b'[' => open_count += 1,
1139 b']' => {
1140 open_count -= 1;
1141 if open_count == 0 {
1142 self.offset += 1;
1143 self.extend_token();
1144 return Ok(());
1145 }
1146 }
1147 _ => {}
1148 }
1149 self.offset += 1;
1150 }
1151
1152 Err(SpanTokenizerError {
1153 kind: SpanTokenizerErrorKind::UnmatchedBracket,
1154 pos: self.offset,
1155 message: "Encountered unmatched '['".to_string(),
1156 span_start: Some(bracket_start),
1157 span_end: Some(self.formula.len()),
1158 })
1159 }
1160
1161 fn parse_error(&mut self) -> Result<(), SpanTokenizerError> {
1162 let has_sheet_prefix = self.has_token()
1168 && self.token_end > 0
1169 && self.formula.as_bytes()[self.token_end - 1] == b'!';
1170 if has_sheet_prefix {
1171 if self.token_end - self.token_start <= 1 {
1172 return Err(SpanTokenizerError {
1173 kind: SpanTokenizerErrorKind::InvalidErrorLiteral,
1174 pos: self.offset,
1175 message: format!(
1176 "Empty sheet qualifier before error literal at position {}",
1177 self.offset
1178 ),
1179 span_start: Some(self.token_start),
1180 span_end: Some(self.offset),
1181 });
1182 }
1183 self.start_token();
1185 } else if self.has_token() {
1186 self.save_token();
1187 self.start_token();
1188 }
1189
1190 let error_start = self.offset;
1191
1192 for &err_code in ERROR_CODES {
1193 let err_bytes = err_code.as_bytes();
1194 if self.offset + err_bytes.len() <= self.formula.len() {
1195 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1196 if slice.eq_ignore_ascii_case(err_bytes) {
1197 self.push_span(
1198 TokenType::Operand,
1199 TokenSubType::Error,
1200 error_start,
1201 self.offset + err_bytes.len(),
1202 );
1203 self.offset += err_bytes.len();
1204 self.start_token();
1205 return Ok(());
1206 }
1207 }
1208 }
1209
1210 let mut end = self.offset + 1;
1211 while end < self.formula.len() {
1212 let ch = self.formula.as_bytes()[end];
1213 if is_token_ender(ch)
1214 || ch == b' '
1215 || ch == b'\n'
1216 || ch == b'('
1217 || ch == b'{'
1218 || ch == b'['
1219 || ch == b'"'
1220 || ch == b'\''
1221 {
1222 break;
1223 }
1224 end += 1;
1225 }
1226
1227 Err(SpanTokenizerError {
1228 kind: SpanTokenizerErrorKind::InvalidErrorLiteral,
1229 pos: self.offset,
1230 message: format!("Invalid error code at position {}", self.offset),
1231 span_start: Some(error_start),
1232 span_end: Some(end),
1233 })
1234 }
1235
1236 fn parse_whitespace(&mut self) -> Result<(), SpanTokenizerError> {
1237 self.save_token();
1238
1239 let ws_start = self.offset;
1240 while self.offset < self.formula.len() {
1241 match self.formula.as_bytes()[self.offset] {
1242 b' ' | b'\n' => self.offset += 1,
1243 _ => break,
1244 }
1245 }
1246
1247 let token_type = if self.prev_is_reference_producing()
1248 && next_starts_reference_expression(self.formula, self.offset)
1249 {
1250 TokenType::OpInfix
1251 } else {
1252 TokenType::Whitespace
1253 };
1254 self.push_span(token_type, TokenSubType::None, ws_start, self.offset);
1255 self.start_token();
1256 Ok(())
1257 }
1258
1259 fn prev_is_reference_producing(&self) -> bool {
1260 match self.prev_non_whitespace() {
1261 Some(prev) => match prev.token_type {
1262 TokenType::OpPostfix => true,
1263 TokenType::Paren | TokenType::Func | TokenType::Array
1264 if prev.subtype == TokenSubType::Close =>
1265 {
1266 true
1267 }
1268 TokenType::Operand if prev.subtype == TokenSubType::Range => self
1269 .formula
1270 .get(prev.start..prev.end)
1271 .is_some_and(is_reference_operand_value),
1272 _ => false,
1273 },
1274 None => false,
1275 }
1276 }
1277
1278 fn should_emit_colon_infix(&self) -> bool {
1279 if self.has_token() {
1280 let value = &self.formula[self.token_start..self.token_end];
1281 if value.ends_with('!') {
1282 return false;
1283 }
1284 return reference_value_contains_range_colon(value)
1285 || value.contains('[')
1286 || (value.contains('!')
1287 && next_reference_has_sheet_qualifier(self.formula, self.offset + 1));
1288 }
1289 self.prev_is_reference_producing()
1290 }
1291
1292 fn emit_infix_operator(&mut self, start: usize, end: usize) {
1293 self.save_token();
1294 self.start_token();
1295 self.push_span(TokenType::OpInfix, TokenSubType::None, start, end);
1296 self.offset = end;
1297 self.start_token();
1298 }
1299
1300 fn prev_non_whitespace(&self) -> Option<&TokenSpan> {
1301 self.spans
1302 .iter()
1303 .rev()
1304 .find(|t| t.token_type != TokenType::Whitespace)
1305 }
1306
1307 fn should_emit_hash_postfix(&self) -> bool {
1312 if self.has_token() {
1313 if self.formula.as_bytes()[self.token_end - 1] == b'!' {
1317 return false;
1318 }
1319 let value = &self.formula[self.token_start..self.token_end];
1320 return operand_subtype(value) == TokenSubType::Range;
1321 }
1322 match self.prev_non_whitespace() {
1323 Some(prev) => match prev.token_type {
1324 TokenType::OpPostfix => true,
1325 TokenType::Paren | TokenType::Func | TokenType::Array
1326 if prev.subtype == TokenSubType::Close =>
1327 {
1328 true
1329 }
1330 TokenType::Operand if prev.subtype == TokenSubType::Range => true,
1331 _ => false,
1332 },
1333 None => false,
1334 }
1335 }
1336
1337 fn emit_hash_postfix(&mut self) {
1338 self.save_token();
1339 self.start_token();
1340 self.push_span(
1341 TokenType::OpPostfix,
1342 TokenSubType::None,
1343 self.offset,
1344 self.offset + 1,
1345 );
1346 self.offset += 1;
1347 self.start_token();
1348 }
1349
1350 fn parse_operator(&mut self) -> Result<(), SpanTokenizerError> {
1351 self.save_token();
1352
1353 if self.offset + 1 < self.formula.len() {
1354 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1355 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1356 self.push_span(
1357 TokenType::OpInfix,
1358 TokenSubType::None,
1359 self.offset,
1360 self.offset + 2,
1361 );
1362 self.offset += 2;
1363 self.start_token();
1364 return Ok(());
1365 }
1366 }
1367
1368 let curr_byte = self.formula.as_bytes()[self.offset];
1369 let token_type = match curr_byte {
1370 b'@' => TokenType::OpPrefix,
1371 b'%' => TokenType::OpPostfix,
1372 b'+' | b'-' => {
1373 if self.spans.is_empty() {
1374 TokenType::OpPrefix
1375 } else {
1376 let prev = self.prev_non_whitespace();
1377 if let Some(p) = prev {
1378 if p.subtype == TokenSubType::Close
1379 || p.token_type == TokenType::OpPostfix
1380 || p.token_type == TokenType::Operand
1381 {
1382 TokenType::OpInfix
1383 } else {
1384 TokenType::OpPrefix
1385 }
1386 } else {
1387 TokenType::OpPrefix
1388 }
1389 }
1390 }
1391 _ => TokenType::OpInfix,
1392 };
1393
1394 self.push_span(token_type, TokenSubType::None, self.offset, self.offset + 1);
1395 self.offset += 1;
1396 self.start_token();
1397 Ok(())
1398 }
1399
1400 fn parse_opener(&mut self) -> Result<(), SpanTokenizerError> {
1401 let curr_byte = self.formula.as_bytes()[self.offset];
1402 assert!(curr_byte == b'(' || curr_byte == b'{');
1403
1404 let token = if curr_byte == b'{' {
1405 self.save_token();
1406 TokenSpan {
1407 token_type: TokenType::Array,
1408 subtype: TokenSubType::Open,
1409 start: self.offset,
1410 end: self.offset + 1,
1411 }
1412 } else if self.has_token() {
1413 let token = TokenSpan {
1414 token_type: TokenType::Func,
1415 subtype: TokenSubType::Open,
1416 start: self.token_start,
1417 end: self.offset + 1,
1418 };
1419 self.token_start = self.offset + 1;
1420 self.token_end = self.offset + 1;
1421 token
1422 } else {
1423 TokenSpan {
1424 token_type: TokenType::Paren,
1425 subtype: TokenSubType::Open,
1426 start: self.offset,
1427 end: self.offset + 1,
1428 }
1429 };
1430
1431 self.spans.push(token);
1432 self.token_stack.push(token);
1433 self.offset += 1;
1434 self.start_token();
1435 Ok(())
1436 }
1437
1438 fn parse_closer(&mut self) -> Result<(), SpanTokenizerError> {
1439 self.save_token();
1440
1441 let curr_byte = self.formula.as_bytes()[self.offset];
1442 assert!(curr_byte == b')' || curr_byte == b'}');
1443
1444 if let Some(open_token) = self.token_stack.last().copied() {
1445 let expected = if open_token.token_type == TokenType::Array {
1446 b'}'
1447 } else {
1448 b')'
1449 };
1450 if curr_byte != expected {
1451 return Err(SpanTokenizerError {
1452 kind: SpanTokenizerErrorKind::MismatchedPair,
1453 pos: self.offset,
1454 message: "Mismatched ( and { pair".to_string(),
1455 span_start: Some(self.offset),
1456 span_end: Some(self.offset + 1),
1457 });
1458 }
1459
1460 self.token_stack.pop();
1461 self.push_span(
1462 open_token.token_type,
1463 TokenSubType::Close,
1464 self.offset,
1465 self.offset + 1,
1466 );
1467 } else {
1468 return Err(SpanTokenizerError {
1469 kind: SpanTokenizerErrorKind::NoMatchingOpener,
1470 pos: self.offset,
1471 message: format!("No matching opener for closer at position {}", self.offset),
1472 span_start: Some(self.offset),
1473 span_end: Some(self.offset + 1),
1474 });
1475 }
1476
1477 self.offset += 1;
1478 self.start_token();
1479 Ok(())
1480 }
1481
1482 fn parse_separator(&mut self) -> Result<(), SpanTokenizerError> {
1483 self.save_token();
1484
1485 let curr_byte = self.formula.as_bytes()[self.offset];
1486 assert!(curr_byte == b';' || curr_byte == b',');
1487
1488 let top_token = self.token_stack.last();
1489 let in_function_or_array = matches!(
1490 top_token.map(|t| t.token_type),
1491 Some(TokenType::Func | TokenType::Array)
1492 );
1493 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1494
1495 let (token_type, subtype) = match curr_byte {
1496 b',' => {
1497 if in_function_or_array {
1498 (TokenType::Sep, TokenSubType::Arg)
1499 } else {
1500 (TokenType::OpInfix, TokenSubType::None)
1501 }
1502 }
1503 b';' => {
1504 if in_array {
1505 (TokenType::Sep, TokenSubType::Row)
1506 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1507 (TokenType::Sep, TokenSubType::Arg)
1508 } else if self.dialect == FormulaDialect::OpenFormula {
1509 (TokenType::OpInfix, TokenSubType::None)
1510 } else {
1511 (TokenType::Sep, TokenSubType::Row)
1512 }
1513 }
1514 _ => (TokenType::OpInfix, TokenSubType::None),
1515 };
1516
1517 self.push_span(token_type, subtype, self.offset, self.offset + 1);
1518 self.offset += 1;
1519 self.start_token();
1520 Ok(())
1521 }
1522}
1523
1524pub struct Tokenizer {
1526 formula: String, pub items: Vec<Token>,
1528 token_stack: Vec<Token>,
1529 offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
1533}
1534
1535impl Tokenizer {
1536 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
1538 Self::new_with_dialect(formula, FormulaDialect::Excel)
1539 }
1540
1541 pub fn new_best_effort(formula: &str) -> Self {
1543 Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
1544 }
1545
1546 pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
1548 let stream = TokenStream::new_best_effort_with_dialect(formula, dialect);
1549 Self::from_token_stream(&stream)
1550 }
1551
1552 pub fn new_with_dialect(
1554 formula: &str,
1555 dialect: FormulaDialect,
1556 ) -> Result<Self, TokenizerError> {
1557 let mut tokenizer = Tokenizer {
1558 formula: formula.to_string(),
1559 items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
1561 offset: 0,
1562 token_start: 0,
1563 token_end: 0,
1564 dialect,
1565 };
1566 tokenizer.parse()?;
1567 Ok(tokenizer)
1568 }
1569
1570 pub fn from_token_stream(stream: &TokenStream) -> Self {
1571 Tokenizer {
1572 formula: stream.source.to_string(),
1573 items: stream.to_tokens(),
1574 token_stack: Vec::with_capacity(16),
1575 offset: 0,
1576 token_start: 0,
1577 token_end: 0,
1578 dialect: stream.dialect,
1579 }
1580 }
1581
1582 #[inline]
1584 fn current_byte(&self) -> Option<u8> {
1585 self.formula.as_bytes().get(self.offset).copied()
1586 }
1587
1588 #[inline]
1590 fn has_token(&self) -> bool {
1591 self.token_end > self.token_start
1592 }
1593
1594 #[inline]
1596 fn start_token(&mut self) {
1597 self.token_start = self.offset;
1598 self.token_end = self.offset;
1599 }
1600
1601 #[inline]
1603 fn extend_token(&mut self) {
1604 self.token_end = self.offset;
1605 }
1606
1607 fn parse(&mut self) -> Result<(), TokenizerError> {
1609 if self.formula.is_empty() {
1610 return Ok(());
1611 }
1612
1613 if self.formula.as_bytes()[0] != b'=' {
1615 self.items.push(Token::new_with_span(
1616 self.formula.clone(),
1617 TokenType::Literal,
1618 TokenSubType::None,
1619 0,
1620 self.formula.len(),
1621 ));
1622 return Ok(());
1623 }
1624
1625 self.offset = 1;
1627 self.start_token();
1628
1629 while self.offset < self.formula.len() {
1630 if self.check_scientific_notation()? {
1631 continue;
1632 }
1633
1634 let curr_byte = self.formula.as_bytes()[self.offset];
1635
1636 if is_token_ender(curr_byte) && self.has_token() {
1638 self.save_token();
1639 self.start_token();
1640 }
1641
1642 match curr_byte {
1644 b'"' | b'\'' => self.parse_string()?,
1645 b'[' => self.parse_brackets()?,
1646 b'#' => {
1647 if self.should_emit_hash_postfix() {
1648 self.emit_hash_postfix();
1649 } else {
1650 self.parse_error()?
1651 }
1652 }
1653 b' ' | b'\n' => self.parse_whitespace()?,
1654 b':' => {
1655 if self.should_emit_colon_infix() {
1656 self.emit_infix_operator(self.offset, self.offset + 1);
1657 } else {
1658 if !self.has_token() {
1659 self.start_token();
1660 }
1661 self.offset += 1;
1662 self.extend_token();
1663 }
1664 }
1665 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
1667 self.parse_operator()?
1668 }
1669 b'{' | b'(' => self.parse_opener()?,
1670 b')' | b'}' => self.parse_closer()?,
1671 b';' | b',' => self.parse_separator()?,
1672 _ => {
1673 if !self.has_token() {
1675 self.start_token();
1676 }
1677 self.offset += 1;
1678 self.extend_token();
1679 }
1680 }
1681 }
1682
1683 if self.has_token() {
1685 self.save_token();
1686 }
1687
1688 if !self.token_stack.is_empty() {
1690 return Err(TokenizerError {
1691 message: "Unmatched opening parenthesis or bracket".to_string(),
1692 pos: self.offset,
1693 });
1694 }
1695
1696 Ok(())
1697 }
1698
1699 fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
1707 if let Some(curr_byte) = self.current_byte() {
1708 if (curr_byte == b'+' || curr_byte == b'-')
1709 && self.has_token()
1710 && self.is_scientific_notation_base()
1711 && self
1712 .formula
1713 .as_bytes()
1714 .get(self.offset + 1)
1715 .is_some_and(|b| b.is_ascii_digit())
1716 {
1717 self.offset += 1;
1718 self.extend_token();
1719 return Ok(true);
1720 }
1721 }
1722 Ok(false)
1723 }
1724
1725 fn is_scientific_notation_base(&self) -> bool {
1728 if !self.has_token() {
1729 return false;
1730 }
1731
1732 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
1733 if token_slice.len() < 2 {
1734 return false;
1735 }
1736
1737 let last = token_slice[token_slice.len() - 1];
1738 if !(last == b'E' || last == b'e') {
1739 return false;
1740 }
1741
1742 let first = token_slice[0];
1743 if !first.is_ascii_digit() {
1744 return false;
1745 }
1746
1747 let mut dot_seen = false;
1748 for &ch in &token_slice[1..token_slice.len() - 1] {
1750 match ch {
1751 b'0'..=b'9' => {}
1752 b'.' if !dot_seen => dot_seen = true,
1753 _ => return false,
1754 }
1755 }
1756 true
1757 }
1758
1759 fn save_token(&mut self) {
1761 if self.has_token() {
1762 let token =
1763 Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
1764 self.items.push(token);
1765 }
1766 }
1767
1768 fn parse_string(&mut self) -> Result<(), TokenizerError> {
1770 let delim = self.formula.as_bytes()[self.offset];
1771 assert!(delim == b'"' || delim == b'\'');
1772
1773 let is_dollar_ref = delim == b'\''
1775 && self.has_token()
1776 && self.token_end - self.token_start == 1
1777 && self.formula.as_bytes()[self.token_start] == b'$';
1778
1779 let glue_to_token = delim == b'\''
1784 && self.has_token()
1785 && self.token_end > 0
1786 && self.formula.as_bytes()[self.token_end - 1] == b':';
1787
1788 if !is_dollar_ref && !glue_to_token && self.has_token() {
1789 self.save_token();
1790 self.start_token();
1791 }
1792
1793 let string_start = if is_dollar_ref {
1794 self.token_start
1795 } else {
1796 self.offset
1797 };
1798 self.offset += 1; while self.offset < self.formula.len() {
1801 if self.formula.as_bytes()[self.offset] == delim {
1802 self.offset += 1;
1803 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
1805 {
1806 self.offset += 1; } else {
1808 if delim == b'"' {
1810 let token = Token::make_operand_from_slice(
1811 &self.formula,
1812 string_start,
1813 self.offset,
1814 );
1815 self.items.push(token);
1816 self.start_token();
1817 } else {
1818 self.token_end = self.offset;
1820 }
1821 return Ok(());
1822 }
1823 } else {
1824 self.offset += 1;
1825 }
1826 }
1827
1828 Err(TokenizerError {
1829 message: "Reached end of formula while parsing string".to_string(),
1830 pos: self.offset,
1831 })
1832 }
1833
1834 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
1836 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
1837
1838 if !self.has_token() {
1839 self.start_token();
1840 }
1841
1842 let mut open_count = 1;
1843 self.offset += 1;
1844
1845 while self.offset < self.formula.len() {
1846 match self.formula.as_bytes()[self.offset] {
1847 b'\'' => {
1852 if self.offset + 1 < self.formula.len() {
1853 self.offset += 2;
1854 continue;
1855 }
1856 self.offset += 1;
1857 continue;
1858 }
1859 b'[' => open_count += 1,
1860 b']' => {
1861 open_count -= 1;
1862 if open_count == 0 {
1863 self.offset += 1;
1864 self.extend_token();
1865 return Ok(());
1866 }
1867 }
1868 _ => {}
1869 }
1870 self.offset += 1;
1871 }
1872
1873 Err(TokenizerError {
1874 message: "Encountered unmatched '['".to_string(),
1875 pos: self.offset,
1876 })
1877 }
1878
1879 fn should_emit_hash_postfix(&self) -> bool {
1881 if self.has_token() {
1882 if self.formula.as_bytes()[self.token_end - 1] == b'!' {
1883 return false;
1884 }
1885 let value = &self.formula[self.token_start..self.token_end];
1886 let is_range = !value.starts_with('"')
1890 && !value.starts_with('#')
1891 && value != "TRUE"
1892 && value != "FALSE"
1893 && value.parse::<f64>().is_err();
1894 return is_range;
1895 }
1896 let prev = self
1897 .items
1898 .iter()
1899 .rev()
1900 .find(|t| t.token_type != TokenType::Whitespace);
1901 match prev {
1902 Some(p) => match p.token_type {
1903 TokenType::OpPostfix => true,
1904 TokenType::Paren | TokenType::Func | TokenType::Array
1905 if p.subtype == TokenSubType::Close =>
1906 {
1907 true
1908 }
1909 TokenType::Operand if p.subtype == TokenSubType::Range => true,
1910 _ => false,
1911 },
1912 None => false,
1913 }
1914 }
1915
1916 fn emit_hash_postfix(&mut self) {
1917 self.save_token();
1918 self.start_token();
1919 self.items.push(Token::from_slice(
1920 &self.formula,
1921 TokenType::OpPostfix,
1922 TokenSubType::None,
1923 self.offset,
1924 self.offset + 1,
1925 ));
1926 self.offset += 1;
1927 self.start_token();
1928 }
1929
1930 fn parse_error(&mut self) -> Result<(), TokenizerError> {
1932 let has_sheet_prefix = self.has_token()
1937 && self.token_end > 0
1938 && self.formula.as_bytes()[self.token_end - 1] == b'!';
1939 if has_sheet_prefix {
1940 if self.token_end - self.token_start <= 1 {
1941 return Err(TokenizerError {
1942 message: format!(
1943 "Empty sheet qualifier before error literal at position {}",
1944 self.offset
1945 ),
1946 pos: self.offset,
1947 });
1948 }
1949 self.start_token();
1951 } else if self.has_token() {
1952 self.save_token();
1953 self.start_token();
1954 }
1955
1956 let error_start = self.offset;
1957
1958 for &err_code in ERROR_CODES {
1960 let err_bytes = err_code.as_bytes();
1961 if self.offset + err_bytes.len() <= self.formula.len() {
1962 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1963 if slice.eq_ignore_ascii_case(err_bytes) {
1964 let token = Token::make_operand_from_slice(
1965 &self.formula,
1966 error_start,
1967 self.offset + err_bytes.len(),
1968 );
1969 self.items.push(token);
1970 self.offset += err_bytes.len();
1971 self.start_token();
1972 return Ok(());
1973 }
1974 }
1975 }
1976
1977 Err(TokenizerError {
1978 message: format!("Invalid error code at position {}", self.offset),
1979 pos: self.offset,
1980 })
1981 }
1982
1983 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
1985 self.save_token();
1986
1987 let ws_start = self.offset;
1988 while self.offset < self.formula.len() {
1989 match self.formula.as_bytes()[self.offset] {
1990 b' ' | b'\n' => self.offset += 1,
1991 _ => break,
1992 }
1993 }
1994
1995 let token_type = if self.prev_is_reference_producing()
1996 && next_starts_reference_expression(&self.formula, self.offset)
1997 {
1998 TokenType::OpInfix
1999 } else {
2000 TokenType::Whitespace
2001 };
2002
2003 self.items.push(Token::from_slice(
2004 &self.formula,
2005 token_type,
2006 TokenSubType::None,
2007 ws_start,
2008 self.offset,
2009 ));
2010 self.start_token();
2011 Ok(())
2012 }
2013
2014 fn prev_non_whitespace(&self) -> Option<&Token> {
2015 self.items
2016 .iter()
2017 .rev()
2018 .find(|t| t.token_type != TokenType::Whitespace)
2019 }
2020
2021 fn prev_is_reference_producing(&self) -> bool {
2022 match self.prev_non_whitespace() {
2023 Some(prev) => match prev.token_type {
2024 TokenType::OpPostfix => true,
2025 TokenType::Paren | TokenType::Func | TokenType::Array
2026 if prev.subtype == TokenSubType::Close =>
2027 {
2028 true
2029 }
2030 TokenType::Operand if prev.subtype == TokenSubType::Range => {
2031 is_reference_operand_value(&prev.value)
2032 }
2033 _ => false,
2034 },
2035 None => false,
2036 }
2037 }
2038
2039 fn should_emit_colon_infix(&self) -> bool {
2040 if self.has_token() {
2041 let value = &self.formula[self.token_start..self.token_end];
2042 if value.ends_with('!') {
2043 return false;
2044 }
2045 return reference_value_contains_range_colon(value)
2046 || value.contains('[')
2047 || (value.contains('!')
2048 && next_reference_has_sheet_qualifier(&self.formula, self.offset + 1));
2049 }
2050 self.prev_is_reference_producing()
2051 }
2052
2053 fn emit_infix_operator(&mut self, start: usize, end: usize) {
2054 self.save_token();
2055 self.start_token();
2056 self.items.push(Token::from_slice(
2057 &self.formula,
2058 TokenType::OpInfix,
2059 TokenSubType::None,
2060 start,
2061 end,
2062 ));
2063 self.offset = end;
2064 self.start_token();
2065 }
2066
2067 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
2069 self.save_token();
2070
2071 if self.offset + 1 < self.formula.len() {
2073 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
2074 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
2075 self.items.push(Token::from_slice(
2076 &self.formula,
2077 TokenType::OpInfix,
2078 TokenSubType::None,
2079 self.offset,
2080 self.offset + 2,
2081 ));
2082 self.offset += 2;
2083 self.start_token();
2084 return Ok(());
2085 }
2086 }
2087
2088 let curr_byte = self.formula.as_bytes()[self.offset];
2089 let token_type = match curr_byte {
2090 b'@' => TokenType::OpPrefix,
2091 b'%' => TokenType::OpPostfix,
2092 b'+' | b'-' => {
2093 if self.items.is_empty() {
2095 TokenType::OpPrefix
2096 } else {
2097 let prev = self
2098 .items
2099 .iter()
2100 .rev()
2101 .find(|t| t.token_type != TokenType::Whitespace);
2102 if let Some(p) = prev {
2103 if p.subtype == TokenSubType::Close
2104 || p.token_type == TokenType::OpPostfix
2105 || p.token_type == TokenType::Operand
2106 {
2107 TokenType::OpInfix
2108 } else {
2109 TokenType::OpPrefix
2110 }
2111 } else {
2112 TokenType::OpPrefix
2113 }
2114 }
2115 }
2116 _ => TokenType::OpInfix,
2117 };
2118
2119 self.items.push(Token::from_slice(
2120 &self.formula,
2121 token_type,
2122 TokenSubType::None,
2123 self.offset,
2124 self.offset + 1,
2125 ));
2126 self.offset += 1;
2127 self.start_token();
2128 Ok(())
2129 }
2130
2131 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
2133 let curr_byte = self.formula.as_bytes()[self.offset];
2134 assert!(curr_byte == b'(' || curr_byte == b'{');
2135
2136 let token = if curr_byte == b'{' {
2137 self.save_token();
2138 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
2139 } else if self.has_token() {
2140 let token = Token::make_subexp_from_slice(
2142 &self.formula,
2143 true,
2144 self.token_start,
2145 self.offset + 1,
2146 );
2147 self.token_start = self.offset + 1;
2148 self.token_end = self.offset + 1;
2149 token
2150 } else {
2151 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
2152 };
2153
2154 self.items.push(token.clone());
2155 self.token_stack.push(token);
2156 self.offset += 1;
2157 self.start_token();
2158 Ok(())
2159 }
2160
2161 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
2163 self.save_token();
2164
2165 let curr_byte = self.formula.as_bytes()[self.offset];
2166 assert!(curr_byte == b')' || curr_byte == b'}');
2167
2168 if let Some(open_token) = self.token_stack.pop() {
2169 let closer = open_token.get_closer()?;
2170 if (curr_byte == b'}' && closer.value != "}")
2171 || (curr_byte == b')' && closer.value != ")")
2172 {
2173 return Err(TokenizerError {
2174 message: "Mismatched ( and { pair".to_string(),
2175 pos: self.offset,
2176 });
2177 }
2178
2179 self.items.push(Token::from_slice(
2180 &self.formula,
2181 closer.token_type,
2182 TokenSubType::Close,
2183 self.offset,
2184 self.offset + 1,
2185 ));
2186 } else {
2187 return Err(TokenizerError {
2188 message: format!("No matching opener for closer at position {}", self.offset),
2189 pos: self.offset,
2190 });
2191 }
2192
2193 self.offset += 1;
2194 self.start_token();
2195 Ok(())
2196 }
2197
2198 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
2200 self.save_token();
2201
2202 let curr_byte = self.formula.as_bytes()[self.offset];
2203 assert!(curr_byte == b';' || curr_byte == b',');
2204
2205 let top_token = self.token_stack.last();
2206 let in_function_or_array = matches!(
2207 top_token.map(|t| t.token_type),
2208 Some(TokenType::Func | TokenType::Array)
2209 );
2210 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
2211
2212 let (token_type, subtype) = match curr_byte {
2213 b',' => {
2214 if in_function_or_array {
2215 (TokenType::Sep, TokenSubType::Arg)
2216 } else {
2217 (TokenType::OpInfix, TokenSubType::None)
2218 }
2219 }
2220 b';' => {
2221 if in_array {
2222 (TokenType::Sep, TokenSubType::Row)
2224 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
2225 (TokenType::Sep, TokenSubType::Arg)
2227 } else if self.dialect == FormulaDialect::OpenFormula {
2228 (TokenType::OpInfix, TokenSubType::None)
2229 } else {
2230 (TokenType::Sep, TokenSubType::Row)
2231 }
2232 }
2233 _ => (TokenType::OpInfix, TokenSubType::None),
2234 };
2235
2236 self.items.push(Token::from_slice(
2237 &self.formula,
2238 token_type,
2239 subtype,
2240 self.offset,
2241 self.offset + 1,
2242 ));
2243
2244 self.offset += 1;
2245 self.start_token();
2246 Ok(())
2247 }
2248
2249 pub fn render(&self) -> String {
2251 if self.items.is_empty() {
2252 "".to_string()
2253 } else if self.items[0].token_type == TokenType::Literal {
2254 self.items[0].value.clone()
2255 } else {
2256 let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
2257 format!("={concatenated}")
2258 }
2259 }
2260
2261 pub fn dialect(&self) -> FormulaDialect {
2263 self.dialect
2264 }
2265}
2266
2267impl TryFrom<&str> for Tokenizer {
2268 type Error = TokenizerError;
2269
2270 fn try_from(value: &str) -> Result<Self, Self::Error> {
2271 Tokenizer::new(value)
2272 }
2273}
2274
2275impl TryFrom<String> for Tokenizer {
2276 type Error = TokenizerError;
2277
2278 fn try_from(value: String) -> Result<Self, Self::Error> {
2279 Tokenizer::new(&value)
2280 }
2281}