1use std::convert::TryFrom;
2use std::error::Error;
3use std::fmt::{self, Display};
4use std::sync::Arc;
5
6#[cfg(feature = "serde")]
7use serde::{Deserialize, Serialize};
8
9use crate::types::FormulaDialect;
10
11const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%@";
12
13const fn build_token_enders() -> [bool; 256] {
14 let mut tbl = [false; 256];
15 let bytes = TOKEN_ENDERS.as_bytes();
16 let mut i = 0;
17 while i < bytes.len() {
18 tbl[bytes[i] as usize] = true;
19 i += 1;
20 }
21 tbl
22}
23static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
24
25#[inline(always)]
26fn is_token_ender(c: u8) -> bool {
27 TOKEN_ENDERS_TABLE[c as usize]
28}
29
30static ERROR_CODES: &[&str] = &[
31 "#NULL!",
32 "#DIV/0!",
33 "#VALUE!",
34 "#REF!",
35 "#NAME?",
36 "#NUM!",
37 "#N/A",
38 "#GETTING_DATA",
39];
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum Associativity {
44 Left,
45 Right,
46}
47
48#[derive(Debug)]
50pub struct TokenizerError {
51 pub message: String,
52 pub pos: usize,
53}
54
55#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
58pub enum RecoveryAction {
59 SkippedUnmatchedCloser,
61 UnterminatedString,
63 UnmatchedBracket,
65 InvalidErrorLiteral,
67 UnmatchedOpener,
69}
70
71#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct TokenDiagnostic {
75 pub span: TokenSpan,
76 pub message: String,
77 pub recovery: RecoveryAction,
78}
79
80impl TokenDiagnostic {
81 fn new(span: TokenSpan, message: String, recovery: RecoveryAction) -> Self {
82 Self {
83 span,
84 message,
85 recovery,
86 }
87 }
88}
89
90#[derive(Debug, Clone)]
91struct SpanTokenizerError {
92 kind: SpanTokenizerErrorKind,
93 pos: usize,
94 message: String,
95 span_start: Option<usize>,
96 span_end: Option<usize>,
97}
98
99#[derive(Debug, Clone, Copy)]
100enum SpanTokenizerErrorKind {
101 NoMatchingOpener,
102 UnmatchedOpening,
103 UnterminatedString,
104 UnmatchedBracket,
105 MismatchedPair,
106 InvalidErrorLiteral,
107}
108
109impl From<SpanTokenizerError> for TokenizerError {
110 fn from(value: SpanTokenizerError) -> Self {
111 TokenizerError {
112 message: value.message,
113 pos: value.pos,
114 }
115 }
116}
117
118impl fmt::Display for TokenizerError {
119 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
120 write!(f, "TokenizerError: {}", self.message)
121 }
122}
123
124impl Error for TokenizerError {}
125
126#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
129pub enum TokenType {
130 Literal,
131 Operand,
132 Func,
133 Array,
134 Paren,
135 Sep,
136 OpPrefix,
137 OpInfix,
138 OpPostfix,
139 Whitespace,
140}
141impl Display for TokenType {
142 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
143 write!(f, "{self:?}")
144 }
145}
146
147#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
149#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
150pub enum TokenSubType {
151 None,
152 Text,
153 Number,
154 Logical,
155 Error,
156 Range,
157 Open,
158 Close,
159 Arg,
160 Row,
161}
162impl Display for TokenSubType {
163 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164 write!(f, "{self:?}")
165 }
166}
167
168#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
170#[derive(Debug, Clone, PartialEq, Hash)]
171pub struct Token {
172 pub value: String, pub token_type: TokenType,
174 pub subtype: TokenSubType,
175 pub start: usize,
176 pub end: usize,
177}
178
179impl Display for Token {
180 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
181 write!(
182 f,
183 "<{} subtype: {:?} value: {}>",
184 self.token_type, self.subtype, self.value
185 )
186 }
187}
188
189impl Token {
190 pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
191 Token {
192 value,
193 token_type,
194 subtype,
195 start: 0,
196 end: 0,
197 }
198 }
199
200 pub fn new_with_span(
201 value: String,
202 token_type: TokenType,
203 subtype: TokenSubType,
204 start: usize,
205 end: usize,
206 ) -> Self {
207 Token {
208 value,
209 token_type,
210 subtype,
211 start,
212 end,
213 }
214 }
215
216 fn from_slice(
217 source: &str,
218 token_type: TokenType,
219 subtype: TokenSubType,
220 start: usize,
221 end: usize,
222 ) -> Self {
223 Token {
224 value: source[start..end].to_string(),
225 token_type,
226 subtype,
227 start,
228 end,
229 }
230 }
231
232 pub fn is_operator(&self) -> bool {
233 matches!(
234 self.token_type,
235 TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
236 )
237 }
238
239 pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
240 let op = if self.token_type == TokenType::OpPrefix {
242 "u"
243 } else {
244 self.value.as_str()
245 };
246
247 match op {
258 ":" | " " | "," => Some((8, Associativity::Left)),
259 "%" => Some((7, Associativity::Left)),
260 "^" => Some((6, Associativity::Right)),
261 "u" => Some((5, Associativity::Right)),
262 "*" | "/" => Some((4, Associativity::Left)),
263 "+" | "-" => Some((3, Associativity::Left)),
264 "&" => Some((2, Associativity::Left)),
265 "=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
266 _ => None,
267 }
268 }
269
270 pub fn make_operand(value: String) -> Self {
272 let subtype = if value.starts_with('"') {
273 TokenSubType::Text
274 } else if value.starts_with('#') {
275 TokenSubType::Error
276 } else if value == "TRUE" || value == "FALSE" {
277 TokenSubType::Logical
278 } else if value.parse::<f64>().is_ok() {
279 TokenSubType::Number
280 } else {
281 TokenSubType::Range
282 };
283 Token::new(value, TokenType::Operand, subtype)
284 }
285
286 pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
288 let subtype = if value.starts_with('"') {
289 TokenSubType::Text
290 } else if value.starts_with('#') {
291 TokenSubType::Error
292 } else if value == "TRUE" || value == "FALSE" {
293 TokenSubType::Logical
294 } else if value.parse::<f64>().is_ok() {
295 TokenSubType::Number
296 } else {
297 TokenSubType::Range
298 };
299 Token::new_with_span(value, TokenType::Operand, subtype, start, end)
300 }
301
302 fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
303 let value_str = &source[start..end];
304 let subtype = if value_str.starts_with('"') {
305 TokenSubType::Text
306 } else if value_str.starts_with('#') {
307 TokenSubType::Error
308 } else if value_str == "TRUE" || value_str == "FALSE" {
309 TokenSubType::Logical
310 } else if value_str.parse::<f64>().is_ok() {
311 TokenSubType::Number
312 } else {
313 TokenSubType::Range
314 };
315 Token::from_slice(source, TokenType::Operand, subtype, start, end)
316 }
317
318 pub fn make_subexp(value: &str, func: bool) -> Self {
323 let last_char = value.chars().last().expect("Empty token value");
324 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
325 let token_type = if func {
326 TokenType::Func
327 } else if "{}".contains(last_char) {
328 TokenType::Array
329 } else if "()".contains(last_char) {
330 TokenType::Paren
331 } else {
332 TokenType::Func
333 };
334 let subtype = if ")}".contains(last_char) {
335 TokenSubType::Close
336 } else {
337 TokenSubType::Open
338 };
339 Token::new(value.to_string(), token_type, subtype)
340 }
341
342 pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
344 let last_char = value.chars().last().expect("Empty token value");
345 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
346 let token_type = if func {
347 TokenType::Func
348 } else if "{}".contains(last_char) {
349 TokenType::Array
350 } else if "()".contains(last_char) {
351 TokenType::Paren
352 } else {
353 TokenType::Func
354 };
355 let subtype = if ")}".contains(last_char) {
356 TokenSubType::Close
357 } else {
358 TokenSubType::Open
359 };
360 Token::new_with_span(value.to_string(), token_type, subtype, start, end)
361 }
362
363 fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
364 let value_str = &source[start..end];
365 let last_char = value_str.chars().last().expect("Empty token value");
366 let token_type = if func {
367 TokenType::Func
368 } else if "{}".contains(last_char) {
369 TokenType::Array
370 } else if "()".contains(last_char) {
371 TokenType::Paren
372 } else {
373 TokenType::Func
374 };
375 let subtype = if ")}".contains(last_char) {
376 TokenSubType::Close
377 } else {
378 TokenSubType::Open
379 };
380 Token::from_slice(source, token_type, subtype, start, end)
381 }
382
383 pub fn get_closer(&self) -> Result<Token, TokenizerError> {
385 if self.subtype != TokenSubType::Open {
386 return Err(TokenizerError {
387 message: "Token is not an opener".to_string(),
388 pos: 0,
389 });
390 }
391 let closer_value = if self.token_type == TokenType::Array {
392 "}"
393 } else {
394 ")"
395 };
396 Ok(Token::make_subexp(
397 closer_value,
398 self.token_type == TokenType::Func,
399 ))
400 }
401
402 pub fn make_separator(value: &str) -> Self {
404 assert!(value == "," || value == ";");
405 let subtype = if value == "," {
406 TokenSubType::Arg
407 } else {
408 TokenSubType::Row
409 };
410 Token::new(value.to_string(), TokenType::Sep, subtype)
411 }
412
413 pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
415 assert!(value == "," || value == ";");
416 let subtype = if value == "," {
417 TokenSubType::Arg
418 } else {
419 TokenSubType::Row
420 };
421 Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
422 }
423}
424
425#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
426#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
427pub struct TokenSpan {
428 pub token_type: TokenType,
429 pub subtype: TokenSubType,
430 pub start: usize,
431 pub end: usize,
432}
433
434#[derive(Debug, Clone, Copy, PartialEq, Eq)]
435pub struct TokenView<'a> {
436 pub span: &'a TokenSpan,
437 pub value: &'a str,
438}
439
440#[derive(Debug, Clone)]
446pub struct TokenStream {
447 source: Arc<str>,
448 pub spans: Vec<TokenSpan>,
449 dialect: FormulaDialect,
450 diagnostics: Vec<TokenDiagnostic>,
451}
452
453impl TokenStream {
454 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
455 Self::new_with_dialect(formula, FormulaDialect::Excel)
456 }
457
458 pub fn new_with_dialect(
459 formula: &str,
460 dialect: FormulaDialect,
461 ) -> Result<Self, TokenizerError> {
462 let source: Arc<str> = Arc::from(formula);
463 let spans = tokenize_spans_with_dialect(source.as_ref(), dialect)?;
464 Ok(TokenStream {
465 source,
466 spans,
467 dialect,
468 diagnostics: Vec::new(),
469 })
470 }
471
472 pub fn new_best_effort(formula: &str) -> Self {
473 Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
474 }
475
476 pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
477 let source: Arc<str> = Arc::from(formula);
478 let mut tokenizer = SpanTokenizer::new(source.as_ref(), dialect);
479 let spans = tokenizer.parse_best_effort();
480 let diagnostics = tokenizer.diagnostics;
481 TokenStream {
482 source,
483 spans,
484 dialect,
485 diagnostics,
486 }
487 }
488
489 pub fn diagnostics(&self) -> Vec<TokenDiagnostic> {
490 self.diagnostics.clone()
491 }
492
493 pub fn diagnostics_ref(&self) -> &[TokenDiagnostic] {
494 &self.diagnostics
495 }
496
497 pub fn has_errors(&self) -> bool {
498 !self.diagnostics.is_empty()
499 }
500
501 pub fn invalid_spans_iter(&self) -> impl Iterator<Item = &TokenSpan> {
502 self.spans.iter().filter(|span| {
503 self.diagnostics.iter().any(|diag| {
504 diag.span.start == span.start
505 && diag.span.end == span.end
506 && diag.span.token_type == span.token_type
507 })
508 })
509 }
510
511 pub fn invalid_spans(&self) -> Vec<&TokenSpan> {
512 self.invalid_spans_iter().collect()
513 }
514
515 pub fn source(&self) -> &str {
516 &self.source
517 }
518
519 pub fn dialect(&self) -> FormulaDialect {
520 self.dialect
521 }
522
523 pub fn len(&self) -> usize {
524 self.spans.len()
525 }
526
527 pub fn is_empty(&self) -> bool {
528 self.spans.is_empty()
529 }
530
531 pub fn get(&self, index: usize) -> Option<TokenView<'_>> {
532 let span = self.spans.get(index)?;
533 let value = self.source.get(span.start..span.end)?;
534 Some(TokenView { span, value })
535 }
536
537 pub fn to_tokens(&self) -> Vec<Token> {
538 self.spans
539 .iter()
540 .map(|s| {
541 let value = self
542 .source
543 .get(s.start..s.end)
544 .unwrap_or_default()
545 .to_string();
546 Token::new_with_span(value, s.token_type, s.subtype, s.start, s.end)
547 })
548 .collect()
549 }
550
551 pub fn render(&self) -> String {
556 let mut out = String::with_capacity(self.source.len());
557 for span in &self.spans {
558 if let Some(s) = self.source.get(span.start..span.end) {
559 out.push_str(s);
560 }
561 }
562 out
563 }
564
565 pub fn render_formula(&self) -> String {
567 if self.source.as_bytes().first() == Some(&b'=') {
568 format!("={}", self.render())
569 } else {
570 self.render()
571 }
572 }
573}
574
575pub(crate) fn tokenize_spans_with_dialect(
576 formula: &str,
577 dialect: FormulaDialect,
578) -> Result<Vec<TokenSpan>, TokenizerError> {
579 let mut tokenizer = SpanTokenizer::new(formula, dialect);
580 tokenizer.parse()?;
581 Ok(tokenizer.spans)
582}
583
584fn operand_subtype(value_str: &str) -> TokenSubType {
585 if value_str.starts_with('"') {
586 TokenSubType::Text
587 } else if value_str.starts_with('#') {
588 TokenSubType::Error
589 } else if value_str == "TRUE" || value_str == "FALSE" {
590 TokenSubType::Logical
591 } else if value_str.parse::<f64>().is_ok() {
592 TokenSubType::Number
593 } else {
594 TokenSubType::Range
595 }
596}
597
598struct SpanTokenizer<'a> {
599 formula: &'a str,
600 spans: Vec<TokenSpan>,
601 token_stack: Vec<TokenSpan>,
602 offset: usize,
603 token_start: usize,
604 token_end: usize,
605 dialect: FormulaDialect,
606 diagnostics: Vec<TokenDiagnostic>,
607}
608
609impl<'a> SpanTokenizer<'a> {
610 fn new(formula: &'a str, dialect: FormulaDialect) -> Self {
611 SpanTokenizer {
612 formula,
613 spans: Vec::with_capacity(formula.len() / 2),
614 token_stack: Vec::with_capacity(16),
615 offset: 0,
616 token_start: 0,
617 token_end: 0,
618 dialect,
619 diagnostics: Vec::new(),
620 }
621 }
622
623 #[inline]
624 fn current_byte(&self) -> Option<u8> {
625 self.formula.as_bytes().get(self.offset).copied()
626 }
627
628 #[inline]
629 fn has_token(&self) -> bool {
630 self.token_end > self.token_start
631 }
632
633 #[inline]
634 fn start_token(&mut self) {
635 self.token_start = self.offset;
636 self.token_end = self.offset;
637 }
638
639 #[inline]
640 fn extend_token(&mut self) {
641 self.token_end = self.offset;
642 }
643
644 fn push_span(
645 &mut self,
646 token_type: TokenType,
647 subtype: TokenSubType,
648 start: usize,
649 end: usize,
650 ) {
651 self.spans.push(TokenSpan {
652 token_type,
653 subtype,
654 start,
655 end,
656 });
657 }
658
659 fn save_token(&mut self) {
660 if self.has_token() {
661 let value_str = &self.formula[self.token_start..self.token_end];
662 let subtype = operand_subtype(value_str);
663 self.push_span(
664 TokenType::Operand,
665 subtype,
666 self.token_start,
667 self.token_end,
668 );
669 }
670 }
671
672 fn check_scientific_notation(&mut self) -> bool {
673 if let Some(curr_byte) = self.current_byte()
674 && (curr_byte == b'+' || curr_byte == b'-')
675 && self.has_token()
676 && self.is_scientific_notation_base()
677 {
678 self.offset += 1;
679 self.extend_token();
680 return true;
681 }
682 false
683 }
684
685 fn is_scientific_notation_base(&self) -> bool {
686 if !self.has_token() {
687 return false;
688 }
689
690 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
691 if token_slice.len() < 2 {
692 return false;
693 }
694
695 let last = token_slice[token_slice.len() - 1];
696 if !(last == b'E' || last == b'e') {
697 return false;
698 }
699
700 let first = token_slice[0];
701 if !first.is_ascii_digit() {
702 return false;
703 }
704
705 let mut dot_seen = false;
706 for &ch in &token_slice[1..token_slice.len() - 1] {
707 match ch {
708 b'0'..=b'9' => {}
709 b'.' if !dot_seen => dot_seen = true,
710 _ => return false,
711 }
712 }
713 true
714 }
715
716 fn parse(&mut self) -> Result<(), TokenizerError> {
717 self.parse_with_recovery(false).map_err(Into::into)
718 }
719
720 pub(crate) fn parse_best_effort(&mut self) -> Vec<TokenSpan> {
721 let _ = self.parse_with_recovery(true);
722 self.spans.clone()
723 }
724
725 fn parse_with_recovery(&mut self, best_effort: bool) -> Result<(), SpanTokenizerError> {
726 if self.formula.is_empty() {
727 return Ok(());
728 }
729
730 if self.formula.as_bytes()[0] != b'=' {
731 self.push_span(
732 TokenType::Literal,
733 TokenSubType::None,
734 0,
735 self.formula.len(),
736 );
737 return Ok(());
738 }
739
740 self.offset = 1;
741 self.start_token();
742
743 while self.offset < self.formula.len() {
744 if self.check_scientific_notation() {
745 continue;
746 }
747
748 let curr_byte = self.formula.as_bytes()[self.offset];
749
750 if is_token_ender(curr_byte) && self.has_token() {
751 self.save_token();
752 self.start_token();
753 }
754
755 let parse_result = match curr_byte {
756 b'"' | b'\'' => self.parse_string(),
757 b'[' => self.parse_brackets(),
758 b'#' => self.parse_error(),
759 b' ' | b'\n' => self.parse_whitespace(),
760 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
761 self.parse_operator()
762 }
763 b'{' | b'(' => self.parse_opener(),
764 b')' | b'}' => self.parse_closer(),
765 b';' | b',' => self.parse_separator(),
766 _ => {
767 if !self.has_token() {
768 self.start_token();
769 }
770 self.offset += 1;
771 self.extend_token();
772 Ok(())
773 }
774 };
775
776 if let Err(err) = parse_result {
777 if best_effort {
778 self.recover_from_error(err);
779 } else {
780 return Err(err);
781 }
782 }
783 }
784
785 if self.has_token() {
786 self.save_token();
787 }
788
789 if !self.token_stack.is_empty() {
790 if best_effort {
791 while let Some(open_token) = self.token_stack.pop() {
792 if let Some(span) = self.spans.iter().find(|span| {
793 span.start == open_token.start
794 && span.end == open_token.end
795 && span.token_type == open_token.token_type
796 && span.subtype == open_token.subtype
797 }) {
798 self.diagnostics.push(TokenDiagnostic::new(
799 *span,
800 "Unmatched opening parenthesis or bracket".to_string(),
801 RecoveryAction::UnmatchedOpener,
802 ));
803 }
804 }
805 } else {
806 return Err(SpanTokenizerError {
807 kind: SpanTokenizerErrorKind::UnmatchedOpening,
808 pos: self.offset,
809 message: "Unmatched opening parenthesis or bracket".to_string(),
810 span_start: None,
811 span_end: None,
812 });
813 }
814 }
815
816 Ok(())
817 }
818
819 fn recover_from_error(&mut self, error: SpanTokenizerError) {
820 match error.kind {
821 SpanTokenizerErrorKind::NoMatchingOpener => {
822 let span = TokenSpan {
823 token_type: TokenType::Operand,
824 subtype: TokenSubType::None,
825 start: error.pos,
826 end: error.pos + 1,
827 };
828 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
829 self.offset = span.end;
830 self.start_token();
831 self.diagnostics.push(TokenDiagnostic::new(
832 span,
833 format!("No matching opener for closer at position {}", error.pos),
834 RecoveryAction::SkippedUnmatchedCloser,
835 ));
836 }
837 SpanTokenizerErrorKind::UnmatchedOpening => {
838 debug_assert!(
839 false,
840 "UnmatchedOpening is handled at end-of-input and should not be routed through recover_from_error"
841 );
842 }
843 SpanTokenizerErrorKind::UnterminatedString => {
844 let start = error.span_start.unwrap_or(error.pos);
845 let span = TokenSpan {
846 token_type: TokenType::Operand,
847 subtype: TokenSubType::None,
848 start,
849 end: self.formula.len(),
850 };
851 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
852 self.offset = span.end;
853 self.start_token();
854 self.diagnostics.push(TokenDiagnostic::new(
855 span,
856 "Reached end of formula while parsing string".to_string(),
857 RecoveryAction::UnterminatedString,
858 ));
859 }
860 SpanTokenizerErrorKind::UnmatchedBracket => {
861 let start = error.span_start.unwrap_or(error.pos);
862 let end = error.span_end.unwrap_or(self.formula.len());
863 let span = TokenSpan {
864 token_type: TokenType::Operand,
865 subtype: TokenSubType::None,
866 start,
867 end,
868 };
869 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
870 self.offset = span.end;
871 self.start_token();
872 self.diagnostics.push(TokenDiagnostic::new(
873 span,
874 "Encountered unmatched '['".to_string(),
875 RecoveryAction::UnmatchedBracket,
876 ));
877 }
878 SpanTokenizerErrorKind::MismatchedPair => {
879 let span = TokenSpan {
880 token_type: TokenType::Operand,
881 subtype: TokenSubType::None,
882 start: error.pos,
883 end: error.pos + 1,
884 };
885 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
886 self.offset = span.end;
887 self.start_token();
888 self.diagnostics.push(TokenDiagnostic::new(
889 span,
890 "Mismatched ( and { pair".to_string(),
891 RecoveryAction::SkippedUnmatchedCloser,
892 ));
893 }
894 SpanTokenizerErrorKind::InvalidErrorLiteral => {
895 let start = error.span_start.unwrap_or(error.pos);
896 let end = error.span_end.unwrap_or(error.pos + 1);
897 let span = TokenSpan {
898 token_type: TokenType::Operand,
899 subtype: TokenSubType::None,
900 start,
901 end,
902 };
903 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
904 self.offset = span.end;
905 self.start_token();
906 self.diagnostics.push(TokenDiagnostic::new(
907 span,
908 "Invalid error code".to_string(),
909 RecoveryAction::InvalidErrorLiteral,
910 ));
911 }
912 }
913 }
914
915 fn parse_string(&mut self) -> Result<(), SpanTokenizerError> {
916 let delim = self.formula.as_bytes()[self.offset];
917 assert!(delim == b'"' || delim == b'\'');
918
919 let is_dollar_ref = delim == b'\''
920 && self.has_token()
921 && self.token_end - self.token_start == 1
922 && self.formula.as_bytes()[self.token_start] == b'$';
923
924 if !is_dollar_ref
925 && self.has_token()
926 && self.token_end > 0
927 && self.formula.as_bytes()[self.token_end - 1] != b':'
928 {
929 self.save_token();
930 self.start_token();
931 }
932
933 let string_start = if is_dollar_ref {
934 self.token_start
935 } else {
936 self.offset
937 };
938 self.offset += 1;
939
940 while self.offset < self.formula.len() {
941 if self.formula.as_bytes()[self.offset] == delim {
942 self.offset += 1;
943 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
944 {
945 self.offset += 1;
946 } else {
947 if delim == b'"' {
948 let value_str = &self.formula[string_start..self.offset];
949 let subtype = operand_subtype(value_str);
950 self.push_span(TokenType::Operand, subtype, string_start, self.offset);
951 self.start_token();
952 } else {
953 self.token_end = self.offset;
954 }
955 return Ok(());
956 }
957 } else {
958 self.offset += 1;
959 }
960 }
961
962 Err(SpanTokenizerError {
963 kind: SpanTokenizerErrorKind::UnterminatedString,
964 pos: self.offset,
965 message: "Reached end of formula while parsing string".to_string(),
966 span_start: Some(string_start),
967 span_end: Some(self.formula.len()),
968 })
969 }
970
971 fn parse_brackets(&mut self) -> Result<(), SpanTokenizerError> {
972 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
973
974 if !self.has_token() {
975 self.start_token();
976 }
977
978 let bracket_start = self.offset;
979 let mut open_count = 1;
980 self.offset += 1;
981
982 while self.offset < self.formula.len() {
983 match self.formula.as_bytes()[self.offset] {
984 b'[' => open_count += 1,
985 b']' => {
986 open_count -= 1;
987 if open_count == 0 {
988 self.offset += 1;
989 self.extend_token();
990 return Ok(());
991 }
992 }
993 _ => {}
994 }
995 self.offset += 1;
996 }
997
998 Err(SpanTokenizerError {
999 kind: SpanTokenizerErrorKind::UnmatchedBracket,
1000 pos: self.offset,
1001 message: "Encountered unmatched '['".to_string(),
1002 span_start: Some(bracket_start),
1003 span_end: Some(self.formula.len()),
1004 })
1005 }
1006
1007 fn parse_error(&mut self) -> Result<(), SpanTokenizerError> {
1008 if self.has_token()
1009 && self.token_end > 0
1010 && self.formula.as_bytes()[self.token_end - 1] != b'!'
1011 {
1012 self.save_token();
1013 self.start_token();
1014 }
1015
1016 let error_start = if self.has_token() {
1017 self.token_start
1018 } else {
1019 self.offset
1020 };
1021
1022 for &err_code in ERROR_CODES {
1023 let err_bytes = err_code.as_bytes();
1024 if self.offset + err_bytes.len() <= self.formula.len() {
1025 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1026 if slice == err_bytes {
1027 self.push_span(
1028 TokenType::Operand,
1029 TokenSubType::Error,
1030 error_start,
1031 self.offset + err_bytes.len(),
1032 );
1033 self.offset += err_bytes.len();
1034 self.start_token();
1035 return Ok(());
1036 }
1037 }
1038 }
1039
1040 let mut end = self.offset + 1;
1041 while end < self.formula.len() {
1042 let ch = self.formula.as_bytes()[end];
1043 if is_token_ender(ch)
1044 || ch == b' '
1045 || ch == b'\n'
1046 || ch == b'('
1047 || ch == b'{'
1048 || ch == b'['
1049 || ch == b'"'
1050 || ch == b'\''
1051 {
1052 break;
1053 }
1054 end += 1;
1055 }
1056
1057 Err(SpanTokenizerError {
1058 kind: SpanTokenizerErrorKind::InvalidErrorLiteral,
1059 pos: self.offset,
1060 message: format!("Invalid error code at position {}", self.offset),
1061 span_start: Some(error_start),
1062 span_end: Some(end),
1063 })
1064 }
1065
1066 fn parse_whitespace(&mut self) -> Result<(), SpanTokenizerError> {
1067 self.save_token();
1068
1069 let ws_start = self.offset;
1070 while self.offset < self.formula.len() {
1071 match self.formula.as_bytes()[self.offset] {
1072 b' ' | b'\n' => self.offset += 1,
1073 _ => break,
1074 }
1075 }
1076
1077 self.push_span(
1078 TokenType::Whitespace,
1079 TokenSubType::None,
1080 ws_start,
1081 self.offset,
1082 );
1083 self.start_token();
1084 Ok(())
1085 }
1086
1087 fn prev_non_whitespace(&self) -> Option<&TokenSpan> {
1088 self.spans
1089 .iter()
1090 .rev()
1091 .find(|t| t.token_type != TokenType::Whitespace)
1092 }
1093
1094 fn parse_operator(&mut self) -> Result<(), SpanTokenizerError> {
1095 self.save_token();
1096
1097 if self.offset + 1 < self.formula.len() {
1098 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1099 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1100 self.push_span(
1101 TokenType::OpInfix,
1102 TokenSubType::None,
1103 self.offset,
1104 self.offset + 2,
1105 );
1106 self.offset += 2;
1107 self.start_token();
1108 return Ok(());
1109 }
1110 }
1111
1112 let curr_byte = self.formula.as_bytes()[self.offset];
1113 let token_type = match curr_byte {
1114 b'@' => TokenType::OpPrefix,
1115 b'%' => TokenType::OpPostfix,
1116 b'+' | b'-' => {
1117 if self.spans.is_empty() {
1118 TokenType::OpPrefix
1119 } else {
1120 let prev = self.prev_non_whitespace();
1121 if let Some(p) = prev {
1122 if p.subtype == TokenSubType::Close
1123 || p.token_type == TokenType::OpPostfix
1124 || p.token_type == TokenType::Operand
1125 {
1126 TokenType::OpInfix
1127 } else {
1128 TokenType::OpPrefix
1129 }
1130 } else {
1131 TokenType::OpPrefix
1132 }
1133 }
1134 }
1135 _ => TokenType::OpInfix,
1136 };
1137
1138 self.push_span(token_type, TokenSubType::None, self.offset, self.offset + 1);
1139 self.offset += 1;
1140 self.start_token();
1141 Ok(())
1142 }
1143
1144 fn parse_opener(&mut self) -> Result<(), SpanTokenizerError> {
1145 let curr_byte = self.formula.as_bytes()[self.offset];
1146 assert!(curr_byte == b'(' || curr_byte == b'{');
1147
1148 let token = if curr_byte == b'{' {
1149 self.save_token();
1150 TokenSpan {
1151 token_type: TokenType::Array,
1152 subtype: TokenSubType::Open,
1153 start: self.offset,
1154 end: self.offset + 1,
1155 }
1156 } else if self.has_token() {
1157 let token = TokenSpan {
1158 token_type: TokenType::Func,
1159 subtype: TokenSubType::Open,
1160 start: self.token_start,
1161 end: self.offset + 1,
1162 };
1163 self.token_start = self.offset + 1;
1164 self.token_end = self.offset + 1;
1165 token
1166 } else {
1167 TokenSpan {
1168 token_type: TokenType::Paren,
1169 subtype: TokenSubType::Open,
1170 start: self.offset,
1171 end: self.offset + 1,
1172 }
1173 };
1174
1175 self.spans.push(token);
1176 self.token_stack.push(token);
1177 self.offset += 1;
1178 self.start_token();
1179 Ok(())
1180 }
1181
1182 fn parse_closer(&mut self) -> Result<(), SpanTokenizerError> {
1183 self.save_token();
1184
1185 let curr_byte = self.formula.as_bytes()[self.offset];
1186 assert!(curr_byte == b')' || curr_byte == b'}');
1187
1188 if let Some(open_token) = self.token_stack.last().copied() {
1189 let expected = if open_token.token_type == TokenType::Array {
1190 b'}'
1191 } else {
1192 b')'
1193 };
1194 if curr_byte != expected {
1195 return Err(SpanTokenizerError {
1196 kind: SpanTokenizerErrorKind::MismatchedPair,
1197 pos: self.offset,
1198 message: "Mismatched ( and { pair".to_string(),
1199 span_start: Some(self.offset),
1200 span_end: Some(self.offset + 1),
1201 });
1202 }
1203
1204 self.token_stack.pop();
1205 self.push_span(
1206 open_token.token_type,
1207 TokenSubType::Close,
1208 self.offset,
1209 self.offset + 1,
1210 );
1211 } else {
1212 return Err(SpanTokenizerError {
1213 kind: SpanTokenizerErrorKind::NoMatchingOpener,
1214 pos: self.offset,
1215 message: format!("No matching opener for closer at position {}", self.offset),
1216 span_start: Some(self.offset),
1217 span_end: Some(self.offset + 1),
1218 });
1219 }
1220
1221 self.offset += 1;
1222 self.start_token();
1223 Ok(())
1224 }
1225
1226 fn parse_separator(&mut self) -> Result<(), SpanTokenizerError> {
1227 self.save_token();
1228
1229 let curr_byte = self.formula.as_bytes()[self.offset];
1230 assert!(curr_byte == b';' || curr_byte == b',');
1231
1232 let top_token = self.token_stack.last();
1233 let in_function_or_array = matches!(
1234 top_token.map(|t| t.token_type),
1235 Some(TokenType::Func | TokenType::Array)
1236 );
1237 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1238
1239 let (token_type, subtype) = match curr_byte {
1240 b',' => {
1241 if in_function_or_array {
1242 (TokenType::Sep, TokenSubType::Arg)
1243 } else {
1244 (TokenType::OpInfix, TokenSubType::None)
1245 }
1246 }
1247 b';' => {
1248 if in_array {
1249 (TokenType::Sep, TokenSubType::Row)
1250 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1251 (TokenType::Sep, TokenSubType::Arg)
1252 } else if self.dialect == FormulaDialect::OpenFormula {
1253 (TokenType::OpInfix, TokenSubType::None)
1254 } else {
1255 (TokenType::Sep, TokenSubType::Row)
1256 }
1257 }
1258 _ => (TokenType::OpInfix, TokenSubType::None),
1259 };
1260
1261 self.push_span(token_type, subtype, self.offset, self.offset + 1);
1262 self.offset += 1;
1263 self.start_token();
1264 Ok(())
1265 }
1266}
1267
1268pub struct Tokenizer {
1270 formula: String, pub items: Vec<Token>,
1272 token_stack: Vec<Token>,
1273 offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
1277}
1278
1279impl Tokenizer {
1280 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
1282 Self::new_with_dialect(formula, FormulaDialect::Excel)
1283 }
1284
1285 pub fn new_best_effort(formula: &str) -> Self {
1287 Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
1288 }
1289
1290 pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
1292 let stream = TokenStream::new_best_effort_with_dialect(formula, dialect);
1293 Self::from_token_stream(&stream)
1294 }
1295
1296 pub fn new_with_dialect(
1298 formula: &str,
1299 dialect: FormulaDialect,
1300 ) -> Result<Self, TokenizerError> {
1301 let mut tokenizer = Tokenizer {
1302 formula: formula.to_string(),
1303 items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
1305 offset: 0,
1306 token_start: 0,
1307 token_end: 0,
1308 dialect,
1309 };
1310 tokenizer.parse()?;
1311 Ok(tokenizer)
1312 }
1313
1314 pub fn from_token_stream(stream: &TokenStream) -> Self {
1315 Tokenizer {
1316 formula: stream.source.to_string(),
1317 items: stream.to_tokens(),
1318 token_stack: Vec::with_capacity(16),
1319 offset: 0,
1320 token_start: 0,
1321 token_end: 0,
1322 dialect: stream.dialect,
1323 }
1324 }
1325
1326 #[inline]
1328 fn current_byte(&self) -> Option<u8> {
1329 self.formula.as_bytes().get(self.offset).copied()
1330 }
1331
1332 #[inline]
1334 fn has_token(&self) -> bool {
1335 self.token_end > self.token_start
1336 }
1337
1338 #[inline]
1340 fn start_token(&mut self) {
1341 self.token_start = self.offset;
1342 self.token_end = self.offset;
1343 }
1344
1345 #[inline]
1347 fn extend_token(&mut self) {
1348 self.token_end = self.offset;
1349 }
1350
1351 fn parse(&mut self) -> Result<(), TokenizerError> {
1353 if self.formula.is_empty() {
1354 return Ok(());
1355 }
1356
1357 if self.formula.as_bytes()[0] != b'=' {
1359 self.items.push(Token::new_with_span(
1360 self.formula.clone(),
1361 TokenType::Literal,
1362 TokenSubType::None,
1363 0,
1364 self.formula.len(),
1365 ));
1366 return Ok(());
1367 }
1368
1369 self.offset = 1;
1371 self.start_token();
1372
1373 while self.offset < self.formula.len() {
1374 if self.check_scientific_notation()? {
1375 continue;
1376 }
1377
1378 let curr_byte = self.formula.as_bytes()[self.offset];
1379
1380 if is_token_ender(curr_byte) && self.has_token() {
1382 self.save_token();
1383 self.start_token();
1384 }
1385
1386 match curr_byte {
1388 b'"' | b'\'' => self.parse_string()?,
1389 b'[' => self.parse_brackets()?,
1390 b'#' => self.parse_error()?,
1391 b' ' | b'\n' => self.parse_whitespace()?,
1392 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
1394 self.parse_operator()?
1395 }
1396 b'{' | b'(' => self.parse_opener()?,
1397 b')' | b'}' => self.parse_closer()?,
1398 b';' | b',' => self.parse_separator()?,
1399 _ => {
1400 if !self.has_token() {
1402 self.start_token();
1403 }
1404 self.offset += 1;
1405 self.extend_token();
1406 }
1407 }
1408 }
1409
1410 if self.has_token() {
1412 self.save_token();
1413 }
1414
1415 if !self.token_stack.is_empty() {
1417 return Err(TokenizerError {
1418 message: "Unmatched opening parenthesis or bracket".to_string(),
1419 pos: self.offset,
1420 });
1421 }
1422
1423 Ok(())
1424 }
1425
1426 fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
1429 if let Some(curr_byte) = self.current_byte()
1430 && (curr_byte == b'+' || curr_byte == b'-')
1431 && self.has_token()
1432 && self.is_scientific_notation_base()
1433 {
1434 self.offset += 1;
1435 self.extend_token();
1436 return Ok(true);
1437 }
1438 Ok(false)
1439 }
1440
1441 fn is_scientific_notation_base(&self) -> bool {
1444 if !self.has_token() {
1445 return false;
1446 }
1447
1448 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
1449 if token_slice.len() < 2 {
1450 return false;
1451 }
1452
1453 let last = token_slice[token_slice.len() - 1];
1454 if !(last == b'E' || last == b'e') {
1455 return false;
1456 }
1457
1458 let first = token_slice[0];
1459 if !first.is_ascii_digit() {
1460 return false;
1461 }
1462
1463 let mut dot_seen = false;
1464 for &ch in &token_slice[1..token_slice.len() - 1] {
1466 match ch {
1467 b'0'..=b'9' => {}
1468 b'.' if !dot_seen => dot_seen = true,
1469 _ => return false,
1470 }
1471 }
1472 true
1473 }
1474
1475 fn save_token(&mut self) {
1477 if self.has_token() {
1478 let token =
1479 Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
1480 self.items.push(token);
1481 }
1482 }
1483
1484 fn parse_string(&mut self) -> Result<(), TokenizerError> {
1486 let delim = self.formula.as_bytes()[self.offset];
1487 assert!(delim == b'"' || delim == b'\'');
1488
1489 let is_dollar_ref = delim == b'\''
1491 && self.has_token()
1492 && self.token_end - self.token_start == 1
1493 && self.formula.as_bytes()[self.token_start] == b'$';
1494
1495 if !is_dollar_ref && self.has_token() {
1496 if self.token_end > 0 && self.formula.as_bytes()[self.token_end - 1] != b':' {
1498 self.save_token();
1499 self.start_token();
1500 }
1501 }
1502
1503 let string_start = if is_dollar_ref {
1504 self.token_start
1505 } else {
1506 self.offset
1507 };
1508 self.offset += 1; while self.offset < self.formula.len() {
1511 if self.formula.as_bytes()[self.offset] == delim {
1512 self.offset += 1;
1513 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
1515 {
1516 self.offset += 1; } else {
1518 if delim == b'"' {
1520 let token = Token::make_operand_from_slice(
1521 &self.formula,
1522 string_start,
1523 self.offset,
1524 );
1525 self.items.push(token);
1526 self.start_token();
1527 } else {
1528 self.token_end = self.offset;
1530 }
1531 return Ok(());
1532 }
1533 } else {
1534 self.offset += 1;
1535 }
1536 }
1537
1538 Err(TokenizerError {
1539 message: "Reached end of formula while parsing string".to_string(),
1540 pos: self.offset,
1541 })
1542 }
1543
1544 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
1546 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
1547
1548 if !self.has_token() {
1549 self.start_token();
1550 }
1551
1552 let mut open_count = 1;
1553 self.offset += 1;
1554
1555 while self.offset < self.formula.len() {
1556 match self.formula.as_bytes()[self.offset] {
1557 b'[' => open_count += 1,
1558 b']' => {
1559 open_count -= 1;
1560 if open_count == 0 {
1561 self.offset += 1;
1562 self.extend_token();
1563 return Ok(());
1564 }
1565 }
1566 _ => {}
1567 }
1568 self.offset += 1;
1569 }
1570
1571 Err(TokenizerError {
1572 message: "Encountered unmatched '['".to_string(),
1573 pos: self.offset,
1574 })
1575 }
1576
1577 fn parse_error(&mut self) -> Result<(), TokenizerError> {
1579 if self.has_token()
1581 && self.token_end > 0
1582 && self.formula.as_bytes()[self.token_end - 1] != b'!'
1583 {
1584 self.save_token();
1585 self.start_token();
1586 }
1587
1588 let error_start = if self.has_token() {
1589 self.token_start
1590 } else {
1591 self.offset
1592 };
1593
1594 for &err_code in ERROR_CODES {
1596 let err_bytes = err_code.as_bytes();
1597 if self.offset + err_bytes.len() <= self.formula.len() {
1598 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1599 if slice == err_bytes {
1600 let token = Token::make_operand_from_slice(
1601 &self.formula,
1602 error_start,
1603 self.offset + err_bytes.len(),
1604 );
1605 self.items.push(token);
1606 self.offset += err_bytes.len();
1607 self.start_token();
1608 return Ok(());
1609 }
1610 }
1611 }
1612
1613 Err(TokenizerError {
1614 message: format!("Invalid error code at position {}", self.offset),
1615 pos: self.offset,
1616 })
1617 }
1618
1619 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
1621 self.save_token();
1622
1623 let ws_start = self.offset;
1624 while self.offset < self.formula.len() {
1625 match self.formula.as_bytes()[self.offset] {
1626 b' ' | b'\n' => self.offset += 1,
1627 _ => break,
1628 }
1629 }
1630
1631 self.items.push(Token::from_slice(
1632 &self.formula,
1633 TokenType::Whitespace,
1634 TokenSubType::None,
1635 ws_start,
1636 self.offset,
1637 ));
1638 self.start_token();
1639 Ok(())
1640 }
1641
1642 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
1644 self.save_token();
1645
1646 if self.offset + 1 < self.formula.len() {
1648 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1649 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1650 self.items.push(Token::from_slice(
1651 &self.formula,
1652 TokenType::OpInfix,
1653 TokenSubType::None,
1654 self.offset,
1655 self.offset + 2,
1656 ));
1657 self.offset += 2;
1658 self.start_token();
1659 return Ok(());
1660 }
1661 }
1662
1663 let curr_byte = self.formula.as_bytes()[self.offset];
1664 let token_type = match curr_byte {
1665 b'@' => TokenType::OpPrefix,
1666 b'%' => TokenType::OpPostfix,
1667 b'+' | b'-' => {
1668 if self.items.is_empty() {
1670 TokenType::OpPrefix
1671 } else {
1672 let prev = self
1673 .items
1674 .iter()
1675 .rev()
1676 .find(|t| t.token_type != TokenType::Whitespace);
1677 if let Some(p) = prev {
1678 if p.subtype == TokenSubType::Close
1679 || p.token_type == TokenType::OpPostfix
1680 || p.token_type == TokenType::Operand
1681 {
1682 TokenType::OpInfix
1683 } else {
1684 TokenType::OpPrefix
1685 }
1686 } else {
1687 TokenType::OpPrefix
1688 }
1689 }
1690 }
1691 _ => TokenType::OpInfix,
1692 };
1693
1694 self.items.push(Token::from_slice(
1695 &self.formula,
1696 token_type,
1697 TokenSubType::None,
1698 self.offset,
1699 self.offset + 1,
1700 ));
1701 self.offset += 1;
1702 self.start_token();
1703 Ok(())
1704 }
1705
1706 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
1708 let curr_byte = self.formula.as_bytes()[self.offset];
1709 assert!(curr_byte == b'(' || curr_byte == b'{');
1710
1711 let token = if curr_byte == b'{' {
1712 self.save_token();
1713 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1714 } else if self.has_token() {
1715 let token = Token::make_subexp_from_slice(
1717 &self.formula,
1718 true,
1719 self.token_start,
1720 self.offset + 1,
1721 );
1722 self.token_start = self.offset + 1;
1723 self.token_end = self.offset + 1;
1724 token
1725 } else {
1726 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1727 };
1728
1729 self.items.push(token.clone());
1730 self.token_stack.push(token);
1731 self.offset += 1;
1732 self.start_token();
1733 Ok(())
1734 }
1735
1736 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
1738 self.save_token();
1739
1740 let curr_byte = self.formula.as_bytes()[self.offset];
1741 assert!(curr_byte == b')' || curr_byte == b'}');
1742
1743 if let Some(open_token) = self.token_stack.pop() {
1744 let closer = open_token.get_closer()?;
1745 if (curr_byte == b'}' && closer.value != "}")
1746 || (curr_byte == b')' && closer.value != ")")
1747 {
1748 return Err(TokenizerError {
1749 message: "Mismatched ( and { pair".to_string(),
1750 pos: self.offset,
1751 });
1752 }
1753
1754 self.items.push(Token::from_slice(
1755 &self.formula,
1756 closer.token_type,
1757 TokenSubType::Close,
1758 self.offset,
1759 self.offset + 1,
1760 ));
1761 } else {
1762 return Err(TokenizerError {
1763 message: format!("No matching opener for closer at position {}", self.offset),
1764 pos: self.offset,
1765 });
1766 }
1767
1768 self.offset += 1;
1769 self.start_token();
1770 Ok(())
1771 }
1772
1773 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
1775 self.save_token();
1776
1777 let curr_byte = self.formula.as_bytes()[self.offset];
1778 assert!(curr_byte == b';' || curr_byte == b',');
1779
1780 let top_token = self.token_stack.last();
1781 let in_function_or_array = matches!(
1782 top_token.map(|t| t.token_type),
1783 Some(TokenType::Func | TokenType::Array)
1784 );
1785 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1786
1787 let (token_type, subtype) = match curr_byte {
1788 b',' => {
1789 if in_function_or_array {
1790 (TokenType::Sep, TokenSubType::Arg)
1791 } else {
1792 (TokenType::OpInfix, TokenSubType::None)
1793 }
1794 }
1795 b';' => {
1796 if in_array {
1797 (TokenType::Sep, TokenSubType::Row)
1799 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1800 (TokenType::Sep, TokenSubType::Arg)
1802 } else if self.dialect == FormulaDialect::OpenFormula {
1803 (TokenType::OpInfix, TokenSubType::None)
1804 } else {
1805 (TokenType::Sep, TokenSubType::Row)
1806 }
1807 }
1808 _ => (TokenType::OpInfix, TokenSubType::None),
1809 };
1810
1811 self.items.push(Token::from_slice(
1812 &self.formula,
1813 token_type,
1814 subtype,
1815 self.offset,
1816 self.offset + 1,
1817 ));
1818
1819 self.offset += 1;
1820 self.start_token();
1821 Ok(())
1822 }
1823
1824 pub fn render(&self) -> String {
1826 if self.items.is_empty() {
1827 "".to_string()
1828 } else if self.items[0].token_type == TokenType::Literal {
1829 self.items[0].value.clone()
1830 } else {
1831 let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
1832 format!("={concatenated}")
1833 }
1834 }
1835
1836 pub fn dialect(&self) -> FormulaDialect {
1838 self.dialect
1839 }
1840}
1841
1842impl TryFrom<&str> for Tokenizer {
1843 type Error = TokenizerError;
1844
1845 fn try_from(value: &str) -> Result<Self, Self::Error> {
1846 Tokenizer::new(value)
1847 }
1848}
1849
1850impl TryFrom<String> for Tokenizer {
1851 type Error = TokenizerError;
1852
1853 fn try_from(value: String) -> Result<Self, Self::Error> {
1854 Tokenizer::new(&value)
1855 }
1856}