1use std::convert::TryFrom;
2use std::error::Error;
3use std::fmt::{self, Display};
4use std::sync::Arc;
5
6#[cfg(feature = "serde")]
7use serde::{Deserialize, Serialize};
8
9use crate::types::FormulaDialect;
10
11const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%@";
12
13const fn build_token_enders() -> [bool; 256] {
14 let mut tbl = [false; 256];
15 let bytes = TOKEN_ENDERS.as_bytes();
16 let mut i = 0;
17 while i < bytes.len() {
18 tbl[bytes[i] as usize] = true;
19 i += 1;
20 }
21 tbl
22}
23static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
24
25#[inline(always)]
26fn is_token_ender(c: u8) -> bool {
27 TOKEN_ENDERS_TABLE[c as usize]
28}
29
30static ERROR_CODES: &[&str] = &[
31 "#NULL!",
32 "#DIV/0!",
33 "#VALUE!",
34 "#REF!",
35 "#NAME?",
36 "#NUM!",
37 "#N/A",
38 "#GETTING_DATA",
39];
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum Associativity {
44 Left,
45 Right,
46}
47
48#[derive(Debug)]
50pub struct TokenizerError {
51 pub message: String,
52 pub pos: usize,
53}
54
55#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
58pub enum RecoveryAction {
59 SkippedUnmatchedCloser,
61 UnterminatedString,
63 UnmatchedBracket,
65 InvalidErrorLiteral,
67 UnmatchedOpener,
69}
70
71#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct TokenDiagnostic {
75 pub span: TokenSpan,
76 pub message: String,
77 pub recovery: RecoveryAction,
78}
79
80impl TokenDiagnostic {
81 fn new(span: TokenSpan, message: String, recovery: RecoveryAction) -> Self {
82 Self {
83 span,
84 message,
85 recovery,
86 }
87 }
88}
89
90#[derive(Debug, Clone)]
91struct SpanTokenizerError {
92 kind: SpanTokenizerErrorKind,
93 pos: usize,
94 message: String,
95 span_start: Option<usize>,
96 span_end: Option<usize>,
97}
98
99#[derive(Debug, Clone, Copy)]
100enum SpanTokenizerErrorKind {
101 NoMatchingOpener,
102 UnmatchedOpening,
103 UnterminatedString,
104 UnmatchedBracket,
105 MismatchedPair,
106 InvalidErrorLiteral,
107}
108
109impl From<SpanTokenizerError> for TokenizerError {
110 fn from(value: SpanTokenizerError) -> Self {
111 TokenizerError {
112 message: value.message,
113 pos: value.pos,
114 }
115 }
116}
117
118impl fmt::Display for TokenizerError {
119 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
120 write!(f, "TokenizerError: {}", self.message)
121 }
122}
123
124impl Error for TokenizerError {}
125
126#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
128#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
129pub enum TokenType {
130 Literal,
131 Operand,
132 Func,
133 Array,
134 Paren,
135 Sep,
136 OpPrefix,
137 OpInfix,
138 OpPostfix,
139 Whitespace,
140}
141impl Display for TokenType {
142 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
143 write!(f, "{self:?}")
144 }
145}
146
147#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
149#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
150pub enum TokenSubType {
151 None,
152 Text,
153 Number,
154 Logical,
155 Error,
156 Range,
157 Open,
158 Close,
159 Arg,
160 Row,
161}
162impl Display for TokenSubType {
163 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164 write!(f, "{self:?}")
165 }
166}
167
168#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
170#[derive(Debug, Clone, PartialEq, Hash)]
171pub struct Token {
172 pub value: String, pub token_type: TokenType,
174 pub subtype: TokenSubType,
175 pub start: usize,
176 pub end: usize,
177}
178
179impl Display for Token {
180 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
181 write!(
182 f,
183 "<{} subtype: {:?} value: {}>",
184 self.token_type, self.subtype, self.value
185 )
186 }
187}
188
189impl Token {
190 pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
191 Token {
192 value,
193 token_type,
194 subtype,
195 start: 0,
196 end: 0,
197 }
198 }
199
200 pub fn new_with_span(
201 value: String,
202 token_type: TokenType,
203 subtype: TokenSubType,
204 start: usize,
205 end: usize,
206 ) -> Self {
207 Token {
208 value,
209 token_type,
210 subtype,
211 start,
212 end,
213 }
214 }
215
216 fn from_slice(
217 source: &str,
218 token_type: TokenType,
219 subtype: TokenSubType,
220 start: usize,
221 end: usize,
222 ) -> Self {
223 Token {
224 value: source[start..end].to_string(),
225 token_type,
226 subtype,
227 start,
228 end,
229 }
230 }
231
232 pub fn is_operator(&self) -> bool {
233 matches!(
234 self.token_type,
235 TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
236 )
237 }
238
239 pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
240 let op = if self.token_type == TokenType::OpPrefix {
242 "u"
243 } else {
244 self.value.as_str()
245 };
246
247 match op {
258 ":" | " " | "," => Some((8, Associativity::Left)),
259 "%" => Some((7, Associativity::Left)),
260 "u" => Some((6, Associativity::Right)),
261 "^" => Some((5, Associativity::Right)),
262 "*" | "/" => Some((4, Associativity::Left)),
263 "+" | "-" => Some((3, Associativity::Left)),
264 "&" => Some((2, Associativity::Left)),
265 "=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
266 _ => None,
267 }
268 }
269
270 pub fn make_operand(value: String) -> Self {
272 let subtype = if value.starts_with('"') {
273 TokenSubType::Text
274 } else if value.starts_with('#') {
275 TokenSubType::Error
276 } else if value == "TRUE" || value == "FALSE" {
277 TokenSubType::Logical
278 } else if value.parse::<f64>().is_ok() {
279 TokenSubType::Number
280 } else {
281 TokenSubType::Range
282 };
283 Token::new(value, TokenType::Operand, subtype)
284 }
285
286 pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
288 let subtype = if value.starts_with('"') {
289 TokenSubType::Text
290 } else if value.starts_with('#') {
291 TokenSubType::Error
292 } else if value == "TRUE" || value == "FALSE" {
293 TokenSubType::Logical
294 } else if value.parse::<f64>().is_ok() {
295 TokenSubType::Number
296 } else {
297 TokenSubType::Range
298 };
299 Token::new_with_span(value, TokenType::Operand, subtype, start, end)
300 }
301
302 fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
303 let value_str = &source[start..end];
304 let subtype = if value_str.starts_with('"') {
305 TokenSubType::Text
306 } else if value_str.starts_with('#') {
307 TokenSubType::Error
308 } else if value_str == "TRUE" || value_str == "FALSE" {
309 TokenSubType::Logical
310 } else if value_str.parse::<f64>().is_ok() {
311 TokenSubType::Number
312 } else {
313 TokenSubType::Range
314 };
315 Token::from_slice(source, TokenType::Operand, subtype, start, end)
316 }
317
318 pub fn make_subexp(value: &str, func: bool) -> Self {
323 let last_char = value.chars().last().expect("Empty token value");
324 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
325 let token_type = if func {
326 TokenType::Func
327 } else if "{}".contains(last_char) {
328 TokenType::Array
329 } else if "()".contains(last_char) {
330 TokenType::Paren
331 } else {
332 TokenType::Func
333 };
334 let subtype = if ")}".contains(last_char) {
335 TokenSubType::Close
336 } else {
337 TokenSubType::Open
338 };
339 Token::new(value.to_string(), token_type, subtype)
340 }
341
342 pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
344 let last_char = value.chars().last().expect("Empty token value");
345 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
346 let token_type = if func {
347 TokenType::Func
348 } else if "{}".contains(last_char) {
349 TokenType::Array
350 } else if "()".contains(last_char) {
351 TokenType::Paren
352 } else {
353 TokenType::Func
354 };
355 let subtype = if ")}".contains(last_char) {
356 TokenSubType::Close
357 } else {
358 TokenSubType::Open
359 };
360 Token::new_with_span(value.to_string(), token_type, subtype, start, end)
361 }
362
363 fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
364 let value_str = &source[start..end];
365 let last_char = value_str.chars().last().expect("Empty token value");
366 let token_type = if func {
367 TokenType::Func
368 } else if "{}".contains(last_char) {
369 TokenType::Array
370 } else if "()".contains(last_char) {
371 TokenType::Paren
372 } else {
373 TokenType::Func
374 };
375 let subtype = if ")}".contains(last_char) {
376 TokenSubType::Close
377 } else {
378 TokenSubType::Open
379 };
380 Token::from_slice(source, token_type, subtype, start, end)
381 }
382
383 pub fn get_closer(&self) -> Result<Token, TokenizerError> {
385 if self.subtype != TokenSubType::Open {
386 return Err(TokenizerError {
387 message: "Token is not an opener".to_string(),
388 pos: 0,
389 });
390 }
391 let closer_value = if self.token_type == TokenType::Array {
392 "}"
393 } else {
394 ")"
395 };
396 Ok(Token::make_subexp(
397 closer_value,
398 self.token_type == TokenType::Func,
399 ))
400 }
401
402 pub fn make_separator(value: &str) -> Self {
404 assert!(value == "," || value == ";");
405 let subtype = if value == "," {
406 TokenSubType::Arg
407 } else {
408 TokenSubType::Row
409 };
410 Token::new(value.to_string(), TokenType::Sep, subtype)
411 }
412
413 pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
415 assert!(value == "," || value == ";");
416 let subtype = if value == "," {
417 TokenSubType::Arg
418 } else {
419 TokenSubType::Row
420 };
421 Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
422 }
423}
424
425#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
426#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
427pub struct TokenSpan {
428 pub token_type: TokenType,
429 pub subtype: TokenSubType,
430 pub start: usize,
431 pub end: usize,
432}
433
434#[derive(Debug, Clone, Copy, PartialEq, Eq)]
435pub struct TokenView<'a> {
436 pub span: &'a TokenSpan,
437 pub value: &'a str,
438}
439
440#[derive(Debug, Clone)]
446pub struct TokenStream {
447 source: Arc<str>,
448 pub spans: Vec<TokenSpan>,
449 dialect: FormulaDialect,
450 diagnostics: Vec<TokenDiagnostic>,
451}
452
453impl TokenStream {
454 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
455 Self::new_with_dialect(formula, FormulaDialect::Excel)
456 }
457
458 pub fn new_with_dialect(
459 formula: &str,
460 dialect: FormulaDialect,
461 ) -> Result<Self, TokenizerError> {
462 let source: Arc<str> = Arc::from(formula);
463 let spans = tokenize_spans_with_dialect(source.as_ref(), dialect)?;
464 Ok(TokenStream {
465 source,
466 spans,
467 dialect,
468 diagnostics: Vec::new(),
469 })
470 }
471
472 pub fn new_best_effort(formula: &str) -> Self {
473 Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
474 }
475
476 pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
477 let source: Arc<str> = Arc::from(formula);
478 let mut tokenizer = SpanTokenizer::new(source.as_ref(), dialect);
479 let spans = tokenizer.parse_best_effort();
480 let diagnostics = tokenizer.diagnostics;
481 TokenStream {
482 source,
483 spans,
484 dialect,
485 diagnostics,
486 }
487 }
488
489 pub fn diagnostics(&self) -> Vec<TokenDiagnostic> {
490 self.diagnostics.clone()
491 }
492
493 pub fn diagnostics_ref(&self) -> &[TokenDiagnostic] {
494 &self.diagnostics
495 }
496
497 pub fn has_errors(&self) -> bool {
498 !self.diagnostics.is_empty()
499 }
500
501 pub fn invalid_spans_iter(&self) -> impl Iterator<Item = &TokenSpan> {
502 self.spans.iter().filter(|span| {
503 self.diagnostics.iter().any(|diag| {
504 diag.span.start == span.start
505 && diag.span.end == span.end
506 && diag.span.token_type == span.token_type
507 })
508 })
509 }
510
511 pub fn invalid_spans(&self) -> Vec<&TokenSpan> {
512 self.invalid_spans_iter().collect()
513 }
514
515 pub fn source(&self) -> &str {
516 &self.source
517 }
518
519 pub fn dialect(&self) -> FormulaDialect {
520 self.dialect
521 }
522
523 pub fn len(&self) -> usize {
524 self.spans.len()
525 }
526
527 pub fn is_empty(&self) -> bool {
528 self.spans.is_empty()
529 }
530
531 pub fn get(&self, index: usize) -> Option<TokenView<'_>> {
532 let span = self.spans.get(index)?;
533 let value = self.source.get(span.start..span.end)?;
534 Some(TokenView { span, value })
535 }
536
537 pub fn to_tokens(&self) -> Vec<Token> {
538 self.spans
539 .iter()
540 .map(|s| {
541 let value = self
542 .source
543 .get(s.start..s.end)
544 .unwrap_or_default()
545 .to_string();
546 Token::new_with_span(value, s.token_type, s.subtype, s.start, s.end)
547 })
548 .collect()
549 }
550
551 pub fn render(&self) -> String {
556 let mut out = String::with_capacity(self.source.len());
557 for span in &self.spans {
558 if let Some(s) = self.source.get(span.start..span.end) {
559 out.push_str(s);
560 }
561 }
562 out
563 }
564
565 pub fn render_formula(&self) -> String {
567 if self.source.as_bytes().first() == Some(&b'=') {
568 format!("={}", self.render())
569 } else {
570 self.render()
571 }
572 }
573}
574
575pub(crate) fn tokenize_spans_with_dialect(
576 formula: &str,
577 dialect: FormulaDialect,
578) -> Result<Vec<TokenSpan>, TokenizerError> {
579 let mut tokenizer = SpanTokenizer::new(formula, dialect);
580 tokenizer.parse()?;
581 Ok(tokenizer.spans)
582}
583
584fn operand_subtype(value_str: &str) -> TokenSubType {
585 if value_str.starts_with('"') {
586 TokenSubType::Text
587 } else if value_str.starts_with('#') {
588 TokenSubType::Error
589 } else if value_str == "TRUE" || value_str == "FALSE" {
590 TokenSubType::Logical
591 } else if value_str.parse::<f64>().is_ok() {
592 TokenSubType::Number
593 } else {
594 TokenSubType::Range
595 }
596}
597
598struct SpanTokenizer<'a> {
599 formula: &'a str,
600 spans: Vec<TokenSpan>,
601 token_stack: Vec<TokenSpan>,
602 offset: usize,
603 token_start: usize,
604 token_end: usize,
605 dialect: FormulaDialect,
606 diagnostics: Vec<TokenDiagnostic>,
607}
608
609impl<'a> SpanTokenizer<'a> {
610 fn new(formula: &'a str, dialect: FormulaDialect) -> Self {
611 SpanTokenizer {
612 formula,
613 spans: Vec::with_capacity(formula.len() / 2),
614 token_stack: Vec::with_capacity(16),
615 offset: 0,
616 token_start: 0,
617 token_end: 0,
618 dialect,
619 diagnostics: Vec::new(),
620 }
621 }
622
623 #[inline]
624 fn current_byte(&self) -> Option<u8> {
625 self.formula.as_bytes().get(self.offset).copied()
626 }
627
628 #[inline]
629 fn has_token(&self) -> bool {
630 self.token_end > self.token_start
631 }
632
633 #[inline]
634 fn start_token(&mut self) {
635 self.token_start = self.offset;
636 self.token_end = self.offset;
637 }
638
639 #[inline]
640 fn extend_token(&mut self) {
641 self.token_end = self.offset;
642 }
643
644 fn push_span(
645 &mut self,
646 token_type: TokenType,
647 subtype: TokenSubType,
648 start: usize,
649 end: usize,
650 ) {
651 self.spans.push(TokenSpan {
652 token_type,
653 subtype,
654 start,
655 end,
656 });
657 }
658
659 fn save_token(&mut self) {
660 if self.has_token() {
661 let value_str = &self.formula[self.token_start..self.token_end];
662 let subtype = operand_subtype(value_str);
663 self.push_span(
664 TokenType::Operand,
665 subtype,
666 self.token_start,
667 self.token_end,
668 );
669 }
670 }
671
672 fn check_scientific_notation(&mut self) -> bool {
673 if let Some(curr_byte) = self.current_byte() {
674 if (curr_byte == b'+' || curr_byte == b'-')
675 && self.has_token()
676 && self.is_scientific_notation_base()
677 {
678 self.offset += 1;
679 self.extend_token();
680 return true;
681 }
682 }
683 false
684 }
685
686 fn is_scientific_notation_base(&self) -> bool {
687 if !self.has_token() {
688 return false;
689 }
690
691 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
692 if token_slice.len() < 2 {
693 return false;
694 }
695
696 let last = token_slice[token_slice.len() - 1];
697 if !(last == b'E' || last == b'e') {
698 return false;
699 }
700
701 let first = token_slice[0];
702 if !first.is_ascii_digit() {
703 return false;
704 }
705
706 let mut dot_seen = false;
707 for &ch in &token_slice[1..token_slice.len() - 1] {
708 match ch {
709 b'0'..=b'9' => {}
710 b'.' if !dot_seen => dot_seen = true,
711 _ => return false,
712 }
713 }
714 true
715 }
716
717 fn parse(&mut self) -> Result<(), TokenizerError> {
718 self.parse_with_recovery(false).map_err(Into::into)
719 }
720
721 pub(crate) fn parse_best_effort(&mut self) -> Vec<TokenSpan> {
722 let _ = self.parse_with_recovery(true);
723 self.spans.clone()
724 }
725
726 fn parse_with_recovery(&mut self, best_effort: bool) -> Result<(), SpanTokenizerError> {
727 if self.formula.is_empty() {
728 return Ok(());
729 }
730
731 if self.formula.as_bytes()[0] != b'=' {
732 self.push_span(
733 TokenType::Literal,
734 TokenSubType::None,
735 0,
736 self.formula.len(),
737 );
738 return Ok(());
739 }
740
741 self.offset = 1;
742 self.start_token();
743
744 while self.offset < self.formula.len() {
745 if self.check_scientific_notation() {
746 continue;
747 }
748
749 let curr_byte = self.formula.as_bytes()[self.offset];
750
751 if is_token_ender(curr_byte) && self.has_token() {
752 self.save_token();
753 self.start_token();
754 }
755
756 let parse_result = match curr_byte {
757 b'"' | b'\'' => self.parse_string(),
758 b'[' => self.parse_brackets(),
759 b'#' => self.parse_error(),
760 b' ' | b'\n' => self.parse_whitespace(),
761 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
762 self.parse_operator()
763 }
764 b'{' | b'(' => self.parse_opener(),
765 b')' | b'}' => self.parse_closer(),
766 b';' | b',' => self.parse_separator(),
767 _ => {
768 if !self.has_token() {
769 self.start_token();
770 }
771 self.offset += 1;
772 self.extend_token();
773 Ok(())
774 }
775 };
776
777 if let Err(err) = parse_result {
778 if best_effort {
779 self.recover_from_error(err);
780 } else {
781 return Err(err);
782 }
783 }
784 }
785
786 if self.has_token() {
787 self.save_token();
788 }
789
790 if !self.token_stack.is_empty() {
791 if best_effort {
792 while let Some(open_token) = self.token_stack.pop() {
793 if let Some(span) = self.spans.iter().find(|span| {
794 span.start == open_token.start
795 && span.end == open_token.end
796 && span.token_type == open_token.token_type
797 && span.subtype == open_token.subtype
798 }) {
799 self.diagnostics.push(TokenDiagnostic::new(
800 *span,
801 "Unmatched opening parenthesis or bracket".to_string(),
802 RecoveryAction::UnmatchedOpener,
803 ));
804 }
805 }
806 } else {
807 return Err(SpanTokenizerError {
808 kind: SpanTokenizerErrorKind::UnmatchedOpening,
809 pos: self.offset,
810 message: "Unmatched opening parenthesis or bracket".to_string(),
811 span_start: None,
812 span_end: None,
813 });
814 }
815 }
816
817 Ok(())
818 }
819
820 fn recover_from_error(&mut self, error: SpanTokenizerError) {
821 match error.kind {
822 SpanTokenizerErrorKind::NoMatchingOpener => {
823 let span = TokenSpan {
824 token_type: TokenType::Operand,
825 subtype: TokenSubType::None,
826 start: error.pos,
827 end: error.pos + 1,
828 };
829 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
830 self.offset = span.end;
831 self.start_token();
832 self.diagnostics.push(TokenDiagnostic::new(
833 span,
834 format!("No matching opener for closer at position {}", error.pos),
835 RecoveryAction::SkippedUnmatchedCloser,
836 ));
837 }
838 SpanTokenizerErrorKind::UnmatchedOpening => {
839 debug_assert!(
840 false,
841 "UnmatchedOpening is handled at end-of-input and should not be routed through recover_from_error"
842 );
843 }
844 SpanTokenizerErrorKind::UnterminatedString => {
845 let start = error.span_start.unwrap_or(error.pos);
846 let span = TokenSpan {
847 token_type: TokenType::Operand,
848 subtype: TokenSubType::None,
849 start,
850 end: self.formula.len(),
851 };
852 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
853 self.offset = span.end;
854 self.start_token();
855 self.diagnostics.push(TokenDiagnostic::new(
856 span,
857 "Reached end of formula while parsing string".to_string(),
858 RecoveryAction::UnterminatedString,
859 ));
860 }
861 SpanTokenizerErrorKind::UnmatchedBracket => {
862 let start = error.span_start.unwrap_or(error.pos);
863 let end = error.span_end.unwrap_or(self.formula.len());
864 let span = TokenSpan {
865 token_type: TokenType::Operand,
866 subtype: TokenSubType::None,
867 start,
868 end,
869 };
870 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
871 self.offset = span.end;
872 self.start_token();
873 self.diagnostics.push(TokenDiagnostic::new(
874 span,
875 "Encountered unmatched '['".to_string(),
876 RecoveryAction::UnmatchedBracket,
877 ));
878 }
879 SpanTokenizerErrorKind::MismatchedPair => {
880 let span = TokenSpan {
881 token_type: TokenType::Operand,
882 subtype: TokenSubType::None,
883 start: error.pos,
884 end: error.pos + 1,
885 };
886 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
887 self.offset = span.end;
888 self.start_token();
889 self.diagnostics.push(TokenDiagnostic::new(
890 span,
891 "Mismatched ( and { pair".to_string(),
892 RecoveryAction::SkippedUnmatchedCloser,
893 ));
894 }
895 SpanTokenizerErrorKind::InvalidErrorLiteral => {
896 let start = error.span_start.unwrap_or(error.pos);
897 let end = error.span_end.unwrap_or(error.pos + 1);
898 let span = TokenSpan {
899 token_type: TokenType::Operand,
900 subtype: TokenSubType::None,
901 start,
902 end,
903 };
904 self.push_span(TokenType::Operand, TokenSubType::None, span.start, span.end);
905 self.offset = span.end;
906 self.start_token();
907 self.diagnostics.push(TokenDiagnostic::new(
908 span,
909 "Invalid error code".to_string(),
910 RecoveryAction::InvalidErrorLiteral,
911 ));
912 }
913 }
914 }
915
916 fn parse_string(&mut self) -> Result<(), SpanTokenizerError> {
917 let delim = self.formula.as_bytes()[self.offset];
918 assert!(delim == b'"' || delim == b'\'');
919
920 let is_dollar_ref = delim == b'\''
921 && self.has_token()
922 && self.token_end - self.token_start == 1
923 && self.formula.as_bytes()[self.token_start] == b'$';
924
925 if !is_dollar_ref
926 && self.has_token()
927 && self.token_end > 0
928 && self.formula.as_bytes()[self.token_end - 1] != b':'
929 {
930 self.save_token();
931 self.start_token();
932 }
933
934 let string_start = if is_dollar_ref {
935 self.token_start
936 } else {
937 self.offset
938 };
939 self.offset += 1;
940
941 while self.offset < self.formula.len() {
942 if self.formula.as_bytes()[self.offset] == delim {
943 self.offset += 1;
944 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
945 {
946 self.offset += 1;
947 } else {
948 if delim == b'"' {
949 let value_str = &self.formula[string_start..self.offset];
950 let subtype = operand_subtype(value_str);
951 self.push_span(TokenType::Operand, subtype, string_start, self.offset);
952 self.start_token();
953 } else {
954 self.token_end = self.offset;
955 }
956 return Ok(());
957 }
958 } else {
959 self.offset += 1;
960 }
961 }
962
963 Err(SpanTokenizerError {
964 kind: SpanTokenizerErrorKind::UnterminatedString,
965 pos: self.offset,
966 message: "Reached end of formula while parsing string".to_string(),
967 span_start: Some(string_start),
968 span_end: Some(self.formula.len()),
969 })
970 }
971
972 fn parse_brackets(&mut self) -> Result<(), SpanTokenizerError> {
973 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
974
975 if !self.has_token() {
976 self.start_token();
977 }
978
979 let bracket_start = self.offset;
980 let mut open_count = 1;
981 self.offset += 1;
982
983 while self.offset < self.formula.len() {
984 match self.formula.as_bytes()[self.offset] {
985 b'[' => open_count += 1,
986 b']' => {
987 open_count -= 1;
988 if open_count == 0 {
989 self.offset += 1;
990 self.extend_token();
991 return Ok(());
992 }
993 }
994 _ => {}
995 }
996 self.offset += 1;
997 }
998
999 Err(SpanTokenizerError {
1000 kind: SpanTokenizerErrorKind::UnmatchedBracket,
1001 pos: self.offset,
1002 message: "Encountered unmatched '['".to_string(),
1003 span_start: Some(bracket_start),
1004 span_end: Some(self.formula.len()),
1005 })
1006 }
1007
1008 fn parse_error(&mut self) -> Result<(), SpanTokenizerError> {
1009 if self.has_token()
1010 && self.token_end > 0
1011 && self.formula.as_bytes()[self.token_end - 1] != b'!'
1012 {
1013 self.save_token();
1014 self.start_token();
1015 }
1016
1017 let error_start = if self.has_token() {
1018 self.token_start
1019 } else {
1020 self.offset
1021 };
1022
1023 for &err_code in ERROR_CODES {
1024 let err_bytes = err_code.as_bytes();
1025 if self.offset + err_bytes.len() <= self.formula.len() {
1026 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1027 if slice.eq_ignore_ascii_case(err_bytes) {
1028 self.push_span(
1029 TokenType::Operand,
1030 TokenSubType::Error,
1031 error_start,
1032 self.offset + err_bytes.len(),
1033 );
1034 self.offset += err_bytes.len();
1035 self.start_token();
1036 return Ok(());
1037 }
1038 }
1039 }
1040
1041 let mut end = self.offset + 1;
1042 while end < self.formula.len() {
1043 let ch = self.formula.as_bytes()[end];
1044 if is_token_ender(ch)
1045 || ch == b' '
1046 || ch == b'\n'
1047 || ch == b'('
1048 || ch == b'{'
1049 || ch == b'['
1050 || ch == b'"'
1051 || ch == b'\''
1052 {
1053 break;
1054 }
1055 end += 1;
1056 }
1057
1058 Err(SpanTokenizerError {
1059 kind: SpanTokenizerErrorKind::InvalidErrorLiteral,
1060 pos: self.offset,
1061 message: format!("Invalid error code at position {}", self.offset),
1062 span_start: Some(error_start),
1063 span_end: Some(end),
1064 })
1065 }
1066
1067 fn parse_whitespace(&mut self) -> Result<(), SpanTokenizerError> {
1068 self.save_token();
1069
1070 let ws_start = self.offset;
1071 while self.offset < self.formula.len() {
1072 match self.formula.as_bytes()[self.offset] {
1073 b' ' | b'\n' => self.offset += 1,
1074 _ => break,
1075 }
1076 }
1077
1078 self.push_span(
1079 TokenType::Whitespace,
1080 TokenSubType::None,
1081 ws_start,
1082 self.offset,
1083 );
1084 self.start_token();
1085 Ok(())
1086 }
1087
1088 fn prev_non_whitespace(&self) -> Option<&TokenSpan> {
1089 self.spans
1090 .iter()
1091 .rev()
1092 .find(|t| t.token_type != TokenType::Whitespace)
1093 }
1094
1095 fn parse_operator(&mut self) -> Result<(), SpanTokenizerError> {
1096 self.save_token();
1097
1098 if self.offset + 1 < self.formula.len() {
1099 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1100 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1101 self.push_span(
1102 TokenType::OpInfix,
1103 TokenSubType::None,
1104 self.offset,
1105 self.offset + 2,
1106 );
1107 self.offset += 2;
1108 self.start_token();
1109 return Ok(());
1110 }
1111 }
1112
1113 let curr_byte = self.formula.as_bytes()[self.offset];
1114 let token_type = match curr_byte {
1115 b'@' => TokenType::OpPrefix,
1116 b'%' => TokenType::OpPostfix,
1117 b'+' | b'-' => {
1118 if self.spans.is_empty() {
1119 TokenType::OpPrefix
1120 } else {
1121 let prev = self.prev_non_whitespace();
1122 if let Some(p) = prev {
1123 if p.subtype == TokenSubType::Close
1124 || p.token_type == TokenType::OpPostfix
1125 || p.token_type == TokenType::Operand
1126 {
1127 TokenType::OpInfix
1128 } else {
1129 TokenType::OpPrefix
1130 }
1131 } else {
1132 TokenType::OpPrefix
1133 }
1134 }
1135 }
1136 _ => TokenType::OpInfix,
1137 };
1138
1139 self.push_span(token_type, TokenSubType::None, self.offset, self.offset + 1);
1140 self.offset += 1;
1141 self.start_token();
1142 Ok(())
1143 }
1144
1145 fn parse_opener(&mut self) -> Result<(), SpanTokenizerError> {
1146 let curr_byte = self.formula.as_bytes()[self.offset];
1147 assert!(curr_byte == b'(' || curr_byte == b'{');
1148
1149 let token = if curr_byte == b'{' {
1150 self.save_token();
1151 TokenSpan {
1152 token_type: TokenType::Array,
1153 subtype: TokenSubType::Open,
1154 start: self.offset,
1155 end: self.offset + 1,
1156 }
1157 } else if self.has_token() {
1158 let token = TokenSpan {
1159 token_type: TokenType::Func,
1160 subtype: TokenSubType::Open,
1161 start: self.token_start,
1162 end: self.offset + 1,
1163 };
1164 self.token_start = self.offset + 1;
1165 self.token_end = self.offset + 1;
1166 token
1167 } else {
1168 TokenSpan {
1169 token_type: TokenType::Paren,
1170 subtype: TokenSubType::Open,
1171 start: self.offset,
1172 end: self.offset + 1,
1173 }
1174 };
1175
1176 self.spans.push(token);
1177 self.token_stack.push(token);
1178 self.offset += 1;
1179 self.start_token();
1180 Ok(())
1181 }
1182
1183 fn parse_closer(&mut self) -> Result<(), SpanTokenizerError> {
1184 self.save_token();
1185
1186 let curr_byte = self.formula.as_bytes()[self.offset];
1187 assert!(curr_byte == b')' || curr_byte == b'}');
1188
1189 if let Some(open_token) = self.token_stack.last().copied() {
1190 let expected = if open_token.token_type == TokenType::Array {
1191 b'}'
1192 } else {
1193 b')'
1194 };
1195 if curr_byte != expected {
1196 return Err(SpanTokenizerError {
1197 kind: SpanTokenizerErrorKind::MismatchedPair,
1198 pos: self.offset,
1199 message: "Mismatched ( and { pair".to_string(),
1200 span_start: Some(self.offset),
1201 span_end: Some(self.offset + 1),
1202 });
1203 }
1204
1205 self.token_stack.pop();
1206 self.push_span(
1207 open_token.token_type,
1208 TokenSubType::Close,
1209 self.offset,
1210 self.offset + 1,
1211 );
1212 } else {
1213 return Err(SpanTokenizerError {
1214 kind: SpanTokenizerErrorKind::NoMatchingOpener,
1215 pos: self.offset,
1216 message: format!("No matching opener for closer at position {}", self.offset),
1217 span_start: Some(self.offset),
1218 span_end: Some(self.offset + 1),
1219 });
1220 }
1221
1222 self.offset += 1;
1223 self.start_token();
1224 Ok(())
1225 }
1226
1227 fn parse_separator(&mut self) -> Result<(), SpanTokenizerError> {
1228 self.save_token();
1229
1230 let curr_byte = self.formula.as_bytes()[self.offset];
1231 assert!(curr_byte == b';' || curr_byte == b',');
1232
1233 let top_token = self.token_stack.last();
1234 let in_function_or_array = matches!(
1235 top_token.map(|t| t.token_type),
1236 Some(TokenType::Func | TokenType::Array)
1237 );
1238 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1239
1240 let (token_type, subtype) = match curr_byte {
1241 b',' => {
1242 if in_function_or_array {
1243 (TokenType::Sep, TokenSubType::Arg)
1244 } else {
1245 (TokenType::OpInfix, TokenSubType::None)
1246 }
1247 }
1248 b';' => {
1249 if in_array {
1250 (TokenType::Sep, TokenSubType::Row)
1251 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1252 (TokenType::Sep, TokenSubType::Arg)
1253 } else if self.dialect == FormulaDialect::OpenFormula {
1254 (TokenType::OpInfix, TokenSubType::None)
1255 } else {
1256 (TokenType::Sep, TokenSubType::Row)
1257 }
1258 }
1259 _ => (TokenType::OpInfix, TokenSubType::None),
1260 };
1261
1262 self.push_span(token_type, subtype, self.offset, self.offset + 1);
1263 self.offset += 1;
1264 self.start_token();
1265 Ok(())
1266 }
1267}
1268
1269pub struct Tokenizer {
1271 formula: String, pub items: Vec<Token>,
1273 token_stack: Vec<Token>,
1274 offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
1278}
1279
1280impl Tokenizer {
1281 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
1283 Self::new_with_dialect(formula, FormulaDialect::Excel)
1284 }
1285
1286 pub fn new_best_effort(formula: &str) -> Self {
1288 Self::new_best_effort_with_dialect(formula, FormulaDialect::Excel)
1289 }
1290
1291 pub fn new_best_effort_with_dialect(formula: &str, dialect: FormulaDialect) -> Self {
1293 let stream = TokenStream::new_best_effort_with_dialect(formula, dialect);
1294 Self::from_token_stream(&stream)
1295 }
1296
1297 pub fn new_with_dialect(
1299 formula: &str,
1300 dialect: FormulaDialect,
1301 ) -> Result<Self, TokenizerError> {
1302 let mut tokenizer = Tokenizer {
1303 formula: formula.to_string(),
1304 items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
1306 offset: 0,
1307 token_start: 0,
1308 token_end: 0,
1309 dialect,
1310 };
1311 tokenizer.parse()?;
1312 Ok(tokenizer)
1313 }
1314
1315 pub fn from_token_stream(stream: &TokenStream) -> Self {
1316 Tokenizer {
1317 formula: stream.source.to_string(),
1318 items: stream.to_tokens(),
1319 token_stack: Vec::with_capacity(16),
1320 offset: 0,
1321 token_start: 0,
1322 token_end: 0,
1323 dialect: stream.dialect,
1324 }
1325 }
1326
1327 #[inline]
1329 fn current_byte(&self) -> Option<u8> {
1330 self.formula.as_bytes().get(self.offset).copied()
1331 }
1332
1333 #[inline]
1335 fn has_token(&self) -> bool {
1336 self.token_end > self.token_start
1337 }
1338
1339 #[inline]
1341 fn start_token(&mut self) {
1342 self.token_start = self.offset;
1343 self.token_end = self.offset;
1344 }
1345
1346 #[inline]
1348 fn extend_token(&mut self) {
1349 self.token_end = self.offset;
1350 }
1351
1352 fn parse(&mut self) -> Result<(), TokenizerError> {
1354 if self.formula.is_empty() {
1355 return Ok(());
1356 }
1357
1358 if self.formula.as_bytes()[0] != b'=' {
1360 self.items.push(Token::new_with_span(
1361 self.formula.clone(),
1362 TokenType::Literal,
1363 TokenSubType::None,
1364 0,
1365 self.formula.len(),
1366 ));
1367 return Ok(());
1368 }
1369
1370 self.offset = 1;
1372 self.start_token();
1373
1374 while self.offset < self.formula.len() {
1375 if self.check_scientific_notation()? {
1376 continue;
1377 }
1378
1379 let curr_byte = self.formula.as_bytes()[self.offset];
1380
1381 if is_token_ender(curr_byte) && self.has_token() {
1383 self.save_token();
1384 self.start_token();
1385 }
1386
1387 match curr_byte {
1389 b'"' | b'\'' => self.parse_string()?,
1390 b'[' => self.parse_brackets()?,
1391 b'#' => self.parse_error()?,
1392 b' ' | b'\n' => self.parse_whitespace()?,
1393 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' | b'@' => {
1395 self.parse_operator()?
1396 }
1397 b'{' | b'(' => self.parse_opener()?,
1398 b')' | b'}' => self.parse_closer()?,
1399 b';' | b',' => self.parse_separator()?,
1400 _ => {
1401 if !self.has_token() {
1403 self.start_token();
1404 }
1405 self.offset += 1;
1406 self.extend_token();
1407 }
1408 }
1409 }
1410
1411 if self.has_token() {
1413 self.save_token();
1414 }
1415
1416 if !self.token_stack.is_empty() {
1418 return Err(TokenizerError {
1419 message: "Unmatched opening parenthesis or bracket".to_string(),
1420 pos: self.offset,
1421 });
1422 }
1423
1424 Ok(())
1425 }
1426
1427 fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
1430 if let Some(curr_byte) = self.current_byte() {
1431 if (curr_byte == b'+' || curr_byte == b'-')
1432 && self.has_token()
1433 && self.is_scientific_notation_base()
1434 {
1435 self.offset += 1;
1436 self.extend_token();
1437 return Ok(true);
1438 }
1439 }
1440 Ok(false)
1441 }
1442
1443 fn is_scientific_notation_base(&self) -> bool {
1446 if !self.has_token() {
1447 return false;
1448 }
1449
1450 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
1451 if token_slice.len() < 2 {
1452 return false;
1453 }
1454
1455 let last = token_slice[token_slice.len() - 1];
1456 if !(last == b'E' || last == b'e') {
1457 return false;
1458 }
1459
1460 let first = token_slice[0];
1461 if !first.is_ascii_digit() {
1462 return false;
1463 }
1464
1465 let mut dot_seen = false;
1466 for &ch in &token_slice[1..token_slice.len() - 1] {
1468 match ch {
1469 b'0'..=b'9' => {}
1470 b'.' if !dot_seen => dot_seen = true,
1471 _ => return false,
1472 }
1473 }
1474 true
1475 }
1476
1477 fn save_token(&mut self) {
1479 if self.has_token() {
1480 let token =
1481 Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
1482 self.items.push(token);
1483 }
1484 }
1485
1486 fn parse_string(&mut self) -> Result<(), TokenizerError> {
1488 let delim = self.formula.as_bytes()[self.offset];
1489 assert!(delim == b'"' || delim == b'\'');
1490
1491 let is_dollar_ref = delim == b'\''
1493 && self.has_token()
1494 && self.token_end - self.token_start == 1
1495 && self.formula.as_bytes()[self.token_start] == b'$';
1496
1497 if !is_dollar_ref && self.has_token() {
1498 if self.token_end > 0 && self.formula.as_bytes()[self.token_end - 1] != b':' {
1500 self.save_token();
1501 self.start_token();
1502 }
1503 }
1504
1505 let string_start = if is_dollar_ref {
1506 self.token_start
1507 } else {
1508 self.offset
1509 };
1510 self.offset += 1; while self.offset < self.formula.len() {
1513 if self.formula.as_bytes()[self.offset] == delim {
1514 self.offset += 1;
1515 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
1517 {
1518 self.offset += 1; } else {
1520 if delim == b'"' {
1522 let token = Token::make_operand_from_slice(
1523 &self.formula,
1524 string_start,
1525 self.offset,
1526 );
1527 self.items.push(token);
1528 self.start_token();
1529 } else {
1530 self.token_end = self.offset;
1532 }
1533 return Ok(());
1534 }
1535 } else {
1536 self.offset += 1;
1537 }
1538 }
1539
1540 Err(TokenizerError {
1541 message: "Reached end of formula while parsing string".to_string(),
1542 pos: self.offset,
1543 })
1544 }
1545
1546 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
1548 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
1549
1550 if !self.has_token() {
1551 self.start_token();
1552 }
1553
1554 let mut open_count = 1;
1555 self.offset += 1;
1556
1557 while self.offset < self.formula.len() {
1558 match self.formula.as_bytes()[self.offset] {
1559 b'[' => open_count += 1,
1560 b']' => {
1561 open_count -= 1;
1562 if open_count == 0 {
1563 self.offset += 1;
1564 self.extend_token();
1565 return Ok(());
1566 }
1567 }
1568 _ => {}
1569 }
1570 self.offset += 1;
1571 }
1572
1573 Err(TokenizerError {
1574 message: "Encountered unmatched '['".to_string(),
1575 pos: self.offset,
1576 })
1577 }
1578
1579 fn parse_error(&mut self) -> Result<(), TokenizerError> {
1581 if self.has_token()
1583 && self.token_end > 0
1584 && self.formula.as_bytes()[self.token_end - 1] != b'!'
1585 {
1586 self.save_token();
1587 self.start_token();
1588 }
1589
1590 let error_start = if self.has_token() {
1591 self.token_start
1592 } else {
1593 self.offset
1594 };
1595
1596 for &err_code in ERROR_CODES {
1598 let err_bytes = err_code.as_bytes();
1599 if self.offset + err_bytes.len() <= self.formula.len() {
1600 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1601 if slice.eq_ignore_ascii_case(err_bytes) {
1602 let token = Token::make_operand_from_slice(
1603 &self.formula,
1604 error_start,
1605 self.offset + err_bytes.len(),
1606 );
1607 self.items.push(token);
1608 self.offset += err_bytes.len();
1609 self.start_token();
1610 return Ok(());
1611 }
1612 }
1613 }
1614
1615 Err(TokenizerError {
1616 message: format!("Invalid error code at position {}", self.offset),
1617 pos: self.offset,
1618 })
1619 }
1620
1621 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
1623 self.save_token();
1624
1625 let ws_start = self.offset;
1626 while self.offset < self.formula.len() {
1627 match self.formula.as_bytes()[self.offset] {
1628 b' ' | b'\n' => self.offset += 1,
1629 _ => break,
1630 }
1631 }
1632
1633 self.items.push(Token::from_slice(
1634 &self.formula,
1635 TokenType::Whitespace,
1636 TokenSubType::None,
1637 ws_start,
1638 self.offset,
1639 ));
1640 self.start_token();
1641 Ok(())
1642 }
1643
1644 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
1646 self.save_token();
1647
1648 if self.offset + 1 < self.formula.len() {
1650 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1651 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1652 self.items.push(Token::from_slice(
1653 &self.formula,
1654 TokenType::OpInfix,
1655 TokenSubType::None,
1656 self.offset,
1657 self.offset + 2,
1658 ));
1659 self.offset += 2;
1660 self.start_token();
1661 return Ok(());
1662 }
1663 }
1664
1665 let curr_byte = self.formula.as_bytes()[self.offset];
1666 let token_type = match curr_byte {
1667 b'@' => TokenType::OpPrefix,
1668 b'%' => TokenType::OpPostfix,
1669 b'+' | b'-' => {
1670 if self.items.is_empty() {
1672 TokenType::OpPrefix
1673 } else {
1674 let prev = self
1675 .items
1676 .iter()
1677 .rev()
1678 .find(|t| t.token_type != TokenType::Whitespace);
1679 if let Some(p) = prev {
1680 if p.subtype == TokenSubType::Close
1681 || p.token_type == TokenType::OpPostfix
1682 || p.token_type == TokenType::Operand
1683 {
1684 TokenType::OpInfix
1685 } else {
1686 TokenType::OpPrefix
1687 }
1688 } else {
1689 TokenType::OpPrefix
1690 }
1691 }
1692 }
1693 _ => TokenType::OpInfix,
1694 };
1695
1696 self.items.push(Token::from_slice(
1697 &self.formula,
1698 token_type,
1699 TokenSubType::None,
1700 self.offset,
1701 self.offset + 1,
1702 ));
1703 self.offset += 1;
1704 self.start_token();
1705 Ok(())
1706 }
1707
1708 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
1710 let curr_byte = self.formula.as_bytes()[self.offset];
1711 assert!(curr_byte == b'(' || curr_byte == b'{');
1712
1713 let token = if curr_byte == b'{' {
1714 self.save_token();
1715 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1716 } else if self.has_token() {
1717 let token = Token::make_subexp_from_slice(
1719 &self.formula,
1720 true,
1721 self.token_start,
1722 self.offset + 1,
1723 );
1724 self.token_start = self.offset + 1;
1725 self.token_end = self.offset + 1;
1726 token
1727 } else {
1728 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1729 };
1730
1731 self.items.push(token.clone());
1732 self.token_stack.push(token);
1733 self.offset += 1;
1734 self.start_token();
1735 Ok(())
1736 }
1737
1738 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
1740 self.save_token();
1741
1742 let curr_byte = self.formula.as_bytes()[self.offset];
1743 assert!(curr_byte == b')' || curr_byte == b'}');
1744
1745 if let Some(open_token) = self.token_stack.pop() {
1746 let closer = open_token.get_closer()?;
1747 if (curr_byte == b'}' && closer.value != "}")
1748 || (curr_byte == b')' && closer.value != ")")
1749 {
1750 return Err(TokenizerError {
1751 message: "Mismatched ( and { pair".to_string(),
1752 pos: self.offset,
1753 });
1754 }
1755
1756 self.items.push(Token::from_slice(
1757 &self.formula,
1758 closer.token_type,
1759 TokenSubType::Close,
1760 self.offset,
1761 self.offset + 1,
1762 ));
1763 } else {
1764 return Err(TokenizerError {
1765 message: format!("No matching opener for closer at position {}", self.offset),
1766 pos: self.offset,
1767 });
1768 }
1769
1770 self.offset += 1;
1771 self.start_token();
1772 Ok(())
1773 }
1774
1775 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
1777 self.save_token();
1778
1779 let curr_byte = self.formula.as_bytes()[self.offset];
1780 assert!(curr_byte == b';' || curr_byte == b',');
1781
1782 let top_token = self.token_stack.last();
1783 let in_function_or_array = matches!(
1784 top_token.map(|t| t.token_type),
1785 Some(TokenType::Func | TokenType::Array)
1786 );
1787 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1788
1789 let (token_type, subtype) = match curr_byte {
1790 b',' => {
1791 if in_function_or_array {
1792 (TokenType::Sep, TokenSubType::Arg)
1793 } else {
1794 (TokenType::OpInfix, TokenSubType::None)
1795 }
1796 }
1797 b';' => {
1798 if in_array {
1799 (TokenType::Sep, TokenSubType::Row)
1801 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1802 (TokenType::Sep, TokenSubType::Arg)
1804 } else if self.dialect == FormulaDialect::OpenFormula {
1805 (TokenType::OpInfix, TokenSubType::None)
1806 } else {
1807 (TokenType::Sep, TokenSubType::Row)
1808 }
1809 }
1810 _ => (TokenType::OpInfix, TokenSubType::None),
1811 };
1812
1813 self.items.push(Token::from_slice(
1814 &self.formula,
1815 token_type,
1816 subtype,
1817 self.offset,
1818 self.offset + 1,
1819 ));
1820
1821 self.offset += 1;
1822 self.start_token();
1823 Ok(())
1824 }
1825
1826 pub fn render(&self) -> String {
1828 if self.items.is_empty() {
1829 "".to_string()
1830 } else if self.items[0].token_type == TokenType::Literal {
1831 self.items[0].value.clone()
1832 } else {
1833 let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
1834 format!("={concatenated}")
1835 }
1836 }
1837
1838 pub fn dialect(&self) -> FormulaDialect {
1840 self.dialect
1841 }
1842}
1843
1844impl TryFrom<&str> for Tokenizer {
1845 type Error = TokenizerError;
1846
1847 fn try_from(value: &str) -> Result<Self, Self::Error> {
1848 Tokenizer::new(value)
1849 }
1850}
1851
1852impl TryFrom<String> for Tokenizer {
1853 type Error = TokenizerError;
1854
1855 fn try_from(value: String) -> Result<Self, Self::Error> {
1856 Tokenizer::new(&value)
1857 }
1858}