1use std::convert::TryFrom;
2use std::error::Error;
3use std::fmt::{self, Display};
4use std::sync::Arc;
5
6#[cfg(feature = "serde")]
7use serde::{Deserialize, Serialize};
8
9use crate::types::FormulaDialect;
10
11const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%";
12
13const fn build_token_enders() -> [bool; 256] {
14 let mut tbl = [false; 256];
15 let bytes = TOKEN_ENDERS.as_bytes();
16 let mut i = 0;
17 while i < bytes.len() {
18 tbl[bytes[i] as usize] = true;
19 i += 1;
20 }
21 tbl
22}
23static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
24
25#[inline(always)]
26fn is_token_ender(c: u8) -> bool {
27 TOKEN_ENDERS_TABLE[c as usize]
28}
29
30static ERROR_CODES: &[&str] = &[
31 "#NULL!",
32 "#DIV/0!",
33 "#VALUE!",
34 "#REF!",
35 "#NAME?",
36 "#NUM!",
37 "#N/A",
38 "#GETTING_DATA",
39];
40
41#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum Associativity {
44 Left,
45 Right,
46}
47
48#[derive(Debug)]
50pub struct TokenizerError {
51 pub message: String,
52 pub pos: usize,
53}
54
55impl fmt::Display for TokenizerError {
56 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57 write!(f, "TokenizerError: {}", self.message)
58 }
59}
60
61impl Error for TokenizerError {}
62
63#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
65#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
66pub enum TokenType {
67 Literal,
68 Operand,
69 Func,
70 Array,
71 Paren,
72 Sep,
73 OpPrefix,
74 OpInfix,
75 OpPostfix,
76 Whitespace,
77}
78impl Display for TokenType {
79 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 write!(f, "{self:?}")
81 }
82}
83
84#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
86#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
87pub enum TokenSubType {
88 None,
89 Text,
90 Number,
91 Logical,
92 Error,
93 Range,
94 Open,
95 Close,
96 Arg,
97 Row,
98}
99impl Display for TokenSubType {
100 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
101 write!(f, "{self:?}")
102 }
103}
104
105#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
107#[derive(Debug, Clone, PartialEq, Hash)]
108pub struct Token {
109 pub value: String, pub token_type: TokenType,
111 pub subtype: TokenSubType,
112 pub start: usize,
113 pub end: usize,
114}
115
116impl Display for Token {
117 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 write!(
119 f,
120 "<{} subtype: {:?} value: {}>",
121 self.token_type, self.subtype, self.value
122 )
123 }
124}
125
126impl Token {
127 pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
128 Token {
129 value,
130 token_type,
131 subtype,
132 start: 0,
133 end: 0,
134 }
135 }
136
137 pub fn new_with_span(
138 value: String,
139 token_type: TokenType,
140 subtype: TokenSubType,
141 start: usize,
142 end: usize,
143 ) -> Self {
144 Token {
145 value,
146 token_type,
147 subtype,
148 start,
149 end,
150 }
151 }
152
153 fn from_slice(
154 source: &str,
155 token_type: TokenType,
156 subtype: TokenSubType,
157 start: usize,
158 end: usize,
159 ) -> Self {
160 Token {
161 value: source[start..end].to_string(),
162 token_type,
163 subtype,
164 start,
165 end,
166 }
167 }
168
169 pub fn is_operator(&self) -> bool {
170 matches!(
171 self.token_type,
172 TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
173 )
174 }
175
176 pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
177 let op = if self.token_type == TokenType::OpPrefix {
179 "u"
180 } else {
181 self.value.as_str()
182 };
183
184 match op {
195 ":" | " " | "," => Some((8, Associativity::Left)),
196 "%" => Some((7, Associativity::Left)),
197 "^" => Some((6, Associativity::Right)),
198 "u" => Some((5, Associativity::Right)),
199 "*" | "/" => Some((4, Associativity::Left)),
200 "+" | "-" => Some((3, Associativity::Left)),
201 "&" => Some((2, Associativity::Left)),
202 "=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
203 _ => None,
204 }
205 }
206
207 pub fn make_operand(value: String) -> Self {
209 let subtype = if value.starts_with('"') {
210 TokenSubType::Text
211 } else if value.starts_with('#') {
212 TokenSubType::Error
213 } else if value == "TRUE" || value == "FALSE" {
214 TokenSubType::Logical
215 } else if value.parse::<f64>().is_ok() {
216 TokenSubType::Number
217 } else {
218 TokenSubType::Range
219 };
220 Token::new(value, TokenType::Operand, subtype)
221 }
222
223 pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
225 let subtype = if value.starts_with('"') {
226 TokenSubType::Text
227 } else if value.starts_with('#') {
228 TokenSubType::Error
229 } else if value == "TRUE" || value == "FALSE" {
230 TokenSubType::Logical
231 } else if value.parse::<f64>().is_ok() {
232 TokenSubType::Number
233 } else {
234 TokenSubType::Range
235 };
236 Token::new_with_span(value, TokenType::Operand, subtype, start, end)
237 }
238
239 fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
240 let value_str = &source[start..end];
241 let subtype = if value_str.starts_with('"') {
242 TokenSubType::Text
243 } else if value_str.starts_with('#') {
244 TokenSubType::Error
245 } else if value_str == "TRUE" || value_str == "FALSE" {
246 TokenSubType::Logical
247 } else if value_str.parse::<f64>().is_ok() {
248 TokenSubType::Number
249 } else {
250 TokenSubType::Range
251 };
252 Token::from_slice(source, TokenType::Operand, subtype, start, end)
253 }
254
255 pub fn make_subexp(value: &str, func: bool) -> Self {
260 let last_char = value.chars().last().expect("Empty token value");
261 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
262 let token_type = if func {
263 TokenType::Func
264 } else if "{}".contains(last_char) {
265 TokenType::Array
266 } else if "()".contains(last_char) {
267 TokenType::Paren
268 } else {
269 TokenType::Func
270 };
271 let subtype = if ")}".contains(last_char) {
272 TokenSubType::Close
273 } else {
274 TokenSubType::Open
275 };
276 Token::new(value.to_string(), token_type, subtype)
277 }
278
279 pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
281 let last_char = value.chars().last().expect("Empty token value");
282 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
283 let token_type = if func {
284 TokenType::Func
285 } else if "{}".contains(last_char) {
286 TokenType::Array
287 } else if "()".contains(last_char) {
288 TokenType::Paren
289 } else {
290 TokenType::Func
291 };
292 let subtype = if ")}".contains(last_char) {
293 TokenSubType::Close
294 } else {
295 TokenSubType::Open
296 };
297 Token::new_with_span(value.to_string(), token_type, subtype, start, end)
298 }
299
300 fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
301 let value_str = &source[start..end];
302 let last_char = value_str.chars().last().expect("Empty token value");
303 let token_type = if func {
304 TokenType::Func
305 } else if "{}".contains(last_char) {
306 TokenType::Array
307 } else if "()".contains(last_char) {
308 TokenType::Paren
309 } else {
310 TokenType::Func
311 };
312 let subtype = if ")}".contains(last_char) {
313 TokenSubType::Close
314 } else {
315 TokenSubType::Open
316 };
317 Token::from_slice(source, token_type, subtype, start, end)
318 }
319
320 pub fn get_closer(&self) -> Result<Token, TokenizerError> {
322 if self.subtype != TokenSubType::Open {
323 return Err(TokenizerError {
324 message: "Token is not an opener".to_string(),
325 pos: 0,
326 });
327 }
328 let closer_value = if self.token_type == TokenType::Array {
329 "}"
330 } else {
331 ")"
332 };
333 Ok(Token::make_subexp(
334 closer_value,
335 self.token_type == TokenType::Func,
336 ))
337 }
338
339 pub fn make_separator(value: &str) -> Self {
341 assert!(value == "," || value == ";");
342 let subtype = if value == "," {
343 TokenSubType::Arg
344 } else {
345 TokenSubType::Row
346 };
347 Token::new(value.to_string(), TokenType::Sep, subtype)
348 }
349
350 pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
352 assert!(value == "," || value == ";");
353 let subtype = if value == "," {
354 TokenSubType::Arg
355 } else {
356 TokenSubType::Row
357 };
358 Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
359 }
360}
361
362#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
363pub struct TokenSpan {
364 pub token_type: TokenType,
365 pub subtype: TokenSubType,
366 pub start: usize,
367 pub end: usize,
368}
369
370#[derive(Debug, Clone, Copy, PartialEq, Eq)]
371pub struct TokenView<'a> {
372 pub span: &'a TokenSpan,
373 pub value: &'a str,
374}
375
376#[derive(Debug, Clone)]
382pub struct TokenStream {
383 source: Arc<str>,
384 pub spans: Vec<TokenSpan>,
385 dialect: FormulaDialect,
386}
387
388impl TokenStream {
389 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
390 Self::new_with_dialect(formula, FormulaDialect::Excel)
391 }
392
393 pub fn new_with_dialect(
394 formula: &str,
395 dialect: FormulaDialect,
396 ) -> Result<Self, TokenizerError> {
397 let source: Arc<str> = Arc::from(formula);
398 let spans = tokenize_spans_with_dialect(source.as_ref(), dialect)?;
399 Ok(TokenStream {
400 source,
401 spans,
402 dialect,
403 })
404 }
405
406 pub fn source(&self) -> &str {
407 &self.source
408 }
409
410 pub fn dialect(&self) -> FormulaDialect {
411 self.dialect
412 }
413
414 pub fn len(&self) -> usize {
415 self.spans.len()
416 }
417
418 pub fn is_empty(&self) -> bool {
419 self.spans.is_empty()
420 }
421
422 pub fn get(&self, index: usize) -> Option<TokenView<'_>> {
423 let span = self.spans.get(index)?;
424 let value = self.source.get(span.start..span.end)?;
425 Some(TokenView { span, value })
426 }
427
428 pub fn to_tokens(&self) -> Vec<Token> {
429 self.spans
430 .iter()
431 .map(|s| {
432 let value = self
433 .source
434 .get(s.start..s.end)
435 .unwrap_or_default()
436 .to_string();
437 Token::new_with_span(value, s.token_type, s.subtype, s.start, s.end)
438 })
439 .collect()
440 }
441
442 pub fn render(&self) -> String {
443 let mut out = String::with_capacity(self.source.len());
444 for span in &self.spans {
445 if let Some(s) = self.source.get(span.start..span.end) {
446 out.push_str(s);
447 }
448 }
449 out
450 }
451}
452
453pub(crate) fn tokenize_spans_with_dialect(
454 formula: &str,
455 dialect: FormulaDialect,
456) -> Result<Vec<TokenSpan>, TokenizerError> {
457 let mut tokenizer = SpanTokenizer::new(formula, dialect);
458 tokenizer.parse()?;
459 Ok(tokenizer.spans)
460}
461
462fn operand_subtype(value_str: &str) -> TokenSubType {
463 if value_str.starts_with('"') {
464 TokenSubType::Text
465 } else if value_str.starts_with('#') {
466 TokenSubType::Error
467 } else if value_str == "TRUE" || value_str == "FALSE" {
468 TokenSubType::Logical
469 } else if value_str.parse::<f64>().is_ok() {
470 TokenSubType::Number
471 } else {
472 TokenSubType::Range
473 }
474}
475
476struct SpanTokenizer<'a> {
477 formula: &'a str,
478 spans: Vec<TokenSpan>,
479 token_stack: Vec<TokenSpan>,
480 offset: usize,
481 token_start: usize,
482 token_end: usize,
483 dialect: FormulaDialect,
484}
485
486impl<'a> SpanTokenizer<'a> {
487 fn new(formula: &'a str, dialect: FormulaDialect) -> Self {
488 SpanTokenizer {
489 formula,
490 spans: Vec::with_capacity(formula.len() / 2),
491 token_stack: Vec::with_capacity(16),
492 offset: 0,
493 token_start: 0,
494 token_end: 0,
495 dialect,
496 }
497 }
498
499 #[inline]
500 fn current_byte(&self) -> Option<u8> {
501 self.formula.as_bytes().get(self.offset).copied()
502 }
503
504 #[inline]
505 fn has_token(&self) -> bool {
506 self.token_end > self.token_start
507 }
508
509 #[inline]
510 fn start_token(&mut self) {
511 self.token_start = self.offset;
512 self.token_end = self.offset;
513 }
514
515 #[inline]
516 fn extend_token(&mut self) {
517 self.token_end = self.offset;
518 }
519
520 fn push_span(
521 &mut self,
522 token_type: TokenType,
523 subtype: TokenSubType,
524 start: usize,
525 end: usize,
526 ) {
527 self.spans.push(TokenSpan {
528 token_type,
529 subtype,
530 start,
531 end,
532 });
533 }
534
535 fn save_token(&mut self) {
536 if self.has_token() {
537 let value_str = &self.formula[self.token_start..self.token_end];
538 let subtype = operand_subtype(value_str);
539 self.push_span(
540 TokenType::Operand,
541 subtype,
542 self.token_start,
543 self.token_end,
544 );
545 }
546 }
547
548 fn check_scientific_notation(&mut self) -> bool {
549 if let Some(curr_byte) = self.current_byte()
550 && (curr_byte == b'+' || curr_byte == b'-')
551 && self.has_token()
552 && self.is_scientific_notation_base()
553 {
554 self.offset += 1;
555 self.extend_token();
556 return true;
557 }
558 false
559 }
560
561 fn is_scientific_notation_base(&self) -> bool {
562 if !self.has_token() {
563 return false;
564 }
565
566 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
567 if token_slice.len() < 2 {
568 return false;
569 }
570
571 let last = token_slice[token_slice.len() - 1];
572 if !(last == b'E' || last == b'e') {
573 return false;
574 }
575
576 let first = token_slice[0];
577 if !first.is_ascii_digit() {
578 return false;
579 }
580
581 let mut dot_seen = false;
582 for &ch in &token_slice[1..token_slice.len() - 1] {
583 match ch {
584 b'0'..=b'9' => {}
585 b'.' if !dot_seen => dot_seen = true,
586 _ => return false,
587 }
588 }
589 true
590 }
591
592 fn parse(&mut self) -> Result<(), TokenizerError> {
593 if self.formula.is_empty() {
594 return Ok(());
595 }
596
597 if self.formula.as_bytes()[0] != b'=' {
598 self.push_span(
599 TokenType::Literal,
600 TokenSubType::None,
601 0,
602 self.formula.len(),
603 );
604 return Ok(());
605 }
606
607 self.offset = 1;
608 self.start_token();
609
610 while self.offset < self.formula.len() {
611 if self.check_scientific_notation() {
612 continue;
613 }
614
615 let curr_byte = self.formula.as_bytes()[self.offset];
616
617 if is_token_ender(curr_byte) && self.has_token() {
618 self.save_token();
619 self.start_token();
620 }
621
622 match curr_byte {
623 b'"' | b'\'' => self.parse_string()?,
624 b'[' => self.parse_brackets()?,
625 b'#' => self.parse_error()?,
626 b' ' | b'\n' => self.parse_whitespace()?,
627 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' => {
628 self.parse_operator()?
629 }
630 b'{' | b'(' => self.parse_opener()?,
631 b')' | b'}' => self.parse_closer()?,
632 b';' | b',' => self.parse_separator()?,
633 _ => {
634 if !self.has_token() {
635 self.start_token();
636 }
637 self.offset += 1;
638 self.extend_token();
639 }
640 }
641 }
642
643 if self.has_token() {
644 self.save_token();
645 }
646
647 if !self.token_stack.is_empty() {
648 return Err(TokenizerError {
649 message: "Unmatched opening parenthesis or bracket".to_string(),
650 pos: self.offset,
651 });
652 }
653
654 Ok(())
655 }
656
657 fn parse_string(&mut self) -> Result<(), TokenizerError> {
658 let delim = self.formula.as_bytes()[self.offset];
659 assert!(delim == b'"' || delim == b'\'');
660
661 let is_dollar_ref = delim == b'\''
662 && self.has_token()
663 && self.token_end - self.token_start == 1
664 && self.formula.as_bytes()[self.token_start] == b'$';
665
666 if !is_dollar_ref
667 && self.has_token()
668 && self.token_end > 0
669 && self.formula.as_bytes()[self.token_end - 1] != b':'
670 {
671 self.save_token();
672 self.start_token();
673 }
674
675 let string_start = if is_dollar_ref {
676 self.token_start
677 } else {
678 self.offset
679 };
680 self.offset += 1;
681
682 while self.offset < self.formula.len() {
683 if self.formula.as_bytes()[self.offset] == delim {
684 self.offset += 1;
685 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
686 {
687 self.offset += 1;
688 } else {
689 if delim == b'"' {
690 let value_str = &self.formula[string_start..self.offset];
691 let subtype = operand_subtype(value_str);
692 self.push_span(TokenType::Operand, subtype, string_start, self.offset);
693 self.start_token();
694 } else {
695 self.token_end = self.offset;
696 }
697 return Ok(());
698 }
699 } else {
700 self.offset += 1;
701 }
702 }
703
704 Err(TokenizerError {
705 message: "Reached end of formula while parsing string".to_string(),
706 pos: self.offset,
707 })
708 }
709
710 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
711 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
712
713 if !self.has_token() {
714 self.start_token();
715 }
716
717 let mut open_count = 1;
718 self.offset += 1;
719
720 while self.offset < self.formula.len() {
721 match self.formula.as_bytes()[self.offset] {
722 b'[' => open_count += 1,
723 b']' => {
724 open_count -= 1;
725 if open_count == 0 {
726 self.offset += 1;
727 self.extend_token();
728 return Ok(());
729 }
730 }
731 _ => {}
732 }
733 self.offset += 1;
734 }
735
736 Err(TokenizerError {
737 message: "Encountered unmatched '['".to_string(),
738 pos: self.offset,
739 })
740 }
741
742 fn parse_error(&mut self) -> Result<(), TokenizerError> {
743 if self.has_token()
744 && self.token_end > 0
745 && self.formula.as_bytes()[self.token_end - 1] != b'!'
746 {
747 self.save_token();
748 self.start_token();
749 }
750
751 let error_start = if self.has_token() {
752 self.token_start
753 } else {
754 self.offset
755 };
756
757 for &err_code in ERROR_CODES {
758 let err_bytes = err_code.as_bytes();
759 if self.offset + err_bytes.len() <= self.formula.len() {
760 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
761 if slice == err_bytes {
762 self.push_span(
763 TokenType::Operand,
764 TokenSubType::Error,
765 error_start,
766 self.offset + err_bytes.len(),
767 );
768 self.offset += err_bytes.len();
769 self.start_token();
770 return Ok(());
771 }
772 }
773 }
774
775 Err(TokenizerError {
776 message: format!("Invalid error code at position {}", self.offset),
777 pos: self.offset,
778 })
779 }
780
781 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
782 self.save_token();
783
784 let ws_start = self.offset;
785 while self.offset < self.formula.len() {
786 match self.formula.as_bytes()[self.offset] {
787 b' ' | b'\n' => self.offset += 1,
788 _ => break,
789 }
790 }
791
792 self.push_span(
793 TokenType::Whitespace,
794 TokenSubType::None,
795 ws_start,
796 self.offset,
797 );
798 self.start_token();
799 Ok(())
800 }
801
802 fn prev_non_whitespace(&self) -> Option<&TokenSpan> {
803 self.spans
804 .iter()
805 .rev()
806 .find(|t| t.token_type != TokenType::Whitespace)
807 }
808
809 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
810 self.save_token();
811
812 if self.offset + 1 < self.formula.len() {
813 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
814 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
815 self.push_span(
816 TokenType::OpInfix,
817 TokenSubType::None,
818 self.offset,
819 self.offset + 2,
820 );
821 self.offset += 2;
822 self.start_token();
823 return Ok(());
824 }
825 }
826
827 let curr_byte = self.formula.as_bytes()[self.offset];
828 let token_type = match curr_byte {
829 b'%' => TokenType::OpPostfix,
830 b'+' | b'-' => {
831 if self.spans.is_empty() {
832 TokenType::OpPrefix
833 } else {
834 let prev = self.prev_non_whitespace();
835 if let Some(p) = prev {
836 if p.subtype == TokenSubType::Close
837 || p.token_type == TokenType::OpPostfix
838 || p.token_type == TokenType::Operand
839 {
840 TokenType::OpInfix
841 } else {
842 TokenType::OpPrefix
843 }
844 } else {
845 TokenType::OpPrefix
846 }
847 }
848 }
849 _ => TokenType::OpInfix,
850 };
851
852 self.push_span(token_type, TokenSubType::None, self.offset, self.offset + 1);
853 self.offset += 1;
854 self.start_token();
855 Ok(())
856 }
857
858 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
859 let curr_byte = self.formula.as_bytes()[self.offset];
860 assert!(curr_byte == b'(' || curr_byte == b'{');
861
862 let token = if curr_byte == b'{' {
863 self.save_token();
864 TokenSpan {
865 token_type: TokenType::Array,
866 subtype: TokenSubType::Open,
867 start: self.offset,
868 end: self.offset + 1,
869 }
870 } else if self.has_token() {
871 let token = TokenSpan {
872 token_type: TokenType::Func,
873 subtype: TokenSubType::Open,
874 start: self.token_start,
875 end: self.offset + 1,
876 };
877 self.token_start = self.offset + 1;
878 self.token_end = self.offset + 1;
879 token
880 } else {
881 TokenSpan {
882 token_type: TokenType::Paren,
883 subtype: TokenSubType::Open,
884 start: self.offset,
885 end: self.offset + 1,
886 }
887 };
888
889 self.spans.push(token);
890 self.token_stack.push(token);
891 self.offset += 1;
892 self.start_token();
893 Ok(())
894 }
895
896 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
897 self.save_token();
898
899 let curr_byte = self.formula.as_bytes()[self.offset];
900 assert!(curr_byte == b')' || curr_byte == b'}');
901
902 if let Some(open_token) = self.token_stack.pop() {
903 let expected = if open_token.token_type == TokenType::Array {
904 b'}'
905 } else {
906 b')'
907 };
908 if curr_byte != expected {
909 return Err(TokenizerError {
910 message: "Mismatched ( and { pair".to_string(),
911 pos: self.offset,
912 });
913 }
914
915 self.push_span(
916 open_token.token_type,
917 TokenSubType::Close,
918 self.offset,
919 self.offset + 1,
920 );
921 } else {
922 return Err(TokenizerError {
923 message: format!("No matching opener for closer at position {}", self.offset),
924 pos: self.offset,
925 });
926 }
927
928 self.offset += 1;
929 self.start_token();
930 Ok(())
931 }
932
933 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
934 self.save_token();
935
936 let curr_byte = self.formula.as_bytes()[self.offset];
937 assert!(curr_byte == b';' || curr_byte == b',');
938
939 let top_token = self.token_stack.last();
940 let in_function_or_array = matches!(
941 top_token.map(|t| t.token_type),
942 Some(TokenType::Func | TokenType::Array)
943 );
944 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
945
946 let (token_type, subtype) = match curr_byte {
947 b',' => {
948 if in_function_or_array {
949 (TokenType::Sep, TokenSubType::Arg)
950 } else {
951 (TokenType::OpInfix, TokenSubType::None)
952 }
953 }
954 b';' => {
955 if in_array {
956 (TokenType::Sep, TokenSubType::Row)
957 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
958 (TokenType::Sep, TokenSubType::Arg)
959 } else if self.dialect == FormulaDialect::OpenFormula {
960 (TokenType::OpInfix, TokenSubType::None)
961 } else {
962 (TokenType::Sep, TokenSubType::Row)
963 }
964 }
965 _ => (TokenType::OpInfix, TokenSubType::None),
966 };
967
968 self.push_span(token_type, subtype, self.offset, self.offset + 1);
969 self.offset += 1;
970 self.start_token();
971 Ok(())
972 }
973}
974
975pub struct Tokenizer {
977 formula: String, pub items: Vec<Token>,
979 token_stack: Vec<Token>,
980 offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
984}
985
986impl Tokenizer {
987 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
989 Self::new_with_dialect(formula, FormulaDialect::Excel)
990 }
991
992 pub fn new_with_dialect(
994 formula: &str,
995 dialect: FormulaDialect,
996 ) -> Result<Self, TokenizerError> {
997 let mut tokenizer = Tokenizer {
998 formula: formula.to_string(),
999 items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
1001 offset: 0,
1002 token_start: 0,
1003 token_end: 0,
1004 dialect,
1005 };
1006 tokenizer.parse()?;
1007 Ok(tokenizer)
1008 }
1009
1010 pub fn from_token_stream(stream: &TokenStream) -> Self {
1011 Tokenizer {
1012 formula: stream.source.to_string(),
1013 items: stream.to_tokens(),
1014 token_stack: Vec::with_capacity(16),
1015 offset: 0,
1016 token_start: 0,
1017 token_end: 0,
1018 dialect: stream.dialect,
1019 }
1020 }
1021
1022 #[inline]
1024 fn current_byte(&self) -> Option<u8> {
1025 self.formula.as_bytes().get(self.offset).copied()
1026 }
1027
1028 #[inline]
1030 fn has_token(&self) -> bool {
1031 self.token_end > self.token_start
1032 }
1033
1034 #[inline]
1036 fn start_token(&mut self) {
1037 self.token_start = self.offset;
1038 self.token_end = self.offset;
1039 }
1040
1041 #[inline]
1043 fn extend_token(&mut self) {
1044 self.token_end = self.offset;
1045 }
1046
1047 fn parse(&mut self) -> Result<(), TokenizerError> {
1049 if self.formula.is_empty() {
1050 return Ok(());
1051 }
1052
1053 if self.formula.as_bytes()[0] != b'=' {
1055 self.items.push(Token::new_with_span(
1056 self.formula.clone(),
1057 TokenType::Literal,
1058 TokenSubType::None,
1059 0,
1060 self.formula.len(),
1061 ));
1062 return Ok(());
1063 }
1064
1065 self.offset = 1;
1067 self.start_token();
1068
1069 while self.offset < self.formula.len() {
1070 if self.check_scientific_notation()? {
1071 continue;
1072 }
1073
1074 let curr_byte = self.formula.as_bytes()[self.offset];
1075
1076 if is_token_ender(curr_byte) && self.has_token() {
1078 self.save_token();
1079 self.start_token();
1080 }
1081
1082 match curr_byte {
1084 b'"' | b'\'' => self.parse_string()?,
1085 b'[' => self.parse_brackets()?,
1086 b'#' => self.parse_error()?,
1087 b' ' | b'\n' => self.parse_whitespace()?,
1088 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' => {
1090 self.parse_operator()?
1091 }
1092 b'{' | b'(' => self.parse_opener()?,
1093 b')' | b'}' => self.parse_closer()?,
1094 b';' | b',' => self.parse_separator()?,
1095 _ => {
1096 if !self.has_token() {
1098 self.start_token();
1099 }
1100 self.offset += 1;
1101 self.extend_token();
1102 }
1103 }
1104 }
1105
1106 if self.has_token() {
1108 self.save_token();
1109 }
1110
1111 if !self.token_stack.is_empty() {
1113 return Err(TokenizerError {
1114 message: "Unmatched opening parenthesis or bracket".to_string(),
1115 pos: self.offset,
1116 });
1117 }
1118
1119 Ok(())
1120 }
1121
1122 fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
1125 if let Some(curr_byte) = self.current_byte()
1126 && (curr_byte == b'+' || curr_byte == b'-')
1127 && self.has_token()
1128 && self.is_scientific_notation_base()
1129 {
1130 self.offset += 1;
1131 self.extend_token();
1132 return Ok(true);
1133 }
1134 Ok(false)
1135 }
1136
1137 fn is_scientific_notation_base(&self) -> bool {
1140 if !self.has_token() {
1141 return false;
1142 }
1143
1144 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
1145 if token_slice.len() < 2 {
1146 return false;
1147 }
1148
1149 let last = token_slice[token_slice.len() - 1];
1150 if !(last == b'E' || last == b'e') {
1151 return false;
1152 }
1153
1154 let first = token_slice[0];
1155 if !first.is_ascii_digit() {
1156 return false;
1157 }
1158
1159 let mut dot_seen = false;
1160 for &ch in &token_slice[1..token_slice.len() - 1] {
1162 match ch {
1163 b'0'..=b'9' => {}
1164 b'.' if !dot_seen => dot_seen = true,
1165 _ => return false,
1166 }
1167 }
1168 true
1169 }
1170
1171 fn save_token(&mut self) {
1173 if self.has_token() {
1174 let token =
1175 Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
1176 self.items.push(token);
1177 }
1178 }
1179
1180 fn parse_string(&mut self) -> Result<(), TokenizerError> {
1182 let delim = self.formula.as_bytes()[self.offset];
1183 assert!(delim == b'"' || delim == b'\'');
1184
1185 let is_dollar_ref = delim == b'\''
1187 && self.has_token()
1188 && self.token_end - self.token_start == 1
1189 && self.formula.as_bytes()[self.token_start] == b'$';
1190
1191 if !is_dollar_ref && self.has_token() {
1192 if self.token_end > 0 && self.formula.as_bytes()[self.token_end - 1] != b':' {
1194 self.save_token();
1195 self.start_token();
1196 }
1197 }
1198
1199 let string_start = if is_dollar_ref {
1200 self.token_start
1201 } else {
1202 self.offset
1203 };
1204 self.offset += 1; while self.offset < self.formula.len() {
1207 if self.formula.as_bytes()[self.offset] == delim {
1208 self.offset += 1;
1209 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
1211 {
1212 self.offset += 1; } else {
1214 if delim == b'"' {
1216 let token = Token::make_operand_from_slice(
1217 &self.formula,
1218 string_start,
1219 self.offset,
1220 );
1221 self.items.push(token);
1222 self.start_token();
1223 } else {
1224 self.token_end = self.offset;
1226 }
1227 return Ok(());
1228 }
1229 } else {
1230 self.offset += 1;
1231 }
1232 }
1233
1234 Err(TokenizerError {
1235 message: "Reached end of formula while parsing string".to_string(),
1236 pos: self.offset,
1237 })
1238 }
1239
1240 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
1242 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
1243
1244 if !self.has_token() {
1245 self.start_token();
1246 }
1247
1248 let mut open_count = 1;
1249 self.offset += 1;
1250
1251 while self.offset < self.formula.len() {
1252 match self.formula.as_bytes()[self.offset] {
1253 b'[' => open_count += 1,
1254 b']' => {
1255 open_count -= 1;
1256 if open_count == 0 {
1257 self.offset += 1;
1258 self.extend_token();
1259 return Ok(());
1260 }
1261 }
1262 _ => {}
1263 }
1264 self.offset += 1;
1265 }
1266
1267 Err(TokenizerError {
1268 message: "Encountered unmatched '['".to_string(),
1269 pos: self.offset,
1270 })
1271 }
1272
1273 fn parse_error(&mut self) -> Result<(), TokenizerError> {
1275 if self.has_token()
1277 && self.token_end > 0
1278 && self.formula.as_bytes()[self.token_end - 1] != b'!'
1279 {
1280 self.save_token();
1281 self.start_token();
1282 }
1283
1284 let error_start = if self.has_token() {
1285 self.token_start
1286 } else {
1287 self.offset
1288 };
1289
1290 for &err_code in ERROR_CODES {
1292 let err_bytes = err_code.as_bytes();
1293 if self.offset + err_bytes.len() <= self.formula.len() {
1294 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1295 if slice == err_bytes {
1296 let token = Token::make_operand_from_slice(
1297 &self.formula,
1298 error_start,
1299 self.offset + err_bytes.len(),
1300 );
1301 self.items.push(token);
1302 self.offset += err_bytes.len();
1303 self.start_token();
1304 return Ok(());
1305 }
1306 }
1307 }
1308
1309 Err(TokenizerError {
1310 message: format!("Invalid error code at position {}", self.offset),
1311 pos: self.offset,
1312 })
1313 }
1314
1315 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
1317 self.save_token();
1318
1319 let ws_start = self.offset;
1320 while self.offset < self.formula.len() {
1321 match self.formula.as_bytes()[self.offset] {
1322 b' ' | b'\n' => self.offset += 1,
1323 _ => break,
1324 }
1325 }
1326
1327 self.items.push(Token::from_slice(
1328 &self.formula,
1329 TokenType::Whitespace,
1330 TokenSubType::None,
1331 ws_start,
1332 self.offset,
1333 ));
1334 self.start_token();
1335 Ok(())
1336 }
1337
1338 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
1340 self.save_token();
1341
1342 if self.offset + 1 < self.formula.len() {
1344 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1345 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1346 self.items.push(Token::from_slice(
1347 &self.formula,
1348 TokenType::OpInfix,
1349 TokenSubType::None,
1350 self.offset,
1351 self.offset + 2,
1352 ));
1353 self.offset += 2;
1354 self.start_token();
1355 return Ok(());
1356 }
1357 }
1358
1359 let curr_byte = self.formula.as_bytes()[self.offset];
1360 let token_type = match curr_byte {
1361 b'%' => TokenType::OpPostfix,
1362 b'+' | b'-' => {
1363 if self.items.is_empty() {
1365 TokenType::OpPrefix
1366 } else {
1367 let prev = self
1368 .items
1369 .iter()
1370 .rev()
1371 .find(|t| t.token_type != TokenType::Whitespace);
1372 if let Some(p) = prev {
1373 if p.subtype == TokenSubType::Close
1374 || p.token_type == TokenType::OpPostfix
1375 || p.token_type == TokenType::Operand
1376 {
1377 TokenType::OpInfix
1378 } else {
1379 TokenType::OpPrefix
1380 }
1381 } else {
1382 TokenType::OpPrefix
1383 }
1384 }
1385 }
1386 _ => TokenType::OpInfix,
1387 };
1388
1389 self.items.push(Token::from_slice(
1390 &self.formula,
1391 token_type,
1392 TokenSubType::None,
1393 self.offset,
1394 self.offset + 1,
1395 ));
1396 self.offset += 1;
1397 self.start_token();
1398 Ok(())
1399 }
1400
1401 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
1403 let curr_byte = self.formula.as_bytes()[self.offset];
1404 assert!(curr_byte == b'(' || curr_byte == b'{');
1405
1406 let token = if curr_byte == b'{' {
1407 self.save_token();
1408 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1409 } else if self.has_token() {
1410 let token = Token::make_subexp_from_slice(
1412 &self.formula,
1413 true,
1414 self.token_start,
1415 self.offset + 1,
1416 );
1417 self.token_start = self.offset + 1;
1418 self.token_end = self.offset + 1;
1419 token
1420 } else {
1421 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1422 };
1423
1424 self.items.push(token.clone());
1425 self.token_stack.push(token);
1426 self.offset += 1;
1427 self.start_token();
1428 Ok(())
1429 }
1430
1431 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
1433 self.save_token();
1434
1435 let curr_byte = self.formula.as_bytes()[self.offset];
1436 assert!(curr_byte == b')' || curr_byte == b'}');
1437
1438 if let Some(open_token) = self.token_stack.pop() {
1439 let closer = open_token.get_closer()?;
1440 if (curr_byte == b'}' && closer.value != "}")
1441 || (curr_byte == b')' && closer.value != ")")
1442 {
1443 return Err(TokenizerError {
1444 message: "Mismatched ( and { pair".to_string(),
1445 pos: self.offset,
1446 });
1447 }
1448
1449 self.items.push(Token::from_slice(
1450 &self.formula,
1451 closer.token_type,
1452 TokenSubType::Close,
1453 self.offset,
1454 self.offset + 1,
1455 ));
1456 } else {
1457 return Err(TokenizerError {
1458 message: format!("No matching opener for closer at position {}", self.offset),
1459 pos: self.offset,
1460 });
1461 }
1462
1463 self.offset += 1;
1464 self.start_token();
1465 Ok(())
1466 }
1467
1468 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
1470 self.save_token();
1471
1472 let curr_byte = self.formula.as_bytes()[self.offset];
1473 assert!(curr_byte == b';' || curr_byte == b',');
1474
1475 let top_token = self.token_stack.last();
1476 let in_function_or_array = matches!(
1477 top_token.map(|t| t.token_type),
1478 Some(TokenType::Func | TokenType::Array)
1479 );
1480 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1481
1482 let (token_type, subtype) = match curr_byte {
1483 b',' => {
1484 if in_function_or_array {
1485 (TokenType::Sep, TokenSubType::Arg)
1486 } else {
1487 (TokenType::OpInfix, TokenSubType::None)
1488 }
1489 }
1490 b';' => {
1491 if in_array {
1492 (TokenType::Sep, TokenSubType::Row)
1494 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1495 (TokenType::Sep, TokenSubType::Arg)
1497 } else if self.dialect == FormulaDialect::OpenFormula {
1498 (TokenType::OpInfix, TokenSubType::None)
1499 } else {
1500 (TokenType::Sep, TokenSubType::Row)
1501 }
1502 }
1503 _ => (TokenType::OpInfix, TokenSubType::None),
1504 };
1505
1506 self.items.push(Token::from_slice(
1507 &self.formula,
1508 token_type,
1509 subtype,
1510 self.offset,
1511 self.offset + 1,
1512 ));
1513
1514 self.offset += 1;
1515 self.start_token();
1516 Ok(())
1517 }
1518
1519 pub fn render(&self) -> String {
1521 if self.items.is_empty() {
1522 "".to_string()
1523 } else if self.items[0].token_type == TokenType::Literal {
1524 self.items[0].value.clone()
1525 } else {
1526 let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
1527 format!("={concatenated}")
1528 }
1529 }
1530
1531 pub fn dialect(&self) -> FormulaDialect {
1533 self.dialect
1534 }
1535}
1536
1537impl TryFrom<&str> for Tokenizer {
1538 type Error = TokenizerError;
1539
1540 fn try_from(value: &str) -> Result<Self, Self::Error> {
1541 Tokenizer::new(value)
1542 }
1543}
1544
1545impl TryFrom<String> for Tokenizer {
1546 type Error = TokenizerError;
1547
1548 fn try_from(value: String) -> Result<Self, Self::Error> {
1549 Tokenizer::new(&value)
1550 }
1551}