1use std::convert::TryFrom;
2use std::error::Error;
3use std::fmt::{self, Display};
4use std::sync::Arc;
5
6use crate::types::FormulaDialect;
7
8const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%";
9
10const fn build_token_enders() -> [bool; 256] {
11 let mut tbl = [false; 256];
12 let bytes = TOKEN_ENDERS.as_bytes();
13 let mut i = 0;
14 while i < bytes.len() {
15 tbl[bytes[i] as usize] = true;
16 i += 1;
17 }
18 tbl
19}
20static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
21
22#[inline(always)]
23fn is_token_ender(c: u8) -> bool {
24 TOKEN_ENDERS_TABLE[c as usize]
25}
26
27static ERROR_CODES: &[&str] = &[
28 "#NULL!",
29 "#DIV/0!",
30 "#VALUE!",
31 "#REF!",
32 "#NAME?",
33 "#NUM!",
34 "#N/A",
35 "#GETTING_DATA",
36];
37
38#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum Associativity {
41 Left,
42 Right,
43}
44
45#[derive(Debug)]
47pub struct TokenizerError {
48 pub message: String,
49 pub pos: usize,
50}
51
52impl fmt::Display for TokenizerError {
53 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54 write!(f, "TokenizerError: {}", self.message)
55 }
56}
57
58impl Error for TokenizerError {}
59
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
62pub enum TokenType {
63 Literal,
64 Operand,
65 Func,
66 Array,
67 Paren,
68 Sep,
69 OpPrefix,
70 OpInfix,
71 OpPostfix,
72 Whitespace,
73}
74
75impl Display for TokenType {
76 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
77 write!(f, "{self:?}")
78 }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
83pub enum TokenSubType {
84 None,
85 Text,
86 Number,
87 Logical,
88 Error,
89 Range,
90 Open,
91 Close,
92 Arg,
93 Row,
94}
95
96impl Display for TokenSubType {
97 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98 write!(f, "{self:?}")
99 }
100}
101
102#[derive(Debug, Clone, PartialEq, Hash)]
104pub struct Token {
105 pub value: String, pub token_type: TokenType,
107 pub subtype: TokenSubType,
108 pub start: usize,
109 pub end: usize,
110}
111
112impl Display for Token {
113 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
114 write!(
115 f,
116 "<{} subtype: {:?} value: {}>",
117 self.token_type, self.subtype, self.value
118 )
119 }
120}
121
122impl Token {
123 pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
124 Token {
125 value,
126 token_type,
127 subtype,
128 start: 0,
129 end: 0,
130 }
131 }
132
133 pub fn new_with_span(
134 value: String,
135 token_type: TokenType,
136 subtype: TokenSubType,
137 start: usize,
138 end: usize,
139 ) -> Self {
140 Token {
141 value,
142 token_type,
143 subtype,
144 start,
145 end,
146 }
147 }
148
149 fn from_slice(
150 source: &str,
151 token_type: TokenType,
152 subtype: TokenSubType,
153 start: usize,
154 end: usize,
155 ) -> Self {
156 Token {
157 value: source[start..end].to_string(),
158 token_type,
159 subtype,
160 start,
161 end,
162 }
163 }
164
165 pub fn is_operator(&self) -> bool {
166 matches!(
167 self.token_type,
168 TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
169 )
170 }
171
172 pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
173 let op = if self.token_type == TokenType::OpPrefix {
175 "u"
176 } else {
177 self.value.as_str()
178 };
179
180 match op {
191 ":" | " " | "," => Some((8, Associativity::Left)),
192 "%" => Some((7, Associativity::Left)),
193 "^" => Some((6, Associativity::Right)),
194 "u" => Some((5, Associativity::Right)),
195 "*" | "/" => Some((4, Associativity::Left)),
196 "+" | "-" => Some((3, Associativity::Left)),
197 "&" => Some((2, Associativity::Left)),
198 "=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
199 _ => None,
200 }
201 }
202
203 pub fn make_operand(value: String) -> Self {
205 let subtype = if value.starts_with('"') {
206 TokenSubType::Text
207 } else if value.starts_with('#') {
208 TokenSubType::Error
209 } else if value == "TRUE" || value == "FALSE" {
210 TokenSubType::Logical
211 } else if value.parse::<f64>().is_ok() {
212 TokenSubType::Number
213 } else {
214 TokenSubType::Range
215 };
216 Token::new(value, TokenType::Operand, subtype)
217 }
218
219 pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
221 let subtype = if value.starts_with('"') {
222 TokenSubType::Text
223 } else if value.starts_with('#') {
224 TokenSubType::Error
225 } else if value == "TRUE" || value == "FALSE" {
226 TokenSubType::Logical
227 } else if value.parse::<f64>().is_ok() {
228 TokenSubType::Number
229 } else {
230 TokenSubType::Range
231 };
232 Token::new_with_span(value, TokenType::Operand, subtype, start, end)
233 }
234
235 fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
236 let value_str = &source[start..end];
237 let subtype = if value_str.starts_with('"') {
238 TokenSubType::Text
239 } else if value_str.starts_with('#') {
240 TokenSubType::Error
241 } else if value_str == "TRUE" || value_str == "FALSE" {
242 TokenSubType::Logical
243 } else if value_str.parse::<f64>().is_ok() {
244 TokenSubType::Number
245 } else {
246 TokenSubType::Range
247 };
248 Token::from_slice(source, TokenType::Operand, subtype, start, end)
249 }
250
251 pub fn make_subexp(value: &str, func: bool) -> Self {
256 let last_char = value.chars().last().expect("Empty token value");
257 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
258 let token_type = if func {
259 TokenType::Func
260 } else if "{}".contains(last_char) {
261 TokenType::Array
262 } else if "()".contains(last_char) {
263 TokenType::Paren
264 } else {
265 TokenType::Func
266 };
267 let subtype = if ")}".contains(last_char) {
268 TokenSubType::Close
269 } else {
270 TokenSubType::Open
271 };
272 Token::new(value.to_string(), token_type, subtype)
273 }
274
275 pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
277 let last_char = value.chars().last().expect("Empty token value");
278 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
279 let token_type = if func {
280 TokenType::Func
281 } else if "{}".contains(last_char) {
282 TokenType::Array
283 } else if "()".contains(last_char) {
284 TokenType::Paren
285 } else {
286 TokenType::Func
287 };
288 let subtype = if ")}".contains(last_char) {
289 TokenSubType::Close
290 } else {
291 TokenSubType::Open
292 };
293 Token::new_with_span(value.to_string(), token_type, subtype, start, end)
294 }
295
296 fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
297 let value_str = &source[start..end];
298 let last_char = value_str.chars().last().expect("Empty token value");
299 let token_type = if func {
300 TokenType::Func
301 } else if "{}".contains(last_char) {
302 TokenType::Array
303 } else if "()".contains(last_char) {
304 TokenType::Paren
305 } else {
306 TokenType::Func
307 };
308 let subtype = if ")}".contains(last_char) {
309 TokenSubType::Close
310 } else {
311 TokenSubType::Open
312 };
313 Token::from_slice(source, token_type, subtype, start, end)
314 }
315
316 pub fn get_closer(&self) -> Result<Token, TokenizerError> {
318 if self.subtype != TokenSubType::Open {
319 return Err(TokenizerError {
320 message: "Token is not an opener".to_string(),
321 pos: 0,
322 });
323 }
324 let closer_value = if self.token_type == TokenType::Array {
325 "}"
326 } else {
327 ")"
328 };
329 Ok(Token::make_subexp(
330 closer_value,
331 self.token_type == TokenType::Func,
332 ))
333 }
334
335 pub fn make_separator(value: &str) -> Self {
337 assert!(value == "," || value == ";");
338 let subtype = if value == "," {
339 TokenSubType::Arg
340 } else {
341 TokenSubType::Row
342 };
343 Token::new(value.to_string(), TokenType::Sep, subtype)
344 }
345
346 pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
348 assert!(value == "," || value == ";");
349 let subtype = if value == "," {
350 TokenSubType::Arg
351 } else {
352 TokenSubType::Row
353 };
354 Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
355 }
356}
357
358#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
359pub struct TokenSpan {
360 pub token_type: TokenType,
361 pub subtype: TokenSubType,
362 pub start: usize,
363 pub end: usize,
364}
365
366#[derive(Debug, Clone, Copy, PartialEq, Eq)]
367pub struct TokenView<'a> {
368 pub span: &'a TokenSpan,
369 pub value: &'a str,
370}
371
372#[derive(Debug, Clone)]
378pub struct TokenStream {
379 source: Arc<str>,
380 pub spans: Vec<TokenSpan>,
381 dialect: FormulaDialect,
382}
383
384impl TokenStream {
385 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
386 Self::new_with_dialect(formula, FormulaDialect::Excel)
387 }
388
389 pub fn new_with_dialect(
390 formula: &str,
391 dialect: FormulaDialect,
392 ) -> Result<Self, TokenizerError> {
393 let source: Arc<str> = Arc::from(formula);
394 let spans = tokenize_spans_with_dialect(source.as_ref(), dialect)?;
395 Ok(TokenStream {
396 source,
397 spans,
398 dialect,
399 })
400 }
401
402 pub fn source(&self) -> &str {
403 &self.source
404 }
405
406 pub fn dialect(&self) -> FormulaDialect {
407 self.dialect
408 }
409
410 pub fn len(&self) -> usize {
411 self.spans.len()
412 }
413
414 pub fn is_empty(&self) -> bool {
415 self.spans.is_empty()
416 }
417
418 pub fn get(&self, index: usize) -> Option<TokenView<'_>> {
419 let span = self.spans.get(index)?;
420 let value = self.source.get(span.start..span.end)?;
421 Some(TokenView { span, value })
422 }
423
424 pub fn to_tokens(&self) -> Vec<Token> {
425 self.spans
426 .iter()
427 .map(|s| {
428 let value = self
429 .source
430 .get(s.start..s.end)
431 .unwrap_or_default()
432 .to_string();
433 Token::new_with_span(value, s.token_type, s.subtype, s.start, s.end)
434 })
435 .collect()
436 }
437
438 pub fn render(&self) -> String {
439 let mut out = String::with_capacity(self.source.len());
440 for span in &self.spans {
441 if let Some(s) = self.source.get(span.start..span.end) {
442 out.push_str(s);
443 }
444 }
445 out
446 }
447}
448
449pub(crate) fn tokenize_spans_with_dialect(
450 formula: &str,
451 dialect: FormulaDialect,
452) -> Result<Vec<TokenSpan>, TokenizerError> {
453 let mut tokenizer = SpanTokenizer::new(formula, dialect);
454 tokenizer.parse()?;
455 Ok(tokenizer.spans)
456}
457
458fn operand_subtype(value_str: &str) -> TokenSubType {
459 if value_str.starts_with('"') {
460 TokenSubType::Text
461 } else if value_str.starts_with('#') {
462 TokenSubType::Error
463 } else if value_str == "TRUE" || value_str == "FALSE" {
464 TokenSubType::Logical
465 } else if value_str.parse::<f64>().is_ok() {
466 TokenSubType::Number
467 } else {
468 TokenSubType::Range
469 }
470}
471
472struct SpanTokenizer<'a> {
473 formula: &'a str,
474 spans: Vec<TokenSpan>,
475 token_stack: Vec<TokenSpan>,
476 offset: usize,
477 token_start: usize,
478 token_end: usize,
479 dialect: FormulaDialect,
480}
481
482impl<'a> SpanTokenizer<'a> {
483 fn new(formula: &'a str, dialect: FormulaDialect) -> Self {
484 SpanTokenizer {
485 formula,
486 spans: Vec::with_capacity(formula.len() / 2),
487 token_stack: Vec::with_capacity(16),
488 offset: 0,
489 token_start: 0,
490 token_end: 0,
491 dialect,
492 }
493 }
494
495 #[inline]
496 fn current_byte(&self) -> Option<u8> {
497 self.formula.as_bytes().get(self.offset).copied()
498 }
499
500 #[inline]
501 fn has_token(&self) -> bool {
502 self.token_end > self.token_start
503 }
504
505 #[inline]
506 fn start_token(&mut self) {
507 self.token_start = self.offset;
508 self.token_end = self.offset;
509 }
510
511 #[inline]
512 fn extend_token(&mut self) {
513 self.token_end = self.offset;
514 }
515
516 fn push_span(
517 &mut self,
518 token_type: TokenType,
519 subtype: TokenSubType,
520 start: usize,
521 end: usize,
522 ) {
523 self.spans.push(TokenSpan {
524 token_type,
525 subtype,
526 start,
527 end,
528 });
529 }
530
531 fn save_token(&mut self) {
532 if self.has_token() {
533 let value_str = &self.formula[self.token_start..self.token_end];
534 let subtype = operand_subtype(value_str);
535 self.push_span(
536 TokenType::Operand,
537 subtype,
538 self.token_start,
539 self.token_end,
540 );
541 }
542 }
543
544 fn check_scientific_notation(&mut self) -> bool {
545 if let Some(curr_byte) = self.current_byte()
546 && (curr_byte == b'+' || curr_byte == b'-')
547 && self.has_token()
548 && self.is_scientific_notation_base()
549 {
550 self.offset += 1;
551 self.extend_token();
552 return true;
553 }
554 false
555 }
556
557 fn is_scientific_notation_base(&self) -> bool {
558 if !self.has_token() {
559 return false;
560 }
561
562 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
563 if token_slice.len() < 2 {
564 return false;
565 }
566
567 let last = token_slice[token_slice.len() - 1];
568 if !(last == b'E' || last == b'e') {
569 return false;
570 }
571
572 let first = token_slice[0];
573 if !first.is_ascii_digit() {
574 return false;
575 }
576
577 let mut dot_seen = false;
578 for &ch in &token_slice[1..token_slice.len() - 1] {
579 match ch {
580 b'0'..=b'9' => {}
581 b'.' if !dot_seen => dot_seen = true,
582 _ => return false,
583 }
584 }
585 true
586 }
587
588 fn parse(&mut self) -> Result<(), TokenizerError> {
589 if self.formula.is_empty() {
590 return Ok(());
591 }
592
593 if self.formula.as_bytes()[0] != b'=' {
594 self.push_span(
595 TokenType::Literal,
596 TokenSubType::None,
597 0,
598 self.formula.len(),
599 );
600 return Ok(());
601 }
602
603 self.offset = 1;
604 self.start_token();
605
606 while self.offset < self.formula.len() {
607 if self.check_scientific_notation() {
608 continue;
609 }
610
611 let curr_byte = self.formula.as_bytes()[self.offset];
612
613 if is_token_ender(curr_byte) && self.has_token() {
614 self.save_token();
615 self.start_token();
616 }
617
618 match curr_byte {
619 b'"' | b'\'' => self.parse_string()?,
620 b'[' => self.parse_brackets()?,
621 b'#' => self.parse_error()?,
622 b' ' | b'\n' => self.parse_whitespace()?,
623 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' => {
624 self.parse_operator()?
625 }
626 b'{' | b'(' => self.parse_opener()?,
627 b')' | b'}' => self.parse_closer()?,
628 b';' | b',' => self.parse_separator()?,
629 _ => {
630 if !self.has_token() {
631 self.start_token();
632 }
633 self.offset += 1;
634 self.extend_token();
635 }
636 }
637 }
638
639 if self.has_token() {
640 self.save_token();
641 }
642
643 if !self.token_stack.is_empty() {
644 return Err(TokenizerError {
645 message: "Unmatched opening parenthesis or bracket".to_string(),
646 pos: self.offset,
647 });
648 }
649
650 Ok(())
651 }
652
653 fn parse_string(&mut self) -> Result<(), TokenizerError> {
654 let delim = self.formula.as_bytes()[self.offset];
655 assert!(delim == b'"' || delim == b'\'');
656
657 let is_dollar_ref = delim == b'\''
658 && self.has_token()
659 && self.token_end - self.token_start == 1
660 && self.formula.as_bytes()[self.token_start] == b'$';
661
662 if !is_dollar_ref
663 && self.has_token()
664 && self.token_end > 0
665 && self.formula.as_bytes()[self.token_end - 1] != b':'
666 {
667 self.save_token();
668 self.start_token();
669 }
670
671 let string_start = if is_dollar_ref {
672 self.token_start
673 } else {
674 self.offset
675 };
676 self.offset += 1;
677
678 while self.offset < self.formula.len() {
679 if self.formula.as_bytes()[self.offset] == delim {
680 self.offset += 1;
681 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
682 {
683 self.offset += 1;
684 } else {
685 if delim == b'"' {
686 let value_str = &self.formula[string_start..self.offset];
687 let subtype = operand_subtype(value_str);
688 self.push_span(TokenType::Operand, subtype, string_start, self.offset);
689 self.start_token();
690 } else {
691 self.token_end = self.offset;
692 }
693 return Ok(());
694 }
695 } else {
696 self.offset += 1;
697 }
698 }
699
700 Err(TokenizerError {
701 message: "Reached end of formula while parsing string".to_string(),
702 pos: self.offset,
703 })
704 }
705
706 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
707 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
708
709 if !self.has_token() {
710 self.start_token();
711 }
712
713 let mut open_count = 1;
714 self.offset += 1;
715
716 while self.offset < self.formula.len() {
717 match self.formula.as_bytes()[self.offset] {
718 b'[' => open_count += 1,
719 b']' => {
720 open_count -= 1;
721 if open_count == 0 {
722 self.offset += 1;
723 self.extend_token();
724 return Ok(());
725 }
726 }
727 _ => {}
728 }
729 self.offset += 1;
730 }
731
732 Err(TokenizerError {
733 message: "Encountered unmatched '['".to_string(),
734 pos: self.offset,
735 })
736 }
737
738 fn parse_error(&mut self) -> Result<(), TokenizerError> {
739 if self.has_token()
740 && self.token_end > 0
741 && self.formula.as_bytes()[self.token_end - 1] != b'!'
742 {
743 self.save_token();
744 self.start_token();
745 }
746
747 let error_start = if self.has_token() {
748 self.token_start
749 } else {
750 self.offset
751 };
752
753 for &err_code in ERROR_CODES {
754 let err_bytes = err_code.as_bytes();
755 if self.offset + err_bytes.len() <= self.formula.len() {
756 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
757 if slice == err_bytes {
758 self.push_span(
759 TokenType::Operand,
760 TokenSubType::Error,
761 error_start,
762 self.offset + err_bytes.len(),
763 );
764 self.offset += err_bytes.len();
765 self.start_token();
766 return Ok(());
767 }
768 }
769 }
770
771 Err(TokenizerError {
772 message: format!("Invalid error code at position {}", self.offset),
773 pos: self.offset,
774 })
775 }
776
777 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
778 self.save_token();
779
780 let ws_start = self.offset;
781 while self.offset < self.formula.len() {
782 match self.formula.as_bytes()[self.offset] {
783 b' ' | b'\n' => self.offset += 1,
784 _ => break,
785 }
786 }
787
788 self.push_span(
789 TokenType::Whitespace,
790 TokenSubType::None,
791 ws_start,
792 self.offset,
793 );
794 self.start_token();
795 Ok(())
796 }
797
798 fn prev_non_whitespace(&self) -> Option<&TokenSpan> {
799 self.spans
800 .iter()
801 .rev()
802 .find(|t| t.token_type != TokenType::Whitespace)
803 }
804
805 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
806 self.save_token();
807
808 if self.offset + 1 < self.formula.len() {
809 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
810 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
811 self.push_span(
812 TokenType::OpInfix,
813 TokenSubType::None,
814 self.offset,
815 self.offset + 2,
816 );
817 self.offset += 2;
818 self.start_token();
819 return Ok(());
820 }
821 }
822
823 let curr_byte = self.formula.as_bytes()[self.offset];
824 let token_type = match curr_byte {
825 b'%' => TokenType::OpPostfix,
826 b'+' | b'-' => {
827 if self.spans.is_empty() {
828 TokenType::OpPrefix
829 } else {
830 let prev = self.prev_non_whitespace();
831 if let Some(p) = prev {
832 if p.subtype == TokenSubType::Close
833 || p.token_type == TokenType::OpPostfix
834 || p.token_type == TokenType::Operand
835 {
836 TokenType::OpInfix
837 } else {
838 TokenType::OpPrefix
839 }
840 } else {
841 TokenType::OpPrefix
842 }
843 }
844 }
845 _ => TokenType::OpInfix,
846 };
847
848 self.push_span(token_type, TokenSubType::None, self.offset, self.offset + 1);
849 self.offset += 1;
850 self.start_token();
851 Ok(())
852 }
853
854 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
855 let curr_byte = self.formula.as_bytes()[self.offset];
856 assert!(curr_byte == b'(' || curr_byte == b'{');
857
858 let token = if curr_byte == b'{' {
859 self.save_token();
860 TokenSpan {
861 token_type: TokenType::Array,
862 subtype: TokenSubType::Open,
863 start: self.offset,
864 end: self.offset + 1,
865 }
866 } else if self.has_token() {
867 let token = TokenSpan {
868 token_type: TokenType::Func,
869 subtype: TokenSubType::Open,
870 start: self.token_start,
871 end: self.offset + 1,
872 };
873 self.token_start = self.offset + 1;
874 self.token_end = self.offset + 1;
875 token
876 } else {
877 TokenSpan {
878 token_type: TokenType::Paren,
879 subtype: TokenSubType::Open,
880 start: self.offset,
881 end: self.offset + 1,
882 }
883 };
884
885 self.spans.push(token);
886 self.token_stack.push(token);
887 self.offset += 1;
888 self.start_token();
889 Ok(())
890 }
891
892 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
893 self.save_token();
894
895 let curr_byte = self.formula.as_bytes()[self.offset];
896 assert!(curr_byte == b')' || curr_byte == b'}');
897
898 if let Some(open_token) = self.token_stack.pop() {
899 let expected = if open_token.token_type == TokenType::Array {
900 b'}'
901 } else {
902 b')'
903 };
904 if curr_byte != expected {
905 return Err(TokenizerError {
906 message: "Mismatched ( and { pair".to_string(),
907 pos: self.offset,
908 });
909 }
910
911 self.push_span(
912 open_token.token_type,
913 TokenSubType::Close,
914 self.offset,
915 self.offset + 1,
916 );
917 } else {
918 return Err(TokenizerError {
919 message: format!("No matching opener for closer at position {}", self.offset),
920 pos: self.offset,
921 });
922 }
923
924 self.offset += 1;
925 self.start_token();
926 Ok(())
927 }
928
929 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
930 self.save_token();
931
932 let curr_byte = self.formula.as_bytes()[self.offset];
933 assert!(curr_byte == b';' || curr_byte == b',');
934
935 let top_token = self.token_stack.last();
936 let in_function_or_array = matches!(
937 top_token.map(|t| t.token_type),
938 Some(TokenType::Func | TokenType::Array)
939 );
940 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
941
942 let (token_type, subtype) = match curr_byte {
943 b',' => {
944 if in_function_or_array {
945 (TokenType::Sep, TokenSubType::Arg)
946 } else {
947 (TokenType::OpInfix, TokenSubType::None)
948 }
949 }
950 b';' => {
951 if in_array {
952 (TokenType::Sep, TokenSubType::Row)
953 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
954 (TokenType::Sep, TokenSubType::Arg)
955 } else if self.dialect == FormulaDialect::OpenFormula {
956 (TokenType::OpInfix, TokenSubType::None)
957 } else {
958 (TokenType::Sep, TokenSubType::Row)
959 }
960 }
961 _ => (TokenType::OpInfix, TokenSubType::None),
962 };
963
964 self.push_span(token_type, subtype, self.offset, self.offset + 1);
965 self.offset += 1;
966 self.start_token();
967 Ok(())
968 }
969}
970
971pub struct Tokenizer {
973 formula: String, pub items: Vec<Token>,
975 token_stack: Vec<Token>,
976 offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
980}
981
982impl Tokenizer {
983 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
985 Self::new_with_dialect(formula, FormulaDialect::Excel)
986 }
987
988 pub fn new_with_dialect(
990 formula: &str,
991 dialect: FormulaDialect,
992 ) -> Result<Self, TokenizerError> {
993 let mut tokenizer = Tokenizer {
994 formula: formula.to_string(),
995 items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
997 offset: 0,
998 token_start: 0,
999 token_end: 0,
1000 dialect,
1001 };
1002 tokenizer.parse()?;
1003 Ok(tokenizer)
1004 }
1005
1006 pub fn from_token_stream(stream: &TokenStream) -> Self {
1007 Tokenizer {
1008 formula: stream.source.to_string(),
1009 items: stream.to_tokens(),
1010 token_stack: Vec::with_capacity(16),
1011 offset: 0,
1012 token_start: 0,
1013 token_end: 0,
1014 dialect: stream.dialect,
1015 }
1016 }
1017
1018 #[inline]
1020 fn current_byte(&self) -> Option<u8> {
1021 self.formula.as_bytes().get(self.offset).copied()
1022 }
1023
1024 #[inline]
1026 fn has_token(&self) -> bool {
1027 self.token_end > self.token_start
1028 }
1029
1030 #[inline]
1032 fn start_token(&mut self) {
1033 self.token_start = self.offset;
1034 self.token_end = self.offset;
1035 }
1036
1037 #[inline]
1039 fn extend_token(&mut self) {
1040 self.token_end = self.offset;
1041 }
1042
1043 fn parse(&mut self) -> Result<(), TokenizerError> {
1045 if self.formula.is_empty() {
1046 return Ok(());
1047 }
1048
1049 if self.formula.as_bytes()[0] != b'=' {
1051 self.items.push(Token::new_with_span(
1052 self.formula.clone(),
1053 TokenType::Literal,
1054 TokenSubType::None,
1055 0,
1056 self.formula.len(),
1057 ));
1058 return Ok(());
1059 }
1060
1061 self.offset = 1;
1063 self.start_token();
1064
1065 while self.offset < self.formula.len() {
1066 if self.check_scientific_notation()? {
1067 continue;
1068 }
1069
1070 let curr_byte = self.formula.as_bytes()[self.offset];
1071
1072 if is_token_ender(curr_byte) && self.has_token() {
1074 self.save_token();
1075 self.start_token();
1076 }
1077
1078 match curr_byte {
1080 b'"' | b'\'' => self.parse_string()?,
1081 b'[' => self.parse_brackets()?,
1082 b'#' => self.parse_error()?,
1083 b' ' | b'\n' => self.parse_whitespace()?,
1084 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' => {
1086 self.parse_operator()?
1087 }
1088 b'{' | b'(' => self.parse_opener()?,
1089 b')' | b'}' => self.parse_closer()?,
1090 b';' | b',' => self.parse_separator()?,
1091 _ => {
1092 if !self.has_token() {
1094 self.start_token();
1095 }
1096 self.offset += 1;
1097 self.extend_token();
1098 }
1099 }
1100 }
1101
1102 if self.has_token() {
1104 self.save_token();
1105 }
1106
1107 if !self.token_stack.is_empty() {
1109 return Err(TokenizerError {
1110 message: "Unmatched opening parenthesis or bracket".to_string(),
1111 pos: self.offset,
1112 });
1113 }
1114
1115 Ok(())
1116 }
1117
1118 fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
1121 if let Some(curr_byte) = self.current_byte()
1122 && (curr_byte == b'+' || curr_byte == b'-')
1123 && self.has_token()
1124 && self.is_scientific_notation_base()
1125 {
1126 self.offset += 1;
1127 self.extend_token();
1128 return Ok(true);
1129 }
1130 Ok(false)
1131 }
1132
1133 fn is_scientific_notation_base(&self) -> bool {
1136 if !self.has_token() {
1137 return false;
1138 }
1139
1140 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
1141 if token_slice.len() < 2 {
1142 return false;
1143 }
1144
1145 let last = token_slice[token_slice.len() - 1];
1146 if !(last == b'E' || last == b'e') {
1147 return false;
1148 }
1149
1150 let first = token_slice[0];
1151 if !first.is_ascii_digit() {
1152 return false;
1153 }
1154
1155 let mut dot_seen = false;
1156 for &ch in &token_slice[1..token_slice.len() - 1] {
1158 match ch {
1159 b'0'..=b'9' => {}
1160 b'.' if !dot_seen => dot_seen = true,
1161 _ => return false,
1162 }
1163 }
1164 true
1165 }
1166
1167 fn save_token(&mut self) {
1169 if self.has_token() {
1170 let token =
1171 Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
1172 self.items.push(token);
1173 }
1174 }
1175
1176 fn parse_string(&mut self) -> Result<(), TokenizerError> {
1178 let delim = self.formula.as_bytes()[self.offset];
1179 assert!(delim == b'"' || delim == b'\'');
1180
1181 let is_dollar_ref = delim == b'\''
1183 && self.has_token()
1184 && self.token_end - self.token_start == 1
1185 && self.formula.as_bytes()[self.token_start] == b'$';
1186
1187 if !is_dollar_ref && self.has_token() {
1188 if self.token_end > 0 && self.formula.as_bytes()[self.token_end - 1] != b':' {
1190 self.save_token();
1191 self.start_token();
1192 }
1193 }
1194
1195 let string_start = if is_dollar_ref {
1196 self.token_start
1197 } else {
1198 self.offset
1199 };
1200 self.offset += 1; while self.offset < self.formula.len() {
1203 if self.formula.as_bytes()[self.offset] == delim {
1204 self.offset += 1;
1205 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
1207 {
1208 self.offset += 1; } else {
1210 if delim == b'"' {
1212 let token = Token::make_operand_from_slice(
1213 &self.formula,
1214 string_start,
1215 self.offset,
1216 );
1217 self.items.push(token);
1218 self.start_token();
1219 } else {
1220 self.token_end = self.offset;
1222 }
1223 return Ok(());
1224 }
1225 } else {
1226 self.offset += 1;
1227 }
1228 }
1229
1230 Err(TokenizerError {
1231 message: "Reached end of formula while parsing string".to_string(),
1232 pos: self.offset,
1233 })
1234 }
1235
1236 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
1238 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
1239
1240 if !self.has_token() {
1241 self.start_token();
1242 }
1243
1244 let mut open_count = 1;
1245 self.offset += 1;
1246
1247 while self.offset < self.formula.len() {
1248 match self.formula.as_bytes()[self.offset] {
1249 b'[' => open_count += 1,
1250 b']' => {
1251 open_count -= 1;
1252 if open_count == 0 {
1253 self.offset += 1;
1254 self.extend_token();
1255 return Ok(());
1256 }
1257 }
1258 _ => {}
1259 }
1260 self.offset += 1;
1261 }
1262
1263 Err(TokenizerError {
1264 message: "Encountered unmatched '['".to_string(),
1265 pos: self.offset,
1266 })
1267 }
1268
1269 fn parse_error(&mut self) -> Result<(), TokenizerError> {
1271 if self.has_token()
1273 && self.token_end > 0
1274 && self.formula.as_bytes()[self.token_end - 1] != b'!'
1275 {
1276 self.save_token();
1277 self.start_token();
1278 }
1279
1280 let error_start = if self.has_token() {
1281 self.token_start
1282 } else {
1283 self.offset
1284 };
1285
1286 for &err_code in ERROR_CODES {
1288 let err_bytes = err_code.as_bytes();
1289 if self.offset + err_bytes.len() <= self.formula.len() {
1290 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
1291 if slice == err_bytes {
1292 let token = Token::make_operand_from_slice(
1293 &self.formula,
1294 error_start,
1295 self.offset + err_bytes.len(),
1296 );
1297 self.items.push(token);
1298 self.offset += err_bytes.len();
1299 self.start_token();
1300 return Ok(());
1301 }
1302 }
1303 }
1304
1305 Err(TokenizerError {
1306 message: format!("Invalid error code at position {}", self.offset),
1307 pos: self.offset,
1308 })
1309 }
1310
1311 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
1313 self.save_token();
1314
1315 let ws_start = self.offset;
1316 while self.offset < self.formula.len() {
1317 match self.formula.as_bytes()[self.offset] {
1318 b' ' | b'\n' => self.offset += 1,
1319 _ => break,
1320 }
1321 }
1322
1323 self.items.push(Token::from_slice(
1324 &self.formula,
1325 TokenType::Whitespace,
1326 TokenSubType::None,
1327 ws_start,
1328 self.offset,
1329 ));
1330 self.start_token();
1331 Ok(())
1332 }
1333
1334 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
1336 self.save_token();
1337
1338 if self.offset + 1 < self.formula.len() {
1340 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
1341 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
1342 self.items.push(Token::from_slice(
1343 &self.formula,
1344 TokenType::OpInfix,
1345 TokenSubType::None,
1346 self.offset,
1347 self.offset + 2,
1348 ));
1349 self.offset += 2;
1350 self.start_token();
1351 return Ok(());
1352 }
1353 }
1354
1355 let curr_byte = self.formula.as_bytes()[self.offset];
1356 let token_type = match curr_byte {
1357 b'%' => TokenType::OpPostfix,
1358 b'+' | b'-' => {
1359 if self.items.is_empty() {
1361 TokenType::OpPrefix
1362 } else {
1363 let prev = self
1364 .items
1365 .iter()
1366 .rev()
1367 .find(|t| t.token_type != TokenType::Whitespace);
1368 if let Some(p) = prev {
1369 if p.subtype == TokenSubType::Close
1370 || p.token_type == TokenType::OpPostfix
1371 || p.token_type == TokenType::Operand
1372 {
1373 TokenType::OpInfix
1374 } else {
1375 TokenType::OpPrefix
1376 }
1377 } else {
1378 TokenType::OpPrefix
1379 }
1380 }
1381 }
1382 _ => TokenType::OpInfix,
1383 };
1384
1385 self.items.push(Token::from_slice(
1386 &self.formula,
1387 token_type,
1388 TokenSubType::None,
1389 self.offset,
1390 self.offset + 1,
1391 ));
1392 self.offset += 1;
1393 self.start_token();
1394 Ok(())
1395 }
1396
1397 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
1399 let curr_byte = self.formula.as_bytes()[self.offset];
1400 assert!(curr_byte == b'(' || curr_byte == b'{');
1401
1402 let token = if curr_byte == b'{' {
1403 self.save_token();
1404 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1405 } else if self.has_token() {
1406 let token = Token::make_subexp_from_slice(
1408 &self.formula,
1409 true,
1410 self.token_start,
1411 self.offset + 1,
1412 );
1413 self.token_start = self.offset + 1;
1414 self.token_end = self.offset + 1;
1415 token
1416 } else {
1417 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
1418 };
1419
1420 self.items.push(token.clone());
1421 self.token_stack.push(token);
1422 self.offset += 1;
1423 self.start_token();
1424 Ok(())
1425 }
1426
1427 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
1429 self.save_token();
1430
1431 let curr_byte = self.formula.as_bytes()[self.offset];
1432 assert!(curr_byte == b')' || curr_byte == b'}');
1433
1434 if let Some(open_token) = self.token_stack.pop() {
1435 let closer = open_token.get_closer()?;
1436 if (curr_byte == b'}' && closer.value != "}")
1437 || (curr_byte == b')' && closer.value != ")")
1438 {
1439 return Err(TokenizerError {
1440 message: "Mismatched ( and { pair".to_string(),
1441 pos: self.offset,
1442 });
1443 }
1444
1445 self.items.push(Token::from_slice(
1446 &self.formula,
1447 closer.token_type,
1448 TokenSubType::Close,
1449 self.offset,
1450 self.offset + 1,
1451 ));
1452 } else {
1453 return Err(TokenizerError {
1454 message: format!("No matching opener for closer at position {}", self.offset),
1455 pos: self.offset,
1456 });
1457 }
1458
1459 self.offset += 1;
1460 self.start_token();
1461 Ok(())
1462 }
1463
1464 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
1466 self.save_token();
1467
1468 let curr_byte = self.formula.as_bytes()[self.offset];
1469 assert!(curr_byte == b';' || curr_byte == b',');
1470
1471 let top_token = self.token_stack.last();
1472 let in_function_or_array = matches!(
1473 top_token.map(|t| t.token_type),
1474 Some(TokenType::Func | TokenType::Array)
1475 );
1476 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
1477
1478 let (token_type, subtype) = match curr_byte {
1479 b',' => {
1480 if in_function_or_array {
1481 (TokenType::Sep, TokenSubType::Arg)
1482 } else {
1483 (TokenType::OpInfix, TokenSubType::None)
1484 }
1485 }
1486 b';' => {
1487 if in_array {
1488 (TokenType::Sep, TokenSubType::Row)
1490 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
1491 (TokenType::Sep, TokenSubType::Arg)
1493 } else if self.dialect == FormulaDialect::OpenFormula {
1494 (TokenType::OpInfix, TokenSubType::None)
1495 } else {
1496 (TokenType::Sep, TokenSubType::Row)
1497 }
1498 }
1499 _ => (TokenType::OpInfix, TokenSubType::None),
1500 };
1501
1502 self.items.push(Token::from_slice(
1503 &self.formula,
1504 token_type,
1505 subtype,
1506 self.offset,
1507 self.offset + 1,
1508 ));
1509
1510 self.offset += 1;
1511 self.start_token();
1512 Ok(())
1513 }
1514
1515 pub fn render(&self) -> String {
1517 if self.items.is_empty() {
1518 "".to_string()
1519 } else if self.items[0].token_type == TokenType::Literal {
1520 self.items[0].value.clone()
1521 } else {
1522 let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
1523 format!("={concatenated}")
1524 }
1525 }
1526
1527 pub fn dialect(&self) -> FormulaDialect {
1529 self.dialect
1530 }
1531}
1532
1533impl TryFrom<&str> for Tokenizer {
1534 type Error = TokenizerError;
1535
1536 fn try_from(value: &str) -> Result<Self, Self::Error> {
1537 Tokenizer::new(value)
1538 }
1539}
1540
1541impl TryFrom<String> for Tokenizer {
1542 type Error = TokenizerError;
1543
1544 fn try_from(value: String) -> Result<Self, Self::Error> {
1545 Tokenizer::new(&value)
1546 }
1547}