1use std::convert::TryFrom;
2use std::error::Error;
3use std::fmt::{self, Display};
4
5use crate::types::FormulaDialect;
6
7const TOKEN_ENDERS: &str = ",;}) +-*/^&=><%";
8
9const fn build_token_enders() -> [bool; 256] {
10 let mut tbl = [false; 256];
11 let bytes = TOKEN_ENDERS.as_bytes();
12 let mut i = 0;
13 while i < bytes.len() {
14 tbl[bytes[i] as usize] = true;
15 i += 1;
16 }
17 tbl
18}
19static TOKEN_ENDERS_TABLE: [bool; 256] = build_token_enders();
20
21#[inline(always)]
22fn is_token_ender(c: u8) -> bool {
23 TOKEN_ENDERS_TABLE[c as usize]
24}
25
26static ERROR_CODES: &[&str] = &[
27 "#NULL!",
28 "#DIV/0!",
29 "#VALUE!",
30 "#REF!",
31 "#NAME?",
32 "#NUM!",
33 "#N/A",
34 "#GETTING_DATA",
35];
36
37#[derive(Debug, PartialEq, Eq)]
39pub enum Associativity {
40 Left,
41 Right,
42}
43
44#[derive(Debug)]
46pub struct TokenizerError {
47 pub message: String,
48 pub pos: usize,
49}
50
51impl fmt::Display for TokenizerError {
52 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
53 write!(f, "TokenizerError: {}", self.message)
54 }
55}
56
57impl Error for TokenizerError {}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
61pub enum TokenType {
62 Literal,
63 Operand,
64 Func,
65 Array,
66 Paren,
67 Sep,
68 OpPrefix,
69 OpInfix,
70 OpPostfix,
71 Whitespace,
72}
73
74impl Display for TokenType {
75 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
76 write!(f, "{self:?}")
77 }
78}
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82pub enum TokenSubType {
83 None,
84 Text,
85 Number,
86 Logical,
87 Error,
88 Range,
89 Open,
90 Close,
91 Arg,
92 Row,
93}
94
95impl Display for TokenSubType {
96 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
97 write!(f, "{self:?}")
98 }
99}
100
101#[derive(Debug, Clone, PartialEq, Hash)]
103pub struct Token {
104 pub value: String, pub token_type: TokenType,
106 pub subtype: TokenSubType,
107 pub start: usize,
108 pub end: usize,
109}
110
111impl Display for Token {
112 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
113 write!(
114 f,
115 "<{} subtype: {:?} value: {}>",
116 self.token_type, self.subtype, self.value
117 )
118 }
119}
120
121impl Token {
122 pub fn new(value: String, token_type: TokenType, subtype: TokenSubType) -> Self {
123 Token {
124 value,
125 token_type,
126 subtype,
127 start: 0,
128 end: 0,
129 }
130 }
131
132 pub fn new_with_span(
133 value: String,
134 token_type: TokenType,
135 subtype: TokenSubType,
136 start: usize,
137 end: usize,
138 ) -> Self {
139 Token {
140 value,
141 token_type,
142 subtype,
143 start,
144 end,
145 }
146 }
147
148 fn from_slice(
149 source: &str,
150 token_type: TokenType,
151 subtype: TokenSubType,
152 start: usize,
153 end: usize,
154 ) -> Self {
155 Token {
156 value: source[start..end].to_string(),
157 token_type,
158 subtype,
159 start,
160 end,
161 }
162 }
163
164 pub fn is_operator(&self) -> bool {
165 matches!(
166 self.token_type,
167 TokenType::OpPrefix | TokenType::OpInfix | TokenType::OpPostfix
168 )
169 }
170
171 pub fn get_precedence(&self) -> Option<(u8, Associativity)> {
172 let op = if self.token_type == TokenType::OpPrefix {
174 "u"
175 } else {
176 self.value.as_str()
177 };
178
179 match op {
180 ":" | " " | "," => Some((8, Associativity::Left)),
181 "u" => Some((7, Associativity::Right)),
182 "%" => Some((6, Associativity::Left)),
183 "^" => Some((5, Associativity::Left)),
184 "*" | "/" => Some((4, Associativity::Left)),
185 "+" | "-" => Some((3, Associativity::Left)),
186 "&" => Some((2, Associativity::Left)),
187 "=" | "<" | ">" | "<=" | ">=" | "<>" => Some((1, Associativity::Left)),
188 _ => None,
189 }
190 }
191
192 pub fn make_operand(value: String) -> Self {
194 let subtype = if value.starts_with('"') {
195 TokenSubType::Text
196 } else if value.starts_with('#') {
197 TokenSubType::Error
198 } else if value == "TRUE" || value == "FALSE" {
199 TokenSubType::Logical
200 } else if value.parse::<f64>().is_ok() {
201 TokenSubType::Number
202 } else {
203 TokenSubType::Range
204 };
205 Token::new(value, TokenType::Operand, subtype)
206 }
207
208 pub fn make_operand_with_span(value: String, start: usize, end: usize) -> Self {
210 let subtype = if value.starts_with('"') {
211 TokenSubType::Text
212 } else if value.starts_with('#') {
213 TokenSubType::Error
214 } else if value == "TRUE" || value == "FALSE" {
215 TokenSubType::Logical
216 } else if value.parse::<f64>().is_ok() {
217 TokenSubType::Number
218 } else {
219 TokenSubType::Range
220 };
221 Token::new_with_span(value, TokenType::Operand, subtype, start, end)
222 }
223
224 fn make_operand_from_slice(source: &str, start: usize, end: usize) -> Self {
225 let value_str = &source[start..end];
226 let subtype = if value_str.starts_with('"') {
227 TokenSubType::Text
228 } else if value_str.starts_with('#') {
229 TokenSubType::Error
230 } else if value_str == "TRUE" || value_str == "FALSE" {
231 TokenSubType::Logical
232 } else if value_str.parse::<f64>().is_ok() {
233 TokenSubType::Number
234 } else {
235 TokenSubType::Range
236 };
237 Token::from_slice(source, TokenType::Operand, subtype, start, end)
238 }
239
240 pub fn make_subexp(value: &str, func: bool) -> Self {
245 let last_char = value.chars().last().expect("Empty token value");
246 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
247 let token_type = if func {
248 TokenType::Func
249 } else if "{}".contains(last_char) {
250 TokenType::Array
251 } else if "()".contains(last_char) {
252 TokenType::Paren
253 } else {
254 TokenType::Func
255 };
256 let subtype = if ")}".contains(last_char) {
257 TokenSubType::Close
258 } else {
259 TokenSubType::Open
260 };
261 Token::new(value.to_string(), token_type, subtype)
262 }
263
264 pub fn make_subexp_with_span(value: &str, func: bool, start: usize, end: usize) -> Self {
266 let last_char = value.chars().last().expect("Empty token value");
267 assert!(matches!(last_char, '{' | '}' | '(' | ')'));
268 let token_type = if func {
269 TokenType::Func
270 } else if "{}".contains(last_char) {
271 TokenType::Array
272 } else if "()".contains(last_char) {
273 TokenType::Paren
274 } else {
275 TokenType::Func
276 };
277 let subtype = if ")}".contains(last_char) {
278 TokenSubType::Close
279 } else {
280 TokenSubType::Open
281 };
282 Token::new_with_span(value.to_string(), token_type, subtype, start, end)
283 }
284
285 fn make_subexp_from_slice(source: &str, func: bool, start: usize, end: usize) -> Self {
286 let value_str = &source[start..end];
287 let last_char = value_str.chars().last().expect("Empty token value");
288 let token_type = if func {
289 TokenType::Func
290 } else if "{}".contains(last_char) {
291 TokenType::Array
292 } else if "()".contains(last_char) {
293 TokenType::Paren
294 } else {
295 TokenType::Func
296 };
297 let subtype = if ")}".contains(last_char) {
298 TokenSubType::Close
299 } else {
300 TokenSubType::Open
301 };
302 Token::from_slice(source, token_type, subtype, start, end)
303 }
304
305 pub fn get_closer(&self) -> Result<Token, TokenizerError> {
307 if self.subtype != TokenSubType::Open {
308 return Err(TokenizerError {
309 message: "Token is not an opener".to_string(),
310 pos: 0,
311 });
312 }
313 let closer_value = if self.token_type == TokenType::Array {
314 "}"
315 } else {
316 ")"
317 };
318 Ok(Token::make_subexp(
319 closer_value,
320 self.token_type == TokenType::Func,
321 ))
322 }
323
324 pub fn make_separator(value: &str) -> Self {
326 assert!(value == "," || value == ";");
327 let subtype = if value == "," {
328 TokenSubType::Arg
329 } else {
330 TokenSubType::Row
331 };
332 Token::new(value.to_string(), TokenType::Sep, subtype)
333 }
334
335 pub fn make_separator_with_span(value: &str, start: usize, end: usize) -> Self {
337 assert!(value == "," || value == ";");
338 let subtype = if value == "," {
339 TokenSubType::Arg
340 } else {
341 TokenSubType::Row
342 };
343 Token::new_with_span(value.to_string(), TokenType::Sep, subtype, start, end)
344 }
345}
346
347pub struct Tokenizer {
349 formula: String, pub items: Vec<Token>,
351 token_stack: Vec<Token>,
352 offset: usize, token_start: usize, token_end: usize, dialect: FormulaDialect,
356}
357
358impl Tokenizer {
359 pub fn new(formula: &str) -> Result<Self, TokenizerError> {
361 Self::new_with_dialect(formula, FormulaDialect::Excel)
362 }
363
364 pub fn new_with_dialect(
366 formula: &str,
367 dialect: FormulaDialect,
368 ) -> Result<Self, TokenizerError> {
369 let mut tokenizer = Tokenizer {
370 formula: formula.to_string(),
371 items: Vec::with_capacity(formula.len() / 2), token_stack: Vec::with_capacity(16),
373 offset: 0,
374 token_start: 0,
375 token_end: 0,
376 dialect,
377 };
378 tokenizer.parse()?;
379 Ok(tokenizer)
380 }
381
382 #[inline]
384 fn current_byte(&self) -> Option<u8> {
385 self.formula.as_bytes().get(self.offset).copied()
386 }
387
388 #[inline]
390 fn has_token(&self) -> bool {
391 self.token_end > self.token_start
392 }
393
394 #[inline]
396 fn start_token(&mut self) {
397 self.token_start = self.offset;
398 self.token_end = self.offset;
399 }
400
401 #[inline]
403 fn extend_token(&mut self) {
404 self.token_end = self.offset;
405 }
406
407 fn parse(&mut self) -> Result<(), TokenizerError> {
409 if self.formula.is_empty() {
410 return Ok(());
411 }
412
413 if self.formula.as_bytes()[0] != b'=' {
415 self.items.push(Token::new_with_span(
416 self.formula.clone(),
417 TokenType::Literal,
418 TokenSubType::None,
419 0,
420 self.formula.len(),
421 ));
422 return Ok(());
423 }
424
425 self.offset = 1;
427 self.start_token();
428
429 while self.offset < self.formula.len() {
430 if self.check_scientific_notation()? {
431 continue;
432 }
433
434 let curr_byte = self.formula.as_bytes()[self.offset];
435
436 if is_token_ender(curr_byte) && self.has_token() {
438 self.save_token();
439 self.start_token();
440 }
441
442 match curr_byte {
444 b'"' | b'\'' => self.parse_string()?,
445 b'[' => self.parse_brackets()?,
446 b'#' => self.parse_error()?,
447 b' ' | b'\n' => self.parse_whitespace()?,
448 b'+' | b'-' | b'*' | b'/' | b'^' | b'&' | b'=' | b'>' | b'<' | b'%' => {
450 self.parse_operator()?
451 }
452 b'{' | b'(' => self.parse_opener()?,
453 b')' | b'}' => self.parse_closer()?,
454 b';' | b',' => self.parse_separator()?,
455 _ => {
456 if !self.has_token() {
458 self.start_token();
459 }
460 self.offset += 1;
461 self.extend_token();
462 }
463 }
464 }
465
466 if self.has_token() {
468 self.save_token();
469 }
470
471 if !self.token_stack.is_empty() {
473 return Err(TokenizerError {
474 message: "Unmatched opening parenthesis or bracket".to_string(),
475 pos: self.offset,
476 });
477 }
478
479 Ok(())
480 }
481
482 fn check_scientific_notation(&mut self) -> Result<bool, TokenizerError> {
485 if let Some(curr_byte) = self.current_byte() {
486 if (curr_byte == b'+' || curr_byte == b'-')
487 && self.has_token()
488 && self.is_scientific_notation_base()
489 {
490 self.offset += 1;
491 self.extend_token();
492 return Ok(true);
493 }
494 }
495 Ok(false)
496 }
497
498 fn is_scientific_notation_base(&self) -> bool {
501 if !self.has_token() {
502 return false;
503 }
504
505 let token_slice = &self.formula.as_bytes()[self.token_start..self.token_end];
506 if token_slice.len() < 2 {
507 return false;
508 }
509
510 let last = token_slice[token_slice.len() - 1];
511 if !(last == b'E' || last == b'e') {
512 return false;
513 }
514
515 let first = token_slice[0];
516 if !first.is_ascii_digit() {
517 return false;
518 }
519
520 let mut dot_seen = false;
521 for &ch in &token_slice[1..token_slice.len() - 1] {
523 match ch {
524 b'0'..=b'9' => {}
525 b'.' if !dot_seen => dot_seen = true,
526 _ => return false,
527 }
528 }
529 true
530 }
531
532 fn save_token(&mut self) {
534 if self.has_token() {
535 let token =
536 Token::make_operand_from_slice(&self.formula, self.token_start, self.token_end);
537 self.items.push(token);
538 }
539 }
540
541 fn parse_string(&mut self) -> Result<(), TokenizerError> {
543 let delim = self.formula.as_bytes()[self.offset];
544 assert!(delim == b'"' || delim == b'\'');
545
546 let is_dollar_ref = delim == b'\''
548 && self.has_token()
549 && self.token_end - self.token_start == 1
550 && self.formula.as_bytes()[self.token_start] == b'$';
551
552 if !is_dollar_ref && self.has_token() {
553 if self.token_end > 0 && self.formula.as_bytes()[self.token_end - 1] != b':' {
555 self.save_token();
556 self.start_token();
557 }
558 }
559
560 let string_start = if is_dollar_ref {
561 self.token_start
562 } else {
563 self.offset
564 };
565 self.offset += 1; while self.offset < self.formula.len() {
568 if self.formula.as_bytes()[self.offset] == delim {
569 self.offset += 1;
570 if self.offset < self.formula.len() && self.formula.as_bytes()[self.offset] == delim
572 {
573 self.offset += 1; } else {
575 if delim == b'"' {
577 let token = Token::make_operand_from_slice(
578 &self.formula,
579 string_start,
580 self.offset,
581 );
582 self.items.push(token);
583 self.start_token();
584 } else {
585 self.token_end = self.offset;
587 }
588 return Ok(());
589 }
590 } else {
591 self.offset += 1;
592 }
593 }
594
595 Err(TokenizerError {
596 message: "Reached end of formula while parsing string".to_string(),
597 pos: self.offset,
598 })
599 }
600
601 fn parse_brackets(&mut self) -> Result<(), TokenizerError> {
603 assert_eq!(self.formula.as_bytes()[self.offset], b'[');
604
605 if !self.has_token() {
606 self.start_token();
607 }
608
609 let mut open_count = 1;
610 self.offset += 1;
611
612 while self.offset < self.formula.len() {
613 match self.formula.as_bytes()[self.offset] {
614 b'[' => open_count += 1,
615 b']' => {
616 open_count -= 1;
617 if open_count == 0 {
618 self.offset += 1;
619 self.extend_token();
620 return Ok(());
621 }
622 }
623 _ => {}
624 }
625 self.offset += 1;
626 }
627
628 Err(TokenizerError {
629 message: "Encountered unmatched '['".to_string(),
630 pos: self.offset,
631 })
632 }
633
634 fn parse_error(&mut self) -> Result<(), TokenizerError> {
636 if self.has_token()
638 && self.token_end > 0
639 && self.formula.as_bytes()[self.token_end - 1] != b'!'
640 {
641 self.save_token();
642 self.start_token();
643 }
644
645 let error_start = if self.has_token() {
646 self.token_start
647 } else {
648 self.offset
649 };
650
651 for &err_code in ERROR_CODES {
653 let err_bytes = err_code.as_bytes();
654 if self.offset + err_bytes.len() <= self.formula.len() {
655 let slice = &self.formula.as_bytes()[self.offset..self.offset + err_bytes.len()];
656 if slice == err_bytes {
657 let token = Token::make_operand_from_slice(
658 &self.formula,
659 error_start,
660 self.offset + err_bytes.len(),
661 );
662 self.items.push(token);
663 self.offset += err_bytes.len();
664 self.start_token();
665 return Ok(());
666 }
667 }
668 }
669
670 Err(TokenizerError {
671 message: format!("Invalid error code at position {}", self.offset),
672 pos: self.offset,
673 })
674 }
675
676 fn parse_whitespace(&mut self) -> Result<(), TokenizerError> {
678 self.save_token();
679
680 let ws_start = self.offset;
681 while self.offset < self.formula.len() {
682 match self.formula.as_bytes()[self.offset] {
683 b' ' | b'\n' => self.offset += 1,
684 _ => break,
685 }
686 }
687
688 self.items.push(Token::from_slice(
689 &self.formula,
690 TokenType::Whitespace,
691 TokenSubType::None,
692 ws_start,
693 self.offset,
694 ));
695 self.start_token();
696 Ok(())
697 }
698
699 fn parse_operator(&mut self) -> Result<(), TokenizerError> {
701 self.save_token();
702
703 if self.offset + 1 < self.formula.len() {
705 let two_char = &self.formula.as_bytes()[self.offset..self.offset + 2];
706 if two_char == b">=" || two_char == b"<=" || two_char == b"<>" {
707 self.items.push(Token::from_slice(
708 &self.formula,
709 TokenType::OpInfix,
710 TokenSubType::None,
711 self.offset,
712 self.offset + 2,
713 ));
714 self.offset += 2;
715 self.start_token();
716 return Ok(());
717 }
718 }
719
720 let curr_byte = self.formula.as_bytes()[self.offset];
721 let token_type = match curr_byte {
722 b'%' => TokenType::OpPostfix,
723 b'+' | b'-' => {
724 if self.items.is_empty() {
726 TokenType::OpPrefix
727 } else {
728 let prev = self
729 .items
730 .iter()
731 .rev()
732 .find(|t| t.token_type != TokenType::Whitespace);
733 if let Some(p) = prev {
734 if p.subtype == TokenSubType::Close
735 || p.token_type == TokenType::OpPostfix
736 || p.token_type == TokenType::Operand
737 {
738 TokenType::OpInfix
739 } else {
740 TokenType::OpPrefix
741 }
742 } else {
743 TokenType::OpPrefix
744 }
745 }
746 }
747 _ => TokenType::OpInfix,
748 };
749
750 self.items.push(Token::from_slice(
751 &self.formula,
752 token_type,
753 TokenSubType::None,
754 self.offset,
755 self.offset + 1,
756 ));
757 self.offset += 1;
758 self.start_token();
759 Ok(())
760 }
761
762 fn parse_opener(&mut self) -> Result<(), TokenizerError> {
764 let curr_byte = self.formula.as_bytes()[self.offset];
765 assert!(curr_byte == b'(' || curr_byte == b'{');
766
767 let token = if curr_byte == b'{' {
768 self.save_token();
769 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
770 } else if self.has_token() {
771 let token = Token::make_subexp_from_slice(
773 &self.formula,
774 true,
775 self.token_start,
776 self.offset + 1,
777 );
778 self.token_start = self.offset + 1;
779 self.token_end = self.offset + 1;
780 token
781 } else {
782 Token::make_subexp_from_slice(&self.formula, false, self.offset, self.offset + 1)
783 };
784
785 self.items.push(token.clone());
786 self.token_stack.push(token);
787 self.offset += 1;
788 self.start_token();
789 Ok(())
790 }
791
792 fn parse_closer(&mut self) -> Result<(), TokenizerError> {
794 self.save_token();
795
796 let curr_byte = self.formula.as_bytes()[self.offset];
797 assert!(curr_byte == b')' || curr_byte == b'}');
798
799 if let Some(open_token) = self.token_stack.pop() {
800 let closer = open_token.get_closer()?;
801 if (curr_byte == b'}' && closer.value != "}")
802 || (curr_byte == b')' && closer.value != ")")
803 {
804 return Err(TokenizerError {
805 message: "Mismatched ( and { pair".to_string(),
806 pos: self.offset,
807 });
808 }
809
810 self.items.push(Token::from_slice(
811 &self.formula,
812 closer.token_type,
813 TokenSubType::Close,
814 self.offset,
815 self.offset + 1,
816 ));
817 } else {
818 return Err(TokenizerError {
819 message: format!("No matching opener for closer at position {}", self.offset),
820 pos: self.offset,
821 });
822 }
823
824 self.offset += 1;
825 self.start_token();
826 Ok(())
827 }
828
829 fn parse_separator(&mut self) -> Result<(), TokenizerError> {
831 self.save_token();
832
833 let curr_byte = self.formula.as_bytes()[self.offset];
834 assert!(curr_byte == b';' || curr_byte == b',');
835
836 let top_token = self.token_stack.last();
837 let in_function_or_array = matches!(
838 top_token.map(|t| t.token_type),
839 Some(TokenType::Func | TokenType::Array)
840 );
841 let in_array = matches!(top_token.map(|t| t.token_type), Some(TokenType::Array));
842
843 let (token_type, subtype) = match curr_byte {
844 b',' => {
845 if in_function_or_array {
846 (TokenType::Sep, TokenSubType::Arg)
847 } else {
848 (TokenType::OpInfix, TokenSubType::None)
849 }
850 }
851 b';' => {
852 if in_array {
853 (TokenType::Sep, TokenSubType::Row)
855 } else if self.dialect == FormulaDialect::OpenFormula && in_function_or_array {
856 (TokenType::Sep, TokenSubType::Arg)
858 } else if self.dialect == FormulaDialect::OpenFormula {
859 (TokenType::OpInfix, TokenSubType::None)
860 } else {
861 (TokenType::Sep, TokenSubType::Row)
862 }
863 }
864 _ => (TokenType::OpInfix, TokenSubType::None),
865 };
866
867 self.items.push(Token::from_slice(
868 &self.formula,
869 token_type,
870 subtype,
871 self.offset,
872 self.offset + 1,
873 ));
874
875 self.offset += 1;
876 self.start_token();
877 Ok(())
878 }
879
880 pub fn render(&self) -> String {
882 if self.items.is_empty() {
883 "".to_string()
884 } else if self.items[0].token_type == TokenType::Literal {
885 self.items[0].value.clone()
886 } else {
887 let concatenated: String = self.items.iter().map(|t| t.value.clone()).collect();
888 format!("={concatenated}")
889 }
890 }
891
892 pub fn dialect(&self) -> FormulaDialect {
894 self.dialect
895 }
896}
897
898impl TryFrom<&str> for Tokenizer {
899 type Error = TokenizerError;
900
901 fn try_from(value: &str) -> Result<Self, Self::Error> {
902 Tokenizer::new(value)
903 }
904}
905
906impl TryFrom<String> for Tokenizer {
907 type Error = TokenizerError;
908
909 fn try_from(value: String) -> Result<Self, Self::Error> {
910 Tokenizer::new(&value)
911 }
912}