1use ratex_lexer::token::{SourceLocation, Token};
2use unicode_normalization::UnicodeNormalization;
3
4use crate::error::{ParseError, ParseResult};
5use crate::functions::{self, ArgType, FunctionContext, FUNCTIONS};
6use crate::macro_expander::{MacroExpander, IMPLICIT_COMMANDS};
7use crate::parse_node::{AtomFamily, Mode, ParseNode};
8
9static END_OF_EXPRESSION: &[&str] = &["}", "\\endgroup", "\\end", "\\right", "&"];
11
12pub struct Parser<'a> {
22 pub mode: Mode,
23 pub gullet: MacroExpander<'a>,
24 pub leftright_depth: i32,
25 next_token: Option<Token>,
26}
27
28impl<'a> Parser<'a> {
29 pub fn new(input: &'a str) -> Self {
30 Self {
31 mode: Mode::Math,
32 gullet: MacroExpander::new(input, Mode::Math),
33 leftright_depth: 0,
34 next_token: None,
35 }
36 }
37
38 pub fn fetch(&mut self) -> ParseResult<Token> {
42 if self.next_token.is_none() {
43 self.next_token = Some(self.gullet.expand_next_token()?);
44 }
45 Ok(self.next_token.clone().unwrap())
46 }
47
48 pub fn consume(&mut self) {
50 self.next_token = None;
51 }
52
53 pub fn expect(&mut self, text: &str, do_consume: bool) -> ParseResult<()> {
55 let tok = self.fetch()?;
56 if tok.text != text {
57 return Err(ParseError::new(
58 format!("Expected '{}', got '{}'", text, tok.text),
59 Some(&tok),
60 ));
61 }
62 if do_consume {
63 self.consume();
64 }
65 Ok(())
66 }
67
68 pub fn consume_spaces(&mut self) -> ParseResult<()> {
70 loop {
71 let tok = self.fetch()?;
72 if tok.text == " " {
73 self.consume();
74 } else {
75 break;
76 }
77 }
78 Ok(())
79 }
80
81 pub fn switch_mode(&mut self, new_mode: Mode) {
83 self.mode = new_mode;
84 self.gullet.switch_mode(new_mode);
85 }
86
87 pub fn parse(&mut self) -> ParseResult<Vec<ParseNode>> {
91 self.gullet.begin_group();
92
93 let result = self.parse_expression(false, None);
94
95 match result {
96 Ok(parse) => {
97 self.expect("EOF", true)?;
98 self.gullet.end_group();
99 Ok(parse)
100 }
101 Err(e) => {
102 self.gullet.end_groups();
103 Err(e)
104 }
105 }
106 }
107
108 pub fn parse_expression(
112 &mut self,
113 break_on_infix: bool,
114 break_on_token_text: Option<&str>,
115 ) -> ParseResult<Vec<ParseNode>> {
116 let mut body = Vec::new();
117
118 loop {
119 if self.mode == Mode::Math {
120 self.consume_spaces()?;
121 }
122
123 let lex = self.fetch()?;
124
125 if END_OF_EXPRESSION.contains(&lex.text.as_str()) {
126 break;
127 }
128 if let Some(break_text) = break_on_token_text {
129 if lex.text == break_text {
130 break;
131 }
132 }
133 if break_on_infix {
134 if let Some(func) = FUNCTIONS.get(lex.text.as_str()) {
135 if func.infix {
136 break;
137 }
138 }
139 }
140
141 let atom = self.parse_atom(break_on_token_text)?;
142
143 match atom {
144 None => break,
145 Some(node) if node.type_name() == "internal" => continue,
146 Some(node) => body.push(node),
147 }
148 }
149
150 if self.mode == Mode::Text {
151 self.form_ligatures(&mut body);
152 }
153
154 self.handle_infix_nodes(body)
155 }
156
157 fn handle_infix_nodes(&mut self, body: Vec<ParseNode>) -> ParseResult<Vec<ParseNode>> {
159 let mut over_index: Option<usize> = None;
160 let mut func_name: Option<String> = None;
161
162 for (i, node) in body.iter().enumerate() {
163 if let ParseNode::Infix { replace_with, .. } = node {
164 if over_index.is_some() {
165 return Err(ParseError::msg("only one infix operator per group"));
166 }
167 over_index = Some(i);
168 func_name = Some(replace_with.clone());
169 }
170 }
171
172 if let (Some(idx), Some(fname)) = (over_index, func_name) {
173 let numer_body: Vec<ParseNode> = body[..idx].to_vec();
174 let denom_body: Vec<ParseNode> = body[idx + 1..].to_vec();
175
176 let numer = if numer_body.len() == 1 {
177 if let ParseNode::OrdGroup { .. } = &numer_body[0] {
178 numer_body.into_iter().next().unwrap()
179 } else {
180 ParseNode::OrdGroup {
181 mode: self.mode,
182 body: numer_body,
183 semisimple: None,
184 loc: None,
185 }
186 }
187 } else {
188 ParseNode::OrdGroup {
189 mode: self.mode,
190 body: numer_body,
191 semisimple: None,
192 loc: None,
193 }
194 };
195
196 let denom = if denom_body.len() == 1 {
197 if let ParseNode::OrdGroup { .. } = &denom_body[0] {
198 denom_body.into_iter().next().unwrap()
199 } else {
200 ParseNode::OrdGroup {
201 mode: self.mode,
202 body: denom_body,
203 semisimple: None,
204 loc: None,
205 }
206 }
207 } else {
208 ParseNode::OrdGroup {
209 mode: self.mode,
210 body: denom_body,
211 semisimple: None,
212 loc: None,
213 }
214 };
215
216 let node = if fname == "\\\\abovefrac" {
217 let infix_node = body[idx].clone();
219 self.call_function(&fname, vec![numer, infix_node, denom], vec![], None, None)?
220 } else {
221 self.call_function(&fname, vec![numer, denom], vec![], None, None)?
222 };
223 Ok(vec![node])
224 } else {
225 Ok(body)
226 }
227 }
228
229 fn form_ligatures(&self, group: &mut Vec<ParseNode>) {
231 let mut i = 0;
232 while i + 1 < group.len() {
233 let a_text = group[i].symbol_text().map(|s| s.to_string());
234 let b_text = group[i + 1].symbol_text().map(|s| s.to_string());
235
236 if let (Some(a), Some(b)) = (a_text, b_text) {
237 if group[i].type_name() == "textord" && group[i + 1].type_name() == "textord" {
238 if a == "-" && b == "-" {
239 if i + 2 < group.len() {
240 if let Some(c) = group[i + 2].symbol_text() {
241 if c == "-" && group[i + 2].type_name() == "textord" {
242 group[i] = ParseNode::TextOrd {
243 mode: Mode::Text,
244 text: "---".to_string(),
245 loc: None,
246 };
247 group.remove(i + 2);
248 group.remove(i + 1);
249 continue;
250 }
251 }
252 }
253 group[i] = ParseNode::TextOrd {
254 mode: Mode::Text,
255 text: "--".to_string(),
256 loc: None,
257 };
258 group.remove(i + 1);
259 continue;
260 }
261 if (a == "'" || a == "`") && b == a {
262 group[i] = ParseNode::TextOrd {
263 mode: Mode::Text,
264 text: format!("{}{}", a, a),
265 loc: None,
266 };
267 group.remove(i + 1);
268 continue;
269 }
270 }
271 }
272 i += 1;
273 }
274 }
275
276 pub fn parse_atom(
280 &mut self,
281 break_on_token_text: Option<&str>,
282 ) -> ParseResult<Option<ParseNode>> {
283 let mut base = self.parse_group("atom", break_on_token_text)?;
284
285 if let Some(ref b) = base {
286 if b.type_name() == "internal" {
287 return Ok(base);
288 }
289 }
290
291 if self.mode == Mode::Text {
292 return Ok(base);
293 }
294
295 let mut superscript: Option<ParseNode> = None;
296 let mut subscript: Option<ParseNode> = None;
297
298 loop {
299 self.consume_spaces()?;
300 let lex = self.fetch()?;
301
302 if lex.text == "\\limits" || lex.text == "\\nolimits" {
303 let is_limits = lex.text == "\\limits";
304 self.consume();
305 if let Some(
306 ParseNode::Op { limits, .. }
307 | ParseNode::OperatorName { limits, .. },
308 ) = base.as_mut()
309 {
310 *limits = is_limits;
311 }
312 } else if lex.text == "^" {
313 if superscript.is_some() {
314 return Err(ParseError::new("Double superscript", Some(&lex)));
315 }
316 superscript = Some(self.handle_sup_subscript("superscript")?);
317 } else if lex.text == "_" {
318 if subscript.is_some() {
319 return Err(ParseError::new("Double subscript", Some(&lex)));
320 }
321 subscript = Some(self.handle_sup_subscript("subscript")?);
322 } else if lex.text == "'" {
323 if superscript.is_some() {
324 return Err(ParseError::new("Double superscript", Some(&lex)));
325 }
326 let prime = ParseNode::TextOrd {
327 mode: self.mode,
328 text: "\\prime".to_string(),
329 loc: None,
330 };
331 let mut primes = vec![prime.clone()];
332 self.consume();
333 while self.fetch()?.text == "'" {
334 primes.push(prime.clone());
335 self.consume();
336 }
337 if self.fetch()?.text == "^" {
338 primes.push(self.handle_sup_subscript("superscript")?);
339 }
340 superscript = Some(ParseNode::OrdGroup {
341 mode: self.mode,
342 body: primes,
343 semisimple: None,
344 loc: None,
345 });
346 } else if let Some((mapped, is_sub)) = lex
347 .text
348 .chars()
349 .next()
350 .and_then(crate::unicode_sup_sub::unicode_sub_sup)
351 {
352 if is_sub && subscript.is_some() {
353 return Err(ParseError::new("Double subscript", Some(&lex)));
354 }
355 if !is_sub && superscript.is_some() {
356 return Err(ParseError::new("Double superscript", Some(&lex)));
357 }
358 let mut subsup_tokens = vec![Token::new(mapped, 0, 0)];
360 self.consume();
361 loop {
362 let tok = self.fetch()?;
363 match tok
364 .text
365 .chars()
366 .next()
367 .and_then(crate::unicode_sup_sub::unicode_sub_sup)
368 {
369 Some((m, sub)) if sub == is_sub => {
370 subsup_tokens.insert(0, Token::new(m, 0, 0));
371 self.consume();
372 }
373 _ => break,
374 }
375 }
376 let body = self.subparse(subsup_tokens)?;
377 let group = ParseNode::OrdGroup {
378 mode: Mode::Math,
379 body,
380 semisimple: None,
381 loc: None,
382 };
383 if is_sub {
384 subscript = Some(group);
385 } else {
386 superscript = Some(group);
387 }
388 } else {
389 break;
390 }
391 }
392
393 if superscript.is_some() || subscript.is_some() {
394 Ok(Some(ParseNode::SupSub {
395 mode: self.mode,
396 base: base.map(Box::new),
397 sup: superscript.map(Box::new),
398 sub: subscript.map(Box::new),
399 loc: None,
400 }))
401 } else {
402 Ok(base)
403 }
404 }
405
406 fn handle_sup_subscript(&mut self, name: &str) -> ParseResult<ParseNode> {
408 let symbol_token = self.fetch()?;
409 self.consume();
410 self.consume_spaces()?;
411
412 let group = self.parse_group(name, None)?;
413 match group {
414 Some(g) if g.type_name() != "internal" => Ok(g),
415 Some(_) => {
416 let g2 = self.parse_group(name, None)?;
418 g2.ok_or_else(|| {
419 ParseError::new(
420 format!("Expected group after '{}'", symbol_token.text),
421 Some(&symbol_token),
422 )
423 })
424 }
425 None => Err(ParseError::new(
426 format!("Expected group after '{}'", symbol_token.text),
427 Some(&symbol_token),
428 )),
429 }
430 }
431
432 pub fn parse_group(
436 &mut self,
437 name: &str,
438 break_on_token_text: Option<&str>,
439 ) -> ParseResult<Option<ParseNode>> {
440 let first_token = self.fetch()?;
441 let text = first_token.text.clone();
442
443 if text == "{" || text == "\\begingroup" {
444 self.consume();
445 let group_end = if text == "{" { "}" } else { "\\endgroup" };
446
447 self.gullet.begin_group();
448 let expression = self.parse_expression(false, Some(group_end))?;
449 let last_token = self.fetch()?;
450 self.expect(group_end, true)?;
451 self.gullet.end_group();
452
453 let loc = Some(SourceLocation::range(&first_token.loc, &last_token.loc));
454 let semisimple = if text == "\\begingroup" {
455 Some(true)
456 } else {
457 None
458 };
459
460 Ok(Some(ParseNode::OrdGroup {
461 mode: self.mode,
462 body: expression,
463 semisimple,
464 loc,
465 }))
466 } else {
467 let result = self
468 .parse_function(break_on_token_text, Some(name))?
469 .or_else(|| self.parse_symbol_inner().ok().flatten());
470
471 if result.is_none()
472 && text.starts_with('\\')
473 && !IMPLICIT_COMMANDS.contains(&text.as_str())
474 {
475 return Err(ParseError::new(
476 format!("Undefined control sequence: {}", text),
477 Some(&first_token),
478 ));
479 }
480
481 Ok(result)
482 }
483 }
484
485 pub fn parse_function(
489 &mut self,
490 break_on_token_text: Option<&str>,
491 name: Option<&str>,
492 ) -> ParseResult<Option<ParseNode>> {
493 let token = self.fetch()?;
494 let func = token.text.clone();
495
496 let func_data = match FUNCTIONS.get(func.as_str()) {
497 Some(f) => f,
498 None => return Ok(None),
499 };
500
501 self.consume();
502
503 if let Some(n) = name {
504 if n != "atom" && !func_data.allowed_in_argument {
505 return Err(ParseError::new(
506 format!("Got function '{}' with no arguments as {}", func, n),
507 Some(&token),
508 ));
509 }
510 }
511
512 functions::check_mode_compatibility(func_data, self.mode, &func, Some(&token))?;
513
514 if func == "\\hspace" {
518 self.gullet.consume_spaces();
519 if self.gullet.future().text == "*" {
520 self.gullet.pop_token();
521 }
522 }
523
524 let (args, opt_args) = self.parse_arguments(&func, func_data)?;
525
526 self.call_function(
527 &func,
528 args,
529 opt_args,
530 Some(token),
531 break_on_token_text.map(|s| s.to_string()).as_deref(),
532 )
533 .map(Some)
534 }
535
536 pub fn call_function(
538 &mut self,
539 name: &str,
540 args: Vec<ParseNode>,
541 opt_args: Vec<Option<ParseNode>>,
542 token: Option<Token>,
543 break_on_token_text: Option<&str>,
544 ) -> ParseResult<ParseNode> {
545 let func = FUNCTIONS.get(name).ok_or_else(|| {
546 ParseError::msg(format!("No function handler for {}", name))
547 })?;
548
549 let mut ctx = FunctionContext {
550 func_name: name.to_string(),
551 parser: self,
552 token: token.clone(),
553 break_on_token_text: break_on_token_text.map(|s| s.to_string()),
554 };
555
556 (func.handler)(&mut ctx, args, opt_args)
557 }
558
559 pub fn parse_arguments(
561 &mut self,
562 func: &str,
563 func_data: &functions::FunctionSpec,
564 ) -> ParseResult<(Vec<ParseNode>, Vec<Option<ParseNode>>)> {
565 let total_args = func_data.num_args + func_data.num_optional_args;
566 if total_args == 0 {
567 return Ok((Vec::new(), Vec::new()));
568 }
569
570 let mut args = Vec::new();
571 let mut opt_args = Vec::new();
572
573 for i in 0..total_args {
574 let arg_type = func_data
575 .arg_types
576 .as_ref()
577 .and_then(|types| types.get(i).copied());
578 let is_optional = i < func_data.num_optional_args;
579
580 let effective_type = if (func_data.primitive && arg_type.is_none())
581 || (func_data.node_type == "sqrt" && i == 1
582 && opt_args.first().is_some_and(|o: &Option<ParseNode>| o.is_none()))
583 {
584 Some(ArgType::Primitive)
585 } else {
586 arg_type
587 };
588
589 let arg = self.parse_group_of_type(
590 &format!("argument to '{}'", func),
591 effective_type,
592 is_optional,
593 )?;
594
595 if is_optional {
596 opt_args.push(arg);
597 } else if let Some(a) = arg {
598 args.push(a);
599 } else {
600 return Err(ParseError::msg("Null argument, please report this as a bug"));
601 }
602 }
603
604 Ok((args, opt_args))
605 }
606
607 fn parse_group_of_type(
609 &mut self,
610 name: &str,
611 arg_type: Option<ArgType>,
612 optional: bool,
613 ) -> ParseResult<Option<ParseNode>> {
614 match arg_type {
615 Some(ArgType::Color) => self.parse_color_group(optional),
616 Some(ArgType::Size) => self.parse_size_group(optional),
617 Some(ArgType::Primitive) => {
618 if optional {
619 return Err(ParseError::msg("A primitive argument cannot be optional"));
620 }
621 let group = self.parse_group(name, None)?;
622 match group {
623 Some(g) => Ok(Some(g)),
624 None => Err(ParseError::new(
625 format!("Expected group as {}", name),
626 None,
627 )),
628 }
629 }
630 Some(ArgType::Math) | Some(ArgType::Text) => {
631 let mode = match arg_type {
632 Some(ArgType::Math) => Some(Mode::Math),
633 Some(ArgType::Text) => Some(Mode::Text),
634 _ => None,
635 };
636 self.parse_argument_group(optional, mode)
637 }
638 Some(ArgType::HBox) => {
639 let group = self.parse_argument_group(optional, Some(Mode::Text))?;
640 match group {
641 Some(g) => Ok(Some(ParseNode::Styling {
642 mode: g.mode(),
643 style: crate::parse_node::StyleStr::Text,
644 body: vec![g],
645 loc: None,
646 })),
647 None => Ok(None),
648 }
649 }
650 Some(ArgType::Raw) => {
651 let token = self.parse_string_group("raw", optional)?;
652 match token {
653 Some(t) => Ok(Some(ParseNode::Raw {
654 mode: Mode::Text,
655 string: t.text,
656 loc: None,
657 })),
658 None => Ok(None),
659 }
660 }
661 Some(ArgType::Url) => self.parse_url_group(optional),
662 None | Some(ArgType::Original) => self.parse_argument_group(optional, None),
663 }
664 }
665
666 fn parse_color_group(&mut self, optional: bool) -> ParseResult<Option<ParseNode>> {
668 let res = self.parse_string_group("color", optional)?;
669 match res {
670 None => Ok(None),
671 Some(token) => {
672 let text = token.text.trim().to_string();
673 let re = regex_lite::Regex::new(
674 r"^(#[a-fA-F0-9]{3,4}|#[a-fA-F0-9]{6}|#[a-fA-F0-9]{8}|[a-fA-F0-9]{6}|[a-zA-Z]+|\d+(\.\d+)?(,\d+(\.\d+)?)*)$",
675 )
676 .unwrap();
677
678 if !re.is_match(&text) {
679 return Err(ParseError::new(
680 format!("Invalid color: '{}'", text),
681 Some(&token),
682 ));
683 }
684 let mut color = text;
685 if regex_lite::Regex::new(r"^[0-9a-fA-F]{6}$")
686 .unwrap()
687 .is_match(&color)
688 {
689 color = format!("#{}", color);
690 }
691
692 Ok(Some(ParseNode::ColorToken {
693 mode: self.mode,
694 color,
695 loc: None,
696 }))
697 }
698 }
699 }
700
701 pub fn parse_size_group(&mut self, optional: bool) -> ParseResult<Option<ParseNode>> {
703 let mut is_blank = false;
704
705 self.gullet.consume_spaces();
706 let res = if !optional && self.gullet.future().text != "{" {
707 Some(self.parse_regex_group(
708 ®ex_lite::Regex::new(r"^[-+]? *(?:$|\d+|\d+\.\d*|\.\d*) *[a-z]{0,2} *$")
709 .unwrap(),
710 "size",
711 )?)
712 } else {
713 self.parse_string_group("size", optional)?
714 };
715
716 let res = match res {
717 Some(r) => r,
718 None => return Ok(None),
719 };
720
721 let mut text = res.text.clone();
722 if !optional && text.is_empty() {
723 text = "0pt".to_string();
724 is_blank = true;
725 }
726
727 let size_re =
728 regex_lite::Regex::new(r"([-+]?) *(\d+(?:\.\d*)?|\.\d+) *([a-z]{2})").unwrap();
729 let m = size_re.captures(&text).ok_or_else(|| {
730 ParseError::new(format!("Invalid size: '{}'", text), Some(&res))
731 })?;
732
733 let sign = m.get(1).map_or("", |m| m.as_str());
734 let magnitude = m.get(2).map_or("", |m| m.as_str());
735 let unit = m.get(3).map_or("", |m| m.as_str());
736
737 let number: f64 = format!("{}{}", sign, magnitude).parse().unwrap_or(0.0);
738
739 if !is_valid_unit(unit) {
740 return Err(ParseError::new(
741 format!("Invalid unit: '{}'", unit),
742 Some(&res),
743 ));
744 }
745
746 Ok(Some(ParseNode::Size {
747 mode: self.mode,
748 value: crate::parse_node::Measurement {
749 number,
750 unit: unit.to_string(),
751 },
752 is_blank,
753 loc: None,
754 }))
755 }
756
757 fn parse_url_group(&mut self, optional: bool) -> ParseResult<Option<ParseNode>> {
760 self.gullet.lexer.set_catcode('%', 13);
761 self.gullet.lexer.set_catcode('~', 12);
762 let res = self.parse_string_group("url", optional);
763 self.gullet.lexer.set_catcode('%', 14);
764 self.gullet.lexer.set_catcode('~', 13);
765 let res = res?;
766 match res {
767 None => Ok(None),
768 Some(token) => {
769 let url = token.text;
770 Ok(Some(ParseNode::Url {
771 mode: self.mode,
772 url,
773 loc: None,
774 }))
775 }
776 }
777 }
778
779 fn parse_string_group(
781 &mut self,
782 _mode_name: &str,
783 optional: bool,
784 ) -> ParseResult<Option<Token>> {
785 let arg_token = self.gullet.scan_argument(optional)?;
786 let arg_token = match arg_token {
787 Some(t) => t,
788 None => return Ok(None),
789 };
790
791 let mut s = String::new();
792 loop {
793 let next = self.fetch()?;
794 if next.text == "EOF" {
795 break;
796 }
797 s.push_str(&next.text);
798 self.consume();
799 }
800 self.consume(); let mut result = arg_token;
803 result.text = s;
804 Ok(Some(result))
805 }
806
807 fn parse_regex_group(
809 &mut self,
810 regex: ®ex_lite::Regex,
811 mode_name: &str,
812 ) -> ParseResult<Token> {
813 let first_token = self.fetch()?;
814 let mut last_token = first_token.clone();
815 let mut s = String::new();
816
817 loop {
818 let next = self.fetch()?;
819 if next.text == "EOF" {
820 break;
821 }
822 let candidate = format!("{}{}", s, next.text);
823 if regex.is_match(&candidate) {
824 last_token = next;
825 s = candidate;
826 self.consume();
827 } else {
828 break;
829 }
830 }
831
832 if s.is_empty() {
833 return Err(ParseError::new(
834 format!("Invalid {}: '{}'", mode_name, first_token.text),
835 Some(&first_token),
836 ));
837 }
838
839 Ok(first_token.range(&last_token, s))
840 }
841
842 pub fn parse_argument_group(
844 &mut self,
845 optional: bool,
846 mode: Option<Mode>,
847 ) -> ParseResult<Option<ParseNode>> {
848 let arg_token = self.gullet.scan_argument(optional)?;
849 let arg_token = match arg_token {
850 Some(t) => t,
851 None => return Ok(None),
852 };
853
854 let outer_mode = self.mode;
855 if let Some(m) = mode {
856 self.switch_mode(m);
857 }
858
859 self.gullet.begin_group();
860 let expression = self.parse_expression(false, Some("EOF"))?;
861 self.expect("EOF", true)?;
862 self.gullet.end_group();
863
864 let result = ParseNode::OrdGroup {
865 mode: self.mode,
866 loc: Some(arg_token.loc.clone()),
867 body: expression,
868 semisimple: None,
869 };
870
871 if mode.is_some() {
872 self.switch_mode(outer_mode);
873 }
874
875 Ok(Some(result))
876 }
877
878 fn parse_symbol_inner(&mut self) -> ParseResult<Option<ParseNode>> {
882 let nucleus = self.fetch()?;
883 let text = nucleus.text.clone();
884
885 if let Some(stripped) = text.strip_prefix("\\verb") {
886 self.consume();
887 let arg = stripped.to_string();
888 let star = arg.starts_with('*');
889 let arg = if star { &arg[1..] } else { &arg };
890
891 if arg.len() < 2 {
892 return Err(ParseError::new("\\verb assertion failed", Some(&nucleus)));
893 }
894 let body = arg[1..arg.len() - 1].to_string();
895 return Ok(Some(ParseNode::Verb {
896 mode: Mode::Text,
897 body,
898 star,
899 loc: Some(nucleus.loc.clone()),
900 }));
901 }
902
903 let font_mode = match self.mode {
904 Mode::Math => ratex_font::symbols::Mode::Math,
905 Mode::Text => ratex_font::symbols::Mode::Text,
906 };
907
908 if text == "^" || text == "_" {
910 return Ok(None);
911 }
912
913 if text == "\\" {
915 return Ok(None);
916 }
917
918 if let Some(sym_info) = ratex_font::symbols::get_symbol(&text, font_mode) {
919 let loc = Some(SourceLocation::range(&nucleus.loc, &nucleus.loc));
920 let group = sym_info.group;
921
922 let node = if group.is_atom() {
923 let family = match group {
924 ratex_font::symbols::Group::Bin => AtomFamily::Bin,
925 ratex_font::symbols::Group::Close => AtomFamily::Close,
926 ratex_font::symbols::Group::Inner => AtomFamily::Inner,
927 ratex_font::symbols::Group::Open => AtomFamily::Open,
928 ratex_font::symbols::Group::Punct => AtomFamily::Punct,
929 ratex_font::symbols::Group::Rel => AtomFamily::Rel,
930 _ => unreachable!(),
931 };
932 ParseNode::Atom {
933 mode: self.mode,
934 family,
935 text: text.clone(),
936 loc,
937 }
938 } else {
939 match group {
940 ratex_font::symbols::Group::MathOrd => ParseNode::MathOrd {
941 mode: self.mode,
942 text: text.clone(),
943 loc,
944 },
945 ratex_font::symbols::Group::TextOrd => ParseNode::TextOrd {
946 mode: self.mode,
947 text: text.clone(),
948 loc,
949 },
950 ratex_font::symbols::Group::OpToken => ParseNode::OpToken {
951 mode: self.mode,
952 text: text.clone(),
953 loc,
954 },
955 ratex_font::symbols::Group::AccentToken => ParseNode::AccentToken {
956 mode: self.mode,
957 text: text.clone(),
958 loc,
959 },
960 ratex_font::symbols::Group::Spacing => ParseNode::SpacingNode {
961 mode: self.mode,
962 text: text.clone(),
963 loc,
964 },
965 _ => ParseNode::MathOrd {
966 mode: self.mode,
967 text: text.clone(),
968 loc,
969 },
970 }
971 };
972
973 self.consume();
974 return Ok(Some(node));
975 }
976
977 if let Some(node) = self.try_parse_unicode_accent(&text, &nucleus)? {
980 self.consume();
981 return Ok(Some(node));
982 }
983
984 let first_char = text.chars().next();
987 if let Some(ch) = first_char {
988 if ch as u32 >= 0x80 {
989 let node = ParseNode::TextOrd {
990 mode: Mode::Text,
991 text: text.clone(),
992 loc: Some(SourceLocation::range(&nucleus.loc, &nucleus.loc)),
993 };
994 self.consume();
995 return Ok(Some(node));
996 }
997 }
998
999 Ok(None)
1000 }
1001
1002 fn try_parse_unicode_accent(
1006 &self,
1007 text: &str,
1008 nucleus: &Token,
1009 ) -> ParseResult<Option<ParseNode>> {
1010 let nfd: String = text.nfd().collect();
1011 let chars: Vec<char> = nfd.chars().collect();
1012
1013 if chars.len() < 2 {
1014 return Ok(None);
1015 }
1016
1017 let mut split_idx = chars.len() - 1;
1019 while split_idx > 0 && is_supported_combining_accent(chars[split_idx]) {
1020 split_idx -= 1;
1021 }
1022
1023 if split_idx == chars.len() - 1 {
1025 return Ok(None);
1026 }
1027
1028 let base_char = chars[0];
1030 if !is_latin_base_char(base_char) {
1031 return Ok(None);
1032 }
1033
1034 let loc = Some(SourceLocation::range(&nucleus.loc, &nucleus.loc));
1035
1036 let mut base_str: String = chars[..split_idx + 1].iter().collect();
1038
1039 if base_str.len() == 1 {
1041 match base_str.as_str() {
1042 "i" => base_str = "\u{0131}".to_string(), "j" => base_str = "\u{0237}".to_string(), _ => {}
1045 }
1046 }
1047
1048 let font_mode = match self.mode {
1049 Mode::Math => ratex_font::symbols::Mode::Math,
1050 Mode::Text => ratex_font::symbols::Mode::Text,
1051 };
1052
1053 let mut node = if base_str.chars().count() == 1 {
1054 let ch = base_str.chars().next().unwrap();
1055 if let Some(sym) = ratex_font::symbols::get_symbol(&base_str, font_mode) {
1056 match sym.group {
1057 ratex_font::symbols::Group::TextOrd => ParseNode::TextOrd {
1058 mode: self.mode,
1059 text: base_str.clone(),
1060 loc: loc.clone(),
1061 },
1062 _ => ParseNode::MathOrd {
1063 mode: self.mode,
1064 text: base_str.clone(),
1065 loc: loc.clone(),
1066 },
1067 }
1068 } else if (ch as u32) >= 0x80 {
1069 ParseNode::TextOrd {
1071 mode: Mode::Text,
1072 text: base_str.clone(),
1073 loc: loc.clone(),
1074 }
1075 } else {
1076 ParseNode::MathOrd {
1077 mode: self.mode,
1078 text: base_str.clone(),
1079 loc: loc.clone(),
1080 }
1081 }
1082 } else {
1083 return self.try_parse_unicode_accent(&base_str, nucleus).map(|opt| {
1084 opt.or_else(|| {
1085 Some(ParseNode::TextOrd {
1086 mode: Mode::Text,
1087 text: base_str.clone(),
1088 loc: loc.clone(),
1089 })
1090 })
1091 });
1092 };
1093
1094 for &combining in &chars[split_idx + 1..] {
1096 let label = combining_to_accent_label(combining, self.mode);
1097 node = ParseNode::Accent {
1098 mode: self.mode,
1099 label,
1100 is_stretchy: Some(false),
1101 is_shifty: Some(true),
1102 base: Box::new(node),
1103 loc: loc.clone(),
1104 };
1105 }
1106
1107 Ok(Some(node))
1108 }
1109
1110 pub fn subparse(&mut self, tokens: Vec<Token>) -> ParseResult<Vec<ParseNode>> {
1112 let old_token = self.next_token.take();
1113
1114 self.gullet
1115 .push_token(Token::new("}", 0, 0));
1116 self.gullet.push_tokens(tokens);
1117 let parse = self.parse_expression(false, None)?;
1118 self.expect("}", true)?;
1119
1120 self.next_token = old_token;
1121 Ok(parse)
1122 }
1123}
1124
1125fn is_latin_base_char(ch: char) -> bool {
1126 matches!(ch,
1127 'A'..='Z' | 'a'..='z'
1128 | '\u{0131}' | '\u{0237}' | '\u{00C6}' | '\u{00D0}' | '\u{00D8}' | '\u{00DE}' | '\u{00DF}' | '\u{00E6}' | '\u{00F0}' | '\u{00F8}' | '\u{00FE}' )
1140}
1141
1142fn is_supported_combining_accent(ch: char) -> bool {
1143 matches!(
1144 ch,
1145 '\u{0300}' | '\u{0301}' | '\u{0302}' | '\u{0303}' | '\u{0304}'
1146 | '\u{0306}' | '\u{0307}' | '\u{0308}' | '\u{030A}' | '\u{030B}' | '\u{030C}'
1147 | '\u{0327}'
1148 )
1149}
1150
1151fn combining_to_accent_label(ch: char, mode: Mode) -> String {
1152 match mode {
1153 Mode::Math => match ch {
1154 '\u{0300}' => "\\grave".to_string(),
1155 '\u{0301}' => "\\acute".to_string(),
1156 '\u{0302}' => "\\hat".to_string(),
1157 '\u{0303}' => "\\tilde".to_string(),
1158 '\u{0304}' => "\\bar".to_string(),
1159 '\u{0306}' => "\\breve".to_string(),
1160 '\u{0307}' => "\\dot".to_string(),
1161 '\u{0308}' => "\\ddot".to_string(),
1162 '\u{030A}' => "\\mathring".to_string(),
1163 '\u{030B}' => "\\H".to_string(),
1164 '\u{030C}' => "\\check".to_string(),
1165 '\u{0327}' => "\\c".to_string(),
1166 _ => format!("\\char\"{:X}", ch as u32),
1167 },
1168 Mode::Text => match ch {
1169 '\u{0300}' => "\\`".to_string(),
1170 '\u{0301}' => "\\'".to_string(),
1171 '\u{0302}' => "\\^".to_string(),
1172 '\u{0303}' => "\\~".to_string(),
1173 '\u{0304}' => "\\=".to_string(),
1174 '\u{0306}' => "\\u".to_string(),
1175 '\u{0307}' => "\\.".to_string(),
1176 '\u{0308}' => "\\\"".to_string(),
1177 '\u{030A}' => "\\r".to_string(),
1178 '\u{030B}' => "\\H".to_string(),
1179 '\u{030C}' => "\\v".to_string(),
1180 '\u{0327}' => "\\c".to_string(),
1181 _ => format!("\\char\"{:X}", ch as u32),
1182 },
1183 }
1184}
1185
1186fn is_valid_unit(unit: &str) -> bool {
1187 matches!(
1188 unit,
1189 "pt" | "mm" | "cm" | "in" | "bp" | "pc" | "dd" | "cc" | "nd" | "nc" | "sp" | "px"
1190 | "ex" | "em" | "mu"
1191 )
1192}
1193
1194fn strip_outer_math_delimiters(input: &str) -> &str {
1198 let s = input.trim();
1199 if s.len() >= 4 && s.starts_with("$$") && s.ends_with("$$") {
1200 return s[2..s.len() - 2].trim();
1201 }
1202 if s.len() >= 2 && s.starts_with('$') && s.ends_with('$') {
1203 return s[1..s.len() - 1].trim();
1204 }
1205 s
1206}
1207
1208pub fn parse(input: &str) -> ParseResult<Vec<ParseNode>> {
1210 Parser::new(strip_outer_math_delimiters(input)).parse()
1211}
1212
1213#[cfg(test)]
1214mod tests {
1215 use super::*;
1216
1217 #[test]
1218 fn test_parse_single_char() {
1219 let result = parse("x").unwrap();
1220 assert_eq!(result.len(), 1);
1221 assert_eq!(result[0].type_name(), "mathord");
1222 }
1223
1224 #[test]
1225 fn test_parse_strips_outer_dollar_inline_math() {
1226 let inner = r"C_p[\ce{H2O(l)}] = \pu{75.3 J // mol K}";
1227 let wrapped = format!("${inner}$");
1228 let a = parse(&wrapped).expect("wrapped");
1229 let b = parse(inner).expect("inner");
1230 assert_eq!(a.len(), b.len());
1231 for (x, y) in a.iter().zip(b.iter()) {
1232 assert_eq!(x.type_name(), y.type_name());
1233 }
1234 }
1235
1236 #[test]
1237 fn test_parse_addition() {
1238 let result = parse("a+b").unwrap();
1239 assert_eq!(result.len(), 3);
1240 assert_eq!(result[0].type_name(), "mathord"); assert_eq!(result[1].type_name(), "atom"); assert_eq!(result[2].type_name(), "mathord"); }
1244
1245 #[test]
1246 fn test_parse_superscript() {
1247 let result = parse("x^2").unwrap();
1248 assert_eq!(result.len(), 1);
1249 assert_eq!(result[0].type_name(), "supsub");
1250 }
1251
1252 #[test]
1253 fn test_parse_subscript() {
1254 let result = parse("a_i").unwrap();
1255 assert_eq!(result.len(), 1);
1256 assert_eq!(result[0].type_name(), "supsub");
1257 }
1258
1259 #[test]
1260 fn test_parse_supsub() {
1261 let result = parse("x^2_i").unwrap();
1262 assert_eq!(result.len(), 1);
1263 assert_eq!(result[0].type_name(), "supsub");
1264 if let ParseNode::SupSub { sup, sub, .. } = &result[0] {
1265 assert!(sup.is_some());
1266 assert!(sub.is_some());
1267 } else {
1268 panic!("Expected SupSub");
1269 }
1270 }
1271
1272 #[test]
1273 fn test_parse_group() {
1274 let result = parse("{a+b}").unwrap();
1275 assert_eq!(result.len(), 1);
1276 assert_eq!(result[0].type_name(), "ordgroup");
1277 }
1278
1279 #[test]
1280 fn test_parse_frac() {
1281 let result = parse("\\frac{a}{b}").unwrap();
1282 assert_eq!(result.len(), 1);
1283 assert_eq!(result[0].type_name(), "genfrac");
1284 }
1285
1286 #[test]
1287 fn test_parse_sqrt() {
1288 let result = parse("\\sqrt{x}").unwrap();
1289 assert_eq!(result.len(), 1);
1290 assert_eq!(result[0].type_name(), "sqrt");
1291 }
1292
1293 #[test]
1294 fn test_parse_sqrt_optional() {
1295 let result = parse("\\sqrt[3]{x}").unwrap();
1296 assert_eq!(result.len(), 1);
1297 if let ParseNode::Sqrt { index, .. } = &result[0] {
1298 assert!(index.is_some());
1299 } else {
1300 panic!("Expected Sqrt");
1301 }
1302 }
1303
1304 #[test]
1305 fn test_parse_nested() {
1306 let result = parse("\\frac{\\sqrt{a^2+b^2}}{c}").unwrap();
1307 assert_eq!(result.len(), 1);
1308 assert_eq!(result[0].type_name(), "genfrac");
1309 }
1310
1311 #[test]
1312 fn test_parse_empty() {
1313 let result = parse("").unwrap();
1314 assert_eq!(result.len(), 0);
1315 }
1316
1317 #[test]
1318 fn test_parse_double_superscript_error() {
1319 let result = parse("x^2^3");
1320 assert!(result.is_err());
1321 }
1322
1323 #[test]
1324 fn test_parse_unclosed_brace_error() {
1325 let result = parse("{x");
1326 assert!(result.is_err());
1327 }
1328
1329 #[test]
1330 fn test_parse_json_output() {
1331 let result = parse("x^2").unwrap();
1332 let json = serde_json::to_string_pretty(&result).unwrap();
1333 assert!(json.contains("supsub"));
1334 }
1335}