ratex_parser/
parser.rs

1use ratex_lexer::token::{SourceLocation, Token};
2use unicode_normalization::UnicodeNormalization;
3
4use crate::error::{ParseError, ParseResult};
5use crate::functions::{self, ArgType, FunctionContext, FUNCTIONS};
6use crate::macro_expander::{MacroExpander, IMPLICIT_COMMANDS};
7use crate::parse_node::{AtomFamily, Mode, ParseNode};
8
9/// End-of-expression tokens.
10static END_OF_EXPRESSION: &[&str] = &["}", "\\endgroup", "\\end", "\\right", "&"];
11
12/// The LaTeX parser. Converts a token stream into a ParseNode AST.
13///
14/// Follows KaTeX's Parser.ts closely:
15/// - `parse()` → parse full expression
16/// - `parseExpression()` → parse a list of atoms
17/// - `parseAtom()` → parse one atom with optional super/subscripts
18/// - `parseGroup()` → parse a group (braced or single token)
19/// - `parseFunction()` → parse a function call with arguments
20/// - `parseSymbol()` → parse a single symbol
21pub struct Parser<'a> {
22    pub mode: Mode,
23    pub gullet: MacroExpander<'a>,
24    pub leftright_depth: i32,
25    next_token: Option<Token>,
26    pub equation_counter: usize,
27}
28
29impl<'a> Parser<'a> {
30    pub fn new(input: &'a str) -> Self {
31        Self {
32            mode: Mode::Math,
33            gullet: MacroExpander::new(input, Mode::Math),
34            leftright_depth: 0,
35            next_token: None,
36            equation_counter: 0,
37        }
38    }
39
40    // ── Token management ────────────────────────────────────────────────
41
42    /// Return the current lookahead token (fetching from gullet if needed).
43    pub fn fetch(&mut self) -> ParseResult<Token> {
44        if self.next_token.is_none() {
45            self.next_token = Some(self.gullet.expand_next_token()?);
46        }
47        Ok(self.next_token.clone().unwrap())
48    }
49
50    /// Discard the current lookahead token.
51    pub fn consume(&mut self) {
52        self.next_token = None;
53    }
54
55    /// Expect the next token to have the given text, consuming it.
56    pub fn expect(&mut self, text: &str, do_consume: bool) -> ParseResult<()> {
57        let tok = self.fetch()?;
58        if tok.text != text {
59            return Err(ParseError::new(
60                format!("Expected '{}', got '{}'", text, tok.text),
61                Some(&tok),
62            ));
63        }
64        if do_consume {
65            self.consume();
66        }
67        Ok(())
68    }
69
70    /// Consume spaces in math mode.
71    pub fn consume_spaces(&mut self) -> ParseResult<()> {
72        loop {
73            let tok = self.fetch()?;
74            if tok.text == " " {
75                self.consume();
76            } else {
77                break;
78            }
79        }
80        Ok(())
81    }
82
83    /// Switch between "math" and "text" modes.
84    pub fn switch_mode(&mut self, new_mode: Mode) {
85        self.mode = new_mode;
86        self.gullet.switch_mode(new_mode);
87    }
88
89    // ── Main parse entry ────────────────────────────────────────────────
90
91    /// Parse the entire input and return the AST.
92    pub fn parse(&mut self) -> ParseResult<Vec<ParseNode>> {
93        self.gullet.begin_group();
94
95        let result = self.parse_expression(false, None);
96
97        match result {
98            Ok(parse) => {
99                self.expect("EOF", true)?;
100                self.gullet.end_group();
101                Ok(parse)
102            }
103            Err(e) => {
104                self.gullet.end_groups();
105                Err(e)
106            }
107        }
108    }
109
110    // ── Expression parsing ──────────────────────────────────────────────
111
112    /// Parse an expression: a list of atoms.
113    pub fn parse_expression(
114        &mut self,
115        break_on_infix: bool,
116        break_on_token_text: Option<&str>,
117    ) -> ParseResult<Vec<ParseNode>> {
118        let mut body = Vec::new();
119
120        loop {
121            if self.mode == Mode::Math {
122                self.consume_spaces()?;
123            }
124
125            let lex = self.fetch()?;
126
127            if END_OF_EXPRESSION.contains(&lex.text.as_str()) {
128                break;
129            }
130            if let Some(break_text) = break_on_token_text {
131                if lex.text == break_text {
132                    break;
133                }
134            }
135            if break_on_infix {
136                if let Some(func) = FUNCTIONS.get(lex.text.as_str()) {
137                    if func.infix {
138                        break;
139                    }
140                }
141            }
142
143            let atom = self.parse_atom(break_on_token_text)?;
144
145            match atom {
146                None => break,
147                Some(node) if node.type_name() == "internal" => continue,
148                Some(node) => body.push(node),
149            }
150        }
151
152        if self.mode == Mode::Text {
153            self.form_ligatures(&mut body);
154        }
155
156        self.handle_infix_nodes(body)
157    }
158
159    /// Rewrite infix operators (e.g. \over → \frac).
160    fn handle_infix_nodes(&mut self, body: Vec<ParseNode>) -> ParseResult<Vec<ParseNode>> {
161        let mut over_index: Option<usize> = None;
162        let mut func_name: Option<String> = None;
163
164        for (i, node) in body.iter().enumerate() {
165            if let ParseNode::Infix { replace_with, .. } = node {
166                if over_index.is_some() {
167                    return Err(ParseError::msg("only one infix operator per group"));
168                }
169                over_index = Some(i);
170                func_name = Some(replace_with.clone());
171            }
172        }
173
174        if let (Some(idx), Some(fname)) = (over_index, func_name) {
175            let numer_body: Vec<ParseNode> = body[..idx].to_vec();
176            let denom_body: Vec<ParseNode> = body[idx + 1..].to_vec();
177
178            let numer = if numer_body.len() == 1 {
179                if let ParseNode::OrdGroup { .. } = &numer_body[0] {
180                    numer_body.into_iter().next().unwrap()
181                } else {
182                    ParseNode::OrdGroup {
183                        mode: self.mode,
184                        body: numer_body,
185                        semisimple: None,
186                        loc: None,
187                    }
188                }
189            } else {
190                ParseNode::OrdGroup {
191                    mode: self.mode,
192                    body: numer_body,
193                    semisimple: None,
194                    loc: None,
195                }
196            };
197
198            let denom = if denom_body.len() == 1 {
199                if let ParseNode::OrdGroup { .. } = &denom_body[0] {
200                    denom_body.into_iter().next().unwrap()
201                } else {
202                    ParseNode::OrdGroup {
203                        mode: self.mode,
204                        body: denom_body,
205                        semisimple: None,
206                        loc: None,
207                    }
208                }
209            } else {
210                ParseNode::OrdGroup {
211                    mode: self.mode,
212                    body: denom_body,
213                    semisimple: None,
214                    loc: None,
215                }
216            };
217
218            let node = if fname == "\\\\abovefrac" {
219                // \above passes the infix node (with bar size) as the middle argument
220                let infix_node = body[idx].clone();
221                self.call_function(&fname, vec![numer, infix_node, denom], vec![], None, None)?
222            } else {
223                self.call_function(&fname, vec![numer, denom], vec![], None, None)?
224            };
225            Ok(vec![node])
226        } else {
227            Ok(body)
228        }
229    }
230
231    /// Form ligatures in text mode (e.g. -- → –, --- → —).
232    fn form_ligatures(&self, group: &mut Vec<ParseNode>) {
233        let mut i = 0;
234        while i + 1 < group.len() {
235            let a_text = group[i].symbol_text().map(|s| s.to_string());
236            let b_text = group[i + 1].symbol_text().map(|s| s.to_string());
237
238            if let (Some(a), Some(b)) = (a_text, b_text) {
239                if group[i].type_name() == "textord" && group[i + 1].type_name() == "textord" {
240                    if a == "-" && b == "-" {
241                        if i + 2 < group.len() {
242                            if let Some(c) = group[i + 2].symbol_text() {
243                                if c == "-" && group[i + 2].type_name() == "textord" {
244                                    group[i] = ParseNode::TextOrd {
245                                        mode: Mode::Text,
246                                        text: "---".to_string(),
247                                        loc: None,
248                                    };
249                                    group.remove(i + 2);
250                                    group.remove(i + 1);
251                                    continue;
252                                }
253                            }
254                        }
255                        group[i] = ParseNode::TextOrd {
256                            mode: Mode::Text,
257                            text: "--".to_string(),
258                            loc: None,
259                        };
260                        group.remove(i + 1);
261                        continue;
262                    }
263                    if (a == "'" || a == "`") && b == a {
264                        group[i] = ParseNode::TextOrd {
265                            mode: Mode::Text,
266                            text: format!("{}{}", a, a),
267                            loc: None,
268                        };
269                        group.remove(i + 1);
270                        continue;
271                    }
272                }
273            }
274            i += 1;
275        }
276    }
277
278    // ── Atom parsing ────────────────────────────────────────────────────
279
280    /// Parse a single atom with optional super/subscripts.
281    pub fn parse_atom(
282        &mut self,
283        break_on_token_text: Option<&str>,
284    ) -> ParseResult<Option<ParseNode>> {
285        let mut base = self.parse_group("atom", break_on_token_text)?;
286
287        if let Some(ref b) = base {
288            if b.type_name() == "internal" {
289                return Ok(base);
290            }
291        }
292
293        if self.mode == Mode::Text {
294            return Ok(base);
295        }
296
297        let mut superscript: Option<ParseNode> = None;
298        let mut subscript: Option<ParseNode> = None;
299
300        loop {
301            self.consume_spaces()?;
302            let lex = self.fetch()?;
303
304            if lex.text == "\\limits" || lex.text == "\\nolimits" {
305                let is_limits = lex.text == "\\limits";
306                self.consume();
307                if let Some(base_node) = base.as_mut() {
308                    match base_node {
309                        ParseNode::Op {
310                            limits,
311                            always_handle_sup_sub,
312                            ..
313                        } => {
314                            *limits = is_limits;
315                            *always_handle_sup_sub = Some(is_limits);
316                        }
317                        ParseNode::OperatorName {
318                            limits,
319                            always_handle_sup_sub,
320                            ..
321                        } => {
322                            *limits = is_limits;
323                            *always_handle_sup_sub = is_limits;
324                        }
325                        _ => {}
326                    }
327                }
328            } else if lex.text == "^" {
329                if superscript.is_some() {
330                    return Err(ParseError::new("Double superscript", Some(&lex)));
331                }
332                superscript = Some(self.handle_sup_subscript("superscript")?);
333            } else if lex.text == "_" {
334                if subscript.is_some() {
335                    return Err(ParseError::new("Double subscript", Some(&lex)));
336                }
337                subscript = Some(self.handle_sup_subscript("subscript")?);
338            } else if lex.text == "'" {
339                if superscript.is_some() {
340                    return Err(ParseError::new("Double superscript", Some(&lex)));
341                }
342                let prime = ParseNode::TextOrd {
343                    mode: self.mode,
344                    text: "\\prime".to_string(),
345                    loc: None,
346                };
347                let mut primes = vec![prime.clone()];
348                self.consume();
349                while self.fetch()?.text == "'" {
350                    primes.push(prime.clone());
351                    self.consume();
352                }
353                if self.fetch()?.text == "^" {
354                    primes.push(self.handle_sup_subscript("superscript")?);
355                }
356                superscript = Some(ParseNode::OrdGroup {
357                    mode: self.mode,
358                    body: primes,
359                    semisimple: None,
360                    loc: None,
361                });
362            } else if let Some((mapped, is_sub)) = lex
363                .text
364                .chars()
365                .next()
366                .and_then(crate::unicode_sup_sub::unicode_sub_sup)
367            {
368                if is_sub && subscript.is_some() {
369                    return Err(ParseError::new("Double subscript", Some(&lex)));
370                }
371                if !is_sub && superscript.is_some() {
372                    return Err(ParseError::new("Double superscript", Some(&lex)));
373                }
374                // Collect consecutive Unicode sup/sub chars of the same kind
375                let mut subsup_tokens = vec![Token::new(mapped, 0, 0)];
376                self.consume();
377                loop {
378                    let tok = self.fetch()?;
379                    match tok
380                        .text
381                        .chars()
382                        .next()
383                        .and_then(crate::unicode_sup_sub::unicode_sub_sup)
384                    {
385                        Some((m, sub)) if sub == is_sub => {
386                            subsup_tokens.insert(0, Token::new(m, 0, 0));
387                            self.consume();
388                        }
389                        _ => break,
390                    }
391                }
392                let body = self.subparse(subsup_tokens)?;
393                let group = ParseNode::OrdGroup {
394                    mode: Mode::Math,
395                    body,
396                    semisimple: None,
397                    loc: None,
398                };
399                if is_sub {
400                    subscript = Some(group);
401                } else {
402                    superscript = Some(group);
403                }
404            } else {
405                break;
406            }
407        }
408
409        if superscript.is_some() || subscript.is_some() {
410            Ok(Some(ParseNode::SupSub {
411                mode: self.mode,
412                base: base.map(Box::new),
413                sup: superscript.map(Box::new),
414                sub: subscript.map(Box::new),
415                loc: None,
416            }))
417        } else {
418            Ok(base)
419        }
420    }
421
422    /// Handle a subscript or superscript.
423    fn handle_sup_subscript(&mut self, name: &str) -> ParseResult<ParseNode> {
424        let symbol_token = self.fetch()?;
425        self.consume();
426        self.consume_spaces()?;
427
428        let group = self.parse_group(name, None)?;
429        match group {
430            Some(g) if g.type_name() != "internal" => Ok(g),
431            Some(_) => {
432                // Skip internal nodes, try again
433                let g2 = self.parse_group(name, None)?;
434                g2.ok_or_else(|| {
435                    ParseError::new(
436                        format!("Expected group after '{}'", symbol_token.text),
437                        Some(&symbol_token),
438                    )
439                })
440            }
441            None => Err(ParseError::new(
442                format!("Expected group after '{}'", symbol_token.text),
443                Some(&symbol_token),
444            )),
445        }
446    }
447
448    // ── Group parsing ───────────────────────────────────────────────────
449
450    /// Parse a group: braced expression, function call, or single symbol.
451    pub fn parse_group(
452        &mut self,
453        name: &str,
454        break_on_token_text: Option<&str>,
455    ) -> ParseResult<Option<ParseNode>> {
456        let first_token = self.fetch()?;
457        let text = first_token.text.clone();
458
459        if text == "{" || text == "\\begingroup" {
460            self.consume();
461            let group_end = if text == "{" { "}" } else { "\\endgroup" };
462
463            self.gullet.begin_group();
464            let expression = self.parse_expression(false, Some(group_end))?;
465            let last_token = self.fetch()?;
466            self.expect(group_end, true)?;
467            self.gullet.end_group();
468
469            let loc = Some(SourceLocation::range(&first_token.loc, &last_token.loc));
470            let semisimple = if text == "\\begingroup" {
471                Some(true)
472            } else {
473                None
474            };
475
476            Ok(Some(ParseNode::OrdGroup {
477                mode: self.mode,
478                body: expression,
479                semisimple,
480                loc,
481            }))
482        } else {
483            let result = self
484                .parse_function(break_on_token_text, Some(name))?
485                .or_else(|| self.parse_symbol_inner().ok().flatten());
486
487            if result.is_none()
488                && text.starts_with('\\')
489                && !IMPLICIT_COMMANDS.contains(&text.as_str())
490            {
491                return Err(ParseError::new(
492                    format!("Undefined control sequence: {}", text),
493                    Some(&first_token),
494                ));
495            }
496
497            Ok(result)
498        }
499    }
500
501    // ── Function parsing ────────────────────────────────────────────────
502
503    /// Try to parse a function call. Returns None if not a function.
504    pub fn parse_function(
505        &mut self,
506        break_on_token_text: Option<&str>,
507        name: Option<&str>,
508    ) -> ParseResult<Option<ParseNode>> {
509        let token = self.fetch()?;
510        let func = token.text.clone();
511
512        let func_data = match FUNCTIONS.get(func.as_str()) {
513            Some(f) => f,
514            None => return Ok(None),
515        };
516
517        self.consume();
518
519        if let Some(n) = name {
520            if n != "atom" && !func_data.allowed_in_argument {
521                return Err(ParseError::new(
522                    format!("Got function '{}' with no arguments as {}", func, n),
523                    Some(&token),
524                ));
525            }
526        }
527
528        functions::check_mode_compatibility(func_data, self.mode, &func, Some(&token))?;
529
530        // `\hspace*{len}` — `*` is a separate token (not part of the control word); consume it here.
531        // Must use gullet peek/pop only: `parser.fetch()` without `consume()` advances the lexer and
532        // leaves `{` only in `next_token`, so `parse_size_group`'s `gullet.future()` would miss the brace.
533        if func == "\\hspace" {
534            self.gullet.consume_spaces();
535            if self.gullet.future().text == "*" {
536                self.gullet.pop_token();
537            }
538        }
539
540        let (args, opt_args) = self.parse_arguments(&func, func_data)?;
541
542        self.call_function(
543            &func,
544            args,
545            opt_args,
546            Some(token),
547            break_on_token_text.map(|s| s.to_string()).as_deref(),
548        )
549        .map(Some)
550    }
551
552    /// Call a function handler.
553    pub fn call_function(
554        &mut self,
555        name: &str,
556        args: Vec<ParseNode>,
557        opt_args: Vec<Option<ParseNode>>,
558        token: Option<Token>,
559        break_on_token_text: Option<&str>,
560    ) -> ParseResult<ParseNode> {
561        let func = FUNCTIONS.get(name).ok_or_else(|| {
562            ParseError::msg(format!("No function handler for {}", name))
563        })?;
564
565        let mut ctx = FunctionContext {
566            func_name: name.to_string(),
567            parser: self,
568            token: token.clone(),
569            break_on_token_text: break_on_token_text.map(|s| s.to_string()),
570        };
571
572        (func.handler)(&mut ctx, args, opt_args)
573    }
574
575    /// Parse the arguments for a function.
576    pub fn parse_arguments(
577        &mut self,
578        func: &str,
579        func_data: &functions::FunctionSpec,
580    ) -> ParseResult<(Vec<ParseNode>, Vec<Option<ParseNode>>)> {
581        let total_args = func_data.num_args + func_data.num_optional_args;
582        if total_args == 0 {
583            return Ok((Vec::new(), Vec::new()));
584        }
585
586        let mut args = Vec::new();
587        let mut opt_args = Vec::new();
588
589        for i in 0..total_args {
590            let arg_type = func_data
591                .arg_types
592                .as_ref()
593                .and_then(|types| types.get(i).copied());
594            let is_optional = i < func_data.num_optional_args;
595
596            let effective_type = if (func_data.primitive && arg_type.is_none())
597                || (func_data.node_type == "sqrt" && i == 1
598                    && opt_args.first().is_some_and(|o: &Option<ParseNode>| o.is_none()))
599            {
600                Some(ArgType::Primitive)
601            } else {
602                arg_type
603            };
604
605            let arg = self.parse_group_of_type(
606                &format!("argument to '{}'", func),
607                effective_type,
608                is_optional,
609            )?;
610
611            if is_optional {
612                opt_args.push(arg);
613            } else if let Some(a) = arg {
614                args.push(a);
615            } else {
616                return Err(ParseError::msg("Null argument, please report this as a bug"));
617            }
618        }
619
620        Ok((args, opt_args))
621    }
622
623    /// Parse a group with a specific type.
624    fn parse_group_of_type(
625        &mut self,
626        name: &str,
627        arg_type: Option<ArgType>,
628        optional: bool,
629    ) -> ParseResult<Option<ParseNode>> {
630        match arg_type {
631            Some(ArgType::Color) => self.parse_color_group(optional),
632            Some(ArgType::Size) => self.parse_size_group(optional),
633            Some(ArgType::Primitive) => {
634                if optional {
635                    return Err(ParseError::msg("A primitive argument cannot be optional"));
636                }
637                let group = self.parse_group(name, None)?;
638                match group {
639                    Some(g) => Ok(Some(g)),
640                    None => Err(ParseError::new(
641                        format!("Expected group as {}", name),
642                        None,
643                    )),
644                }
645            }
646            Some(ArgType::Math) | Some(ArgType::Text) => {
647                let mode = match arg_type {
648                    Some(ArgType::Math) => Some(Mode::Math),
649                    Some(ArgType::Text) => Some(Mode::Text),
650                    _ => None,
651                };
652                self.parse_argument_group(optional, mode)
653            }
654            Some(ArgType::HBox) => {
655                let group = self.parse_argument_group(optional, Some(Mode::Text))?;
656                match group {
657                    Some(g) => Ok(Some(ParseNode::Styling {
658                        mode: g.mode(),
659                        style: crate::parse_node::StyleStr::Text,
660                        body: vec![g],
661                        loc: None,
662                    })),
663                    None => Ok(None),
664                }
665            }
666            Some(ArgType::Raw) => {
667                let token = self.parse_string_group("raw", optional)?;
668                match token {
669                    Some(t) => Ok(Some(ParseNode::Raw {
670                        mode: Mode::Text,
671                        string: t.text,
672                        loc: None,
673                    })),
674                    None => Ok(None),
675                }
676            }
677            Some(ArgType::Url) => self.parse_url_group(optional),
678            None | Some(ArgType::Original) => self.parse_argument_group(optional, None),
679        }
680    }
681
682    /// Parse a color group.
683    fn parse_color_group(&mut self, optional: bool) -> ParseResult<Option<ParseNode>> {
684        let res = self.parse_string_group("color", optional)?;
685        match res {
686            None => Ok(None),
687            Some(token) => {
688                let text = token.text.trim().to_string();
689                let re = regex_lite::Regex::new(
690                    r"^(#[a-fA-F0-9]{3,4}|#[a-fA-F0-9]{6}|#[a-fA-F0-9]{8}|[a-fA-F0-9]{6}|[a-zA-Z]+|\d+(\.\d+)?(,\d+(\.\d+)?)*)$",
691                )
692                .unwrap();
693
694                if !re.is_match(&text) {
695                    return Err(ParseError::new(
696                        format!("Invalid color: '{}'", text),
697                        Some(&token),
698                    ));
699                }
700                let mut color = text;
701                if regex_lite::Regex::new(r"^[0-9a-fA-F]{6}$")
702                    .unwrap()
703                    .is_match(&color)
704                {
705                    color = format!("#{}", color);
706                }
707
708                Ok(Some(ParseNode::ColorToken {
709                    mode: self.mode,
710                    color,
711                    loc: None,
712                }))
713            }
714        }
715    }
716
717    /// Parse a size group (e.g., "3pt", "1em").
718    pub fn parse_size_group(&mut self, optional: bool) -> ParseResult<Option<ParseNode>> {
719        let mut is_blank = false;
720
721        self.gullet.consume_spaces();
722        let res = if !optional && self.gullet.future().text != "{" {
723            Some(self.parse_regex_group(
724                &regex_lite::Regex::new(r"^[-+]? *(?:$|\d+|\d+\.\d*|\.\d*) *[a-z]{0,2} *$")
725                    .unwrap(),
726                "size",
727            )?)
728        } else {
729            self.parse_string_group("size", optional)?
730        };
731
732        let res = match res {
733            Some(r) => r,
734            None => return Ok(None),
735        };
736
737        let mut text = res.text.clone();
738        if !optional && text.is_empty() {
739            text = "0pt".to_string();
740            is_blank = true;
741        }
742
743        let size_re =
744            regex_lite::Regex::new(r"([-+]?) *(\d+(?:\.\d*)?|\.\d+) *([a-z]{2})").unwrap();
745        let m = size_re.captures(&text).ok_or_else(|| {
746            ParseError::new(format!("Invalid size: '{}'", text), Some(&res))
747        })?;
748
749        let sign = m.get(1).map_or("", |m| m.as_str());
750        let magnitude = m.get(2).map_or("", |m| m.as_str());
751        let unit = m.get(3).map_or("", |m| m.as_str());
752
753        let number: f64 = format!("{}{}", sign, magnitude).parse().unwrap_or(0.0);
754
755        if !is_valid_unit(unit) {
756            return Err(ParseError::new(
757                format!("Invalid unit: '{}'", unit),
758                Some(&res),
759            ));
760        }
761
762        Ok(Some(ParseNode::Size {
763            mode: self.mode,
764            value: crate::parse_node::Measurement {
765                number,
766                unit: unit.to_string(),
767            },
768            is_blank,
769            loc: None,
770        }))
771    }
772
773    /// Parse a URL group.
774    /// Temporarily disables `%` as comment character to allow `%20` etc. in URLs.
775    fn parse_url_group(&mut self, optional: bool) -> ParseResult<Option<ParseNode>> {
776        self.gullet.lexer.set_catcode('%', 13);
777        self.gullet.lexer.set_catcode('~', 12);
778        let res = self.parse_string_group("url", optional);
779        self.gullet.lexer.set_catcode('%', 14);
780        self.gullet.lexer.set_catcode('~', 13);
781        let res = res?;
782        match res {
783            None => Ok(None),
784            Some(token) => {
785                let url = token.text;
786                Ok(Some(ParseNode::Url {
787                    mode: self.mode,
788                    url,
789                    loc: None,
790                }))
791            }
792        }
793    }
794
795    /// Parse a string group (brace-enclosed string).
796    fn parse_string_group(
797        &mut self,
798        _mode_name: &str,
799        optional: bool,
800    ) -> ParseResult<Option<Token>> {
801        let arg_token = self.gullet.scan_argument(optional)?;
802        let arg_token = match arg_token {
803            Some(t) => t,
804            None => return Ok(None),
805        };
806
807        let mut s = String::new();
808        loop {
809            let next = self.fetch()?;
810            if next.text == "EOF" {
811                break;
812            }
813            s.push_str(&next.text);
814            self.consume();
815        }
816        self.consume(); // consume EOF
817
818        let mut result = arg_token;
819        result.text = s;
820        Ok(Some(result))
821    }
822
823    /// Parse a regex-delimited group.
824    fn parse_regex_group(
825        &mut self,
826        regex: &regex_lite::Regex,
827        mode_name: &str,
828    ) -> ParseResult<Token> {
829        let first_token = self.fetch()?;
830        let mut last_token = first_token.clone();
831        let mut s = String::new();
832
833        loop {
834            let next = self.fetch()?;
835            if next.text == "EOF" {
836                break;
837            }
838            let candidate = format!("{}{}", s, next.text);
839            if regex.is_match(&candidate) {
840                last_token = next;
841                s = candidate;
842                self.consume();
843            } else {
844                break;
845            }
846        }
847
848        if s.is_empty() {
849            return Err(ParseError::new(
850                format!("Invalid {}: '{}'", mode_name, first_token.text),
851                Some(&first_token),
852            ));
853        }
854
855        Ok(first_token.range(&last_token, s))
856    }
857
858    /// Parse an argument group (with optional mode switch).
859    pub fn parse_argument_group(
860        &mut self,
861        optional: bool,
862        mode: Option<Mode>,
863    ) -> ParseResult<Option<ParseNode>> {
864        let arg_token = self.gullet.scan_argument(optional)?;
865        let arg_token = match arg_token {
866            Some(t) => t,
867            None => return Ok(None),
868        };
869
870        let outer_mode = self.mode;
871        if let Some(m) = mode {
872            self.switch_mode(m);
873        }
874
875        self.gullet.begin_group();
876        let expression = self.parse_expression(false, Some("EOF"))?;
877        self.expect("EOF", true)?;
878        self.gullet.end_group();
879
880        let result = ParseNode::OrdGroup {
881            mode: self.mode,
882            loc: Some(arg_token.loc.clone()),
883            body: expression,
884            semisimple: None,
885        };
886
887        if mode.is_some() {
888            self.switch_mode(outer_mode);
889        }
890
891        Ok(Some(result))
892    }
893
894    // ── Symbol parsing ──────────────────────────────────────────────────
895
896    /// Parse a single symbol (internal version that returns Result).
897    fn parse_symbol_inner(&mut self) -> ParseResult<Option<ParseNode>> {
898        let nucleus = self.fetch()?;
899        let text = nucleus.text.clone();
900
901        if let Some(stripped) = text.strip_prefix("\\verb") {
902            self.consume();
903            let arg = stripped.to_string();
904            let star = arg.starts_with('*');
905            let arg = if star { &arg[1..] } else { &arg };
906
907            if arg.len() < 2 {
908                return Err(ParseError::new("\\verb assertion failed", Some(&nucleus)));
909            }
910            let body = arg[1..arg.len() - 1].to_string();
911            return Ok(Some(ParseNode::Verb {
912                mode: Mode::Text,
913                body,
914                star,
915                loc: Some(nucleus.loc.clone()),
916            }));
917        }
918
919        let font_mode = match self.mode {
920            Mode::Math => ratex_font::symbols::Mode::Math,
921            Mode::Text => ratex_font::symbols::Mode::Text,
922        };
923
924        // ^ and _ are handled by parse_atom for sup/sub, not as symbol nodes
925        if text == "^" || text == "_" {
926            return Ok(None);
927        }
928
929        // Bare backslash (incomplete control sequence) → not a valid symbol
930        if text == "\\" {
931            return Ok(None);
932        }
933
934        if let Some(sym_info) = ratex_font::symbols::get_symbol(&text, font_mode) {
935            let loc = Some(SourceLocation::range(&nucleus.loc, &nucleus.loc));
936            let group = sym_info.group;
937
938            let node = if group.is_atom() {
939                let family = match group {
940                    ratex_font::symbols::Group::Bin => AtomFamily::Bin,
941                    ratex_font::symbols::Group::Close => AtomFamily::Close,
942                    ratex_font::symbols::Group::Inner => AtomFamily::Inner,
943                    ratex_font::symbols::Group::Open => AtomFamily::Open,
944                    ratex_font::symbols::Group::Punct => AtomFamily::Punct,
945                    ratex_font::symbols::Group::Rel => AtomFamily::Rel,
946                    _ => unreachable!(),
947                };
948                ParseNode::Atom {
949                    mode: self.mode,
950                    family,
951                    text: text.clone(),
952                    loc,
953                }
954            } else {
955                match group {
956                    ratex_font::symbols::Group::MathOrd => ParseNode::MathOrd {
957                        mode: self.mode,
958                        text: text.clone(),
959                        loc,
960                    },
961                    ratex_font::symbols::Group::TextOrd => ParseNode::TextOrd {
962                        mode: self.mode,
963                        text: text.clone(),
964                        loc,
965                    },
966                    ratex_font::symbols::Group::OpToken => ParseNode::OpToken {
967                        mode: self.mode,
968                        text: text.clone(),
969                        loc,
970                    },
971                    ratex_font::symbols::Group::AccentToken => ParseNode::AccentToken {
972                        mode: self.mode,
973                        text: text.clone(),
974                        loc,
975                    },
976                    ratex_font::symbols::Group::Spacing => ParseNode::SpacingNode {
977                        mode: self.mode,
978                        text: text.clone(),
979                        loc,
980                    },
981                    _ => ParseNode::MathOrd {
982                        mode: self.mode,
983                        text: text.clone(),
984                        loc,
985                    },
986                }
987            };
988
989            self.consume();
990            return Ok(Some(node));
991        }
992
993        // Unicode accented characters → decompose into accent nodes
994        // Handles both precomposed (á U+00E1) and combining forms (a + U+0301)
995        if let Some(node) = self.try_parse_unicode_accent(&text, &nucleus)? {
996            self.consume();
997            return Ok(Some(node));
998        }
999
1000        // Non-ASCII characters without accent decomposition → treat as textord
1001        // KaTeX always uses mode="text" for these, regardless of current mode
1002        let first_char = text.chars().next();
1003        if let Some(ch) = first_char {
1004            if ch as u32 >= 0x80 {
1005                let node = ParseNode::TextOrd {
1006                    mode: Mode::Text,
1007                    text: text.clone(),
1008                    loc: Some(SourceLocation::range(&nucleus.loc, &nucleus.loc)),
1009                };
1010                self.consume();
1011                return Ok(Some(node));
1012            }
1013        }
1014
1015        Ok(None)
1016    }
1017
1018    /// Try to decompose a Unicode accented character into accent nodes.
1019    /// Returns None if no decomposition is available.
1020    /// Only decomposes Latin-script characters, matching KaTeX behavior.
1021    fn try_parse_unicode_accent(
1022        &self,
1023        text: &str,
1024        nucleus: &Token,
1025    ) -> ParseResult<Option<ParseNode>> {
1026        let nfd: String = text.nfd().collect();
1027        let chars: Vec<char> = nfd.chars().collect();
1028
1029        if chars.len() < 2 {
1030            return Ok(None);
1031        }
1032
1033        // Build from the base up through each combining mark
1034        let mut split_idx = chars.len() - 1;
1035        while split_idx > 0 && is_supported_combining_accent(chars[split_idx]) {
1036            split_idx -= 1;
1037        }
1038
1039        // Verify ALL trailing chars are supported combining accents
1040        if split_idx == chars.len() - 1 {
1041            return Ok(None);
1042        }
1043
1044        // Only decompose Latin-script base characters
1045        let base_char = chars[0];
1046        if !is_latin_base_char(base_char) {
1047            return Ok(None);
1048        }
1049
1050        let loc = Some(SourceLocation::range(&nucleus.loc, &nucleus.loc));
1051
1052        // Base: everything before the combining marks
1053        let mut base_str: String = chars[..split_idx + 1].iter().collect();
1054
1055        // Accented i→ı and j→ȷ (dotless variants), matching KaTeX behavior
1056        if base_str.len() == 1 {
1057            match base_str.as_str() {
1058                "i" => base_str = "\u{0131}".to_string(), // ı
1059                "j" => base_str = "\u{0237}".to_string(), // ȷ
1060                _ => {}
1061            }
1062        }
1063
1064        let font_mode = match self.mode {
1065            Mode::Math => ratex_font::symbols::Mode::Math,
1066            Mode::Text => ratex_font::symbols::Mode::Text,
1067        };
1068
1069        let mut node = if base_str.chars().count() == 1 {
1070            let ch = base_str.chars().next().unwrap();
1071            if let Some(sym) = ratex_font::symbols::get_symbol(&base_str, font_mode) {
1072                match sym.group {
1073                    ratex_font::symbols::Group::TextOrd => ParseNode::TextOrd {
1074                        mode: self.mode,
1075                        text: base_str.clone(),
1076                        loc: loc.clone(),
1077                    },
1078                    _ => ParseNode::MathOrd {
1079                        mode: self.mode,
1080                        text: base_str.clone(),
1081                        loc: loc.clone(),
1082                    },
1083                }
1084            } else if (ch as u32) >= 0x80 {
1085                // Non-ASCII base chars always text mode (KaTeX compat)
1086                ParseNode::TextOrd {
1087                    mode: Mode::Text,
1088                    text: base_str.clone(),
1089                    loc: loc.clone(),
1090                }
1091            } else {
1092                ParseNode::MathOrd {
1093                    mode: self.mode,
1094                    text: base_str.clone(),
1095                    loc: loc.clone(),
1096                }
1097            }
1098        } else {
1099            return self.try_parse_unicode_accent(&base_str, nucleus).map(|opt| {
1100                opt.or_else(|| {
1101                    Some(ParseNode::TextOrd {
1102                        mode: Mode::Text,
1103                        text: base_str.clone(),
1104                        loc: loc.clone(),
1105                    })
1106                })
1107            });
1108        };
1109
1110        // Wrap in accent nodes from innermost to outermost
1111        for &combining in &chars[split_idx + 1..] {
1112            let label = combining_to_accent_label(combining, self.mode);
1113            node = ParseNode::Accent {
1114                mode: self.mode,
1115                label,
1116                is_stretchy: Some(false),
1117                is_shifty: Some(true),
1118                base: Box::new(node),
1119                loc: loc.clone(),
1120            };
1121        }
1122
1123        Ok(Some(node))
1124    }
1125
1126    /// Parse a sub-expression from the given tokens.
1127    pub fn subparse(&mut self, tokens: Vec<Token>) -> ParseResult<Vec<ParseNode>> {
1128        let old_token = self.next_token.take();
1129
1130        self.gullet
1131            .push_token(Token::new("}", 0, 0));
1132        self.gullet.push_tokens(tokens);
1133        let parse = self.parse_expression(false, None)?;
1134        self.expect("}", true)?;
1135
1136        self.next_token = old_token;
1137        Ok(parse)
1138    }
1139}
1140
1141fn is_latin_base_char(ch: char) -> bool {
1142    matches!(ch,
1143        'A'..='Z' | 'a'..='z'
1144        | '\u{0131}' // ı (dotless i)
1145        | '\u{0237}' // ȷ (dotless j)
1146        | '\u{00C6}' // Æ
1147        | '\u{00D0}' // Ð
1148        | '\u{00D8}' // Ø
1149        | '\u{00DE}' // Þ
1150        | '\u{00DF}' // ß
1151        | '\u{00E6}' // æ
1152        | '\u{00F0}' // ð
1153        | '\u{00F8}' // ø
1154        | '\u{00FE}' // þ
1155    )
1156}
1157
1158fn is_supported_combining_accent(ch: char) -> bool {
1159    matches!(
1160        ch,
1161        '\u{0300}' | '\u{0301}' | '\u{0302}' | '\u{0303}' | '\u{0304}'
1162        | '\u{0306}' | '\u{0307}' | '\u{0308}' | '\u{030A}' | '\u{030B}' | '\u{030C}'
1163        | '\u{0327}'
1164    )
1165}
1166
1167fn combining_to_accent_label(ch: char, mode: Mode) -> String {
1168    match mode {
1169        Mode::Math => match ch {
1170            '\u{0300}' => "\\grave".to_string(),
1171            '\u{0301}' => "\\acute".to_string(),
1172            '\u{0302}' => "\\hat".to_string(),
1173            '\u{0303}' => "\\tilde".to_string(),
1174            '\u{0304}' => "\\bar".to_string(),
1175            '\u{0306}' => "\\breve".to_string(),
1176            '\u{0307}' => "\\dot".to_string(),
1177            '\u{0308}' => "\\ddot".to_string(),
1178            '\u{030A}' => "\\mathring".to_string(),
1179            '\u{030B}' => "\\H".to_string(),
1180            '\u{030C}' => "\\check".to_string(),
1181            '\u{0327}' => "\\c".to_string(),
1182            _ => format!("\\char\"{:X}", ch as u32),
1183        },
1184        Mode::Text => match ch {
1185            '\u{0300}' => "\\`".to_string(),
1186            '\u{0301}' => "\\'".to_string(),
1187            '\u{0302}' => "\\^".to_string(),
1188            '\u{0303}' => "\\~".to_string(),
1189            '\u{0304}' => "\\=".to_string(),
1190            '\u{0306}' => "\\u".to_string(),
1191            '\u{0307}' => "\\.".to_string(),
1192            '\u{0308}' => "\\\"".to_string(),
1193            '\u{030A}' => "\\r".to_string(),
1194            '\u{030B}' => "\\H".to_string(),
1195            '\u{030C}' => "\\v".to_string(),
1196            '\u{0327}' => "\\c".to_string(),
1197            _ => format!("\\char\"{:X}", ch as u32),
1198        },
1199    }
1200}
1201
1202fn is_valid_unit(unit: &str) -> bool {
1203    matches!(
1204        unit,
1205        "pt" | "mm" | "cm" | "in" | "bp" | "pc" | "dd" | "cc" | "nd" | "nc" | "sp" | "px"
1206            | "ex" | "em" | "mu"
1207    )
1208}
1209
1210/// If the whole expression is wrapped in TeX inline/display math delimiters, parse the inside only.
1211/// The parser already runs in math mode; a leading `$` would otherwise hit the `$` / `\\(` "switch to math"
1212/// handler, which is disallowed in math mode (see `functions::math`).
1213fn strip_outer_math_delimiters(input: &str) -> &str {
1214    let s = input.trim();
1215    if s.len() >= 4 && s.starts_with("$$") && s.ends_with("$$") {
1216        return s[2..s.len() - 2].trim();
1217    }
1218    if s.len() >= 2 && s.starts_with('$') && s.ends_with('$') {
1219        return s[1..s.len() - 1].trim();
1220    }
1221    s
1222}
1223
1224/// Convenience function: parse a LaTeX string and return the AST.
1225pub fn parse(input: &str) -> ParseResult<Vec<ParseNode>> {
1226    Parser::new(strip_outer_math_delimiters(input)).parse()
1227}
1228
1229#[cfg(test)]
1230mod tests {
1231    use super::*;
1232
1233    #[test]
1234    fn test_parse_single_char() {
1235        let result = parse("x").unwrap();
1236        assert_eq!(result.len(), 1);
1237        assert_eq!(result[0].type_name(), "mathord");
1238    }
1239
1240    #[test]
1241    fn test_parse_strips_outer_dollar_inline_math() {
1242        let inner = r"C_p[\ce{H2O(l)}] = \pu{75.3 J // mol K}";
1243        let wrapped = format!("${inner}$");
1244        let a = parse(&wrapped).expect("wrapped");
1245        let b = parse(inner).expect("inner");
1246        assert_eq!(a.len(), b.len());
1247        for (x, y) in a.iter().zip(b.iter()) {
1248            assert_eq!(x.type_name(), y.type_name());
1249        }
1250    }
1251
1252    #[test]
1253    fn test_parse_addition() {
1254        let result = parse("a+b").unwrap();
1255        assert_eq!(result.len(), 3);
1256        assert_eq!(result[0].type_name(), "mathord"); // a
1257        assert_eq!(result[1].type_name(), "atom"); // +
1258        assert_eq!(result[2].type_name(), "mathord"); // b
1259    }
1260
1261    #[test]
1262    fn test_parse_superscript() {
1263        let result = parse("x^2").unwrap();
1264        assert_eq!(result.len(), 1);
1265        assert_eq!(result[0].type_name(), "supsub");
1266    }
1267
1268    #[test]
1269    fn test_parse_subscript() {
1270        let result = parse("a_i").unwrap();
1271        assert_eq!(result.len(), 1);
1272        assert_eq!(result[0].type_name(), "supsub");
1273    }
1274
1275    #[test]
1276    fn test_parse_supsub() {
1277        let result = parse("x^2_i").unwrap();
1278        assert_eq!(result.len(), 1);
1279        assert_eq!(result[0].type_name(), "supsub");
1280        if let ParseNode::SupSub { sup, sub, .. } = &result[0] {
1281            assert!(sup.is_some());
1282            assert!(sub.is_some());
1283        } else {
1284            panic!("Expected SupSub");
1285        }
1286    }
1287
1288    #[test]
1289    fn test_parse_group() {
1290        let result = parse("{a+b}").unwrap();
1291        assert_eq!(result.len(), 1);
1292        assert_eq!(result[0].type_name(), "ordgroup");
1293    }
1294
1295    #[test]
1296    fn test_parse_frac() {
1297        let result = parse("\\frac{a}{b}").unwrap();
1298        assert_eq!(result.len(), 1);
1299        assert_eq!(result[0].type_name(), "genfrac");
1300    }
1301
1302    #[test]
1303    fn test_parse_sqrt() {
1304        let result = parse("\\sqrt{x}").unwrap();
1305        assert_eq!(result.len(), 1);
1306        assert_eq!(result[0].type_name(), "sqrt");
1307    }
1308
1309    #[test]
1310    fn test_parse_sqrt_optional() {
1311        let result = parse("\\sqrt[3]{x}").unwrap();
1312        assert_eq!(result.len(), 1);
1313        if let ParseNode::Sqrt { index, .. } = &result[0] {
1314            assert!(index.is_some());
1315        } else {
1316            panic!("Expected Sqrt");
1317        }
1318    }
1319
1320    #[test]
1321    fn test_parse_nested() {
1322        let result = parse("\\frac{\\sqrt{a^2+b^2}}{c}").unwrap();
1323        assert_eq!(result.len(), 1);
1324        assert_eq!(result[0].type_name(), "genfrac");
1325    }
1326
1327    #[test]
1328    fn test_parse_empty() {
1329        let result = parse("").unwrap();
1330        assert_eq!(result.len(), 0);
1331    }
1332
1333    #[test]
1334    fn test_parse_double_superscript_error() {
1335        let result = parse("x^2^3");
1336        assert!(result.is_err());
1337    }
1338
1339    #[test]
1340    fn test_parse_unclosed_brace_error() {
1341        let result = parse("{x");
1342        assert!(result.is_err());
1343    }
1344
1345    #[test]
1346    fn test_parse_json_output() {
1347        let result = parse("x^2").unwrap();
1348        let json = serde_json::to_string_pretty(&result).unwrap();
1349        assert!(json.contains("supsub"));
1350    }
1351}
ratex_parser/parser.rs

ratex_parser/
parser.rs