Skip to main content

oxyl_parser/
lib.rs

1// oxyl-parser
2// 
3// Builds a Document of Nodes from the lexer's token stream.
4//
5// - Commands greedily pick up [...] and {...} until the next token 
6// is neither of those.
7// - A pair of $ tokens wraps a math node.
8// - A \[...\] pair wraps a display math node. Inline and display math children 
9// are parsed with the same machinery as ordinairy text;
10// will do atoms and scripts later - TODO
11//  - Every error carries a DiagSpan poiting at the token that triggered it 
12//  (the unmatched bracket or dollar sign) so the cli can render source 
13//  context without having to extract it from the message string :D
14
15
16use oxyl_diagnostics::{DiagSpan, Diagnostic};
17use oxyl_lexer::{Span, Token, TokenKind};
18
19fn diag_span(s: Span) -> DiagSpan {
20    DiagSpan::new(s.start, s.end)
21}
22
23/// Stop predicate for `parse_nodes` when scanning the body of `\[ ... \]`.
24fn is_display_math_close(k: &TokenKind) -> bool {
25    matches!(k, TokenKind::ControlSeq(s) if s == "]")
26}
27
28// --- 
29// AST Types 
30//
31
32/// The root of a parsed LaTeX document.
33///
34/// For now we do not distinguish preamble from body - everything lands in 
35/// `body`. Will add that split when handling for `\begin{document}` is done.
36#[derive(Debug, Clone)]
37pub struct Document {
38    pub body: Vec<Node>,
39}
40
41/// A single node in the LaTeX AST.
42#[derive(Debug, Clone)]
43pub enum Node {
44    /// A run of plain text characters
45    Text(String, Span),
46
47    /// A blank line in the source - signals a paragraph break.
48    ParagraphBreak(Span),
49
50    /// A LaTeX command and its arguments, e.g. `\textbf{hello}`.
51    Command {
52        /// Name without the leading backslash, e.g. `"textbf"`.
53        name: String ,
54        args: Vec<Arg>,
55        span: Span,
56    },
57
58    /// A braced group `{...}`.
59    Group(Vec<Node>, Span),
60    
61    /// Inline match: `$ ... $`. The span covers both `$` delimiters.
62    Math(Vec<Node>, Span),
63
64    /// Display math: `\[ ... \]`. The span covers both delimiters.
65    DisplayMath(Vec<Node>, Span),
66}
67
68impl Node {
69    pub fn span(&self) -> Span {
70        match self {
71            Node::Text(_, s) => *s,
72            Node::ParagraphBreak(s) => *s,
73            Node::Command { span, .. } => *span,
74            Node::Group(_, s) => *s,
75            Node::Math(_, s) => *s,
76            Node::DisplayMath(_, s) => *s,
77        }
78    }
79}
80
81/// A single argument to a command or environment 
82#[derive(Debug, Clone)]
83pub enum Arg {
84    Mandatory(Vec<Node>),
85    Optional(Vec<Node>),
86}
87
88// --- 
89// Parser Result 
90// --- 
91
92/// Returned by [`Parser::parse`]. The document is always produced; errors 
93/// are collected alongside it so the caller sees everything at once.
94#[derive(Debug)]
95pub struct ParseResult {
96    pub document: Document,
97    pub errors: Vec<Diagnostic>,
98}
99
100// --- 
101// Parser 
102// --- 
103
104pub struct Parser {
105    tokens: Vec<Token>,
106    pos: usize,
107    errors: Vec<Diagnostic>,
108}
109
110impl Parser {
111    pub fn new(tokens: Vec<Token>) -> Self {
112        Self { tokens, pos: 0, errors: Vec::new() }
113    }
114    
115    /// Parse the token stream.
116    pub fn parse(mut self) -> ParseResult {
117        let body = self.parse_nodes(|_| false);
118        ParseResult { document: Document { body }, errors: self.errors }
119    }
120
121    fn peek(&self) -> Option<&Token> {
122        self.tokens.get(self.pos)
123    }
124
125    fn peek_kind(&self) -> Option<&TokenKind> {
126        self.peek().map(|t| &t.kind)
127    }
128
129    fn bump(&mut self) -> Option<Token> {
130        if self.pos < self.tokens.len() {
131            let tok = self.tokens[self.pos].clone();
132            self.pos += 1;
133            Some(tok)
134        } else {
135            None
136        }
137    }
138
139    /// Parse a run of nodes until the token stream is exhausted or 
140    /// `stop` returns true for the next token's kind. The stopping token is 
141    /// left unconsumed so it can be examined and bumped by the caller !
142    ///
143    /// `stop` is used by the group parser to halt at `}` - it is a function pointer 
144    /// rather than an `impl Fn` so the recursive calls don't blow up the parser.
145    fn parse_nodes(&mut self, stop: fn(&TokenKind) -> bool) -> Vec<Node> {
146        let mut nodes: Vec<Node> = Vec::new();
147        
148        loop {
149            match self.peek() {
150                None => break,
151                Some(tok) if stop(&tok.kind) => break,
152                _ => {}
153            }
154
155            let tok = self.bump().unwrap();
156
157            match tok.kind {
158                TokenKind::Char(c) => self.push_char(&mut nodes, c, tok.span),
159                TokenKind::Space => self.push_char(&mut nodes, ' ', tok.span),
160
161                TokenKind::ParagraphBreak => {
162                    nodes.push(Node::ParagraphBreak(tok.span));
163                }
164
165                // `\[` opens display math. 
166                TokenKind::ControlSeq(ref name) if name == "[" => {
167                    let open_span = tok.span;
168                    let children = self.parse_nodes(is_display_math_close);
169                    if matches!(self.peek_kind(), Some(TokenKind::ControlSeq(s)) if s == "]") {
170                        let close = self.bump().unwrap();
171                        nodes.push(Node::DisplayMath(children, open_span.merge(close.span)));
172                    } else {
173                        self.errors.push(
174                            Diagnostic::error("E031", "unclosed '\\[' (display math)")
175                            .with_span(diag_span(open_span)),
176                        );
177                        nodes.push(Node::DisplayMath(children, open_span));
178                    }
179                }
180
181                // A bare `\]` outside display math is a stray closer.
182                TokenKind::ControlSeq(ref name) if name == "]" => {
183                    self.errors.push(
184                        Diagnostic::error("E032", "stray '\\]' (no matching '\\[')")
185                        .with_span(diag_span(tok.span)),
186                    );
187                }
188
189                TokenKind::ControlSeq(name) => {
190                    let cmd_span = tok.span; 
191                    let args = self.parse_args();
192                    // Extend the span to cover the last argument. 
193                    let full_span = args.last()
194                        .and_then(|a| match a {
195                            Arg::Mandatory(children) => children.last().map(|n| n.span()),
196                            Arg::Optional(children) => children.last().map(|n| n.span()), 
197                        })
198                        .map(|s| cmd_span.merge(s))
199                        .unwrap_or(cmd_span);
200                    nodes.push(Node::Command { name, args, span: full_span });
201                }
202
203                TokenKind::BeginGroup => {
204                    let open_span = tok.span;
205                    let children = self.parse_nodes(|k| matches!(k, TokenKind::EndGroup));
206                    if self.peek_kind() == Some(&TokenKind::EndGroup) {
207                        let close = self.bump().unwrap();
208                        nodes.push(Node::Group(children, open_span.merge(close.span)));
209                    } else {
210                        // Unclosed group - record the error, keep what we parsed.
211                        self.errors.push(
212                            Diagnostic::error("E020", "unclosed '{'")
213                                .with_span(diag_span(open_span)),
214                        );
215                        nodes.push(Node::Group(children, open_span));
216                    }
217                }
218                
219                TokenKind::MathShift => {
220                    let open_span = tok.span;
221                    let children = self.parse_nodes(|k| matches!(k, TokenKind::MathShift));
222                    if self.peek_kind() == Some(&TokenKind::MathShift) {
223                        let close = self.bump().unwrap();
224                        nodes.push(Node::Math(children, open_span.merge(close.span)));
225                    } else {
226                        self.errors.push(
227                            Diagnostic::error("E030", "unclosed '$' (math mode)")
228                                .with_span(diag_span(open_span)),
229                        );
230                        nodes.push(Node::Math(children, open_span));
231                    }
232                }
233                // Everything else is left unhandled for now so skip it.
234                _ => {}
235            }
236        }
237
238        nodes
239    }
240    /// Consume all immediately following `[...] and `{ ... }` groups as args.
241    ///
242    /// TeX commands pick up their arguments greedily; we skip spaces between
243    /// the command name and each argument to match TeX's behaviour. The loop
244    /// stops at the first token that is neither `[` nor `{`.
245    fn parse_args(&mut self) -> Vec<Arg> {
246        let mut args = Vec::new();
247        
248        loop {
249            // Skip spaces between the command and its next argument.
250            if self.peek_kind() == Some(&TokenKind::Space) {
251                self.bump();
252            }
253
254            match self.peek_kind() {
255                Some(&TokenKind::BeginGroup) => args.push(self.parse_mandatory_arg()),
256                Some(&TokenKind::Char('[')) => args.push(self.parse_optional_arg()),
257                _ => break,
258            }
259        }
260        args
261    }    
262
263    fn parse_mandatory_arg(&mut self) -> Arg {
264        // Consume the opening brace, remembering its span for diagnostics.
265        let open_span = self.bump().unwrap().span;
266        let children = self.parse_nodes(|k| matches!(k, TokenKind::EndGroup));
267        if self.peek_kind() == Some(&TokenKind::EndGroup) {
268            self.bump();
269        } else {
270            self.errors.push(
271                Diagnostic::error("E021","unclosed mandatory argument")
272                    .with_span(diag_span(open_span)),
273            );
274        }
275        Arg::Mandatory(children)
276    }
277
278    fn parse_optional_arg(&mut self) -> Arg {
279        // Consume the opening `[`, remembering its span for diagnostics.
280        let open_span = self.bump().unwrap().span;
281        let children = self.parse_nodes(|k| matches!(k, TokenKind::Char(']')));
282        if self.peek_kind() == Some(&TokenKind::Char(']')) {
283            self.bump();
284        } else {
285            self.errors.push(
286                Diagnostic::error("E022","unclosed optional argument")
287                    .with_span(diag_span(open_span)),
288            );
289        }
290        Arg::Optional(children)
291    }
292    
293    /// Append a character to the last `Text` node, or start a new one.
294    fn push_char(&self, nodes: &mut Vec<Node>, c: char, span: Span) {
295        match nodes.last_mut() {
296            Some(Node::Text(s, existing)) => {
297                s.push(c);
298                *existing = existing.merge(span);
299            }
300            _ => nodes.push(Node::Text(c.to_string(), span)),
301        }
302    }
303}
304
305
306
307// Tests
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312    use oxyl_lexer::Lexer;
313
314    fn parse(src: &str) -> ParseResult {
315        let tokens = Lexer::new(src).tokenise().tokens;
316        Parser::new(tokens).parse()
317    }
318
319    fn first_command(src: &str) -> (String, Vec<Arg>) {
320        let r = parse(src);
321        for node in &r.document.body {
322            if let Node::Command { name, args, .. } = node {
323                return (name.clone(), args.clone());
324            }
325        }
326        panic!("no command found in: {src}");
327    }
328
329    #[test]
330    fn command_no_args() {
331        let (name, args) = first_command("\\LaTeX");
332        assert_eq!(name, "LaTeX");
333        assert!(args.is_empty());
334    }
335
336    #[test]
337    fn command_one_mandatory_arg() {
338        let (name, args) = first_command("\\textbf{hello}");
339        assert_eq!(name, "textbf");
340        assert_eq!(args.len(), 1);
341        assert!(matches!(&args[0], Arg::Mandatory(children)
342            if matches!(&children[0], Node::Text(s, _) if s == "hello")));
343    }
344
345    #[test]
346    fn command_two_mandatory_args() {
347        let (name, args) = first_command("\\frac{a}{b}");
348        assert_eq!(name, "frac");
349        assert_eq!(args.len(), 2);
350    }
351    
352    #[test]
353    fn unclosed_arg_produces_error() {
354        let r = parse("\\cmd{oops");
355        assert!(!r.errors.is_empty());
356    }
357
358    #[test]
359    fn paragraph_break_still_works() {
360        let r = parse("line one\n\nline two");
361        let has_par = r.document.body.iter().any(|n| matches!(n, Node::ParagraphBreak(_)));
362        assert!(has_par);
363    }
364
365    #[test]
366    fn nested_command_in_arg() {
367        let r = parse("\\outer{\\inner{x}}");
368        assert!(r.errors.is_empty());
369        if let Node::Command { args, .. } = &r.document.body[0] {
370            if let Arg::Mandatory(inner) = &args[0] {
371                assert!(matches!(&inner[0], Node::Command { name, .. } if name == "inner"));
372            } else { panic!("expected mandatory arg"); }
373        } else { panic!("expected command"); }
374    }
375
376    #[test]
377    fn command_with_optional_arg() {
378        let (name, args) = first_command("\\sqrt[3]{27}");
379        assert_eq!(name, "sqrt");
380        assert_eq!(args.len(), 2);
381        assert!(matches!(&args[0], Arg::Optional(children)
382            if matches!(&children[0], Node::Text(s, _) if s == "3")));
383        assert!(matches!(&args[1], Arg::Mandatory(children)
384            if matches!(&children[0], Node::Text(s, _) if s == "27")));
385    }
386
387    #[test]
388    fn command_with_only_optional_arg() {
389        let (name, args) = first_command("\\foo[opt]");
390        assert_eq!(name, "foo");
391        assert_eq!(args.len(), 1);
392        assert!(matches!(&args[0], Arg::Optional(_)));
393    }
394
395    #[test]
396    fn optional_then_two_mandatory() {
397        // two diff types of option + ordering 
398        let (_, args) = first_command("\\section[short]{long}{extra}");
399        assert_eq!(args.len(), 3);
400        assert!(matches!(&args[0], Arg::Optional(_)));
401        assert!(matches!(&args[1], Arg::Mandatory(_)));
402        assert!(matches!(&args[2], Arg::Mandatory(_)));
403    }
404
405    #[test]
406    fn unclosed_optional_arg_produces_error() {
407        let r = parse("\\cmd[oops");
408        assert!(!r.errors.is_empty());
409    }
410
411    #[test]
412    fn bracket_outside_command_is_text() {
413        // A `'[` not directly after a control sequence is just ordinary text.
414        let r = parse("hello [world]");
415        assert!(r.errors.is_empty());
416        assert!(matches!(&r.document.body[0], Node::Text(s, _) if s == "hello [world]"));
417    }
418
419    #[test]
420    fn inline_math_simple() {
421        let r = parse("$x+1$");
422        assert!(r.errors.is_empty());
423        assert_eq!(r.document.body.len(), 1);
424        assert!(matches!(&r.document.body[0], Node::Math(children, _)
425            if matches!(&children[0], Node::Text(s, _) if s == "x+1")));
426    }
427
428    #[test]
429    fn inline_math_with_command() {
430        let r = parse("$\\alpha + \\beta$");
431        assert!(r.errors.is_empty());
432        if let Node::Math(children, _) = &r.document.body[0] {
433            let names: Vec<_> = children.iter().filter_map(|n| match n {
434                Node::Command { name, .. } => Some(name.as_str()),
435                _ => None, 
436            }).collect();
437            assert_eq!(names, vec!["alpha", "beta"]);
438        } else {
439            panic!("expected math node");
440        }
441    }
442
443    #[test]
444    fn unclosed_math_produces_error() {
445        let r = parse("text $oops");
446        assert!(!r.errors.is_empty());
447    }
448    
449    #[test]
450    fn parser_errors_carry_spans() {
451        // Every parser error must point at the offending opener so the CLI 
452        // can render the location from the diagnostic span instead of
453        // picking it ouf the message text.
454        let cases = [
455            "\\cmd{oops", // E021
456            "\\cmd[oops", // E022
457            "{", // E020
458            "$oops", // E030
459        ];
460        for src in cases {
461            let r = parse(src);
462            assert!(!r.errors.is_empty(), "expected error for {src:?}");
463            for e in &r.errors {
464                assert!(e.span.is_some(), "error for {src:?} has no span: {e:?}");
465            }
466        }
467    }
468
469    #[test]
470    fn math_after_text() {
471        let r = parse("hello $x$");
472        assert!(r.errors.is_empty());
473        assert_eq!(r.document.body.len(), 2);
474        assert!(matches!(&r.document.body[0], Node::Text(s, _) if s == "hello "));
475        assert!(matches!(&r.document.body[1], Node::Math(_, _)));
476    }
477
478    #[test]
479    fn display_math_simple() {
480        let r = parse("\\[x+1\\]");
481        assert!(r.errors.is_empty(), "{:?}", r.errors);
482        assert_eq!(r.document.body.len(), 1);
483        assert!(matches!(&r.document.body[0], Node::DisplayMath(children, _)
484            if matches!(&children[0], Node::Text(s, _) if s == "x+1")));
485    }
486
487    #[test]
488    fn display_math_with_command() {
489        let r = parse("\\[ \\sum_{i=0}^n i \\]");
490        assert!(r.errors.is_empty(), "{:?}", r.errors);
491        assert!(matches!(&r.document.body[0], Node::DisplayMath(_, _)));
492    }
493
494    #[test]
495    fn unclosed_display_math_produces_error() {
496        let r = parse("\\[ a + b");
497        assert!(r.errors.iter().any(|e| e.code == "E031"));
498    }
499
500    #[test]
501    fn stray_close_display_math_produces_error() {
502        let r = parse("oops \\] more");
503        assert!(r.errors.iter().any(|e| e.code == "E032"));
504    }
505}