Skip to main content

oxyl_parser/
lib.rs

1// oxyl-parser
2// TODO - put all this in docs
3// Builds a Document of Nodes from the lexer's token stream.
4//
5// - Commands greedily pick up [...] and {...} until the next token 
6// is neither of those.
7// - A pair of $ tokens wraps a math node.
8// - A \[...\] pair wraps a display math node. Inline and display math children 
9// are parsed with the same machinery as ordinairy text;
10// will do atoms and scripts later - TODO
11// - Comments are preserved so source fidelity tools ie formatters and
12// linters can round-trip them!!
13//  - Every error carries a DiagSpan poiting at the token that triggered it 
14//  (the unmatched bracket or dollar sign) so the cli can render source 
15//  context without having to extract it from the message string :D
16
17
18use oxyl_diagnostics::{DiagSpan, Diagnostic};
19use oxyl_lexer::{Span, Token, TokenKind};
20
21fn diag_span(s: Span) -> DiagSpan {
22    DiagSpan::new(s.start, s.end)
23}
24
25/// Stop predicate for `parse_nodes` when scanning the body of `\[ ... \]`.
26fn is_display_math_close(k: &TokenKind) -> bool {
27    matches!(k, TokenKind::ControlSeq(s) if s == "]")
28}
29
30// --- 
31// AST Types 
32//
33
34/// The root of a parsed LaTeX document.
35///
36/// For now we do not distinguish preamble from body - everything lands in 
37/// `body`. Will add that split when handling for `\begin{document}` is done.
38#[derive(Debug, Clone)]
39pub struct Document {
40    pub body: Vec<Node>,
41}
42
43/// A single node in the LaTeX AST.
44#[derive(Debug, Clone)]
45pub enum Node {
46    /// A run of plain text characters
47    Text(String, Span),
48
49    /// A blank line in the source - signals a paragraph break.
50    ParagraphBreak(Span),
51
52    /// A LaTeX command and its arguments, e.g. `\textbf{hello}`.
53    Command {
54        /// Name without the leading backslash, e.g. `"textbf"`.
55        name: String ,
56        args: Vec<Arg>,
57        span: Span,
58    },
59
60    /// A braced group `{...}`.
61    Group(Vec<Node>, Span),
62    
63    /// Inline match: `$ ... $`. The span covers both `$` delimiters.
64    Math(Vec<Node>, Span),
65
66    /// Display math: `\[ ... \]`. The span covers both delimiters.
67    DisplayMath(Vec<Node>, Span),
68
69    /// A `% ...` line comment. THe string is the body without the leading 
70    /// `%` and without the trailing newline - the span covers the whole 
71    /// run, including both. Comments in AST since they can actually affect produced PDF.
72    Comment(String, Span),
73}
74
75impl Node {
76    pub fn span(&self) -> Span {
77        match self {
78            Node::Text(_, s) => *s,
79            Node::ParagraphBreak(s) => *s,
80            Node::Command { span, .. } => *span,
81            Node::Group(_, s) => *s,
82            Node::Math(_, s) => *s,
83            Node::DisplayMath(_, s) => *s,
84            Node::Comment(_, s) => *s,
85        }
86    }
87}
88
89/// A single argument to a command or environment 
90#[derive(Debug, Clone)]
91pub enum Arg {
92    Mandatory(Vec<Node>),
93    Optional(Vec<Node>),
94}
95
96// --- 
97// Parser Result 
98// --- 
99
100/// Returned by [`Parser::parse`]. The document is always produced; errors 
101/// are collected alongside it so the caller sees everything at once.
102#[derive(Debug)]
103pub struct ParseResult {
104    pub document: Document,
105    pub errors: Vec<Diagnostic>,
106}
107
108// --- 
109// Parser 
110// --- 
111
112pub struct Parser {
113    tokens: Vec<Token>,
114    pos: usize,
115    errors: Vec<Diagnostic>,
116}
117
118impl Parser {
119    pub fn new(tokens: Vec<Token>) -> Self {
120        Self { tokens, pos: 0, errors: Vec::new() }
121    }
122    
123    /// Parse the token stream.
124    pub fn parse(mut self) -> ParseResult {
125        let body = self.parse_nodes(|_| false);
126        ParseResult { document: Document { body }, errors: self.errors }
127    }
128
129    fn peek(&self) -> Option<&Token> {
130        self.tokens.get(self.pos)
131    }
132
133    fn peek_kind(&self) -> Option<&TokenKind> {
134        self.peek().map(|t| &t.kind)
135    }
136
137    fn bump(&mut self) -> Option<Token> {
138        if self.pos < self.tokens.len() {
139            let tok = self.tokens[self.pos].clone();
140            self.pos += 1;
141            Some(tok)
142        } else {
143            None
144        }
145    }
146
147    /// Parse a run of nodes until the token stream is exhausted or 
148    /// `stop` returns true for the next token's kind. The stopping token is 
149    /// left unconsumed so it can be examined and bumped by the caller !
150    ///
151    /// `stop` is used by the group parser to halt at `}` - it is a function pointer 
152    /// rather than an `impl Fn` so the recursive calls don't blow up the parser.
153    fn parse_nodes(&mut self, stop: fn(&TokenKind) -> bool) -> Vec<Node> {
154        let mut nodes: Vec<Node> = Vec::new();
155        
156        loop {
157            match self.peek() {
158                None => break,
159                Some(tok) if stop(&tok.kind) => break,
160                _ => {}
161            }
162
163            let tok = self.bump().unwrap();
164
165            match tok.kind {
166                TokenKind::Char(c) => self.push_char(&mut nodes, c, tok.span),
167                TokenKind::Space => self.push_char(&mut nodes, ' ', tok.span),
168
169                TokenKind::ParagraphBreak => {
170                    nodes.push(Node::ParagraphBreak(tok.span));
171                }
172                
173                TokenKind::Comment(body) => {
174                    nodes.push(Node::Comment(body, tok.span));
175                }
176
177                // `\[` opens display math. 
178                TokenKind::ControlSeq(ref name) if name == "[" => {
179                    let open_span = tok.span;
180                    let children = self.parse_nodes(is_display_math_close);
181                    if matches!(self.peek_kind(), Some(TokenKind::ControlSeq(s)) if s == "]") {
182                        let close = self.bump().unwrap();
183                        nodes.push(Node::DisplayMath(children, open_span.merge(close.span)));
184                    } else {
185                        self.errors.push(
186                            Diagnostic::error("E031", "unclosed '\\[' (display math)")
187                                .with_span(diag_span(open_span)),
188                        );
189                        nodes.push(Node::DisplayMath(children, open_span));
190                    }
191                }
192
193                // A bare `\]` outside display math is a stray closer.
194                TokenKind::ControlSeq(ref name) if name == "]" => {
195                    self.errors.push(
196                        Diagnostic::error("E032", "stray '\\]' (no matching '\\[')")
197                            .with_span(diag_span(tok.span)),
198                    );
199                }
200
201                TokenKind::ControlSeq(name) => {
202                    let cmd_span = tok.span; 
203                    let args = self.parse_args();
204                    // Extend the span to cover the last argument. 
205                    let full_span = args.last()
206                        .and_then(|a| match a {
207                            Arg::Mandatory(children) => children.last().map(|n| n.span()),
208                            Arg::Optional(children) => children.last().map(|n| n.span()), 
209                        })
210                        .map(|s| cmd_span.merge(s))
211                        .unwrap_or(cmd_span);
212                    nodes.push(Node::Command { name, args, span: full_span });
213                }
214
215                TokenKind::BeginGroup => {
216                    let open_span = tok.span;
217                    let children = self.parse_nodes(|k| matches!(k, TokenKind::EndGroup));
218                    if self.peek_kind() == Some(&TokenKind::EndGroup) {
219                        let close = self.bump().unwrap();
220                        nodes.push(Node::Group(children, open_span.merge(close.span)));
221                    } else {
222                        // Unclosed group - record the error, keep what we parsed.
223                        self.errors.push(
224                            Diagnostic::error("E020", "unclosed '{'")
225                                .with_span(diag_span(open_span)),
226                        );
227                        nodes.push(Node::Group(children, open_span));
228                    }
229                }
230                
231                TokenKind::MathShift => {
232                    let open_span = tok.span;
233                    let children = self.parse_nodes(|k| matches!(k, TokenKind::MathShift));
234                    if self.peek_kind() == Some(&TokenKind::MathShift) {
235                        let close = self.bump().unwrap();
236                        nodes.push(Node::Math(children, open_span.merge(close.span)));
237                    } else {
238                        self.errors.push(
239                            Diagnostic::error("E030", "unclosed '$' (math mode)")
240                                .with_span(diag_span(open_span)),
241                        );
242                        nodes.push(Node::Math(children, open_span));
243                    }
244                }
245                // Everything else is left unhandled for now so skip it.
246                _ => {}
247            }
248        }
249
250        nodes
251    }
252    /// Consume all immediately following `[...] and `{ ... }` groups as args.
253    ///
254    /// TeX commands pick up their arguments greedily; we skip spaces between
255    /// the command name and each argument to match TeX's behaviour. The loop
256    /// stops at the first token that is neither `[` nor `{`.
257    fn parse_args(&mut self) -> Vec<Arg> {
258        let mut args = Vec::new();
259        
260        loop {
261            // Skip spaces between the command and its next argument.
262            if self.peek_kind() == Some(&TokenKind::Space) {
263                self.bump();
264            }
265
266            match self.peek_kind() {
267                Some(&TokenKind::BeginGroup) => args.push(self.parse_mandatory_arg()),
268                Some(&TokenKind::Char('[')) => args.push(self.parse_optional_arg()),
269                _ => break,
270            }
271        }
272        args
273    }    
274
275    fn parse_mandatory_arg(&mut self) -> Arg {
276        // Consume the opening brace, remembering its span for diagnostics.
277        let open_span = self.bump().unwrap().span;
278        let children = self.parse_nodes(|k| matches!(k, TokenKind::EndGroup));
279        if self.peek_kind() == Some(&TokenKind::EndGroup) {
280            self.bump();
281        } else {
282            self.errors.push(
283                Diagnostic::error("E021","unclosed mandatory argument")
284                    .with_span(diag_span(open_span)),
285            );
286        }
287        Arg::Mandatory(children)
288    }
289
290    fn parse_optional_arg(&mut self) -> Arg {
291        // Consume the opening `[`, remembering its span for diagnostics.
292        let open_span = self.bump().unwrap().span;
293        let children = self.parse_nodes(|k| matches!(k, TokenKind::Char(']')));
294        if self.peek_kind() == Some(&TokenKind::Char(']')) {
295            self.bump();
296        } else {
297            self.errors.push(
298                Diagnostic::error("E022","unclosed optional argument")
299                    .with_span(diag_span(open_span)),
300            );
301        }
302        Arg::Optional(children)
303    }
304    
305    /// Append a character to the last `Text` node, or start a new one.
306    fn push_char(&self, nodes: &mut Vec<Node>, c: char, span: Span) {
307        match nodes.last_mut() {
308            Some(Node::Text(s, existing)) => {
309                s.push(c);
310                *existing = existing.merge(span);
311            }
312            _ => nodes.push(Node::Text(c.to_string(), span)),
313        }
314    }
315}
316
317
318
319// Tests
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324    use oxyl_lexer::Lexer;
325
326    fn parse(src: &str) -> ParseResult {
327        let tokens = Lexer::new(src).tokenise().tokens;
328        Parser::new(tokens).parse()
329    }
330
331    fn first_command(src: &str) -> (String, Vec<Arg>) {
332        let r = parse(src);
333        for node in &r.document.body {
334            if let Node::Command { name, args, .. } = node {
335                return (name.clone(), args.clone());
336            }
337        }
338        panic!("no command found in: {src}");
339    }
340
341    #[test]
342    fn command_no_args() {
343        let (name, args) = first_command("\\LaTeX");
344        assert_eq!(name, "LaTeX");
345        assert!(args.is_empty());
346    }
347
348    #[test]
349    fn command_one_mandatory_arg() {
350        let (name, args) = first_command("\\textbf{hello}");
351        assert_eq!(name, "textbf");
352        assert_eq!(args.len(), 1);
353        assert!(matches!(&args[0], Arg::Mandatory(children)
354            if matches!(&children[0], Node::Text(s, _) if s == "hello")));
355    }
356
357    #[test]
358    fn command_two_mandatory_args() {
359        let (name, args) = first_command("\\frac{a}{b}");
360        assert_eq!(name, "frac");
361        assert_eq!(args.len(), 2);
362    }
363    
364    #[test]
365    fn unclosed_arg_produces_error() {
366        let r = parse("\\cmd{oops");
367        assert!(!r.errors.is_empty());
368    }
369
370    #[test]
371    fn paragraph_break_still_works() {
372        let r = parse("line one\n\nline two");
373        let has_par = r.document.body.iter().any(|n| matches!(n, Node::ParagraphBreak(_)));
374        assert!(has_par);
375    }
376
377    #[test]
378    fn nested_command_in_arg() {
379        let r = parse("\\outer{\\inner{x}}");
380        assert!(r.errors.is_empty());
381        if let Node::Command { args, .. } = &r.document.body[0] {
382            if let Arg::Mandatory(inner) = &args[0] {
383                assert!(matches!(&inner[0], Node::Command { name, .. } if name == "inner"));
384            } else { panic!("expected mandatory arg"); }
385        } else { panic!("expected command"); }
386    }
387
388    #[test]
389    fn command_with_optional_arg() {
390        let (name, args) = first_command("\\sqrt[3]{27}");
391        assert_eq!(name, "sqrt");
392        assert_eq!(args.len(), 2);
393        assert!(matches!(&args[0], Arg::Optional(children)
394            if matches!(&children[0], Node::Text(s, _) if s == "3")));
395        assert!(matches!(&args[1], Arg::Mandatory(children)
396            if matches!(&children[0], Node::Text(s, _) if s == "27")));
397    }
398
399    #[test]
400    fn command_with_only_optional_arg() {
401        let (name, args) = first_command("\\foo[opt]");
402        assert_eq!(name, "foo");
403        assert_eq!(args.len(), 1);
404        assert!(matches!(&args[0], Arg::Optional(_)));
405    }
406
407    #[test]
408    fn optional_then_two_mandatory() {
409        // two diff types of option + ordering 
410        let (_, args) = first_command("\\section[short]{long}{extra}");
411        assert_eq!(args.len(), 3);
412        assert!(matches!(&args[0], Arg::Optional(_)));
413        assert!(matches!(&args[1], Arg::Mandatory(_)));
414        assert!(matches!(&args[2], Arg::Mandatory(_)));
415    }
416
417    #[test]
418    fn unclosed_optional_arg_produces_error() {
419        let r = parse("\\cmd[oops");
420        assert!(!r.errors.is_empty());
421    }
422
423    #[test]
424    fn bracket_outside_command_is_text() {
425        // A `'[` not directly after a control sequence is just ordinary text.
426        let r = parse("hello [world]");
427        assert!(r.errors.is_empty());
428        assert!(matches!(&r.document.body[0], Node::Text(s, _) if s == "hello [world]"));
429    }
430
431    #[test]
432    fn inline_math_simple() {
433        let r = parse("$x+1$");
434        assert!(r.errors.is_empty());
435        assert_eq!(r.document.body.len(), 1);
436        assert!(matches!(&r.document.body[0], Node::Math(children, _)
437            if matches!(&children[0], Node::Text(s, _) if s == "x+1")));
438    }
439
440    #[test]
441    fn inline_math_with_command() {
442        let r = parse("$\\alpha + \\beta$");
443        assert!(r.errors.is_empty());
444        if let Node::Math(children, _) = &r.document.body[0] {
445            let names: Vec<_> = children.iter().filter_map(|n| match n {
446                Node::Command { name, .. } => Some(name.as_str()),
447                _ => None, 
448            }).collect();
449            assert_eq!(names, vec!["alpha", "beta"]);
450        } else {
451            panic!("expected math node");
452        }
453    }
454
455    #[test]
456    fn unclosed_math_produces_error() {
457        let r = parse("text $oops");
458        assert!(!r.errors.is_empty());
459    }
460    
461    #[test]
462    fn parser_errors_carry_spans() {
463        // Every parser error must point at the offending opener so the CLI 
464        // can render the location from the diagnostic span instead of
465        // picking it ouf the message text.
466        let cases = [
467            "\\cmd{oops", // E021
468            "\\cmd[oops", // E022
469            "{", // E020
470            "$oops", // E030
471        ];
472        for src in cases {
473            let r = parse(src);
474            assert!(!r.errors.is_empty(), "expected error for {src:?}");
475            for e in &r.errors {
476                assert!(e.span.is_some(), "error for {src:?} has no span: {e:?}");
477            }
478        }
479    }
480
481    #[test]
482    fn math_after_text() {
483        let r = parse("hello $x$");
484        assert!(r.errors.is_empty());
485        assert_eq!(r.document.body.len(), 2);
486        assert!(matches!(&r.document.body[0], Node::Text(s, _) if s == "hello "));
487        assert!(matches!(&r.document.body[1], Node::Math(_, _)));
488    }
489
490    #[test]
491    fn display_math_simple() {
492        let r = parse("\\[x+1\\]");
493        assert!(r.errors.is_empty(), "{:?}", r.errors);
494        assert_eq!(r.document.body.len(), 1);
495        assert!(matches!(&r.document.body[0], Node::DisplayMath(children, _)
496            if matches!(&children[0], Node::Text(s, _) if s == "x+1")));
497    }
498
499    #[test]
500    fn display_math_with_command() {
501        let r = parse("\\[ \\sum_{i=0}^n i \\]");
502        assert!(r.errors.is_empty(), "{:?}", r.errors);
503        assert!(matches!(&r.document.body[0], Node::DisplayMath(_, _)));
504    }
505
506    #[test]
507    fn unclosed_display_math_produces_error() {
508        let r = parse("\\[ a + b");
509        assert!(r.errors.iter().any(|e| e.code == "E031"));
510    }
511
512    #[test]
513    fn stray_close_display_math_produces_error() {
514        let r = parse("oops \\] more");
515        assert!(r.errors.iter().any(|e| e.code == "E032"));
516    }
517
518    #[test]
519    fn comment_preserved() {
520        let r = parse("% hello\nworld");
521        assert!(r.errors.is_empty());
522        assert!(matches!(&r.document.body[0], Node::Comment(s, _) if s == " hello"));
523        assert!(matches!(&r.document.body[1], Node::Text(s, _) if s == "world"));
524    }
525
526    #[test]
527    fn comment_inside_command_arg() {
528        let r = parse("\\textbf{foo % drop?\nbar}");
529        assert!(r.errors.is_empty(), "{:?}", r.errors);
530        if let Node::Command { args, .. } = &r.document.body[0] {
531            if let Arg::Mandatory(children) = &args[0] {
532                assert!(children.iter().any(|n| matches!(n, Node::Comment(_, _))));
533            } else { panic!("expected mandatory arg"); }
534        } else { panic!("expected command"); }
535    }
536}