Skip to main content

oxyl_parser/
lib.rs

1// oxyl-parser
2// 
3// Builds a Document of Nodes from the lexer's token stream.
4//
5// - Commands greedily pick up [...] and {...} until the next token 
6// is neither of those.
7// - A pair of $ tokens wraps a math node, whose children are parsed 
8// with the same func as ordinary text. 
9//  - Every error carries a DiagSpan poiting at the token that triggered it 
10//  (the unmatched bracket or dollar sign) so the cli can render source 
11//  context without having to extract it from the message string :D
12// TODO - math specific structure ie atoms, scripts etc and display math (\[\])
13
14
15use oxyl_diagnostics::{DiagSpan, Diagnostic};
16use oxyl_lexer::{Span, Token, TokenKind};
17
18fn diag_span(s: Span) -> DiagSpan {
19    DiagSpan::new(s.start, s.end)
20}
21// --- 
22// AST Types 
23//
24
25/// The root of a parsed LaTeX document.
26///
27/// For now we do not distinguish preamble from body - everything lands in 
28/// `body`. Will add that split when handling for `\begin{document}` is done.
29#[derive(Debug, Clone)]
30pub struct Document {
31    pub body: Vec<Node>,
32}
33
34/// A single node in the LaTeX AST.
35#[derive(Debug, Clone)]
36pub enum Node {
37    /// A run of plain text characters
38    Text(String, Span),
39
40    /// A blank line in the source - signals a paragraph break.
41    ParagraphBreak(Span),
42
43    /// A LaTeX command and its arguments, e.g. `\textbf{hello}`.
44    Command {
45        /// Name without the leading backslash, e.g. `"textbf"`.
46        name: String ,
47        args: Vec<Arg>,
48        span: Span,
49    },
50
51    /// A braced group `{...}`.
52    Group(Vec<Node>, Span),
53    
54    /// Inline match: `$ ... $`. The span covers both `$` delimiters.
55    Math(Vec<Node>, Span),
56}
57
58impl Node {
59    pub fn span(&self) -> Span {
60        match self {
61            Node::Text(_,s) => *s,
62            Node::ParagraphBreak(s) => *s,
63            Node::Command { span, .. } => *span,
64            Node::Group(_, s) => *s,
65            Node::Math(_, s) => *s,
66        }
67    }
68}
69
70/// A single argument to a command or environment 
71#[derive(Debug, Clone)]
72pub enum Arg {
73    Mandatory(Vec<Node>),
74    Optional(Vec<Node>),
75}
76
77// --- 
78// Parser Result 
79// --- 
80
81/// Returned by [`Parser::parse`]. The document is always produced; errors 
82/// are collected alongside it so the caller sees everything at once.
83#[derive(Debug)]
84pub struct ParseResult {
85    pub document: Document,
86    pub errors: Vec<Diagnostic>,
87}
88
89// --- 
90// Parser 
91// --- 
92
93pub struct Parser {
94    tokens: Vec<Token>,
95    pos: usize,
96    errors: Vec<Diagnostic>,
97}
98
99impl Parser {
100    pub fn new(tokens: Vec<Token>) -> Self {
101        Self { tokens, pos: 0, errors: Vec::new() }
102    }
103    
104    /// Parse the token stream.
105    pub fn parse(mut self) -> ParseResult {
106        let body = self.parse_nodes(None);
107        ParseResult { document: Document { body }, errors: self.errors }
108    }
109
110    fn peek(&self) -> Option<&Token> {
111        self.tokens.get(self.pos)
112    }
113
114    fn peek_kind(&self) -> Option<&TokenKind> {
115        self.peek().map(|t| &t.kind)
116    }
117
118    fn bump(&mut self) -> Option<Token> {
119        if self.pos < self.tokens.len() {
120            let tok = self.tokens[self.pos].clone();
121            self.pos += 1;
122            Some(tok)
123        } else {
124            None
125        }
126    }
127
128    /// Parse nodes until the token stream ends or `stop` matches. 
129    ///
130    /// `stop` is used by the group parser to halt at `}`.
131    fn parse_nodes(&mut self, stop: Option<&TokenKind>) -> Vec<Node> {
132        let mut nodes: Vec<Node> = Vec::new();
133        
134        loop {
135            match self.peek() {
136                None => break,
137                Some(tok) if stop.map_or(false, |s| &tok.kind == s) => break,
138                _ => {}
139            }
140
141            let tok = self.bump().unwrap();
142
143            match tok.kind {
144                TokenKind::Char(c) => self.push_char(&mut nodes, c, tok.span),
145                TokenKind::Space => self.push_char(&mut nodes, ' ', tok.span),
146
147                TokenKind::ParagraphBreak => nodes.push(Node::ParagraphBreak(tok.span)),
148
149                TokenKind::ControlSeq(name) => {
150                    let cmd_span = tok.span; 
151                    let args = self.parse_args();
152                    // Extend the span to cover the last argument. 
153                    let full_span = args.last()
154                        .and_then(|a| match a {
155                            Arg::Mandatory(children) => children.last().map(|n| n.span()),
156                            Arg::Optional(children) => children.last().map(|n| n.span()), 
157                        })
158                        .map(|s| cmd_span.merge(s))
159                        .unwrap_or(cmd_span);
160                    nodes.push(Node::Command { name, args, span: full_span });
161                }
162
163                TokenKind::BeginGroup => {
164                    let open_span = tok.span;
165                    let children = self.parse_nodes(Some(&TokenKind::EndGroup));
166                    if self.peek_kind() == Some(&TokenKind::EndGroup) {
167                        let close = self.bump().unwrap();
168                        nodes.push(Node::Group(children, open_span.merge(close.span)));
169                    } else {
170                        // Unclosed group - record the error, keep what we parsed.
171                        self.errors.push(
172                            Diagnostic::error("E020", "unclosed '{'")
173                                .with_span(diag_span(open_span)),
174                        );
175                        nodes.push(Node::Group(children, open_span));
176                    }
177                }
178                
179                TokenKind::MathShift => {
180                    let open_span = tok.span;
181                    let children = self.parse_nodes(Some(&TokenKind::MathShift));
182                    if self.peek_kind() == Some(&TokenKind::MathShift) {
183                        let close = self.bump().unwrap();
184                        nodes.push(Node::Math(children, open_span.merge(close.span)));
185                    } else {
186                        self.errors.push(
187                            Diagnostic::error("E030", "unclosed '$' (math mode)")
188                                .with_span(diag_span(open_span)),
189                        );
190                    }
191                }
192                // Everything else is left unhandled for now so skip it.
193                _ => {}
194            }
195        }
196
197        nodes
198    }
199    /// Consume all immediately following `[...] and `{ ... }` groups as args.
200    ///
201    /// TeX commands pick up their arguments greedily; we skip spaces between
202    /// the command name and each argument to match TeX's behaviour. The loop
203    /// stops at the first token that is neither `[` nor `{`.
204    fn parse_args(&mut self) -> Vec<Arg> {
205        let mut args = Vec::new();
206        
207        loop {
208            // Skip spaces between the command and its next argument.
209            if self.peek_kind() == Some(&TokenKind::Space) {
210                self.bump();
211            }
212
213            match self.peek_kind() {
214                Some(&TokenKind::BeginGroup) => args.push(self.parse_mandatory_arg()),
215                Some(&TokenKind::Char('[')) => args.push(self.parse_optional_arg()),
216                _ => break,
217            }
218        }
219        args
220    }    
221
222    fn parse_mandatory_arg(&mut self) -> Arg {
223        // Consume the opening brace, remembering its span for diagnostics.
224        let open_span = self.bump().unwrap().span;
225        let children = self.parse_nodes(Some(&TokenKind::EndGroup));
226        if self.peek_kind() == Some(&TokenKind::EndGroup) {
227            self.bump();
228        } else {
229            self.errors.push(
230                Diagnostic::error("E021","unclosed mandatory argument")
231                    .with_span(diag_span(open_span)),
232            );
233        }
234        Arg::Mandatory(children)
235    }
236
237    fn parse_optional_arg(&mut self) -> Arg {
238        // Consume the opening `[`, remembering its span for diagnostics.
239        let open_span = self.bump().unwrap().span;
240        let children = self.parse_nodes(Some(&TokenKind::Char(']')));
241        if self.peek_kind() == Some(&TokenKind::Char(']')) {
242            self.bump();
243        } else {
244            self.errors.push(
245                Diagnostic::error("E022","unclosed optional argument")
246                    .with_span(diag_span(open_span)),
247            );
248        }
249        Arg::Optional(children)
250    }
251    
252    /// Append a character to the last `Text` node, or start a new one.
253    fn push_char(&self, nodes: &mut Vec<Node>, c: char, span: Span) {
254        match nodes.last_mut() {
255            Some(Node::Text(s, existing)) => {
256                s.push(c);
257                *existing = existing.merge(span);
258            }
259            _ => nodes.push(Node::Text(c.to_string(), span)),
260        }
261    }
262}
263
264
265
266// Tests
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271    use oxyl_lexer::Lexer;
272
273    fn parse(src: &str) -> ParseResult {
274        let tokens = Lexer::new(src).tokenise().tokens;
275        Parser::new(tokens).parse()
276    }
277
278    fn first_command(src: &str) -> (String, Vec<Arg>) {
279        let r = parse(src);
280        for node in &r.document.body {
281            if let Node::Command { name, args, .. } = node {
282                return (name.clone(), args.clone());
283            }
284        }
285        panic!("no command found in: {src}");
286    }
287
288    #[test]
289    fn command_no_args() {
290        let (name, args) = first_command("\\LaTeX");
291        assert_eq!(name, "LaTeX");
292        assert!(args.is_empty());
293    }
294
295    #[test]
296    fn command_one_mandatory_arg() {
297        let (name, args) = first_command("\\textbf{hello}");
298        assert_eq!(name, "textbf");
299        assert_eq!(args.len(), 1);
300        assert!(matches!(&args[0], Arg::Mandatory(children)
301                if matches!(&children[0], Node::Text(s, _) if s == "hello")));
302    }
303
304    #[test]
305    fn command_two_mandatory_args() {
306        let (name, args) = first_command("\\frac{a}{b}");
307        assert_eq!(name, "frac");
308        assert_eq!(args.len(), 2);
309    }
310    
311    #[test]
312    fn unclosed_arg_produces_error() {
313        let r = parse("\\cmd{oops");
314        assert!(!r.errors.is_empty());
315    }
316
317    #[test]
318    fn paragraph_break_still_works() {
319        let r = parse("line one\n\nline two");
320        let has_par = r.document.body.iter().any(|n| matches!(n, Node::ParagraphBreak(_)));
321        assert!(has_par);
322    }
323
324    #[test]
325    fn nested_command_in_arg() {
326        let r = parse("\\outer{\\inner{x}}");
327        assert!(r.errors.is_empty());
328        if let Node::Command { args, .. } = &r.document.body[0] {
329            if let Arg::Mandatory(inner) = &args[0] {
330                assert!(matches!(&inner[0], Node::Command { name, .. } if name == "inner"));
331            } else { panic!("expected mandatory arg"); }
332        } else { panic!("expected command"); }
333    }
334
335    #[test]
336    fn command_with_optional_arg() {
337        let (name, args) = first_command("\\sqrt[3]{27}");
338        assert_eq!(name, "sqrt");
339        assert_eq!(args.len(), 2);
340        assert!(matches!(&args[0], Arg::Optional(children)
341                if matches!(&children[0], Node::Text(s, _) if s == "3")));
342        assert!(matches!(&args[1], Arg::Mandatory(children)
343                if matches!(&children[0], Node::Text(s, _) if s== "27")));
344    }
345
346    #[test]
347    fn command_with_only_optional_arg() {
348        let (name, args) = first_command("\\foo[opt]");
349        assert_eq!(name, "foo");
350        assert_eq!(args.len(), 1);
351        assert!(matches!(&args[0], Arg::Optional(_)));
352    }
353
354    #[test]
355    fn optional_then_two_mandatory() {
356        // two diff types of option + ordering 
357        let (_, args) = first_command("\\section[short]{long}{extra}");
358        assert_eq!(args.len(), 3);
359        assert!(matches!(&args[0], Arg::Optional(_)));
360        assert!(matches!(&args[1], Arg::Mandatory(_)));
361        assert!(matches!(&args[2], Arg::Mandatory(_)));
362    }
363
364    #[test]
365    fn unclosed_optional_arg_produces_error() {
366        let r = parse("\\cmd[oops");
367        assert!(!r.errors.is_empty());
368    }
369
370    #[test]
371    fn bracket_outside_command_is_text() {
372        // A `'[` not directly after a control sequence is just ordinary text.
373        let r = parse("hello [world]");
374        assert!(r.errors.is_empty());
375        assert!(matches!(&r.document.body[0], Node::Text(s, _) if s == "hello [world]"));
376    }
377
378    #[test]
379    fn inline_math_simple() {
380        let r = parse("$x+1$");
381        assert!(r.errors.is_empty());
382        assert_eq!(r.document.body.len(), 1);
383        assert!(matches!(&r.document.body[0], Node::Math(children, _)
384                if matches!(&children[0], Node::Text(s, _) if s == "x+1")));
385    }
386
387    #[test]
388    fn inline_math_with_command() {
389        let r = parse("$\\alpha + \\beta$");
390        assert!(r.errors.is_empty());
391        if let Node::Math(children, _) = &r.document.body[0] {
392            let names: Vec<_> = children.iter().filter_map(|n| match n {
393                Node::Command { name, .. } => Some(name.as_str()),
394                _ => None, 
395            }).collect();
396            assert_eq!(names, vec!["alpha", "beta"]);
397        } else {
398            panic!("expected math node");
399        }
400    }
401
402    #[test]
403    fn unclosed_math_produces_error() {
404        let r = parse("text $oops");
405        assert!(!r.errors.is_empty());
406    }
407    
408    #[test]
409    fn parser_errors_carry_spans() {
410        let cases = [
411            "\\cmd{oops", // E021
412            "\\cmd[oops", // E022
413            "{", // E020
414            "$oops", // E030
415        ];
416        for src in cases {
417            let r = parse(src);
418            assert!(!r.errors.is_empty(), "expected error for {src:?}");
419            for e in &r.errors {
420                assert!(e.span.is_some(), "error for {src:?} has no span: {e:?}");
421            }
422        }
423    }
424
425    #[test]
426    fn math_after_text() {
427        let r = parse("hello $x$");
428        assert!(r.errors.is_empty());
429        assert_eq!(r.document.body.len(), 2);
430        assert!(matches!(&r.document.body[0], Node::Text(s, _) if s == "hello "));
431        assert!(matches!(&r.document.body[1], Node::Math(_, _)));
432    }
433}