Skip to main content

oxyl_parser/parser/
mod.rs

1// The parser turns a flat token stream into a tree of Nodes 
2// TODO - formalise all below and put in docs 
3// - commands greedily pick up [] (optional args) and also {} 
4// which are mandatory args - until it finds a token that isnt any of these 
5// - a pair of $ tokens wraps a math node 
6// a \[...\] pair wraps a display math node. inline and display math 
7// children are parsed like ordinary text
8// TODO - atoms + scripts + operators !!
9// \begin \end {name} produce an env node whose body is parsed 
10// recusrively so nested envs work. the first mandatory arg 
11// after \begin is treated as the name, everything else 
12// stays in args 
13// comments are preserved as comment nodes for any source-fidelity 
14// tools to utilise :)
15// active specials ie & and ~ are align tab and tilde nodes, they have no 
16// children of their ownn - downstream passes that care about 
17// tabular layour or whatever can read them off the 
18// node sequence directly
19// every error carries a diag span pointing at the token 
20// that triggered the error so that the cli can render src 
21// contex directly from span !!!!!!
22
23use oxyl_diagnostics::Diagnostic;
24use oxyl_lexer::{Span, Token, TokenKind};
25
26use crate::ast::{Arg, Document, Node};
27
28mod helpers;
29use helpers::{diag_span, find_env_name, is_display_math_close, is_end_control_seq};
30
31#[cfg(test)]
32mod tests;
33
34
35/// Returned by [`Parser::parse`]. The document is always produced; errors 
36/// are collected alongside it so the caller sees everything at once.
37#[derive(Debug)]
38pub struct ParseResult {
39    pub document: Document,
40    pub errors: Vec<Diagnostic>,
41}
42
43
44pub struct Parser {
45    tokens: Vec<Token>,
46    pos: usize,
47    errors: Vec<Diagnostic>,
48}
49
50impl Parser {
51    pub fn new(tokens: Vec<Token>) -> Self {
52        Self { tokens, pos: 0, errors: Vec::new() }
53    }
54    
55    /// Parse the token stream.
56    pub fn parse(mut self) -> ParseResult {
57        let body = self.parse_nodes(|_| false);
58        ParseResult { document: Document { body }, errors: self.errors }
59    }
60
61    fn peek(&self) -> Option<&Token> {
62        self.tokens.get(self.pos)
63    }
64
65    fn peek_kind(&self) -> Option<&TokenKind> {
66        self.peek().map(|t| &t.kind)
67    }
68
69    fn bump(&mut self) -> Option<Token> {
70        if self.pos < self.tokens.len() {
71            let tok = self.tokens[self.pos].clone();
72            self.pos += 1;
73            Some(tok)
74        } else {
75            None
76        }
77    }
78
79    /// Parse a run of nodes until the token stream is exhausted or 
80    /// `stop` returns true for the next token's kind. The stopping token is 
81    /// left unconsumed so it can be examined and bumped by the caller !
82    ///
83    /// `stop` is used by the group parser to halt at `}` - it is a function pointer 
84    /// rather than an `impl Fn` so the recursive calls don't blow up the parser.
85    fn parse_nodes(&mut self, stop: fn(&TokenKind) -> bool) -> Vec<Node> {
86        let mut nodes: Vec<Node> = Vec::new();
87        
88        loop {
89            match self.peek() {
90                None => break,
91                Some(tok) if stop(&tok.kind) => break,
92                _ => {}
93            }
94
95            let tok = self.bump().unwrap();
96
97            match tok.kind {
98                TokenKind::Char(c) => self.push_char(&mut nodes, c, tok.span),
99                TokenKind::Space => self.push_char(&mut nodes, ' ', tok.span),
100
101                TokenKind::ParagraphBreak => {
102                    nodes.push(Node::ParagraphBreak(tok.span));
103                }
104                
105                TokenKind::Comment(body) => {
106                    nodes.push(Node::Comment(body, tok.span));
107                }
108               
109                // begin{name} opens an environment.
110                TokenKind::ControlSeq(ref name) if name == "begin" => {
111                    let env = self.parse_environment(tok.span);
112                    nodes.push(env);
113                }
114
115                // A bare \end outside an environment is a stray closer. :)
116                TokenKind::ControlSeq(ref name) if name == "end" => {
117                    self.errors.push(
118                        Diagnostic::error("E043", "stray '\\end' (no matching '\\begin')")
119                            .with_span(diag_span(tok.span)),
120                    );
121                    // Eat its name arg so we don't cause a slippery slope of errors lol.
122                    let _ = self.parse_args();
123                }
124
125                // `\[` opens display math. 
126                TokenKind::ControlSeq(ref name) if name == "[" => {
127                    let open_span = tok.span;
128                    let children = self.parse_nodes(is_display_math_close);
129                    if matches!(self.peek_kind(), Some(TokenKind::ControlSeq(s)) if s == "]") {
130                        let close = self.bump().unwrap();
131                        nodes.push(Node::DisplayMath(children, open_span.merge(close.span)));
132                    } else {
133                        self.errors.push(
134                            Diagnostic::error("E031", "unclosed '\\[' (display math)")
135                                .with_span(diag_span(open_span)),
136                        );
137                        nodes.push(Node::DisplayMath(children, open_span));
138                    }
139                }
140
141                // A bare `\]` outside display math is a stray closer.
142                TokenKind::ControlSeq(ref name) if name == "]" => {
143                    self.errors.push(
144                        Diagnostic::error("E032", "stray '\\]' (no matching '\\[')")
145                            .with_span(diag_span(tok.span)),
146                    );
147                }
148
149                TokenKind::ControlSeq(name) => {
150                    let cmd_span = tok.span; 
151                    let args = self.parse_args();
152                    // Extend the span to cover the last argument. 
153                    let full_span = args.last()
154                        .and_then(|a| match a {
155                            Arg::Mandatory(children) => children.last().map(|n| n.span()),
156                            Arg::Optional(children) => children.last().map(|n| n.span()), 
157                        })
158                        .map(|s| cmd_span.merge(s))
159                        .unwrap_or(cmd_span);
160                    nodes.push(Node::Command { name, args, span: full_span });
161                }
162
163                TokenKind::BeginGroup => {
164                    let open_span = tok.span;
165                    let children = self.parse_nodes(|k| matches!(k, TokenKind::EndGroup));
166                    if self.peek_kind() == Some(&TokenKind::EndGroup) {
167                        let close = self.bump().unwrap();
168                        nodes.push(Node::Group(children, open_span.merge(close.span)));
169                    } else {
170                        // Unclosed group - record the error, keep what we parsed.
171                        self.errors.push(
172                            Diagnostic::error("E020", "unclosed '{'")
173                                .with_span(diag_span(open_span)),
174                        );
175                        nodes.push(Node::Group(children, open_span));
176                    }
177                }
178                
179                TokenKind::MathShift => {
180                    let open_span = tok.span;
181                    let children = self.parse_nodes(|k| matches!(k, TokenKind::MathShift));
182                    if self.peek_kind() == Some(&TokenKind::MathShift) {
183                        let close = self.bump().unwrap();
184                        nodes.push(Node::Math(children, open_span.merge(close.span)));
185                    } else {
186                        self.errors.push(
187                            Diagnostic::error("E030", "unclosed '$' (math mode)")
188                                .with_span(diag_span(open_span)),
189                        );
190                        nodes.push(Node::Math(children, open_span));
191                    }
192                }
193
194                TokenKind::AlignTab => nodes.push(Node::AlignTab(tok.span)),
195                TokenKind::Tilde => nodes.push(Node::Tilde(tok.span)),
196
197                // Everything else is left unhandled for now so skip it.
198                _ => {}
199            }
200        }
201
202        nodes
203    }
204    /// Consume all immediately following `[...] and `{ ... }` groups as args.
205    ///
206    /// TeX commands pick up their arguments greedily; we skip spaces between
207    /// the command name and each argument to match TeX's behaviour. The loop
208    /// stops at the first token that is neither `[` nor `{`.
209    fn parse_args(&mut self) -> Vec<Arg> {
210        let mut args = Vec::new();
211        
212        loop {
213            // Skip spaces between the command and its next argument.
214            if self.peek_kind() == Some(&TokenKind::Space) {
215                self.bump();
216            }
217
218            match self.peek_kind() {
219                Some(&TokenKind::BeginGroup) => args.push(self.parse_mandatory_arg()),
220                Some(&TokenKind::Char('[')) => args.push(self.parse_optional_arg()),
221                _ => break,
222            }
223        }
224        args
225
226    }    
227
228    fn parse_mandatory_arg(&mut self) -> Arg {
229        // Consume the opening brace, remembering its span for diagnostics.
230        let open_span = self.bump().unwrap().span;
231        let children = self.parse_nodes(|k| matches!(k, TokenKind::EndGroup));
232        if self.peek_kind() == Some(&TokenKind::EndGroup) {
233            self.bump();
234        } else {
235            self.errors.push(
236                Diagnostic::error("E021","unclosed mandatory argument")
237                    .with_span(diag_span(open_span)),
238            );
239        }
240        Arg::Mandatory(children)
241    }
242
243    /// Parse `\begin{name} body \end{name}`. The opening `\begin` token has
244    /// already been consumed; `begin_span` is its span.
245    fn parse_environment(&mut self, begin_span: Span) -> Node {
246        let mut args = self.parse_args();
247
248        // First mandatory arg is the environment name. Without one we
249        // record the error and fall back to a plain cmd so the AST 
250        // still contains atleast something useful
251        let (name_idx, env_name) = match find_env_name(&args) {
252            Some(x) => x,
253            None => {
254                self.errors.push(
255                    Diagnostic::error("E040", "'\\begin' missing environment name")
256                        .with_span(diag_span(begin_span)),
257                );
258                return Node::Command {
259                    name: "begin".to_owned(),
260                    args,
261                    span: begin_span,
262                };
263            }
264        };
265        args.remove(name_idx);
266
267        let body = self.parse_nodes(is_end_control_seq);
268
269        // Try consume the matching \end
270        let close_span = if matches!(self.peek_kind(), Some(TokenKind::ControlSeq(s)) if s == "end") {
271            let end_tok = self.bump().unwrap();
272            let end_args = self.parse_args();
273            let close_name = find_env_name(&end_args).map(|(_, n)| n);
274
275            if close_name.as_deref() != Some(env_name.as_str()) {
276                self.errors.push(
277                    Diagnostic::error("E042", format!(
278                        "'\\end{{{}}}' does not match '\\begin{{{}}}'",
279                        close_name.as_deref().unwrap_or(""), env_name,
280                    ))
281                    .with_span(diag_span(end_tok.span))
282                    .with_note(format!("the matching '\\begin' opened the '{env_name}' environment")),
283                );
284            }
285
286            // Stretch the span to the last argument of \end (if any)
287            end_args.last()
288                .and_then(|a| match a {
289                    Arg::Mandatory(c) | Arg::Optional(c) => c.last().map(|n| n.span()),
290                })
291                .map(|s| end_tok.span.merge(s))
292                .unwrap_or(end_tok.span)
293        } else {
294            self.errors.push(
295                Diagnostic::error("E041", format!("unclosed '\\begin{{{}}}'", env_name))
296                    .with_span(diag_span(begin_span)),
297            );
298            body.last().map(|n| n.span()).unwrap_or(begin_span)
299        };
300
301        Node::Environment {
302            name: env_name, 
303            args,
304            body,
305            span: begin_span.merge(close_span),
306        }
307    }
308
309    fn parse_optional_arg(&mut self) -> Arg {
310        // Consume the opening `[`, remembering its span for diagnostics.
311        let open_span = self.bump().unwrap().span;
312        let children = self.parse_nodes(|k| matches!(k, TokenKind::Char(']')));
313        if self.peek_kind() == Some(&TokenKind::Char(']')) {
314            self.bump();
315        } else {
316            self.errors.push(
317                Diagnostic::error("E022","unclosed optional argument")
318                    .with_span(diag_span(open_span)),
319            );
320        }
321        Arg::Optional(children)
322    }
323    
324    /// Append a character to the last `Text` node, or start a new one.
325    fn push_char(&self, nodes: &mut Vec<Node>, c: char, span: Span) {
326        match nodes.last_mut() {
327            Some(Node::Text(s, existing)) => {
328                s.push(c);
329                *existing = existing.merge(span);
330            }
331            _ => nodes.push(Node::Text(c.to_string(), span)),
332        }
333    }
334}
335