tex_parser/
lib.rs

1// SPDX-License-Identifier: LGPL-2.1-or-later
2// See Notices.txt for copyright information
3use peg::{error::ParseError, str::LineCol};
4
5pub mod ast;
6
7trait IsStartOfLine {
8    fn is_start_of_line(&self, index: usize) -> peg::RuleResult<()>;
9}
10
11impl IsStartOfLine for str {
12    fn is_start_of_line(&self, index: usize) -> peg::RuleResult<()> {
13        if index == 0 {
14            return peg::RuleResult::Matched(index, ());
15        }
16        match self.as_bytes().get(index - 1) {
17            Some(b'\r') | Some(b'\n') => peg::RuleResult::Matched(index, ()),
18            _ => peg::RuleResult::Failed,
19        }
20    }
21}
22
23peg::parser! {
24    // based on:
25    // https://github.com/siefkenj/latex-parser/blob/96f9bfe405008a0fd1da51dbba476e0353675090/src/grammars/latex.pegjs
26    grammar parser() for str {
27        use crate::ast::*;
28
29        pub(crate) rule document() -> Document =
30            content:(token()*) { Document { content } }
31
32        rule token() -> Token =
33            t:special_macro() { Token::SpecialMacro(t) }
34            / t:macro_() { Token::Macro(t) }
35            / t:full_comment() { Token::FullComment(t) }
36            / t:group() { Token::Group(t) }
37            / t:dollar_inline_math() { Token::DollarInlineMath(t) }
38            / t:alignment_tab() { Token::AlignmentTab(t) }
39            / t:par_break() { Token::ParBreak(t) }
40            / t:macro_parameter() { Token::MacroParameter(t) }
41            / t:ignore() { Token::Ignore(t) }
42            / t:number() { Token::Number(t) }
43            / t:whitespace() { Token::Whitespace(t) }
44            / t:punctuation() { Token::Punctuation(t) }
45            / t:char_tokens() { Token::CharTokens(t) }
46            / t:begin_group() { Token::BeginGroup(t) }
47            / t:end_group() { Token::EndGroup(t) }
48            / t:math_shift() { Token::MathShift(t) }
49
50        rule par_break() -> ParBreak =
51            pos:pos() (space()* new_line())*<2,>
52                // Comments eat the whitespace in front of them, so if a
53                // par_break is followed by a comment, we don't want to eat that
54                // whitespace.
55                (space()* !comment_start())?
56                { ParBreak { pos } }
57
58        rule math_token() -> MathToken =
59            t:special_macro() { MathToken::SpecialMacro(t) }
60            / t:macro_() { MathToken::Macro(t) }
61            / t:full_comment() { MathToken::FullComment(t) }
62            / whitespace()* t:math_group() whitespace()* { MathToken::MathGroup(t) }
63            / whitespace()* t:alignment_tab() whitespace()* { MathToken::AlignmentTab(t) }
64            / whitespace()* t:macro_parameter() whitespace()* { MathToken::MacroParameter(t) }
65            / whitespace()* t:superscript() whitespace()* { MathToken::Superscript(t) }
66            / whitespace()* t:subscript() whitespace()* { MathToken::Subscript(t) }
67            / t:ignore() { MathToken::Ignore(t) }
68            / t:whitespace() { MathToken::Whitespace(t) }
69            / t:number() { MathToken::Number(t) }
70            / t:any_char() { MathToken::AnyChar(t) }
71
72        rule char_tokens() -> CharTokens =
73            pos:pos() content:$(char_token()+) {CharTokens { pos, content: content.into() }}
74
75        rule char_token() -> CharToken =
76            pos:pos()
77            !(
78                escape()
79                / comment_start()
80                / begin_group()
81                / end_group()
82                / math_shift()
83                / alignment_tab()
84                / new_line()
85                / macro_parameter()
86                / ignore()
87                / space()
88                / punctuation()
89            )
90            any_char() { CharToken { pos } }
91
92        rule whitespace() -> Whitespace =
93            pos:pos() (
94                new_line() space()*
95                / space()+ new_line() !comment_start() space()* !new_line()
96                / space()+
97            ) { Whitespace { pos } }
98
99        rule number() -> Number =
100            pos:pos() content:$(
101                digit()+ ("." digit()*)?
102                / "." digit()+
103            ) { Number { pos, content: content.into() } }
104
105        rule special_macro() -> SpecialMacro =
106            v:verb() { SpecialMacro::Verb(v) }
107            / v:verbatim_environment() { SpecialMacro::VerbatimEnvironment(v) }
108            / v:display_math() { SpecialMacro::DisplayMath(v) }
109            / v:parenthesized_inline_math() { SpecialMacro::ParenthesizedInlineMath(v) }
110            / v:math_environment() { SpecialMacro::MathEnvironment(v) }
111            / v:environment() { SpecialMacro::Environment(v) }
112
113        rule verb() -> Verb =
114            escape:escape()
115            env:$("verb*" / "verb")
116            delimiter:$([_])
117            content:$(
118                (
119                    ch:$([_])
120                    {?
121                        if ch == delimiter {
122                            Err("")
123                        } else {
124                            Ok(())
125                        }
126                    }
127                )*
128            )
129            [_] {
130                Verb {
131                    escape,
132                    env: env.into(),
133                    delimiter: delimiter.chars().next().unwrap(),
134                    content: content.into(),
135                }
136            }
137
138
139        rule verbatim_environment() -> VerbatimEnvironment =
140            begin:begin_environment()
141            begin_group()
142            name:verbatim_environment_name()
143            end_group()
144            body:$(
145                (
146                    !(
147                        end_environment()
148                        begin_group()
149                        end_name:verbatim_environment_name()
150                        end_group()
151                        {?
152                            if name.kind == end_name.kind {
153                                Ok(())
154                            } else {
155                                Err("")
156                            }
157                        }
158                    )
159                    [_]
160                )*
161            )
162            end:end_environment()
163            begin_group()
164            verbatim_environment_name()
165            end_group()
166            {
167                VerbatimEnvironment {
168                    begin,
169                    name,
170                    body: body.into(),
171                    end,
172                }
173            }
174
175        rule verbatim_environment_name() -> VerbatimEnvironmentName =
176            // standard verbatim enviroments. `verbatim*` must be listed first
177            pos:pos() kind:(
178                "verbatim*" { VerbatimEnvironmentNameKind::VerbatimStar }
179                / "verbatim" { VerbatimEnvironmentNameKind::Verbatim }
180                / "filecontents*" { VerbatimEnvironmentNameKind::FileContentsStar }
181                / "filecontents" { VerbatimEnvironmentNameKind::FileContents }
182                // comment environment provided by \usepackage{verbatim}
183                / "comment" { VerbatimEnvironmentNameKind::Comment }
184                // lstlisting environment provided by \usepackage{listings}
185                / "lstlisting" { VerbatimEnvironmentNameKind::ListListing }
186            ) { VerbatimEnvironmentName { pos, kind } }
187
188        rule display_math() -> DisplayMath =
189            // display math with \[...\]
190            pos:pos()
191            begin_display_math()
192            content:(!end_display_math() t:math_token() { t })*
193            end_display_math() { DisplayMath { pos, content } }
194            // display math with $$...$$
195            / pos:pos()
196            math_shift()
197            math_shift()
198            content:(!(math_shift() math_shift()) t:math_token() { t })*
199            math_shift()
200            math_shift() { DisplayMath { pos, content } }
201
202        rule parenthesized_inline_math() -> ParenthesizedInlineMath =
203            // inline math with \(...\)
204            begin:begin_inline_math()
205            content:(!end_inline_math() t:math_token() { t })*
206            end:end_inline_math() { ParenthesizedInlineMath { begin, content, end } }
207
208        rule dollar_inline_math() -> DollarInlineMath =
209            begin:math_shift()
210            content:(!math_shift() t:math_token() { t })+
211            end:math_shift() { DollarInlineMath { begin, content, end } }
212
213        rule macro_() -> Macro =
214            escape:escape() name:macro_name() { Macro { escape, name } }
215
216        rule macro_name() -> MacroName =
217            pos:pos() content:$(letter()+ / [_]) { MacroName { pos, content: content.into() } }
218
219        rule group() -> Group =
220            begin:begin_group()
221            tokens:(!end_group() t:token() { t })*
222            end:end_group() { Group { begin, tokens, end } }
223
224        rule environment() -> Environment =
225            begin:begin_environment()
226            begin_group()
227            name:char_tokens()
228            end_group()
229            body:(
230                !(
231                    end_environment()
232                    begin_group()
233                    end_name:char_tokens()
234                    end_group()
235                    {?
236                        if name.content == end_name.content {
237                            Ok(())
238                        } else {
239                            Err("")
240                        }
241                    }
242                )
243                t:token() { t }
244            )*
245            end:end_environment()
246            begin_group()
247            char_tokens()
248            end_group() {
249                Environment {
250                    begin,
251                    name,
252                    body,
253                    end,
254                }
255            }
256
257        rule math_environment() -> MathEnvironment =
258            begin:begin_environment()
259            begin_group()
260            name:math_environment_name()
261            end_group()
262            environment_comment:same_line_comment()?
263            body:(
264                !(
265                    end_environment()
266                    begin_group()
267                    end_name:math_environment_name()
268                    end_group()
269                    {?
270                        if name.kind == end_name.kind {
271                            Ok(())
272                        } else {
273                            Err("")
274                        }
275                    }
276                )
277                t:math_token() { t }
278            )*
279            end:end_environment()
280            begin_group()
281            math_environment_name()
282            end_group() {
283                MathEnvironment {
284                    begin,
285                    name,
286                    environment_comment,
287                    body,
288                    end,
289                }
290            }
291
292        // group that assumes you're in math mode.  If you use "\text{}" this isn't a good idea....
293        rule math_group() -> MathGroup =
294            begin:begin_group()
295            tokens:(!end_group() t:math_token() { t })*
296            end:end_group() { MathGroup { begin, tokens, end } }
297
298        rule begin_display_math() -> BeginDisplayMath =
299            escape:escape() "[" { BeginDisplayMath { escape } }
300
301        rule end_display_math() -> EndDisplayMath =
302            escape:escape() "]" { EndDisplayMath { escape } }
303
304        rule begin_inline_math() -> BeginInlineMath =
305            escape:escape() "(" { BeginInlineMath { escape } }
306
307        rule end_inline_math() -> EndInlineMath =
308            escape:escape() ")" { EndInlineMath { escape } }
309
310        rule begin_environment() -> BeginEnvironment =
311            escape:escape() "begin" { BeginEnvironment { escape } }
312
313        rule end_environment() -> EndEnvironment =
314            escape:escape() "end" { EndEnvironment { escape } }
315
316        rule math_environment_name() -> MathEnvironmentName =
317            pos:pos() kind:(
318                "equation*" { MathEnvironmentNameKind::EquationStar }
319                / "equation" { MathEnvironmentNameKind::Equation }
320                / "align*" { MathEnvironmentNameKind::AlignStar }
321                / "align" { MathEnvironmentNameKind::Align }
322                / "alignat*" { MathEnvironmentNameKind::AlignAtStar }
323                / "alignat" { MathEnvironmentNameKind::AlignAt }
324                / "gather*" { MathEnvironmentNameKind::GatherStar }
325                / "gather" { MathEnvironmentNameKind::Gather }
326                / "multline*" { MathEnvironmentNameKind::MultiLineStar }
327                / "multline" { MathEnvironmentNameKind::MultiLine }
328                / "flalign*" { MathEnvironmentNameKind::FlAlignStar }
329                / "flalign" { MathEnvironmentNameKind::FlAlign }
330                / "split" { MathEnvironmentNameKind::Split }
331                / "math" { MathEnvironmentNameKind::Math }
332                / "displaymath" { MathEnvironmentNameKind::DisplayMath }
333            ) { MathEnvironmentName { pos, kind } }
334
335        // catcode 0
336        rule escape() -> Escape =
337            pos:pos() "\\" { Escape { pos } }
338
339        // catcode 1
340        rule begin_group() -> BeginGroup =
341            pos:pos() "{" { BeginGroup { pos } }
342
343        // catcode 2
344        rule end_group() -> EndGroup =
345            pos:pos() "}" { EndGroup { pos } }
346
347        // catcode 3
348        rule math_shift() -> MathShift =
349            pos:pos() "$" { MathShift { pos } }
350
351        // catcode 4
352        rule alignment_tab() -> AlignmentTab =
353            pos:pos() "&" { AlignmentTab { pos } }
354
355        // catcode 5 (linux, os x, windows)
356        rule new_line() -> NewLine =
357            pos:pos() ("\r\n" / ['\r' | '\n']) { NewLine { pos } }
358
359        // catcode 6
360        rule macro_parameter() -> MacroParameter =
361            pos:pos() "#" { MacroParameter { pos } }
362
363        // catcode 7
364        rule superscript() -> Superscript =
365            pos:pos() "^" { Superscript { pos } }
366
367        // catcode 8
368        rule subscript() -> Subscript =
369            pos:pos() "_" { Subscript { pos } }
370
371        // catcode 9
372        rule ignore() -> Ignore =
373            pos:pos() "\0" { Ignore { pos } }
374
375        // catcode 10
376        rule space() -> Space =
377            pos:pos() [' ' | '\t']+ { Space { pos } }
378
379        // catcode 11
380        rule letter() -> AsciiAlphabetic =
381            pos:pos() ['a'..='z' | 'A'..='Z'] { AsciiAlphabetic { pos } }
382
383        // catcode 12 (other)
384        rule digit() -> AsciiDigit =
385            pos:pos() ['0'..='9'] { AsciiDigit { pos } }
386
387        // catcode 12
388        rule punctuation() -> Punctuation =
389            pos:pos() ch:$([
390                '.' | ',' | ';' | ':' | '-' | '*' | '/' | '(' | ')' | '!'
391                | '?' | '=' | '+' | '<' | '>' | '[' | ']'
392            ]) { Punctuation { pos, ch: ch.chars().next().unwrap() } }
393
394        // catcode 14, including the newline
395        rule comment_start() -> CommentStart =
396            pos:pos() "%" { CommentStart { pos } }
397
398        // A comment consumes any whitespace that comes before it.
399        // It can be the only thing on a line, or can come at the end of a line.
400        // A comment will consume the newline that follows it, unless that newline
401        // is part of a par_break.
402        rule full_comment() -> FullComment =
403            c:own_line_comment() { FullComment::OwnLineComment(c) }
404            / c:same_line_comment() { FullComment::SameLineComment(c) }
405
406        // A comment that appears on a line of its own
407        rule own_line_comment() -> OwnLineComment =
408            // `leading_space()` is whitespace that starts at the beginning fo a line.
409            // A comment is `sameline` if it is on the same line as other content.
410            // The existance of leading whitespace for a `sameline == false` comment
411            // isn't important, but we record it anyways.
412            //
413            // We look for `(space() new_line())?` at the start so that we eat excess whitespace that occurs before
414            // a comment on a new line. Otherwise, the newline itself is counted as whitespace. For example:
415            // ```x
416            //    %comment```
417            // would be parsed as "x, <whitespace (from the newline)>, comment". We don't want this. We want
418            // to parse it as "x, comment".
419            pos:pos() (space()* new_line())? leading_space:leading_space() comment:comment() {
420                OwnLineComment {
421                    pos,
422                    leading_space,
423                    comment,
424                }
425            }
426
427        // A comment that appears at the end of a line
428        rule same_line_comment() -> SameLineComment =
429            pos:pos() leading_spaces:space()* comment:comment() {
430                SameLineComment {
431                    pos,
432                    leading_spaces: !leading_spaces.is_empty(),
433                    comment,
434                }
435            }
436
437        rule comment() -> Comment =
438            // A comment normally consumes the next newline and all leading whitespace.
439            // The exception is if the next line consists solely of a comment. In that case,
440            // consume the newline but leave the whitespace (`full_comment` will eat the
441            // leading whitspace)
442            comment_start:comment_start()
443            content:$((!new_line() [_])*)
444            (
445                &par_break() // par_breaks following a comment are preserved
446                // if a comment is not followed by a par_break, the newline is consumed
447                / new_line() space()* !comment_start()
448                / new_line()
449                / ![_]
450            )
451            { Comment { comment_start, content: content.into() } }
452
453        // Whitespace at the start of a line only
454        rule leading_space() -> LeadingSpace =
455            pos:pos()
456            start_of_line()
457            content:$(space()*)
458            { LeadingSpace { pos, empty: content.is_empty() } }
459
460        rule start_of_line() =
461            ##is_start_of_line()
462
463        rule any_char() -> AnyChar =
464            pos:pos() ch:$([_]) { AnyChar { pos, ch: ch.chars().next().unwrap() } }
465
466        rule pos() -> Pos =
467            p:position!() { Pos::new(p) }
468    }
469}
470
471pub fn parse(input: &str) -> Result<ast::Document, ParseError<LineCol>> {
472    parser::document(input)
473}
474
475#[cfg(test)]
476mod test {
477    use crate::parse;
478    use serde::Serialize;
479    use serde_json::{
480        ser::{PrettyFormatter, Serializer},
481        *,
482    };
483    use std::str;
484
485    fn test_parse(input: &str, json: serde_json::Value) {
486        let document = parse(input).unwrap();
487        let actual_json = serde_json::to_value(&document).unwrap();
488        let mut text = Vec::new();
489        document
490            .serialize(&mut Serializer::with_formatter(
491                &mut text,
492                PrettyFormatter::with_indent(b"    "),
493            ))
494            .unwrap();
495        println!("{}", str::from_utf8(&text).unwrap());
496        assert!(actual_json == json);
497    }
498
499    #[test]
500    fn test_parse_macro() {
501        test_parse(
502            r#"\abc"#,
503            json! {
504                {
505                    "content": [
506                        {
507                            "token_type": "Macro",
508                            "escape": {
509                                "pos": "@0"
510                            },
511                            "name": {
512                                "pos": "@1",
513                                "content": "abc"
514                            }
515                        }
516                    ]
517                }
518            },
519        );
520    }
521
522    #[test]
523    fn test_parse_environment() {
524        test_parse(
525            r#"\begin{env}contents\begin{env2}contents2\end{env2}a\end{env}"#,
526            json! {
527                {
528                    "content": [
529                        {
530                            "token_type": "SpecialMacro",
531                            "special_macro_type": "Environment",
532                            "begin": {
533                                "escape": {
534                                    "pos": "@0"
535                                }
536                            },
537                            "name": {
538                                "pos": "@7",
539                                "content": "env"
540                            },
541                            "body": [
542                                {
543                                    "token_type": "CharTokens",
544                                    "pos": "@11",
545                                    "content": "contents"
546                                },
547                                {
548                                    "token_type": "SpecialMacro",
549                                    "special_macro_type": "Environment",
550                                    "begin": {
551                                        "escape": {
552                                            "pos": "@19"
553                                        }
554                                    },
555                                    "name": {
556                                        "pos": "@26",
557                                        "content": "env2"
558                                    },
559                                    "body": [
560                                        {
561                                            "token_type": "CharTokens",
562                                            "pos": "@31",
563                                            "content": "contents2"
564                                        }
565                                    ],
566                                    "end": {
567                                        "escape": {
568                                            "pos": "@40"
569                                        }
570                                    }
571                                },
572                                {
573                                    "token_type": "CharTokens",
574                                    "pos": "@50",
575                                    "content": "a"
576                                }
577                            ],
578                            "end": {
579                                "escape": {
580                                    "pos": "@51"
581                                }
582                            }
583                        }
584                    ]
585                }
586            },
587        );
588    }
589}