metamath_rs/
comment_parser.rs

1//! Implements markup parsing for metamath theorem comments.
2//!
3//! The [set.mm](https://github.com/metamath/set.mm/) library contains many
4//! theorems with comments, and those comments contain markup in a special syntax
5//! implemented by [metamath.exe](https://github.com/metamath/metamath-exe),
6//! and specified in Section 4.4.1 of the
7//! [metamath book](http://us.metamath.org/downloads/metamath.pdf).
8//! This module implements a SAX-style parser which yields events about the
9//! beginning and end of each syntax event.
10//!
11//! The [`CommentItem`] type does not contain any byte buffers directly;
12//! instead everything uses spans relative to the segment in which this comment
13//! appeared. Note that for many of the [`Span`]-containing fields the
14//! corresponding byte string in the file has to be unescaped before
15//! interpretation, using the [`CommentParser::unescape_text`] and
16//! [`CommentParser::unescape_math`] functions.
17use std::{fmt::Display, sync::OnceLock};
18
19use regex::bytes::{CaptureMatches, Match, Regex, RegexSet};
20
21use crate::{statement::unescape, Span};
22
23/// A comment markup item, which represents either a piece of text from the input
24/// or some kind of metadata item like the start or end of an italicized group.
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum CommentItem {
27    /// A piece of regular text. The characters in the buffer at the given
28    /// span should be interpreted literally, except for the escapes.
29    /// Use `unescape_text` to strip the text escapes `[`, `~`, `` ` ``, and `_`.
30    /// Note that `[` and `_` can also appear unescaped.
31    Text(Span),
32    /// A paragraph break, caused by two or more consecutive newlines in the input.
33    /// This is a zero-length item (all characters will be present in `Text` nodes
34    /// before and after the element), but corresponds roughly to a `<p>` tag in HTML.
35    LineBreak(usize),
36    /// Start math mode, indicated by a backtick character. The usize points to the character.
37    /// Between [`CommentItem::StartMathMode`] and [`CommentItem::EndMathMode`],
38    /// there will be no comment items other than [`CommentItem::MathToken`].
39    StartMathMode(usize),
40    /// End math mode, indicated by a backtick character. The usize points to the character.
41    /// Between [`CommentItem::StartMathMode`] and [`CommentItem::EndMathMode`],
42    /// there will be no comment items other than [`CommentItem::MathToken`].
43    EndMathMode(usize),
44    /// A single math token. After unescaping this should correspond to a `$c` or `$v` statement
45    /// in the database.
46    /// Use `unescape_math` to strip the escape character `` ` ``.
47    MathToken(Span),
48    /// A label of an existing theorem. The `usize` points to the `~` character.
49    /// Use `unescape_text` to strip the text escapes `[`, `~`, `` ` ``, and `_`.
50    /// Note that `[`, `~`, `_` can also appear unescaped.
51    Label(usize, Span),
52    /// A link to a web site URL. The `usize` points to the `~` character.
53    /// Use `unescape_text` to strip the text escapes `[`, `~`, `` ` ``, and `_`.
54    /// Note that `[`, `~`, `_` can also appear unescaped.
55    Url(usize, Span),
56    /// The `<HTML>` keyword, which starts HTML mode
57    /// (it doesn't actually put `<HTML>` in the output).
58    /// In HTML mode subscripts and italics are disabled, and HTML markup is interpreted.
59    StartHtml(usize),
60    /// The `</HTML>` keyword, which ends HTML mode
61    /// (it doesn't actually put `</HTML>` in the output).
62    EndHtml(usize),
63    /// The start of a subscript `x_0`. The `usize` points at the `_` character.
64    StartSubscript(usize),
65    /// The end of a subscript like `x_0`. The `usize` points just after the end of the word.
66    EndSubscript(usize),
67    /// The start of an italic section `_italic text_`. The `usize` points at the `_` character.
68    StartItalic(usize),
69    /// The end of an italic section `_italic text_`. The `usize` points at the `_` character.
70    EndItalic(usize),
71    /// A bibliographic label `[foo]`. No escapes are needed inside the tag body.
72    BibTag(Span),
73}
74
75/// Returns true if this is a character that is escaped in [`CommentItem::Text`] fields,
76/// meaning that doubled occurrences are turned into single occurrences.
77#[inline]
78#[must_use]
79pub const fn is_text_escape(html_mode: bool, c: u8) -> bool {
80    matches!(c, b'`' | b'[' | b'~') || !html_mode && c == b'_'
81}
82
83/// Returns true if this is a character that is escaped in
84/// [`CommentItem::Label`] and [`CommentItem::Url`] fields,
85/// meaning that doubled occurrences are turned into single occurrences.
86#[inline]
87#[must_use]
88pub const fn is_label_escape(c: u8) -> bool {
89    matches!(c, b'`' | b'[' | b'~')
90}
91
92/// Returns true if this is a character that is escaped in [`CommentItem::MathToken`] fields,
93/// meaning that doubled occurrences are turned into single occurrences.
94#[inline]
95#[must_use]
96pub const fn is_math_escape(c: u8) -> bool {
97    c == b'`'
98}
99
100impl CommentItem {
101    /// Remove text escapes from a markup segment `buf`, generally coming from the
102    /// [`CommentItem::Text`] field.
103    pub fn unescape_text(html_mode: bool, buf: &[u8], out: &mut Vec<u8>) {
104        unescape(buf, out, |c| is_text_escape(html_mode, c))
105    }
106
107    /// Remove text escapes from a markup segment `buf`, generally coming from the
108    /// [`CommentItem::Label`] or [`CommentItem::Url`] fields.
109    pub fn unescape_label(buf: &[u8], out: &mut Vec<u8>) {
110        unescape(buf, out, is_label_escape)
111    }
112
113    /// Remove math escapes from a markup segment `buf`, generally coming from the
114    /// [`CommentItem::MathToken`] field.
115    pub fn unescape_math(buf: &[u8], out: &mut Vec<u8>) {
116        unescape(buf, out, is_math_escape)
117    }
118
119    const fn token(math: bool, span: Span) -> Self {
120        if math {
121            Self::MathToken(span)
122        } else {
123            Self::Text(span)
124        }
125    }
126}
127
128/// An iterator over a metamath text comment, yielding markup items.
129#[derive(Debug, Clone)]
130pub struct CommentParser<'a> {
131    buf: &'a [u8],
132    pos: usize,
133    math_mode: bool,
134    html_mode: bool,
135    item: Option<CommentItem>,
136    end_italic: usize,
137    end_subscript: usize,
138}
139
140const CLOSING_PUNCTUATION: &[u8] = b".,;)?!:]'\"_-";
141
142impl<'a> CommentParser<'a> {
143    /// Construct a new `CommentParser` from a sub-span of a buffer.
144    /// The returned comment items will have spans based on the input span,
145    /// and the portion of the buffer outside `buf[span.start..span.end]` will be ignored.
146    #[must_use]
147    pub fn new(buf: &'a [u8], span: Span) -> Self {
148        Self {
149            buf: &buf[..span.end as usize],
150            pos: span.start as _,
151            math_mode: false,
152            html_mode: false,
153            item: None,
154            end_italic: usize::MAX,
155            end_subscript: 0,
156        }
157    }
158
159    /// Remove text escapes from a markup segment `span`, generally coming from the
160    /// [`CommentItem::Text`] field.
161    pub fn unescape_text(&self, span: Span, out: &mut Vec<u8>) {
162        CommentItem::unescape_text(self.html_mode, span.as_ref(self.buf), out)
163    }
164
165    /// Remove text escapes from a markup segment `span`, generally coming from the
166    /// [`CommentItem::Label`] or [`CommentItem::Url`] fields.
167    pub fn unescape_label(&self, span: Span, out: &mut Vec<u8>) {
168        CommentItem::unescape_label(span.as_ref(self.buf), out)
169    }
170
171    /// Remove math escapes from a markup segment `span`, generally coming from the
172    /// [`CommentItem::MathToken`] field.
173    pub fn unescape_math(&self, span: Span, out: &mut Vec<u8>) {
174        CommentItem::unescape_math(span.as_ref(self.buf), out)
175    }
176
177    fn parse_bib(&self) -> Option<Span> {
178        let start = self.pos + 1;
179        let mut end = start;
180        while let Some(&c) = self.buf.get(end) {
181            match c {
182                b']' => return Some(Span::new(start, end)),
183                b'`' if self.buf.get(end + 1) == Some(&c) => end += 2,
184                b'`' => break,
185                _ if c.is_ascii_whitespace() => break,
186                _ => end += 1,
187            }
188        }
189        None
190    }
191
192    fn parse_subscript(&self) -> Option<usize> {
193        let c = self.buf.get(self.pos + 1)?;
194        if c.is_ascii_whitespace() || CLOSING_PUNCTUATION.contains(c) {
195            return None;
196        }
197        let start = self.pos + 1;
198        let mut end = start;
199        while let Some(c) = self.buf.get(end) {
200            if c.is_ascii_whitespace() || CLOSING_PUNCTUATION.contains(c) {
201                break;
202            }
203            end += 1;
204        }
205        Some(end)
206    }
207
208    fn parse_italic(&self) -> Option<usize> {
209        if !self.buf.get(self.pos + 1)?.is_ascii_alphanumeric() {
210            return None;
211        }
212        let mut end = self.pos + 2;
213        loop {
214            if *self.buf.get(end)? == b'_' {
215                if self.buf.get(end + 1) == Some(&b'_') {
216                    end += 2
217                } else {
218                    break;
219                }
220            } else {
221                end += 1
222            }
223        }
224        if !self.buf[end - 1].is_ascii_alphanumeric()
225            || matches!(self.buf.get(end + 1), Some(c) if c.is_ascii_alphanumeric())
226        {
227            return None;
228        }
229        Some(end)
230    }
231
232    fn parse_underscore(&mut self) -> Option<(usize, CommentItem)> {
233        const OPENING_PUNCTUATION: &[u8] = b"(['\"";
234        let start = self.pos;
235        let is_italic = self.pos == self.end_subscript
236            || (self.pos.checked_sub(1).and_then(|pos| self.buf.get(pos))).map_or(true, |c| {
237                c.is_ascii_whitespace() || OPENING_PUNCTUATION.contains(c)
238            });
239        let item = if is_italic {
240            let it_end = self.parse_italic()?;
241            self.pos += 1;
242            self.end_italic = it_end;
243            CommentItem::StartItalic(start)
244        } else {
245            let sub_end = self.parse_subscript()?;
246            self.pos += 1;
247            self.end_subscript = sub_end;
248            CommentItem::StartSubscript(start)
249        };
250        Some((start, item))
251    }
252
253    fn parse_html(&mut self) -> Option<(usize, CommentItem)> {
254        if self.html_mode {
255            if self.buf[self.pos..].starts_with(b"</HTML>") {
256                self.html_mode = false;
257                let start = self.pos;
258                self.pos += 7;
259                Some((start, CommentItem::EndHtml(start)))
260            } else {
261                None
262            }
263        } else if self.buf[self.pos..].starts_with(b"<HTML>") {
264            self.html_mode = true;
265            let start = self.pos;
266            self.pos += 6;
267            Some((start, CommentItem::StartHtml(start)))
268        } else {
269            None
270        }
271    }
272
273    fn parse_math_delim(&mut self, at: usize) -> CommentItem {
274        if self.math_mode {
275            self.math_mode = false;
276            CommentItem::EndMathMode(at)
277        } else {
278            self.math_mode = true;
279            CommentItem::StartMathMode(at)
280        }
281    }
282
283    fn skip_whitespace(&mut self) {
284        while matches!(self.buf.get(self.pos), Some(c) if c.is_ascii_whitespace()) {
285            self.pos += 1;
286        }
287    }
288
289    fn parse_newline(&self, last_nl: Option<usize>) -> Option<()> {
290        self.buf[last_nl? + 1..self.pos]
291            .iter()
292            .all(u8::is_ascii_whitespace)
293            .then_some(())
294    }
295
296    fn parse_label(&mut self) -> CommentItem {
297        let tilde = self.pos;
298        self.pos += 1;
299        while matches!(self.buf.get(self.pos), Some(c) if c.is_ascii_whitespace()) {
300            self.pos += 1;
301        }
302        let label_start = self.pos;
303        while let Some(&c) = self.buf.get(self.pos) {
304            match c {
305                b'[' | b'`' if self.buf.get(self.pos + 1) == Some(&c) => self.pos += 2,
306                b' ' | b'\r' | b'\n' | b'`' => break,
307                b'[' if self.parse_bib().is_some() => break,
308                b'<' if self.buf[self.pos..].starts_with(b"<HTML>")
309                    || self.buf[self.pos..].starts_with(b"</HTML>") =>
310                {
311                    break
312                }
313                _ => self.pos += 1,
314            }
315        }
316        let label = &self.buf[label_start..self.pos];
317        if label.starts_with(b"http://")
318            || label.starts_with(b"https://")
319            || label.starts_with(b"mm")
320        {
321            CommentItem::Url(tilde, Span::new(label_start, self.pos))
322        } else {
323            CommentItem::Label(tilde, Span::new(label_start, self.pos))
324        }
325    }
326
327    fn trim_space_before_open(&self, lo: usize, pos: usize) -> bool {
328        lo + 2 <= pos
329            && self.buf[pos - 1] == b' '
330            && (b"([".contains(&self.buf[pos - 2])
331                || self.buf[pos - 2] == b'\"'
332                    && pos
333                        .checked_sub(3)
334                        .and_then(|i| self.buf.get(i))
335                        .map_or(true, u8::is_ascii_whitespace))
336    }
337
338    fn trim_space_after_close(&self) -> bool {
339        self.buf.get(self.pos) == Some(&b' ')
340            && self
341                .buf
342                .get(self.pos + 1)
343                .map_or(false, |c| CLOSING_PUNCTUATION.contains(c))
344    }
345}
346
347impl<'a> Iterator for CommentParser<'a> {
348    type Item = CommentItem;
349
350    #[allow(clippy::cognitive_complexity)]
351    fn next(&mut self) -> Option<Self::Item> {
352        if let Some(item) = self.item.take() {
353            return Some(item);
354        }
355        let math_token = self.math_mode;
356        if math_token {
357            self.skip_whitespace()
358        }
359        let start = self.pos;
360        let mut end = self.buf.len();
361        let mut last_nl = None;
362        while let Some(&c) = self.buf.get(self.pos) {
363            if c == b'`' {
364                if self.buf.get(self.pos + 1) == Some(&b'`') {
365                    self.pos += 2;
366                } else {
367                    end = self.pos;
368                    self.pos += 1;
369                    self.item = Some(self.parse_math_delim(end));
370                    if self.math_mode && self.trim_space_before_open(start, end) {
371                        // trim a single space if we are opening math mode after opening punct
372                        end -= 1;
373                    } else if !self.math_mode && self.trim_space_after_close() {
374                        // trim a single space if we are exiting math mode before closing punct
375                        self.pos += 1;
376                    }
377                    break;
378                }
379            } else if math_token {
380                if matches!(self.buf.get(self.pos), Some(c) if !c.is_ascii_whitespace()) {
381                    self.pos += 1;
382                } else if self.pos > start {
383                    return Some(CommentItem::MathToken(Span::new(start, self.pos)));
384                } else {
385                    return None;
386                }
387            } else if c == b'<' {
388                if let Some((start, item)) = self.parse_html() {
389                    end = start;
390                    self.item = Some(item);
391                    break;
392                }
393                self.pos += 1;
394            } else if c == b'~' {
395                if self.buf.get(self.pos + 1) == Some(&b'~') {
396                    self.pos += 2;
397                } else {
398                    end = self.pos;
399                    if self.trim_space_before_open(start, end) {
400                        // trim a single space if we have a label after opening punct
401                        end -= 1;
402                    }
403                    self.item = Some(self.parse_label());
404                    if self.trim_space_after_close() {
405                        // trim a single space if we have a label before closing punct
406                        self.pos += 1;
407                    }
408                    break;
409                }
410            } else if c == b'[' {
411                if self.buf.get(self.pos + 1) == Some(&b'[') {
412                    self.pos += 2;
413                } else if let Some(span) = self.parse_bib() {
414                    end = self.pos;
415                    self.pos = span.end as usize + 1;
416                    self.item = Some(CommentItem::BibTag(span));
417                    break;
418                } else {
419                    self.pos += 1;
420                }
421            } else if self.html_mode {
422                self.pos += 1;
423            } else if c == b'\n' {
424                if self.parse_newline(last_nl.replace(self.pos)).is_some() {
425                    end = self.pos;
426                    self.item = Some(CommentItem::LineBreak(end));
427                    break;
428                }
429                self.pos += 1;
430            } else if c == b'_' {
431                if self.buf.get(self.pos + 1) == Some(&b'_') {
432                    self.pos += 2;
433                } else if self.end_italic == self.pos {
434                    end = self.pos;
435                    self.pos += 1;
436                    self.item = Some(CommentItem::EndItalic(end));
437                    break;
438                } else if let Some((pos, item)) = self.parse_underscore() {
439                    end = pos;
440                    self.item = Some(item);
441                    break;
442                } else {
443                    self.pos += 1;
444                }
445            } else {
446                self.pos += 1;
447            }
448            if self.pos == self.end_subscript {
449                end = self.pos;
450                self.item = Some(CommentItem::EndSubscript(end));
451                break;
452            }
453        }
454        if end > start {
455            return Some(CommentItem::token(math_token, Span::new(start, end)));
456        }
457        self.item.take()
458    }
459}
460
461/// Discouragement information about a theorem.
462#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
463pub struct Discouragements {
464    /// Is proof modification discouraged for this theorem?
465    pub modification_discouraged: bool,
466    /// Is new usage discouraged for this theorem?
467    pub usage_discouraged: bool,
468}
469
470impl Discouragements {
471    /// Parse the text of a statement's comment to determine whether proof usage or modification
472    /// is discouraged.
473    #[must_use]
474    pub fn new(buf: &[u8]) -> Self {
475        static MODIFICATION: OnceLock<RegexSet> = OnceLock::new();
476        let modification = MODIFICATION.get_or_init(|| {
477            RegexSet::new([
478                r"\(Proof[ \r\n]+modification[ \r\n]+is[ \r\n]+discouraged\.\)",
479                r"\(New[ \r\n]+usage[ \r\n]+is[ \r\n]+discouraged\.\)",
480            ])
481            .unwrap()
482        });
483        let m = modification.matches(buf);
484        Self {
485            modification_discouraged: m.matched(0),
486            usage_discouraged: m.matched(1),
487        }
488    }
489}
490
491/// Information about "parentheticals" in the comment.
492#[derive(Debug, Clone, Copy, PartialEq, Eq)]
493pub enum Parenthetical {
494    /// A comment like `(Contributed by Foo Bar, 12-Mar-2020.)`.
495    ContributedBy {
496        /// The span of the author in the parenthetical
497        author: Span,
498        /// The date, in the form `DD-MMM-YYYY`.
499        /// To parse this further into a date, use the [`Date`] type's [`TryFrom`] impl.
500        date: Span,
501    },
502    /// A comment like `(Revised by Foo Bar, 12-Mar-2020.)`.
503    RevisedBy {
504        /// The span of the author in the parenthetical
505        author: Span,
506        /// The date, in the form `DD-MMM-YYYY`.
507        /// To parse this further into a date, use the [`Date`] type's [`TryFrom`] impl.
508        date: Span,
509    },
510    /// A comment like `(Proof shortened by Foo Bar, 12-Mar-2020.)`.
511    ProofShortenedBy {
512        /// The span of the author in the parenthetical
513        author: Span,
514        /// The date, in the form `DD-MMM-YYYY`.
515        /// To parse this further into a date, use the [`Date`] type's [`TryFrom`] impl.
516        date: Span,
517    },
518    /// The `(Proof modification is discouraged.)` comment
519    ProofModificationDiscouraged,
520    /// The `(New usage is discouraged.)` comment
521    NewUsageDiscouraged,
522}
523
524/// An iterator over the parentheticals in a comment.
525#[derive(Debug)]
526pub struct ParentheticalIter<'a> {
527    matches: CaptureMatches<'static, 'a>,
528    off: u32,
529}
530
531impl<'a> ParentheticalIter<'a> {
532    /// Construct a new parenthetical iterator given a segment buffer and a span in it.
533    #[must_use]
534    pub fn new(buf: &'a [u8], span: Span) -> Self {
535        static PARENTHETICALS: OnceLock<Regex> = OnceLock::new();
536        let parentheticals = PARENTHETICALS.get_or_init(|| {
537            Regex::new(concat!(
538                r"\((Contributed|Revised|Proof[ \r\n]+shortened)",
539                r"[ \r\n]+by[ \r\n]+([^,)]+),[ \r\n]+([0-9]{1,2}-[A-Z][a-z]{2}-[0-9]{4})\.\)|",
540                r"\((Proof[ \r\n]+modification|New[ \r\n]+usage)[ \r\n]+is[ \r\n]+discouraged\.\)",
541            ))
542            .unwrap()
543        });
544        Self {
545            matches: parentheticals.captures_iter(span.as_ref(buf)),
546            off: span.start,
547        }
548    }
549
550    fn to_span(&self, m: Match<'_>) -> Span {
551        Span {
552            start: self.off + m.start() as u32,
553            end: self.off + m.end() as u32,
554        }
555    }
556}
557
558impl<'a> Iterator for ParentheticalIter<'a> {
559    type Item = (Span, Parenthetical);
560
561    fn next(&mut self) -> Option<Self::Item> {
562        let groups = self.matches.next()?;
563        let all = groups.get(0).unwrap();
564        let item = if let Some(m) = groups.get(1) {
565            let author = self.to_span(groups.get(2).unwrap());
566            let date = self.to_span(groups.get(3).unwrap());
567            match m.as_bytes()[0] {
568                b'C' => Parenthetical::ContributedBy { author, date },
569                b'R' => Parenthetical::RevisedBy { author, date },
570                b'P' => Parenthetical::ProofShortenedBy { author, date },
571                _ => unreachable!(),
572            }
573        } else {
574            match all.as_bytes()[1] {
575                b'P' => Parenthetical::ProofModificationDiscouraged,
576                b'N' => Parenthetical::NewUsageDiscouraged,
577                _ => unreachable!(),
578            }
579        };
580        Some((self.to_span(all), item))
581    }
582}
583
584/// A date, as understood by metamath tools.
585/// This is just a `dd-mmm-yyyy` field after parsing,
586/// so it has weak calendrical restrictions.
587#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)]
588pub struct Date {
589    /// A year, which must be a 4 digit number (0-9999).
590    pub year: u16,
591    /// A month, parsed from three letter names: `Jan`, `Feb`, etc. (1-12)
592    pub month: u8,
593    /// A day, parsed from a 1 or 2 digit number (0-99).
594    pub day: u8,
595}
596
597impl Display for Date {
598    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
599        const MONTHS: [&str; 12] = [
600            "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
601        ];
602        write!(
603            f,
604            "{day}-{month}-{year:04}",
605            day = self.day,
606            month = MONTHS[(self.month - 1) as usize],
607            year = self.year
608        )
609    }
610}
611
612impl TryFrom<&[u8]> for Date {
613    type Error = ();
614
615    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
616        let (day, month, year) = match value.len() {
617            10 => (&value[..1], &value[2..5], &value[6..]),
618            11 => (&value[..2], &value[3..6], &value[7..]),
619            _ => return Err(()),
620        };
621        Ok(Date {
622            year: std::str::from_utf8(year)
623                .map_err(|_| ())?
624                .parse()
625                .map_err(|_| ())?,
626            month: match month {
627                b"Jan" => 1,
628                b"Feb" => 2,
629                b"Mar" => 3,
630                b"Apr" => 4,
631                b"May" => 5,
632                b"Jun" => 6,
633                b"Jul" => 7,
634                b"Aug" => 8,
635                b"Sep" => 9,
636                b"Oct" => 10,
637                b"Nov" => 11,
638                b"Dec" => 12,
639                _ => return Err(()),
640            },
641            day: std::str::from_utf8(day)
642                .map_err(|_| ())?
643                .parse()
644                .map_err(|_| ())?,
645        })
646    }
647}