metamath_rs/
parser.rs

1//! Implementation of the low-level statement parser for Metamath databases.
2//!
3//! The parser identifies the boundaries between statements, extracts their math
4//! strings and proofs, does basic validity checking within statements, and
5//! extracts a list of definitions which can be indexed by nameck.  This module
6//! also defines the core datatypes used to represent source positions and
7//! parsed statements, which are used by other analysis passes.
8//!
9//! Analysis passes are not a stable interface; use `Database` instead.
10//!
11//! The input of the parser is a byte string, which will typically consist of a
12//! file except when the preparser is used; see the `database` module
13//! documentation.  The output is one or more segments; the parser is
14//! responsible for detecting includes and splitting statements appropriately,
15//! although responsibility for following includes rests on the `segment_set`.
16
17use crate::diag::{Diagnostic, MarkupKind};
18use crate::segment::{BufferRef, Segment};
19use crate::segment_set::SegmentSet;
20use crate::statement::{
21    Command, CommandToken, FilePos, FloatDef, GlobalDv, GlobalSpan, HeadingDef, LabelDef,
22    LocalVarDef, Span, Statement, StatementAddress, StatementIndex, SymbolDef, SymbolType, Token,
23    TokenIndex, TokenPtr, NO_STATEMENT,
24};
25use crate::typesetting::TypesettingData;
26use crate::StatementType::{self, *};
27use regex::bytes::Regex;
28use std::cmp;
29use std::collections::hash_map::Entry;
30use std::fmt::Debug;
31use std::mem;
32use std::str;
33use std::sync::{Arc, OnceLock};
34
35/// State used by the scanning process
36#[derive(Default)]
37struct Scanner<'a> {
38    /// Text being parsed.
39    buffer: &'a [u8],
40    /// Arc of text which can be linked into new Segments.
41    buffer_ref: BufferRef,
42    /// Current parsing position; will generally point immediately after a
43    /// token, at whitespace.
44    position: FilePos,
45    /// Accumulated errors for this segment.  Reset on new segment.
46    diagnostics: Vec<(StatementIndex, Diagnostic)>,
47    /// Accumulated spans for this segment.
48    span_pool: Vec<Span>,
49    /// A span which was encountered, but needs to be processed again.
50    ///
51    /// This is used for error recovery if you leave off the `$.` which ends a
52    /// math string, the parser will encounter the next keyword and needs to
53    /// save that token to process it as a keyword.
54    unget: Span,
55    /// Labels accumulated for the current statement.
56    labels: Vec<Span>,
57    /// End of the previous statement
58    statement_start: FilePos,
59    /// Current write index into statement list
60    statement_index: StatementIndex,
61    /// Write indexes into span pools
62    statement_math_start: usize,
63    statement_proof_start: usize,
64    /// Flag which can be set at any time during statement parse to cause the
65    /// type to be overwritten with `Invalid`, if the statement would otherwise
66    /// be too broken to continue with.
67    invalidated: bool,
68}
69
70/// Bitmask of allowed whitespace characters.
71///
72/// A Metamath database is required to consist of graphic characters, SP, HT,
73/// NL, FF, and CR.
74const MM_VALID_SPACES: u64 =
75    (1u64 << 9) | (1u64 << 10) | (1u64 << 12) | (1u64 << 13) | (1u64 << 32);
76
77/// Check if a character which is known to be <= 32 is a valid Metamath
78/// whitespace.  May panic if out of range.
79const fn is_mm_space_c0(byte: u8) -> bool {
80    (MM_VALID_SPACES & (1u64 << byte)) != 0
81}
82
83/// Check if a character is valid Metamath whitespace.
84///
85/// We generally accept any C0 control as whitespace, with a diagnostic; this
86/// function only tests for fully legal whitespace though.
87const fn is_mm_space(byte: u8) -> bool {
88    byte <= 32 && is_mm_space_c0(byte)
89}
90
91#[derive(Default, Debug, Eq, PartialEq, Ord, PartialOrd, Copy, Clone)]
92/// The different types of heading markers, as defined in the Metamath book, section 4.4.1
93pub enum HeadingLevel {
94    /// Virtual top-level heading, used as a root node
95    #[default]
96    Database,
97    /// Major part
98    MajorPart,
99    /// Section
100    Section,
101    /// Subsection
102    SubSection,
103    /// Subsubsection
104    SubSubSection,
105    /// Statement
106    Statement,
107}
108
109#[derive(Eq, PartialEq, Copy, Clone)]
110enum CommentType {
111    Normal,
112    Typesetting,
113    Extra,
114    Heading(HeadingLevel),
115}
116
117impl<'a> Scanner<'a> {
118    /// Record a diagnostic against the nascent statement
119    fn diag(&mut self, diag: Diagnostic) {
120        self.diagnostics.push((self.statement_index, diag));
121    }
122
123    /// Get a single whitespace-delimited token from the source text without
124    /// checking for comments or the unget area.
125    ///
126    /// This function accepts any C0 character as whitespace, with a diagnostic.
127    /// DEL and non-7bit characters invalidate the current token and cause it to
128    /// be omitted from the returned string.  `Span::NULL` is returned when
129    /// the end of the buffer is reached.
130    ///
131    /// This is _very_ hot, mostly due to the unpredicable branches in the
132    /// whitespace testing.  It might be possible to write a version which is
133    /// branch-free in typical cases, but it would be extremely fiddly and has
134    /// not been attempted.
135    fn get_raw(&mut self) -> Span {
136        #[inline(never)]
137        #[cold]
138        fn badchar(slf: &mut Scanner<'_>, ix: usize) -> Span {
139            let ch = slf.buffer[ix];
140            slf.diag(Diagnostic::BadCharacter(ix, ch));
141            // Restart the function from the beginning to reload self.buffer;
142            // doing it this way lets it be kept in a register in the common
143            // case
144            slf.get_raw()
145        }
146
147        let len = self.buffer.len();
148        let mut ix = self.position as usize;
149
150        while ix < len && self.buffer[ix] <= 32 {
151            // For the purpose of error recovery, we consider C0 control
152            // characters to be whitespace (following SMM2)
153            if !is_mm_space_c0(self.buffer[ix]) {
154                self.position = (ix + 1) as FilePos;
155                return badchar(self, ix);
156            }
157            ix += 1;
158        }
159
160        let start = ix;
161        while ix < len {
162            if self.buffer[ix] <= 32 || self.buffer[ix] > 126 {
163                if self.buffer[ix] <= 32 {
164                    break;
165                }
166                // DEL or C1 control or non-ASCII bytes (presumably UTF-8)
167                let badix = ix;
168                // skip this "token"
169                ix += 1;
170                while ix < len && !is_mm_space(self.buffer[ix]) {
171                    ix += 1;
172                }
173                self.position = ix as FilePos;
174                return badchar(self, badix);
175            }
176
177            ix += 1;
178        }
179
180        self.position = ix as FilePos;
181        if start == ix {
182            Span::NULL
183        } else {
184            Span::new(start, ix)
185        }
186    }
187
188    /// Assuming that a `$(` token has just been read, read and skip a comment.
189    ///
190    /// If the comment appears to be special, notice that.  This currently
191    /// detects `$j`, `$t` comments, and outline comments.
192    fn get_comment(&mut self, opener: Span, mid_statement: bool) -> CommentType {
193        let mut ctype = CommentType::Normal;
194        let mut first = true;
195        loop {
196            let tok = self.get_raw();
197            if tok.is_null() {
198                break;
199            }
200            let tok_ref = tok.as_ref(self.buffer);
201            if tok_ref == b"$)" {
202                return ctype;
203            } else if tok_ref == b"$j" || tok_ref == b"$t" {
204                if !first {
205                    self.diag(Diagnostic::CommentMarkerNotStart(tok))
206                } else if mid_statement {
207                    self.diag(Diagnostic::MidStatementCommentMarker(tok))
208                } else {
209                    ctype = if tok_ref == b"$j" {
210                        CommentType::Extra
211                    } else {
212                        CommentType::Typesetting
213                    }
214                }
215            } else if tok_ref.contains(&b'$') {
216                let tok_str = str::from_utf8(tok_ref).unwrap();
217                if tok_str.contains("$)") {
218                    self.diag(Diagnostic::BadCommentEnd(tok, opener));
219                }
220                if tok_str.contains("$(") {
221                    self.diag(Diagnostic::NestedComment(tok, opener));
222                }
223            } else if first && tok_ref.len() >= 4 {
224                if tok_ref[..4] == *b"####" {
225                    ctype = CommentType::Heading(HeadingLevel::MajorPart);
226                } else if tok_ref[..4] == *b"#*#*" {
227                    ctype = CommentType::Heading(HeadingLevel::Section);
228                } else if tok_ref[..4] == *b"=-=-" {
229                    ctype = CommentType::Heading(HeadingLevel::SubSection);
230                } else if tok_ref[..4] == *b"-.-." {
231                    ctype = CommentType::Heading(HeadingLevel::SubSubSection);
232                }
233            }
234            first = false;
235        }
236
237        let cspan = Span::new2(opener.start, self.buffer.len() as FilePos);
238        self.diag(Diagnostic::UnclosedComment(cspan));
239        ctype
240    }
241
242    /// Fetches a single normal token from the buffer, skipping over comments
243    /// and handling unget.
244    fn get(&mut self) -> Span {
245        if !self.unget.is_null() {
246            return mem::replace(&mut self.unget, Span::NULL);
247        }
248
249        loop {
250            let tok = self.get_raw();
251            if tok.is_null() {
252                return Span::NULL;
253            }
254            let tok_ref = tok.as_ref(self.buffer);
255            if tok_ref == b"$(" {
256                self.get_comment(tok, true);
257            } else {
258                return tok;
259            }
260        }
261    }
262
263    /// This is where statements are constructed, factored out between comment
264    /// statements and non-comment.
265    fn out_statement(&mut self, stype: StatementType, label: Span) -> Statement {
266        Statement {
267            stype,
268            label,
269            math_start: self.statement_math_start,
270            proof_start: self.statement_proof_start,
271            proof_end: self.span_pool.len(),
272            group: NO_STATEMENT,
273            group_end: NO_STATEMENT,
274            span: Span::new2(
275                mem::replace(&mut self.statement_start, self.position),
276                self.position,
277            ),
278        }
279    }
280
281    /// Check for and parse a comment statement at the current position.
282    fn get_comment_statement(&mut self) -> Option<Statement> {
283        let ftok = if self.unget.is_null() {
284            self.get_raw()
285        } else {
286            mem::replace(&mut self.unget, Span::NULL)
287        };
288        if ftok != Span::NULL {
289            let ftok_ref = ftok.as_ref(self.buffer);
290            if ftok_ref == b"$(" {
291                let ctype = self.get_comment(ftok, false);
292                let s_type = match ctype {
293                    CommentType::Typesetting => TypesettingComment,
294                    CommentType::Heading(level) => HeadingComment(level),
295                    CommentType::Extra => AdditionalInfoComment,
296                    CommentType::Normal => Comment,
297                };
298                return Some(self.out_statement(s_type, Span::new2(ftok.start, ftok.start)));
299            }
300            self.unget = ftok;
301        }
302        None
303    }
304
305    /// Reads zero or more labels from the input stream.
306    ///
307    /// We haven't seen the statement-starting keyword yet, so we don't know if
308    /// labels are actually expected; just read, validate and store them.
309    fn read_labels(&mut self) {
310        self.labels.clear();
311        loop {
312            let ltok = self.get();
313            if ltok.is_null() {
314                break;
315            }
316            let lref = ltok.as_ref(self.buffer);
317            if lref.contains(&b'$') {
318                self.unget = ltok;
319                break;
320            } else if is_valid_label(lref) {
321                self.labels.push(ltok);
322            } else {
323                self.diag(Diagnostic::BadLabel(ltok));
324            }
325        }
326    }
327
328    /// We now know we shouldn't have labels.  Issue errors if we do anyway.
329    fn get_no_label(&mut self, kwtok: Span) -> Span {
330        // none of these are invalidations...
331        for &lspan in &self.labels {
332            self.diagnostics
333                .push((self.statement_index, Diagnostic::SpuriousLabel(lspan)));
334        }
335
336        // If this is a valid no-label statement, kwtok will have the keyword.
337        // Otherwise it may be Span(0,0) = null, in which case we also return Span(0,0).
338        Span::new2(kwtok.start, kwtok.start)
339    }
340
341    /// We now know we need exactly one label.  Error and invalidate if we don't
342    /// have it.
343    fn get_label(&mut self) -> Span {
344        match self.labels.len() {
345            1 => self.labels[0],
346            0 => {
347                self.diag(Diagnostic::MissingLabel);
348                self.invalidated = true;
349                Span::NULL
350            }
351            _ => {
352                for &addl in self.labels.iter().skip(1) {
353                    self.diagnostics.push((
354                        self.statement_index,
355                        Diagnostic::RepeatedLabel(addl, self.labels[0]),
356                    ));
357                }
358                // have to invalidate because we don't know which to use
359                self.invalidated = true;
360                Span::NULL
361            }
362        }
363    }
364
365    /// Handles parsing for math and proof strings.
366    ///
367    /// Set `is_proof` to parse a proof, else we are parsing a math string.
368    /// Returns true if there is a proof following and generates diagnostics as
369    /// appropriate depending on `expect_proof`.
370    ///
371    /// Directly populates the span pool; record the span pool offset before and
372    /// after calling to get at the string.
373    fn get_string(&mut self, expect_proof: bool, is_proof: bool) -> bool {
374        loop {
375            let tokn = self.get();
376            if tokn.is_null() {
377                break;
378            }
379            let toknref = tokn.as_ref(self.buffer);
380            if toknref.contains(&b'$') {
381                if toknref == b"$." {
382                    // string is closed with no proof
383                    if expect_proof {
384                        self.diag(Diagnostic::MissingProof(tokn));
385                    }
386                    return false;
387                } else if toknref == b"$=" && !is_proof {
388                    // string is closed with a proof
389                    if !expect_proof {
390                        self.diag(Diagnostic::SpuriousProof(tokn));
391                    }
392                    return true;
393                }
394                // string is closed with no proof and with an error, whoops
395                self.unget = tokn;
396                break;
397            }
398            self.span_pool.push(tokn);
399        }
400
401        self.diag(if is_proof {
402            Diagnostic::UnclosedProof
403        } else {
404            Diagnostic::UnclosedMath
405        });
406        false
407    }
408
409    /// Parses math and proof strings for the current statement and records the
410    /// offsets appropriately.
411    fn get_strings(&mut self, want_proof: bool) {
412        let has_proof = self.get_string(want_proof, false);
413        self.statement_proof_start = self.span_pool.len();
414
415        if self.statement_proof_start == self.statement_math_start {
416            self.invalidated = true;
417            // this is invalid for all types of math string
418            self.diag(Diagnostic::EmptyMathString);
419        }
420
421        if has_proof {
422            // diagnostic already generated if unwanted, but we still need to
423            // eat the proof
424            self.get_string(false, true);
425        } else {
426            // diagnostic already generated if unwanted.  this is NOT an
427            // invalidation as $p statements don't need proofs (I mean you
428            // should have a ? but we know what you mean)
429        }
430    }
431
432    /// When we see an invalid statement keyword, eat tokens until we see
433    /// another keyword or a statement end marker.
434    fn eat_invalid(&mut self) {
435        loop {
436            let tok = self.get();
437            if tok.is_null() {
438                break;
439            }
440            let tref = tok.as_ref(self.buffer);
441            if tref == b"$." {
442                // we're probably synchronized
443                break;
444            } else if tref == b"$=" {
445                // this is definitely not it
446            } else if tref.contains(&b'$') {
447                // might be the start of the next statement
448                self.unget = tok;
449                break;
450            }
451        }
452    }
453
454    /// Handles parsing the filename after a `$[` keyword has been seen.
455    ///
456    /// Per the spec, filenames are restricted to the syntax of math tokens.
457    fn get_file_include(&mut self) -> Span {
458        let mut res = Span::NULL;
459        let mut count = 0;
460        loop {
461            let tok = self.get();
462            if tok.is_null() {
463                break;
464            }
465            let tref = tok.as_ref(self.buffer);
466            if tref == b"$]" {
467                if count == 0 {
468                    self.diag(Diagnostic::EmptyFilename);
469                    self.invalidated = true;
470                } else if count > 1 {
471                    self.diag(Diagnostic::FilenameSpaces);
472                    self.invalidated = true;
473                } else if res.as_ref(self.buffer).contains(&b'$') {
474                    self.diag(Diagnostic::FilenameDollar);
475                }
476                return res;
477            } else if !tref.is_empty() && tref[0] == b'$' {
478                break;
479            }
480            count += 1;
481            res = tok;
482        }
483        self.diag(Diagnostic::UnclosedInclude);
484        self.invalidated = true;
485        res
486    }
487
488    /// Main function called to read a complete statement from the input buffer.
489    fn get_statement(&mut self) -> Statement {
490        self.statement_start = self.position;
491        self.statement_math_start = self.span_pool.len();
492        self.statement_proof_start = self.span_pool.len();
493
494        // is there a freestanding comment?
495        if let Some(stmt) = self.get_comment_statement() {
496            return stmt;
497        }
498
499        // look for labels before the keyword
500        self.read_labels();
501
502        let kwtok = self.get();
503        let stype = if kwtok.is_null() {
504            Eof
505        } else {
506            let kwtok_ref = kwtok.as_ref(self.buffer);
507            if kwtok_ref.len() == 2 && kwtok_ref[0] == b'$' {
508                match kwtok_ref[1] {
509                    b'[' => FileInclude,
510                    b'a' => Axiom,
511                    b'c' => Constant,
512                    b'd' => Disjoint,
513                    b'e' => Essential,
514                    b'f' => Floating,
515                    b'p' => Provable,
516                    b'v' => Variable,
517                    b'{' => OpenGroup,
518                    b'}' => CloseGroup,
519                    _ => Invalid,
520                }
521            } else {
522                Invalid
523            }
524        };
525        if stype == Invalid {
526            self.diag(Diagnostic::UnknownKeyword(kwtok));
527        }
528        self.invalidated = false;
529
530        // handle any labels recorded earlier appropriately
531        let mut label = if stype.takes_label() {
532            self.get_label()
533        } else {
534            self.get_no_label(kwtok)
535        };
536
537        // keyword is followed by strings; this also errors if the string is
538        // empty
539        if stype.takes_math() {
540            self.get_strings(stype == Provable);
541        }
542
543        // $d and $f statements require at least two tokens, check that now
544        let math_len = self.statement_proof_start - self.statement_math_start;
545        match stype {
546            FileInclude => label = self.get_file_include(),
547            Disjoint => {
548                // math.len = 1 was caught above
549                if math_len == 1 {
550                    self.diag(Diagnostic::DisjointSingle);
551                    self.invalidated = true;
552                }
553            }
554            Floating => {
555                // math_len = 0 was already marked EmptyMathString
556                if math_len != 0 && math_len != 2 {
557                    self.diag(Diagnostic::BadFloating);
558                    self.invalidated = true;
559                }
560            }
561            // eat tokens to the next keyword rather than treating them as
562            // labels
563            Invalid => self.eat_invalid(),
564            _ => {}
565        }
566
567        let stype = if self.invalidated { Invalid } else { stype };
568
569        self.out_statement(stype, label)
570    }
571
572    /// Reads statements until EOF or an include statement, which breaks logical
573    /// order and requires a new segment.
574    ///
575    /// Also does extractions for the benefit of nameck.  Second return value is
576    /// true if we are at EOF.
577    fn get_segment(&mut self) -> (Segment, bool) {
578        let mut seg = Segment {
579            statements: Vec::new(),
580            next_file: Span::NULL,
581            symbols: Vec::new(),
582            local_vars: Vec::new(),
583            global_dvs: Vec::new(),
584            labels: Vec::new(),
585            floats: Vec::new(),
586            buffer: self.buffer_ref.clone(),
587            diagnostics: Vec::new(),
588            span_pool: Vec::new(),
589            outline: Vec::new(),
590            t_commands: Vec::new(),
591            j_commands: Vec::new(),
592        };
593        let mut top_group = NO_STATEMENT;
594        let is_end;
595        let end_diag;
596
597        loop {
598            let index = seg.statements.len() as StatementIndex;
599            self.statement_index = index;
600            let mut stmt = self.get_statement();
601            stmt.group = top_group;
602            seg.statements.push(stmt);
603
604            // This manages the group stack, and sets `group` on all statements
605            // to the `OpenGroup` of the innermost enclosing group (or the
606            // matching opener, for `CloseGroup`).
607            match seg.statements[index as usize].stype {
608                OpenGroup => top_group = index,
609                CloseGroup if top_group != NO_STATEMENT => {
610                    seg.statements[top_group as usize].group_end = index;
611                    top_group = seg.statements[top_group as usize].group;
612                }
613                CloseGroup => self.diag(Diagnostic::UnmatchedCloseGroup),
614                Constant if top_group != NO_STATEMENT => {
615                    self.diag(Diagnostic::ConstantNotTopLevel);
616                }
617                Essential if top_group == NO_STATEMENT => {
618                    self.diag(Diagnostic::EssentialAtTopLevel)
619                }
620                FileInclude => {
621                    // snag this _now_
622                    seg.next_file = seg.statements[index as usize].label;
623                    end_diag = Diagnostic::UnclosedBeforeInclude(index);
624                    is_end = false;
625                    break;
626                }
627                Eof => {
628                    end_diag = Diagnostic::UnclosedBeforeEof;
629                    is_end = true;
630                    break;
631                }
632                _ => {}
633            }
634        }
635
636        // make sure they're not trying to continue an open group past EOF or a
637        // file include
638        while top_group != NO_STATEMENT {
639            seg.statements[top_group as usize].group_end = seg.statements.len() as StatementIndex;
640            self.diagnostics.push((top_group, end_diag.clone()));
641            top_group = seg.statements[top_group as usize].group;
642        }
643
644        // populate `group_end` for statements in groups; was set for
645        // `OpenGroup` in the loop above, and we don't want to overwrite it
646        for index in 0..seg.statements.len() {
647            if seg.statements[index].group != NO_STATEMENT
648                && seg.statements[index].stype != OpenGroup
649            {
650                seg.statements[index].group_end =
651                    seg.statements[seg.statements[index].group as usize].group_end;
652            }
653        }
654
655        seg.diagnostics = mem::take(&mut self.diagnostics);
656        seg.span_pool = mem::take(&mut self.span_pool);
657        seg.span_pool.shrink_to_fit();
658        seg.statements.shrink_to_fit();
659        collect_definitions(&mut seg);
660        (seg, is_end)
661    }
662}
663
664/// Extracts certain types of statement from the segment so that nameck doesn't
665/// need statement-specific intelligence.
666fn collect_definitions(seg: &mut Segment) {
667    let buf: &[u8] = &seg.buffer;
668    for (index, stmt) in seg.statements.iter().enumerate() {
669        let index = index as StatementIndex;
670        if stmt.stype.takes_label() {
671            seg.labels.push(LabelDef { index });
672        }
673
674        if stmt.group_end != NO_STATEMENT {
675            if stmt.stype == Variable {
676                let math = &seg.span_pool[stmt.math_start..stmt.proof_start];
677                for sindex in 0..math.len() {
678                    seg.local_vars.push(LocalVarDef {
679                        index,
680                        ordinal: sindex as TokenIndex,
681                    });
682                }
683            }
684            // Skip further treatment if the statement is not at top-level, except for $j commands.
685            if stmt.stype != AdditionalInfoComment {
686                continue;
687            }
688        }
689
690        let math = &seg.span_pool[stmt.math_start..stmt.proof_start];
691
692        match stmt.stype {
693            Constant => {
694                for (sindex, &span) in math.iter().enumerate() {
695                    seg.symbols.push(SymbolDef {
696                        stype: SymbolType::Constant,
697                        start: index,
698                        name: span.as_ref(buf).into(),
699                        ordinal: sindex as TokenIndex,
700                    });
701                }
702            }
703            Variable => {
704                for (sindex, &span) in math.iter().enumerate() {
705                    seg.symbols.push(SymbolDef {
706                        stype: SymbolType::Variable,
707                        start: index,
708                        name: span.as_ref(buf).into(),
709                        ordinal: sindex as TokenIndex,
710                    });
711                }
712            }
713            Disjoint => {
714                seg.global_dvs.push(GlobalDv {
715                    start: index,
716                    vars: math.iter().map(|sp| sp.as_ref(buf).into()).collect(),
717                });
718            }
719            Floating => {
720                seg.floats.push(FloatDef {
721                    start: index,
722                    typecode: math[0].as_ref(buf).into(),
723                    label: stmt.label.as_ref(buf).into(),
724                    name: math[1].as_ref(buf).into(),
725                });
726            }
727            HeadingComment(level) => {
728                seg.outline.push(HeadingDef {
729                    name: get_heading_name(buf, stmt.span.start).into(),
730                    index,
731                    level,
732                });
733            }
734            TypesettingComment => match commands(buf, b't', stmt.span.start) {
735                Ok(commands) => {
736                    for result in commands {
737                        match result {
738                            Ok(command) => seg.t_commands.push((index, command)),
739                            Err(diag) => seg.diagnostics.push((index, diag)),
740                        }
741                    }
742                }
743                Err(diag) => seg.diagnostics.push((index, diag)),
744            },
745            AdditionalInfoComment => match commands(buf, b'j', stmt.span.start) {
746                Ok(commands) => {
747                    for result in commands {
748                        match result {
749                            Ok(command) => seg.j_commands.push((index, command)),
750                            Err(diag) => seg.diagnostics.push((index, diag)),
751                        }
752                    }
753                }
754                Err(diag) => seg.diagnostics.push((index, diag)),
755            },
756            _ => {}
757        }
758    }
759}
760
761/// Metamath spec valid label characters are `[-._a-zA-Z0-9]`
762#[must_use]
763pub fn is_valid_label(label: &[u8]) -> bool {
764    label
765        .iter()
766        .all(|&c| c == b'.' || c == b'-' || c == b'_' || c.is_ascii_alphanumeric())
767}
768
769/// Extract a section name from a comment
770fn get_heading_name(buffer: &[u8], pos: FilePos) -> TokenPtr<'_> {
771    let mut index = pos as usize;
772    while index < buffer.len() {
773        // is this line indented?
774        if buffer[index] == b' ' {
775            break;
776        }
777        // skip this line
778        while index < buffer.len() && buffer[index] != b'\n' {
779            index += 1;
780        }
781        // skip the newline, too
782        if index < buffer.len() {
783            index += 1;
784        }
785    }
786    // index points at the beginning of an indented line, or EOF
787
788    // trim horizontal whitespace
789    while index < buffer.len() && buffer[index] == b' ' {
790        index += 1;
791    }
792
793    let mut eol = index;
794    while eol < buffer.len() && buffer[eol] != b'\n' {
795        eol += 1;
796    }
797    while eol > index && (buffer[eol - 1] == b'\r' || buffer[eol - 1] == b' ') {
798        eol -= 1;
799    }
800    &buffer[index..eol]
801}
802
803/// Extract the parser commands out of a $t or $j special comment
804fn commands(buffer: &[u8], ch: u8, pos: FilePos) -> Result<CommandIter<'_>, Diagnostic> {
805    let mut iter = CommandIter {
806        buffer,
807        index: pos as usize,
808    };
809    let _ = iter.skip_white_spaces()?;
810    iter.expect(b'$')?;
811    iter.expect(b'(')?;
812    let _ = iter.skip_white_spaces()?;
813    iter.expect(b'$')?;
814    iter.expect(ch)?;
815    Ok(iter)
816}
817
818struct CommandIter<'a> {
819    buffer: &'a [u8],
820    index: usize,
821}
822
823impl CommandIter<'_> {
824    const fn has_more(&self) -> bool {
825        self.index < self.buffer.len()
826    }
827
828    const fn next_char(&self) -> u8 {
829        self.buffer[self.index]
830    }
831
832    fn skip_white_spaces(&mut self) -> Result<Option<()>, Diagnostic> {
833        'outer_loop: while self.has_more() {
834            match self.next_char() {
835                b' ' | b'\t' | b'\n' | b'\r' => {} // Skip white spaces and line feeds
836                // End upon comment closing, $)
837                b'$' => return Ok(None),
838                // Found a comment start
839                b'/' if self.buffer.get(self.index + 1) == Some(&b'*') => {
840                    for i in self.index + 2..self.buffer.len() - 1 {
841                        // found the end of the comment
842                        if self.buffer[i] == b'*' && self.buffer[i + 1] == b'/' {
843                            self.index = i + 2;
844                            continue 'outer_loop;
845                        }
846                    }
847                    // unclosed comment, advance self.index to end
848                    let cspan = Span::new(self.index, self.buffer.len());
849                    self.index = self.buffer.len();
850                    return Err(Diagnostic::UnclosedCommandComment(cspan));
851                }
852                // Else stop
853                _ => break,
854            }
855            self.index += 1;
856        }
857        Ok(Some(()))
858    }
859
860    #[allow(clippy::if_not_else)]
861    fn expect(&mut self, c: u8) -> Result<(), Diagnostic> {
862        if !self.has_more() {
863            let cspan = Span::new(self.index, self.buffer.len());
864            Err(Diagnostic::UnclosedComment(cspan))
865        } else if self.next_char() != c {
866            let cspan = Span::new(self.index, self.buffer.len());
867            Err(Diagnostic::MalformedAdditionalInfo(cspan))
868        } else {
869            self.index += 1;
870            Ok(())
871        }
872    }
873}
874
875impl Iterator for CommandIter<'_> {
876    type Item = Result<Command, Diagnostic>;
877
878    fn next(&mut self) -> Option<Self::Item> {
879        match self.skip_white_spaces() {
880            Ok(Some(())) => {}
881            Ok(None) => return None,
882            Err(diag) => return Some(Err(diag)),
883        }
884
885        let command_start = self.index;
886        let mut command = Vec::new();
887        while self.has_more() {
888            let token = if let quote @ (b'\'' | b'\"') = self.next_char() {
889                self.index += 1;
890                let token_start = self.index;
891                // Stop if end quote
892                loop {
893                    if !self.has_more() {
894                        let cspan = Span::new(token_start, self.buffer.len());
895                        return Some(Err(Diagnostic::UnclosedCommandString(cspan)));
896                    }
897                    let ch = self.next_char();
898                    self.index += 1;
899                    if ch == quote {
900                        if self.has_more() && self.next_char() == quote {
901                            self.index += 1;
902                        } else {
903                            break;
904                        }
905                    }
906                }
907                CommandToken::String(Span::new(token_start, self.index - 1))
908            } else {
909                let token_start = self.index;
910                // Stop if unquoted white spaces, line feeds or semicolon
911                while self.has_more()
912                    && !matches!(self.next_char(), b' ' | b'\t' | b'\n' | b'\r' | b';')
913                {
914                    self.index += 1;
915                }
916                CommandToken::Keyword(Span::new(token_start, self.index))
917            };
918            command.push(token);
919
920            if let Err(diag) = self.skip_white_spaces() {
921                return Some(Err(diag));
922            }
923            if self.has_more() && self.next_char() == b';' {
924                // Stop if unquoted semicolon $)
925                self.index += 1;
926                return Some(Ok((Span::new(command_start, self.index), command)));
927            }
928        }
929        None
930    }
931}
932
933/// Slightly set.mm specific hack to extract a section name from a byte buffer.
934///
935/// This is run before parsing so it can't take advantage of comment extraction;
936/// instead we look for the first indented line, within a heuristic limit of 500
937/// bytes.
938#[must_use]
939pub fn guess_buffer_name(buffer: &[u8]) -> &str {
940    let buffer = &buffer[0..cmp::min(500, buffer.len())];
941    let ptr = get_heading_name(buffer, 0);
942
943    // fail gracefully, this is a debugging aid only
944    if ptr.is_empty() {
945        "<no section name found>"
946    } else {
947        str::from_utf8(ptr).unwrap_or("<invalid UTF-8 in section name>")
948    }
949}
950
951/// This function implements parsing stage 1, which breaks down the metalanguage
952/// grammar, finding all identifier definitions and inclusion statements.
953///
954/// There is an argument to be made that we shouldn't tokenize or store the math
955/// strings and proofs at this stage, since they're bulky and can easily be
956/// generated on demand.
957///
958/// The current Metamath spec defines comments and file inclusions at the token
959/// level.  It is useful for our purposes to parse comments that are strictly
960/// between statements as if they were statements (SMM2 did this too; may
961/// revisit) and we require file inclusions to be between statements at the top
962/// nesting level (this has been approved by Norman Megill).
963#[must_use]
964pub fn parse_segments(input: &BufferRef) -> Vec<Arc<Segment>> {
965    let mut closed_spans = Vec::new();
966    let mut scanner = Scanner {
967        buffer_ref: input.clone(),
968        buffer: input,
969        ..Scanner::default()
970    };
971    assert!(input.len() < FilePos::MAX as usize);
972
973    loop {
974        let (seg, last) = scanner.get_segment();
975        // we can almost use seg.next_file == Span::null here, but for the error
976        // case
977        closed_spans.push(Arc::new(seg));
978        if last {
979            return closed_spans;
980        }
981    }
982}
983
984/// Creates a new empty segment as a container for an I/O error.
985///
986/// Every error must be associated with a statement in our design, so associate
987/// it with the EOF statement of a zero-length segment.
988#[must_use]
989pub fn dummy_segment(diag: Diagnostic) -> Arc<Segment> {
990    let mut seg = parse_segments(&Arc::new(Vec::new())).pop().unwrap();
991    Arc::get_mut(&mut seg).unwrap().diagnostics.push((0, diag));
992    seg
993}
994
995impl SegmentSet {
996    pub(crate) fn build_typesetting_data(&self) -> TypesettingData {
997        fn parse_one(
998            data: &mut TypesettingData,
999            buf: &[u8],
1000            addr: StatementAddress,
1001            span: Span,
1002            command: &[CommandToken],
1003        ) -> Result<(), Diagnostic> {
1004            const fn as_string(span: Span, tok: Option<&CommandToken>) -> Result<Span, Diagnostic> {
1005                match tok {
1006                    Some(&CommandToken::String(s)) => Ok(s),
1007                    None => Err(Diagnostic::CommandIncomplete(span)),
1008                    Some(&CommandToken::Keyword(s)) => Err(Diagnostic::CommandExpectedString(s)),
1009                }
1010            }
1011
1012            let (cmd, mut rest) = match command.split_first() {
1013                Some((&CommandToken::Keyword(k), rest)) => (k, rest.iter()),
1014                _ => return Err(Diagnostic::BadCommand(span)),
1015            };
1016
1017            let sum = |mut rest: std::slice::Iter<'_, CommandToken>| {
1018                let mut span_out = as_string(span, rest.next())?;
1019                let mut accum: Vec<u8> = CommandToken::unescape_string(buf, span_out).into();
1020                span_out.start -= 1;
1021                loop {
1022                    match rest.next() {
1023                        None => {
1024                            span_out.end += 1;
1025                            return Ok(((addr.segment_id, span_out), accum.into()));
1026                        }
1027                        Some(CommandToken::Keyword(plus)) if plus.as_ref(buf) == b"+" => {}
1028                        _ => return Err(Diagnostic::CommandIncomplete(span)),
1029                    }
1030                    let span2 = as_string(span, rest.next())?;
1031                    CommandToken::append_unescaped_string(buf, span2, &mut accum);
1032                    span_out.end = span2.end
1033                }
1034            };
1035
1036            let as_ = |rest: &mut std::slice::Iter<'_, CommandToken>| {
1037                let s = as_string(span, rest.next())?;
1038                match rest.next() {
1039                    Some(CommandToken::Keyword(as_)) if as_.as_ref(buf) == b"as" => Ok(s),
1040                    None => Err(Diagnostic::CommandIncomplete(span)),
1041                    Some(tk) => Err(Diagnostic::CommandExpectedAs(tk.full_span())),
1042                }
1043            };
1044            let mut insert = |kind: MarkupKind, sp: Span, (sp2, val): (GlobalSpan, Token)| {
1045                let map = match kind {
1046                    MarkupKind::Html => &mut data.html_defs,
1047                    MarkupKind::AltHtml => &mut data.alt_html_defs,
1048                    MarkupKind::Latex => &mut data.latex_defs,
1049                };
1050                match map.entry(CommandToken::unescape_string(buf, sp).into()) {
1051                    Entry::Occupied(e) => {
1052                        let (sp2, (id2, _), _) = *e.get();
1053                        data.diagnostics
1054                            .push((addr, Diagnostic::DuplicateMarkupDef(kind, (id2, sp2), sp)))
1055                    }
1056                    Entry::Vacant(e) => {
1057                        e.insert((sp, sp2, val));
1058                    }
1059                }
1060            };
1061            match cmd.as_ref(buf) {
1062                b"latexdef" => insert(MarkupKind::Latex, as_(&mut rest)?, sum(rest)?),
1063                b"htmldef" => insert(MarkupKind::Html, as_(&mut rest)?, sum(rest)?),
1064                b"althtmldef" => insert(MarkupKind::AltHtml, as_(&mut rest)?, sum(rest)?),
1065                b"htmlvarcolor" => data.html_var_color.push(sum(rest)?),
1066                b"htmltitle" => data.html_title = Some(sum(rest)?),
1067                b"htmlhome" => data.html_home = Some(sum(rest)?),
1068                b"exthtmltitle" => data.ext_html_title = Some(sum(rest)?),
1069                b"exthtmlhome" => data.ext_html_home = Some(sum(rest)?),
1070                b"exthtmllabel" => data.ext_html_label = Some(sum(rest)?),
1071                b"htmldir" => data.html_dir = Some(sum(rest)?),
1072                b"althtmldir" => data.alt_html_dir = Some(sum(rest)?),
1073                b"htmlbibliography" => data.html_bibliography = Some(sum(rest)?),
1074                b"exthtmlbibliography" => data.ext_html_bibliography = Some(sum(rest)?),
1075                b"htmlcss" => data.html_css = Some(sum(rest)?),
1076                b"htmlfont" => data.html_font = Some(sum(rest)?),
1077                b"htmlexturl" => data.html_ext_url = Some(sum(rest)?),
1078                _ => return Err(Diagnostic::UnknownTypesettingCommand(cmd)),
1079            }
1080            Ok(())
1081        }
1082
1083        let mut data = TypesettingData::default();
1084        for seg in self.segments(..) {
1085            for &(ix, (span, ref command)) in &seg.t_commands {
1086                let address = StatementAddress::new(seg.id, ix);
1087                if let Err(diag) = parse_one(&mut data, &seg.buffer, address, span, command) {
1088                    data.diagnostics.push((address, diag))
1089                }
1090            }
1091        }
1092        data
1093    }
1094}
1095
1096/// A parsed heading comment.
1097#[derive(Debug, Clone, Copy)]
1098pub struct HeadingComment {
1099    /// The header part of the heading comment (the text of the header)
1100    pub header: Span,
1101    /// The content part of the heading comment (descriptive text regarding the header)
1102    pub content: Span,
1103}
1104
1105impl HeadingComment {
1106    /// Parses a heading comment at the given span in a segment buffer,
1107    /// with the specified heading level and span. Returns `None` if this is not a heading comment
1108    /// or it is malformed.
1109    #[must_use]
1110    pub fn parse(buf: &[u8], lvl: HeadingLevel, sp: Span) -> Option<Self> {
1111        static MAJOR_PART: OnceLock<Regex> = OnceLock::new();
1112        static SECTION: OnceLock<Regex> = OnceLock::new();
1113        static SUBSECTION: OnceLock<Regex> = OnceLock::new();
1114        static SUBSUBSECTION: OnceLock<Regex> = OnceLock::new();
1115        let regex = match lvl {
1116            HeadingLevel::MajorPart => MAJOR_PART.get_or_init(|| {
1117                Regex::new(r"^[ \r\n]+#{4,}\r?\n *([^\n]*)\r?\n#{4,}\r?\n").unwrap()
1118            }),
1119            HeadingLevel::Section => SECTION.get_or_init(|| {
1120                Regex::new(r"^[ \r\n]+(?:#\*){2,}#?\r?\n *([^\n]*)\r?\n(?:#\*){2,}#?\r?\n").unwrap()
1121            }),
1122            HeadingLevel::SubSection => SUBSECTION.get_or_init(|| {
1123                Regex::new(r"^[ \r\n]+(?:=-){2,}=?\r?\n *([^\n]*)\r?\n(?:=-){2,}=?\r?\n").unwrap()
1124            }),
1125            HeadingLevel::SubSubSection => SUBSUBSECTION.get_or_init(|| {
1126                Regex::new(r"^[ \r\n]+(?:-\.){2,}-?\r?\n *([^\n]*)\r?\n(?:-\.){2,}-?\r?\n").unwrap()
1127            }),
1128            _ => unreachable!(),
1129        };
1130        let groups = regex.captures(sp.as_ref(buf))?;
1131        let m = groups.get(1)?;
1132        Some(Self {
1133            header: Span::new2(sp.start + m.start() as u32, sp.start + m.end() as u32),
1134            content: Span::new2(sp.start + groups.get(0)?.end() as u32, sp.end),
1135        })
1136    }
1137
1138    /// Parses a mathbox heading comment, returning the span of the author name.
1139    #[must_use]
1140    pub fn parse_mathbox_header(&self, buf: &[u8]) -> Option<Span> {
1141        static MATHBOX_FOR: OnceLock<Regex> = OnceLock::new();
1142        let mathbox_for = MATHBOX_FOR.get_or_init(|| Regex::new(r"^Mathbox for (.*)$").unwrap());
1143        let m = mathbox_for.captures(self.header.as_ref(buf))?.get(1)?;
1144        Some(Span::new2(
1145            self.header.start + m.start() as u32,
1146            self.header.start + m.end() as u32,
1147        ))
1148    }
1149}