Skip to main content

lex_core/lex/parsing/
parser.rs

1//! Declarative Grammar Engine - Regex & Imperative Parser for lex
2//!
3//! This module implements a unified parser using declarative regex grammar rules
4//! with imperative fallbacks for patterns that need look-ahead:
5//! 1. Converts token sequences to grammar notation strings
6//! 2. Matches against regex patterns in declaration order
7//! 3. Falls back to imperative matchers (verbatim blocks, paragraphs)
8//! 4. Extracts consumed token indices from regex match
9//! 5. Recursively descends into containers when building AST
10//!
11//! The grammar patterns and AST building logic have been extracted to separate modules:
12//! - `grammar.rs` - Pattern definitions and matching order
13//! - `builder.rs` - AST node construction from matched patterns
14
15use crate::lex::parsing::ir::{NodeType, ParseNode};
16use crate::lex::token::{LineContainer, LineType};
17use regex::Regex;
18use std::ops::Range;
19
20mod builder;
21mod grammar;
22
23use builder::{
24    blank_line_node_from_range, container_starts_with_pipe_row, convert_pattern_to_node,
25    PatternMatch,
26};
27use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
28
29/// Pattern matcher for declarative grammar using regex-based matching
30pub struct GrammarMatcher;
31
32impl GrammarMatcher {
33    /// Try to match a pattern at the current level using regex patterns.
34    ///
35    /// Converts the current token sequence to a grammar string, matches against
36    /// regex patterns in declaration order, and returns the matched pattern with
37    /// consumed token indices.
38    ///
39    /// Returns (matched_pattern, consumed_indices)
40    fn try_match(
41        tokens: &[LineContainer],
42        start_idx: usize,
43        allow_sessions: bool,
44        is_first_item: bool,
45        has_preceding_blank: bool,
46        has_preceding_boundary: bool,
47        prev_was_session: bool,
48    ) -> Option<(PatternMatch, Range<usize>)> {
49        if start_idx >= tokens.len() {
50            return None;
51        }
52
53        // Try verbatim block first (requires special imperative matching logic)
54        if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
55            return Some(result);
56        }
57
58        // Try table: subject + container whose first non-blank line is a pipe row.
59        // Must run before the definition pattern (which matches the same subject + container).
60        if let Some(result) = Self::match_table(tokens, start_idx) {
61            return Some(result);
62        }
63
64        // Convert remaining tokens to grammar string
65        let remaining_tokens = &tokens[start_idx..];
66        let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
67
68        // Try each pattern in order
69        for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
70            // Skip patterns handled imperatively above
71            if *pattern_name == "verbatim_block" {
72                continue;
73            }
74            if let Ok(regex) = Regex::new(pattern_regex_str) {
75                if let Some(caps) = regex.captures(&token_string) {
76                    let full_match = caps.get(0)?;
77                    let consumed_count = Self::count_consumed_tokens(full_match.as_str());
78
79                    // Use captures to extract indices and build the pattern
80                    let pattern = match *pattern_name {
81                        "annotation_block" => PatternMatch::AnnotationBlock {
82                            start_idx: 0,
83                            content_idx: 1,
84                        },
85                        "annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
86                        "list_no_blank" => {
87                            // List without preceding blank line
88                            let items_str = caps.name("items")?.as_str();
89                            let mut items = Vec::new();
90                            let mut token_idx = 0; // No blank line, so start at 0
91                            for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
92                                let has_container = item_cap.as_str().contains("<container>");
93                                items.push((
94                                    token_idx,
95                                    if has_container {
96                                        Some(token_idx + 1)
97                                    } else {
98                                        None
99                                    },
100                                ));
101                                token_idx += if has_container { 2 } else { 1 };
102                            }
103
104                            let trailing_blank_count = caps
105                                .name("trailing_blank")
106                                .map(|m| Self::count_consumed_tokens(m.as_str()))
107                                .unwrap_or(0);
108                            let trailing_blank_range = if trailing_blank_count > 0 {
109                                Some(
110                                    start_idx + consumed_count - trailing_blank_count
111                                        ..start_idx + consumed_count,
112                                )
113                            } else {
114                                None
115                            };
116
117                            PatternMatch::List {
118                                items,
119                                preceding_blank_range: None,
120                                trailing_blank_range,
121                            }
122                        }
123                        "list" => {
124                            let blank_count = caps
125                                .name("blank")
126                                .map(|m| Self::count_consumed_tokens(m.as_str()))
127                                .unwrap_or(0);
128                            let items_str = caps.name("items")?.as_str();
129                            let mut items = Vec::new();
130                            let mut token_idx = blank_count;
131                            for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
132                                let has_container = item_cap.as_str().contains("<container>");
133                                items.push((
134                                    token_idx,
135                                    if has_container {
136                                        Some(token_idx + 1)
137                                    } else {
138                                        None
139                                    },
140                                ));
141                                token_idx += if has_container { 2 } else { 1 };
142                            }
143                            let trailing_blank_count = caps
144                                .name("trailing_blank")
145                                .map(|m| Self::count_consumed_tokens(m.as_str()))
146                                .unwrap_or(0);
147                            let preceding_blank_range = if blank_count > 0 {
148                                Some(start_idx..start_idx + blank_count)
149                            } else {
150                                None
151                            };
152                            let trailing_blank_range = if trailing_blank_count > 0 {
153                                Some(
154                                    start_idx + consumed_count - trailing_blank_count
155                                        ..start_idx + consumed_count,
156                                )
157                            } else {
158                                None
159                            };
160
161                            PatternMatch::List {
162                                items,
163                                preceding_blank_range,
164                                trailing_blank_range,
165                            }
166                        }
167                        "session" => {
168                            // Allow session_no_blank in these cases:
169                            // 1. At document start (is_first_item=true), OR
170                            // 2. At container start when sessions are allowed (start_idx=0 && allow_sessions=true), OR
171                            // 3. After a BlankLineGroup when sessions are allowed (has_preceding_blank && allow_sessions)
172                            // 4. Immediately after another session (prev_was_session && allow_sessions)
173                            // 5. Immediately after a container that just closed (has_preceding_boundary && allow_sessions)
174                            // This prevents Sessions inside Definitions while allowing legitimate session sequences.
175                            if !allow_sessions {
176                                continue; // Definitions and other containers don't allow sessions
177                            }
178                            if !(is_first_item
179                                || start_idx == 0
180                                || has_preceding_blank
181                                || has_preceding_boundary
182                                || prev_was_session)
183                            {
184                                continue; // Sessions need a separator or another session before them
185                            }
186                            let blank_str = caps.name("blank")?.as_str();
187                            let blank_count = Self::count_consumed_tokens(blank_str);
188                            PatternMatch::Session {
189                                subject_idx: 0,
190                                content_idx: 1 + blank_count,
191                                preceding_blank_range: None,
192                            }
193                        }
194                        "definition" => PatternMatch::Definition {
195                            subject_idx: 0,
196                            content_idx: 1,
197                        },
198                        "blank_line_group" => PatternMatch::BlankLineGroup,
199                        "document_title_with_subtitle" => {
200                            // No container lookahead needed: the subtitle variant
201                            // consumed two lines (title + subtitle) before blank lines.
202                            // A session only has one line before blank + container, so
203                            // the presence of a container after the blank is NOT ambiguous
204                            // here — it's the document body, not a session body.
205                            // Match: DocumentStart(0) + title(1) + subtitle(2) + blank lines
206                            PatternMatch::DocumentTitle {
207                                title_idx: 1,
208                                subtitle_idx: Some(2),
209                            }
210                        }
211                        "document_title" => {
212                            // Imperative negative lookahead: not followed by container
213                            let next_idx = start_idx + consumed_count;
214                            if next_idx < tokens.len()
215                                && matches!(&tokens[next_idx], LineContainer::Container { .. })
216                            {
217                                // Followed by container — this is a session, not a title
218                                continue;
219                            }
220                            // Match is: DocumentStart(0) + title line(1) + blank lines
221                            PatternMatch::DocumentTitle {
222                                title_idx: 1,
223                                subtitle_idx: None,
224                            }
225                        }
226                        "document_start" => PatternMatch::DocumentStart,
227                        _ => continue,
228                    };
229
230                    return Some((pattern, start_idx..start_idx + consumed_count));
231                }
232            }
233        }
234
235        // Paragraph: matched imperatively after all regex patterns fail.
236        // Stops before element boundaries (list starts, definition starts).
237        Self::match_paragraph(tokens, start_idx)
238    }
239
240    /// Convert remaining tokens to grammar notation string
241    fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
242        let mut result = String::new();
243        for token in tokens {
244            match token {
245                LineContainer::Token(t) => {
246                    result.push_str(&t.line_type.to_grammar_string());
247                }
248                LineContainer::Container { .. } => {
249                    result.push_str("<container>");
250                }
251            }
252        }
253        if result.is_empty() {
254            None
255        } else {
256            Some(result)
257        }
258    }
259
260    /// Count how many tokens are represented in a grammar string.
261    /// Each token type in angle brackets represents one token.
262    fn count_consumed_tokens(grammar_str: &str) -> usize {
263        grammar_str.matches('<').count()
264    }
265
266    /// Match paragraphs using imperative logic.
267    ///
268    /// Consumes content lines (paragraph, dialog, subject, list) one at a time,
269    /// stopping before sequences that form other block elements:
270    /// - Before 2+ consecutive list-like lines (list start)
271    /// - Before a subject line followed by a container (definition start)
272    fn match_paragraph(
273        tokens: &[LineContainer],
274        start_idx: usize,
275    ) -> Option<(PatternMatch, Range<usize>)> {
276        use LineType::*;
277
278        let len = tokens.len();
279        let mut idx = start_idx;
280
281        while idx < len {
282            match &tokens[idx] {
283                LineContainer::Token(t) => match t.line_type {
284                    ParagraphLine | DialogLine => {
285                        idx += 1;
286                    }
287                    SubjectLine => {
288                        // Stop if followed by container (definition start)
289                        if Self::next_is_container(tokens, idx) {
290                            break;
291                        }
292                        idx += 1;
293                    }
294                    SubjectOrListItemLine => {
295                        // Stop if followed by container (definition start)
296                        if Self::next_is_container(tokens, idx) {
297                            break;
298                        }
299                        // Stop if followed by another list-like line (list start)
300                        if Self::next_is_list_like(tokens, idx) {
301                            break;
302                        }
303                        idx += 1;
304                    }
305                    ListLine => {
306                        // Stop if followed by another list-like line, possibly
307                        // with a container in between (list start)
308                        if Self::next_is_list_continuation(tokens, idx) {
309                            break;
310                        }
311                        idx += 1;
312                    }
313                    _ => break, // Blank line, annotation, document-start, etc.
314                },
315                LineContainer::Container { .. } => break,
316            }
317        }
318
319        if idx > start_idx {
320            Some((
321                PatternMatch::Paragraph {
322                    start_idx: 0,
323                    end_idx: idx - start_idx - 1,
324                },
325                start_idx..idx,
326            ))
327        } else {
328            None
329        }
330    }
331
332    /// Check if the token after `idx` is a Container.
333    fn next_is_container(tokens: &[LineContainer], idx: usize) -> bool {
334        let next = idx + 1;
335        next < tokens.len() && matches!(&tokens[next], LineContainer::Container { .. })
336    }
337
338    /// Check if the token after `idx` is a list-like line (ListLine or SubjectOrListItemLine).
339    fn next_is_list_like(tokens: &[LineContainer], idx: usize) -> bool {
340        let next = idx + 1;
341        if next >= tokens.len() {
342            return false;
343        }
344        matches!(
345            &tokens[next],
346            LineContainer::Token(t) if matches!(t.line_type, LineType::ListLine | LineType::SubjectOrListItemLine)
347        )
348    }
349
350    /// Check if the token after `idx` starts a list continuation:
351    /// either directly another list-like line, or a container followed by a list-like line.
352    fn next_is_list_continuation(tokens: &[LineContainer], idx: usize) -> bool {
353        let next = idx + 1;
354        if next >= tokens.len() {
355            return false;
356        }
357        match &tokens[next] {
358            LineContainer::Token(t) => {
359                matches!(
360                    t.line_type,
361                    LineType::ListLine | LineType::SubjectOrListItemLine
362                )
363            }
364            LineContainer::Container { .. } => {
365                // Container after list item — check if another list item follows
366                let after = next + 1;
367                after < tokens.len()
368                    && matches!(
369                        &tokens[after],
370                        LineContainer::Token(t) if matches!(t.line_type, LineType::ListLine | LineType::SubjectOrListItemLine)
371                    )
372            }
373        }
374    }
375
376    /// Match tables using imperative logic.
377    ///
378    /// A table is a subject line followed immediately by a container whose first
379    /// non-blank line starts with a pipe character. This runs before the definition
380    /// pattern (which matches the same `subject + container` shape) to ensure
381    /// tables are detected by their content.
382    fn match_table(
383        tokens: &[LineContainer],
384        start_idx: usize,
385    ) -> Option<(PatternMatch, Range<usize>)> {
386        use LineType::{SubjectLine, SubjectOrListItemLine};
387
388        if start_idx >= tokens.len() {
389            return None;
390        }
391
392        // Must start with a subject line
393        let is_subject = matches!(
394            &tokens[start_idx],
395            LineContainer::Token(line) if matches!(line.line_type, SubjectLine | SubjectOrListItemLine)
396        );
397        if !is_subject {
398            return None;
399        }
400
401        // Must be immediately followed by a container
402        let content_idx = start_idx + 1;
403        if content_idx >= tokens.len() {
404            return None;
405        }
406        let container = &tokens[content_idx];
407        if !matches!(container, LineContainer::Container { .. }) {
408            return None;
409        }
410
411        // Container's first non-blank line must start with a pipe
412        if !container_starts_with_pipe_row(container) {
413            return None;
414        }
415
416        Some((
417            PatternMatch::Table {
418                subject_idx: 0,
419                content_idx: 1,
420            },
421            start_idx..content_idx + 1,
422        ))
423    }
424
425    /// Match verbatim blocks using imperative logic.
426    ///
427    /// Verbatim blocks consist of:
428    /// 1. A subject line
429    /// 2. Content that is either:
430    ///    a) In a Container (inflow mode - content indented relative to subject)
431    ///    b) Flat lines (fullwidth mode - content at fixed column, or groups)
432    /// 3. A closing annotation marker (:: ... ::)
433    ///
434    /// This matcher handles both the original inflow case (subject + container + annotation)
435    /// and the fullwidth case (subject + flat lines + annotation). To distinguish verbatim
436    /// blocks from sessions followed by annotations, we require that either:
437    /// - There's a Container immediately after the subject, OR
438    /// - The closing annotation is at the SAME indentation as the subject
439    ///
440    /// Sessions have their title at the root level and content is indented. If we see
441    /// a root-level annotation after a root-level subject with indented content between,
442    /// that's NOT a verbatim block - it's a session followed by an annotation.
443    fn match_verbatim_block(
444        tokens: &[LineContainer],
445        start_idx: usize,
446    ) -> Option<(PatternMatch, Range<usize>)> {
447        use LineType::{
448            BlankLine, DataMarkerLine, DocumentStart, SubjectLine, SubjectOrListItemLine,
449        };
450
451        let len = tokens.len();
452        if start_idx >= len {
453            return None;
454        }
455
456        // Allow blank lines and DocumentStart before the subject to be consumed as part of this match
457        let mut idx = start_idx;
458        while idx < len {
459            if let LineContainer::Token(line) = &tokens[idx] {
460                if line.line_type == BlankLine || line.line_type == DocumentStart {
461                    idx += 1;
462                    continue;
463                }
464            }
465            break;
466        }
467
468        if idx >= len {
469            return None;
470        }
471
472        // Must start with a subject line
473        let first_subject_idx = match &tokens[idx] {
474            LineContainer::Token(line)
475                if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
476            {
477                idx
478            }
479            _ => return None,
480        };
481
482        let mut cursor = first_subject_idx + 1;
483
484        // Try to match one or more subject+content pairs followed by closing annotation
485        // This loop handles verbatim groups: multiple subjects sharing one closing annotation
486        loop {
487            // Skip blank lines
488            while cursor < len {
489                if let LineContainer::Token(line) = &tokens[cursor] {
490                    if line.line_type == BlankLine {
491                        cursor += 1;
492                        continue;
493                    }
494                }
495                break;
496            }
497
498            if cursor >= len {
499                return None;
500            }
501
502            // Check what we have at cursor
503            match &tokens[cursor] {
504                LineContainer::Container { .. } => {
505                    // Found a container - this is potentially inflow mode verbatim content
506                    // But we need to verify the pattern:
507                    // - Verbatim: subject + container + (annotation OR another subject+container)
508                    // - Session: subject + container + (other content)
509                    cursor += 1;
510
511                    // Skip blank lines after container
512                    while cursor < len {
513                        if let LineContainer::Token(line) = &tokens[cursor] {
514                            if line.line_type == BlankLine {
515                                cursor += 1;
516                                continue;
517                            }
518                        }
519                        break;
520                    }
521
522                    // After container, check what follows
523                    if cursor >= len {
524                        return None; // Container at end - not a verbatim block
525                    }
526
527                    match &tokens[cursor] {
528                        LineContainer::Token(line) => {
529                            if matches!(line.line_type, DataMarkerLine) {
530                                // Container followed by closing annotation (:: label ::) - this IS verbatim!
531                                // Continue loop to match it
532                                continue;
533                            }
534                            if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
535                                // Container followed by another subject - this is a verbatim group!
536                                // Continue loop to match more groups
537                                continue;
538                            }
539                            // Container followed by something else - NOT a verbatim block
540                            return None;
541                        }
542                        LineContainer::Container { .. } => {
543                            // Container followed by another container - NOT verbatim pattern
544                            return None;
545                        }
546                    }
547                }
548                LineContainer::Token(line) => {
549                    if matches!(line.line_type, DataMarkerLine) {
550                        // Found closing annotation (:: label ::) - success!
551                        // But only if we haven't mixed containers with flat content in a problematic way
552                        return Some((
553                            PatternMatch::VerbatimBlock {
554                                subject_idx: first_subject_idx,
555                                content_range: (first_subject_idx + 1)..cursor,
556                                closing_idx: cursor,
557                            },
558                            start_idx..(cursor + 1),
559                        ));
560                    }
561
562                    if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
563                        // Another subject - this is another group
564                        cursor += 1;
565                        continue;
566                    }
567
568                    // Any other flat token (paragraph line, etc.)
569                    // This is fullwidth mode or group content
570                    cursor += 1;
571                }
572            }
573        }
574    }
575}
576
577/// Main recursive descent parser using the declarative grammar.
578///
579/// This is the entry point for parsing a sequence of tokens at any level.
580/// It iteratively tries to match patterns and recursively descends into containers.
581pub fn parse_with_declarative_grammar(
582    tokens: Vec<LineContainer>,
583    source: &str,
584) -> Result<Vec<ParseNode>, String> {
585    parse_with_declarative_grammar_internal(tokens, source, true, true)
586}
587
588/// Internal parsing function with nesting level tracking
589fn parse_with_declarative_grammar_internal(
590    tokens: Vec<LineContainer>,
591    source: &str,
592    allow_sessions: bool,
593    is_doc_start: bool,
594) -> Result<Vec<ParseNode>, String> {
595    let mut items: Vec<ParseNode> = Vec::new();
596    let mut idx = 0;
597
598    while idx < tokens.len() {
599        let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
600            if let Some(last_node) = items.last() {
601                (
602                    matches!(last_node.node_type, NodeType::BlankLineGroup),
603                    // A node with children indicates we just closed a container; this counts as a boundary.
604                    // DocumentStart and DocumentTitle also count as boundaries.
605                    !last_node.children.is_empty()
606                        || matches!(
607                            last_node.node_type,
608                            NodeType::DocumentStart | NodeType::DocumentTitle
609                        ),
610                    matches!(last_node.node_type, NodeType::Session),
611                )
612            } else {
613                (false, false, false)
614            };
615
616        let is_first_item = idx == 0 && is_doc_start;
617        if let Some((pattern, range)) = GrammarMatcher::try_match(
618            &tokens,
619            idx,
620            allow_sessions,
621            is_first_item,
622            has_preceding_blank,
623            has_preceding_boundary,
624            prev_was_session,
625        ) {
626            let mut pending_nodes = Vec::new();
627
628            if let PatternMatch::List {
629                preceding_blank_range: Some(blank_range),
630                ..
631            } = &pattern
632            {
633                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
634            }
635
636            if let PatternMatch::Session {
637                preceding_blank_range: Some(blank_range),
638                ..
639            } = &pattern
640            {
641                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
642            }
643
644            // Convert pattern to ParseNode
645            // Sessions parse their children with allow_sessions=true to allow nested sessions
646            // Other elements parse with allow_sessions=false to prevent sessions inside them
647            let is_session = matches!(&pattern, PatternMatch::Session { .. });
648            let item = convert_pattern_to_node(
649                &tokens,
650                &pattern,
651                range.clone(),
652                source,
653                &move |children, src| {
654                    parse_with_declarative_grammar_internal(children, src, is_session, false)
655                },
656            )?;
657            pending_nodes.push(item);
658
659            if let PatternMatch::List {
660                trailing_blank_range: Some(blank_range),
661                ..
662            } = &pattern
663            {
664                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
665            }
666
667            items.extend(pending_nodes);
668            idx = range.end;
669        } else {
670            // When no pattern matches, check if this is a Container (orphaned indented content).
671            // Rather than silently dropping it, parse its children and promote them to this level.
672            if let LineContainer::Container {
673                children: inner, ..
674            } = &tokens[idx]
675            {
676                if !inner.is_empty() {
677                    let orphaned = parse_with_declarative_grammar_internal(
678                        inner.clone(),
679                        source,
680                        allow_sessions,
681                        false,
682                    )?;
683                    items.extend(orphaned);
684                }
685            }
686            idx += 1;
687        }
688    }
689
690    Ok(items)
691}