Skip to main content

lex_core/lex/parsing/
parser.rs

1//! Declarative Grammar Engine - Regex-Based Parser for lex
2//!
3//! This module implements a unified parser using declarative regex grammar rules:
4//! 1. Converts token sequences to grammar notation strings
5//! 2. Matches against regex patterns in declaration order
6//! 3. Extracts consumed token indices from regex match
7//! 4. Recursively descends into containers when building AST
8//! 5. No imperative pattern matching - grammar is data, not code
9//!
10//! The grammar patterns and AST building logic have been extracted to separate modules:
11//! - `grammar.rs` - Pattern definitions and matching order
12//! - `builder.rs` - AST node construction from matched patterns
13
14use crate::lex::parsing::ir::{NodeType, ParseNode};
15use crate::lex::token::{LineContainer, LineType};
16use regex::Regex;
17use std::ops::Range;
18
19mod builder;
20mod grammar;
21
22use builder::{blank_line_node_from_range, convert_pattern_to_node, PatternMatch};
23use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
24
25/// Pattern matcher for declarative grammar using regex-based matching
26pub struct GrammarMatcher;
27
28impl GrammarMatcher {
29    /// Try to match a pattern at the current level using regex patterns.
30    ///
31    /// Converts the current token sequence to a grammar string, matches against
32    /// regex patterns in declaration order, and returns the matched pattern with
33    /// consumed token indices.
34    ///
35    /// Returns (matched_pattern, consumed_indices)
36    fn try_match(
37        tokens: &[LineContainer],
38        start_idx: usize,
39        allow_sessions: bool,
40        is_first_item: bool,
41        has_preceding_blank: bool,
42        has_preceding_boundary: bool,
43        prev_was_session: bool,
44    ) -> Option<(PatternMatch, Range<usize>)> {
45        if start_idx >= tokens.len() {
46            return None;
47        }
48
49        // Try verbatim block first (requires special imperative matching logic)
50        if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
51            return Some(result);
52        }
53
54        // Convert remaining tokens to grammar string
55        let remaining_tokens = &tokens[start_idx..];
56        let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
57
58        // Try each pattern in order
59        for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
60            // Skip patterns handled imperatively above
61            if *pattern_name == "verbatim_block" {
62                continue;
63            }
64            if let Ok(regex) = Regex::new(pattern_regex_str) {
65                if let Some(caps) = regex.captures(&token_string) {
66                    let full_match = caps.get(0)?;
67                    let consumed_count = Self::count_consumed_tokens(full_match.as_str());
68
69                    // Use captures to extract indices and build the pattern
70                    let pattern = match *pattern_name {
71                        "annotation_block_with_end" => PatternMatch::AnnotationBlock {
72                            start_idx: 0,
73                            content_idx: 1,
74                        },
75                        "annotation_block" => PatternMatch::AnnotationBlock {
76                            start_idx: 0,
77                            content_idx: 1,
78                        },
79                        "annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
80                        "list_no_blank" => {
81                            // List without preceding blank line
82                            let items_str = caps.name("items")?.as_str();
83                            let mut items = Vec::new();
84                            let mut token_idx = 0; // No blank line, so start at 0
85                            for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
86                                let has_container = item_cap.as_str().contains("<container>");
87                                items.push((
88                                    token_idx,
89                                    if has_container {
90                                        Some(token_idx + 1)
91                                    } else {
92                                        None
93                                    },
94                                ));
95                                token_idx += if has_container { 2 } else { 1 };
96                            }
97
98                            let trailing_blank_count = caps
99                                .name("trailing_blank")
100                                .map(|m| Self::count_consumed_tokens(m.as_str()))
101                                .unwrap_or(0);
102                            let trailing_blank_range = if trailing_blank_count > 0 {
103                                Some(
104                                    start_idx + consumed_count - trailing_blank_count
105                                        ..start_idx + consumed_count,
106                                )
107                            } else {
108                                None
109                            };
110
111                            PatternMatch::List {
112                                items,
113                                preceding_blank_range: None,
114                                trailing_blank_range,
115                            }
116                        }
117                        "list" => {
118                            let blank_count = caps
119                                .name("blank")
120                                .map(|m| Self::count_consumed_tokens(m.as_str()))
121                                .unwrap_or(0);
122                            let items_str = caps.name("items")?.as_str();
123                            let mut items = Vec::new();
124                            let mut token_idx = blank_count;
125                            for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
126                                let has_container = item_cap.as_str().contains("<container>");
127                                items.push((
128                                    token_idx,
129                                    if has_container {
130                                        Some(token_idx + 1)
131                                    } else {
132                                        None
133                                    },
134                                ));
135                                token_idx += if has_container { 2 } else { 1 };
136                            }
137                            let trailing_blank_count = caps
138                                .name("trailing_blank")
139                                .map(|m| Self::count_consumed_tokens(m.as_str()))
140                                .unwrap_or(0);
141                            let preceding_blank_range = if blank_count > 0 {
142                                Some(start_idx..start_idx + blank_count)
143                            } else {
144                                None
145                            };
146                            let trailing_blank_range = if trailing_blank_count > 0 {
147                                Some(
148                                    start_idx + consumed_count - trailing_blank_count
149                                        ..start_idx + consumed_count,
150                                )
151                            } else {
152                                None
153                            };
154
155                            PatternMatch::List {
156                                items,
157                                preceding_blank_range,
158                                trailing_blank_range,
159                            }
160                        }
161                        "session" => {
162                            // Allow session_no_blank in these cases:
163                            // 1. At document start (is_first_item=true), OR
164                            // 2. At container start when sessions are allowed (start_idx=0 && allow_sessions=true), OR
165                            // 3. After a BlankLineGroup when sessions are allowed (has_preceding_blank && allow_sessions)
166                            // 4. Immediately after another session (prev_was_session && allow_sessions)
167                            // 5. Immediately after a container that just closed (has_preceding_boundary && allow_sessions)
168                            // This prevents Sessions inside Definitions while allowing legitimate session sequences.
169                            if !allow_sessions {
170                                continue; // Definitions and other containers don't allow sessions
171                            }
172                            if !(is_first_item
173                                || start_idx == 0
174                                || has_preceding_blank
175                                || has_preceding_boundary
176                                || prev_was_session)
177                            {
178                                continue; // Sessions need a separator or another session before them
179                            }
180                            let blank_str = caps.name("blank")?.as_str();
181                            let blank_count = Self::count_consumed_tokens(blank_str);
182                            PatternMatch::Session {
183                                subject_idx: 0,
184                                content_idx: 1 + blank_count,
185                                preceding_blank_range: None,
186                            }
187                        }
188                        "definition" => PatternMatch::Definition {
189                            subject_idx: 0,
190                            content_idx: 1,
191                        },
192                        "paragraph" => PatternMatch::Paragraph {
193                            start_idx: 0,
194                            end_idx: consumed_count - 1,
195                        },
196                        "blank_line_group" => PatternMatch::BlankLineGroup,
197                        "document_start" => PatternMatch::DocumentStart,
198                        _ => continue,
199                    };
200
201                    return Some((pattern, start_idx..start_idx + consumed_count));
202                }
203            }
204        }
205
206        None
207    }
208
209    /// Convert remaining tokens to grammar notation string
210    fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
211        let mut result = String::new();
212        for token in tokens {
213            match token {
214                LineContainer::Token(t) => {
215                    result.push_str(&t.line_type.to_grammar_string());
216                }
217                LineContainer::Container { .. } => {
218                    result.push_str("<container>");
219                }
220            }
221        }
222        if result.is_empty() {
223            None
224        } else {
225            Some(result)
226        }
227    }
228
229    /// Count how many tokens are represented in a grammar string.
230    /// Each token type in angle brackets represents one token.
231    fn count_consumed_tokens(grammar_str: &str) -> usize {
232        grammar_str.matches('<').count()
233    }
234
235    /// Match verbatim blocks using imperative logic.
236    ///
237    /// Verbatim blocks consist of:
238    /// 1. A subject line
239    /// 2. Content that is either:
240    ///    a) In a Container (inflow mode - content indented relative to subject)
241    ///    b) Flat lines (fullwidth mode - content at fixed column, or groups)
242    /// 3. A closing annotation marker (:: ... ::)
243    ///
244    /// This matcher handles both the original inflow case (subject + container + annotation)
245    /// and the fullwidth case (subject + flat lines + annotation). To distinguish verbatim
246    /// blocks from sessions followed by annotations, we require that either:
247    /// - There's a Container immediately after the subject, OR
248    /// - The closing annotation is at the SAME indentation as the subject
249    ///
250    /// Sessions have their title at the root level and content is indented. If we see
251    /// a root-level annotation after a root-level subject with indented content between,
252    /// that's NOT a verbatim block - it's a session followed by an annotation.
253    fn match_verbatim_block(
254        tokens: &[LineContainer],
255        start_idx: usize,
256    ) -> Option<(PatternMatch, Range<usize>)> {
257        use LineType::{
258            AnnotationStartLine, BlankLine, DocumentStart, SubjectLine, SubjectOrListItemLine,
259        };
260
261        let len = tokens.len();
262        if start_idx >= len {
263            return None;
264        }
265
266        // Allow blank lines and DocumentStart before the subject to be consumed as part of this match
267        let mut idx = start_idx;
268        while idx < len {
269            if let LineContainer::Token(line) = &tokens[idx] {
270                if line.line_type == BlankLine || line.line_type == DocumentStart {
271                    idx += 1;
272                    continue;
273                }
274            }
275            break;
276        }
277
278        if idx >= len {
279            return None;
280        }
281
282        // Must start with a subject line
283        let first_subject_idx = match &tokens[idx] {
284            LineContainer::Token(line)
285                if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
286            {
287                idx
288            }
289            _ => return None,
290        };
291
292        let mut cursor = first_subject_idx + 1;
293
294        // Try to match one or more subject+content pairs followed by closing annotation
295        // This loop handles verbatim groups: multiple subjects sharing one closing annotation
296        loop {
297            // Skip blank lines
298            while cursor < len {
299                if let LineContainer::Token(line) = &tokens[cursor] {
300                    if line.line_type == BlankLine {
301                        cursor += 1;
302                        continue;
303                    }
304                }
305                break;
306            }
307
308            if cursor >= len {
309                return None;
310            }
311
312            // Check what we have at cursor
313            match &tokens[cursor] {
314                LineContainer::Container { .. } => {
315                    // Found a container - this is potentially inflow mode verbatim content
316                    // But we need to verify the pattern:
317                    // - Verbatim: subject + container + (annotation OR another subject+container)
318                    // - Session: subject + container + (other content)
319                    cursor += 1;
320
321                    // Skip blank lines after container
322                    while cursor < len {
323                        if let LineContainer::Token(line) = &tokens[cursor] {
324                            if line.line_type == BlankLine {
325                                cursor += 1;
326                                continue;
327                            }
328                        }
329                        break;
330                    }
331
332                    // After container, check what follows
333                    if cursor >= len {
334                        return None; // Container at end - not a verbatim block
335                    }
336
337                    match &tokens[cursor] {
338                        LineContainer::Token(line) => {
339                            if matches!(line.line_type, AnnotationStartLine) {
340                                // Container followed by closing annotation (:: label ::) - this IS verbatim!
341                                // Continue loop to match it
342                                continue;
343                            }
344                            if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
345                                // Container followed by another subject - this is a verbatim group!
346                                // Continue loop to match more groups
347                                continue;
348                            }
349                            // Container followed by something else - NOT a verbatim block
350                            return None;
351                        }
352                        LineContainer::Container { .. } => {
353                            // Container followed by another container - NOT verbatim pattern
354                            return None;
355                        }
356                    }
357                }
358                LineContainer::Token(line) => {
359                    if matches!(line.line_type, AnnotationStartLine) {
360                        // Found closing annotation (:: label ::) - success!
361                        // But only if we haven't mixed containers with flat content in a problematic way
362                        return Some((
363                            PatternMatch::VerbatimBlock {
364                                subject_idx: first_subject_idx,
365                                content_range: (first_subject_idx + 1)..cursor,
366                                closing_idx: cursor,
367                            },
368                            start_idx..(cursor + 1),
369                        ));
370                    }
371
372                    if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
373                        // Another subject - this is another group
374                        cursor += 1;
375                        continue;
376                    }
377
378                    // Any other flat token (paragraph line, etc.)
379                    // This is fullwidth mode or group content
380                    cursor += 1;
381                }
382            }
383        }
384    }
385}
386
387/// Main recursive descent parser using the declarative grammar.
388///
389/// This is the entry point for parsing a sequence of tokens at any level.
390/// It iteratively tries to match patterns and recursively descends into containers.
391pub fn parse_with_declarative_grammar(
392    tokens: Vec<LineContainer>,
393    source: &str,
394) -> Result<Vec<ParseNode>, String> {
395    parse_with_declarative_grammar_internal(tokens, source, true, true)
396}
397
398/// Internal parsing function with nesting level tracking
399fn parse_with_declarative_grammar_internal(
400    tokens: Vec<LineContainer>,
401    source: &str,
402    allow_sessions: bool,
403    is_doc_start: bool,
404) -> Result<Vec<ParseNode>, String> {
405    let mut items: Vec<ParseNode> = Vec::new();
406    let mut idx = 0;
407
408    while idx < tokens.len() {
409        let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
410            if let Some(last_node) = items.last() {
411                (
412                    matches!(last_node.node_type, NodeType::BlankLineGroup),
413                    // A node with children indicates we just closed a container; this counts as a boundary.
414                    // DocumentStart also counts as a boundary - it marks the start of document content.
415                    !last_node.children.is_empty()
416                        || matches!(last_node.node_type, NodeType::DocumentStart),
417                    matches!(last_node.node_type, NodeType::Session),
418                )
419            } else {
420                (false, false, false)
421            };
422
423        let is_first_item = idx == 0 && is_doc_start;
424        if let Some((pattern, range)) = GrammarMatcher::try_match(
425            &tokens,
426            idx,
427            allow_sessions,
428            is_first_item,
429            has_preceding_blank,
430            has_preceding_boundary,
431            prev_was_session,
432        ) {
433            let mut pending_nodes = Vec::new();
434
435            if let PatternMatch::List {
436                preceding_blank_range: Some(blank_range),
437                ..
438            } = &pattern
439            {
440                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
441            }
442
443            if let PatternMatch::Session {
444                preceding_blank_range: Some(blank_range),
445                ..
446            } = &pattern
447            {
448                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
449            }
450
451            // Convert pattern to ParseNode
452            // Sessions parse their children with allow_sessions=true to allow nested sessions
453            // Other elements parse with allow_sessions=false to prevent sessions inside them
454            let is_session = matches!(&pattern, PatternMatch::Session { .. });
455            let item = convert_pattern_to_node(
456                &tokens,
457                &pattern,
458                range.clone(),
459                source,
460                &move |children, src| {
461                    parse_with_declarative_grammar_internal(children, src, is_session, false)
462                },
463            )?;
464            pending_nodes.push(item);
465
466            if let PatternMatch::List {
467                trailing_blank_range: Some(blank_range),
468                ..
469            } = &pattern
470            {
471                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
472            }
473
474            items.extend(pending_nodes);
475            idx = range.end;
476        } else {
477            // When no pattern matches, check if this is a Container (orphaned indented content).
478            // Rather than silently dropping it, parse its children and promote them to this level.
479            if let LineContainer::Container {
480                children: inner, ..
481            } = &tokens[idx]
482            {
483                if !inner.is_empty() {
484                    let orphaned = parse_with_declarative_grammar_internal(
485                        inner.clone(),
486                        source,
487                        allow_sessions,
488                        false,
489                    )?;
490                    items.extend(orphaned);
491                }
492            }
493            idx += 1;
494        }
495    }
496
497    Ok(items)
498}