lex_core/lex/parsing/
parser.rs

1//! Declarative Grammar Engine - Regex-Based Parser for lex
2//!
3//! This module implements a unified parser using declarative regex grammar rules:
4//! 1. Converts token sequences to grammar notation strings
5//! 2. Matches against regex patterns in declaration order
6//! 3. Extracts consumed token indices from regex match
7//! 4. Recursively descends into containers when building AST
8//! 5. No imperative pattern matching - grammar is data, not code
9//!
10//! The grammar patterns and AST building logic have been extracted to separate modules:
11//! - `grammar.rs` - Pattern definitions and matching order
12//! - `builder.rs` - AST node construction from matched patterns
13
14use crate::lex::parsing::ir::{NodeType, ParseNode};
15use crate::lex::token::{LineContainer, LineType};
16use regex::Regex;
17use std::ops::Range;
18
19mod builder;
20mod grammar;
21
22use builder::{blank_line_node_from_range, convert_pattern_to_node, PatternMatch};
23use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
24
25/// Pattern matcher for declarative grammar using regex-based matching
26pub struct GrammarMatcher;
27
28impl GrammarMatcher {
29    /// Try to match a pattern at the current level using regex patterns.
30    ///
31    /// Converts the current token sequence to a grammar string, matches against
32    /// regex patterns in declaration order, and returns the matched pattern with
33    /// consumed token indices.
34    ///
35    /// Returns (matched_pattern, consumed_indices)
36    fn try_match(
37        tokens: &[LineContainer],
38        start_idx: usize,
39        allow_sessions: bool,
40        is_first_item: bool,
41        has_preceding_blank: bool,
42        has_preceding_boundary: bool,
43        prev_was_session: bool,
44    ) -> Option<(PatternMatch, Range<usize>)> {
45        if start_idx >= tokens.len() {
46            return None;
47        }
48
49        // Try verbatim block first (requires special imperative matching logic)
50        if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
51            return Some(result);
52        }
53
54        // Convert remaining tokens to grammar string
55        let remaining_tokens = &tokens[start_idx..];
56        let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
57
58        // Try each pattern in order
59        for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
60            // Skip patterns handled imperatively above
61            if *pattern_name == "verbatim_block" {
62                continue;
63            }
64            if let Ok(regex) = Regex::new(pattern_regex_str) {
65                if let Some(caps) = regex.captures(&token_string) {
66                    let full_match = caps.get(0)?;
67                    let consumed_count = Self::count_consumed_tokens(full_match.as_str());
68
69                    // Use captures to extract indices and build the pattern
70                    let pattern = match *pattern_name {
71                        "annotation_block_with_end" => PatternMatch::AnnotationBlock {
72                            start_idx: 0,
73                            content_idx: 1,
74                        },
75                        "annotation_block" => PatternMatch::AnnotationBlock {
76                            start_idx: 0,
77                            content_idx: 1,
78                        },
79                        "annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
80                        "list_no_blank" => {
81                            // List without preceding blank line
82                            let items_str = caps.name("items")?.as_str();
83                            let mut items = Vec::new();
84                            let mut token_idx = 0; // No blank line, so start at 0
85                            for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
86                                let has_container = item_cap.as_str().contains("<container>");
87                                items.push((
88                                    token_idx,
89                                    if has_container {
90                                        Some(token_idx + 1)
91                                    } else {
92                                        None
93                                    },
94                                ));
95                                token_idx += if has_container { 2 } else { 1 };
96                            }
97
98                            let trailing_blank_count = caps
99                                .name("trailing_blank")
100                                .map(|m| Self::count_consumed_tokens(m.as_str()))
101                                .unwrap_or(0);
102                            let trailing_blank_range = if trailing_blank_count > 0 {
103                                Some(
104                                    start_idx + consumed_count - trailing_blank_count
105                                        ..start_idx + consumed_count,
106                                )
107                            } else {
108                                None
109                            };
110
111                            PatternMatch::List {
112                                items,
113                                preceding_blank_range: None,
114                                trailing_blank_range,
115                            }
116                        }
117                        "list" => {
118                            let blank_count = caps
119                                .name("blank")
120                                .map(|m| Self::count_consumed_tokens(m.as_str()))
121                                .unwrap_or(0);
122                            let items_str = caps.name("items")?.as_str();
123                            let mut items = Vec::new();
124                            let mut token_idx = blank_count;
125                            for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
126                                let has_container = item_cap.as_str().contains("<container>");
127                                items.push((
128                                    token_idx,
129                                    if has_container {
130                                        Some(token_idx + 1)
131                                    } else {
132                                        None
133                                    },
134                                ));
135                                token_idx += if has_container { 2 } else { 1 };
136                            }
137                            let trailing_blank_count = caps
138                                .name("trailing_blank")
139                                .map(|m| Self::count_consumed_tokens(m.as_str()))
140                                .unwrap_or(0);
141                            let preceding_blank_range = if blank_count > 0 {
142                                Some(start_idx..start_idx + blank_count)
143                            } else {
144                                None
145                            };
146                            let trailing_blank_range = if trailing_blank_count > 0 {
147                                Some(
148                                    start_idx + consumed_count - trailing_blank_count
149                                        ..start_idx + consumed_count,
150                                )
151                            } else {
152                                None
153                            };
154
155                            PatternMatch::List {
156                                items,
157                                preceding_blank_range,
158                                trailing_blank_range,
159                            }
160                        }
161                        "session" => {
162                            // Allow session_no_blank in these cases:
163                            // 1. At document start (is_first_item=true), OR
164                            // 2. At container start when sessions are allowed (start_idx=0 && allow_sessions=true), OR
165                            // 3. After a BlankLineGroup when sessions are allowed (has_preceding_blank && allow_sessions)
166                            // 4. Immediately after another session (prev_was_session && allow_sessions)
167                            // 5. Immediately after a container that just closed (has_preceding_boundary && allow_sessions)
168                            // This prevents Sessions inside Definitions while allowing legitimate session sequences.
169                            if !allow_sessions {
170                                continue; // Definitions and other containers don't allow sessions
171                            }
172                            if !(is_first_item
173                                || start_idx == 0
174                                || has_preceding_blank
175                                || has_preceding_boundary
176                                || prev_was_session)
177                            {
178                                continue; // Sessions need a separator or another session before them
179                            }
180                            let blank_str = caps.name("blank")?.as_str();
181                            let blank_count = Self::count_consumed_tokens(blank_str);
182                            PatternMatch::Session {
183                                subject_idx: 0,
184                                content_idx: 1 + blank_count,
185                                preceding_blank_range: None,
186                            }
187                        }
188                        "definition" => PatternMatch::Definition {
189                            subject_idx: 0,
190                            content_idx: 1,
191                        },
192                        "paragraph" => PatternMatch::Paragraph {
193                            start_idx: 0,
194                            end_idx: consumed_count - 1,
195                        },
196                        "blank_line_group" => PatternMatch::BlankLineGroup,
197                        "document_start" => PatternMatch::DocumentStart,
198                        _ => continue,
199                    };
200
201                    return Some((pattern, start_idx..start_idx + consumed_count));
202                }
203            }
204        }
205
206        None
207    }
208
209    /// Convert remaining tokens to grammar notation string
210    fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
211        let mut result = String::new();
212        for token in tokens {
213            match token {
214                LineContainer::Token(t) => {
215                    result.push_str(&t.line_type.to_grammar_string());
216                }
217                LineContainer::Container { .. } => {
218                    result.push_str("<container>");
219                }
220            }
221        }
222        if result.is_empty() {
223            None
224        } else {
225            Some(result)
226        }
227    }
228
229    /// Count how many tokens are represented in a grammar string.
230    /// Each token type in angle brackets represents one token.
231    fn count_consumed_tokens(grammar_str: &str) -> usize {
232        grammar_str.matches('<').count()
233    }
234
235    /// Match verbatim blocks using imperative logic.
236    ///
237    /// Verbatim blocks consist of:
238    /// 1. A subject line
239    /// 2. Content that is either:
240    ///    a) In a Container (inflow mode - content indented relative to subject)
241    ///    b) Flat lines (fullwidth mode - content at fixed column, or groups)
242    /// 3. A closing annotation marker (:: ... ::)
243    ///
244    /// This matcher handles both the original inflow case (subject + container + annotation)
245    /// and the fullwidth case (subject + flat lines + annotation). To distinguish verbatim
246    /// blocks from sessions followed by annotations, we require that either:
247    /// - There's a Container immediately after the subject, OR
248    /// - The closing annotation is at the SAME indentation as the subject
249    ///
250    /// Sessions have their title at the root level and content is indented. If we see
251    /// a root-level annotation after a root-level subject with indented content between,
252    /// that's NOT a verbatim block - it's a session followed by an annotation.
253    fn match_verbatim_block(
254        tokens: &[LineContainer],
255        start_idx: usize,
256    ) -> Option<(PatternMatch, Range<usize>)> {
257        use LineType::{
258            AnnotationStartLine, BlankLine, DataLine, DocumentStart, SubjectLine,
259            SubjectOrListItemLine,
260        };
261
262        let len = tokens.len();
263        if start_idx >= len {
264            return None;
265        }
266
267        // Allow blank lines and DocumentStart before the subject to be consumed as part of this match
268        let mut idx = start_idx;
269        while idx < len {
270            if let LineContainer::Token(line) = &tokens[idx] {
271                if line.line_type == BlankLine || line.line_type == DocumentStart {
272                    idx += 1;
273                    continue;
274                }
275            }
276            break;
277        }
278
279        if idx >= len {
280            return None;
281        }
282
283        // Must start with a subject line
284        let first_subject_idx = match &tokens[idx] {
285            LineContainer::Token(line)
286                if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
287            {
288                idx
289            }
290            _ => return None,
291        };
292
293        let mut cursor = first_subject_idx + 1;
294
295        // Try to match one or more subject+content pairs followed by closing annotation
296        // This loop handles verbatim groups: multiple subjects sharing one closing annotation
297        loop {
298            // Skip blank lines
299            while cursor < len {
300                if let LineContainer::Token(line) = &tokens[cursor] {
301                    if line.line_type == BlankLine {
302                        cursor += 1;
303                        continue;
304                    }
305                }
306                break;
307            }
308
309            if cursor >= len {
310                return None;
311            }
312
313            // Check what we have at cursor
314            match &tokens[cursor] {
315                LineContainer::Container { .. } => {
316                    // Found a container - this is potentially inflow mode verbatim content
317                    // But we need to verify the pattern:
318                    // - Verbatim: subject + container + (annotation OR another subject+container)
319                    // - Session: subject + container + (other content)
320                    cursor += 1;
321
322                    // Skip blank lines after container
323                    while cursor < len {
324                        if let LineContainer::Token(line) = &tokens[cursor] {
325                            if line.line_type == BlankLine {
326                                cursor += 1;
327                                continue;
328                            }
329                        }
330                        break;
331                    }
332
333                    // After container, check what follows
334                    if cursor >= len {
335                        return None; // Container at end - not a verbatim block
336                    }
337
338                    match &tokens[cursor] {
339                        LineContainer::Token(line) => {
340                            if matches!(line.line_type, DataLine | AnnotationStartLine) {
341                                // Container followed by annotation - this IS verbatim!
342                                // Continue loop to match it
343                                continue;
344                            }
345                            if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
346                                // Container followed by another subject - this is a verbatim group!
347                                // Continue loop to match more groups
348                                continue;
349                            }
350                            // Container followed by something else - NOT a verbatim block
351                            return None;
352                        }
353                        LineContainer::Container { .. } => {
354                            // Container followed by another container - NOT verbatim pattern
355                            return None;
356                        }
357                    }
358                }
359                LineContainer::Token(line) => {
360                    if matches!(line.line_type, DataLine | AnnotationStartLine) {
361                        // Found closing annotation - success!
362                        // But only if we haven't mixed containers with flat content in a problematic way
363                        return Some((
364                            PatternMatch::VerbatimBlock {
365                                subject_idx: first_subject_idx,
366                                content_range: (first_subject_idx + 1)..cursor,
367                                closing_idx: cursor,
368                            },
369                            start_idx..(cursor + 1),
370                        ));
371                    }
372
373                    if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
374                        // Another subject - this is another group
375                        cursor += 1;
376                        continue;
377                    }
378
379                    // Any other flat token (paragraph line, etc.)
380                    // This is fullwidth mode or group content
381                    cursor += 1;
382                }
383            }
384        }
385    }
386}
387
388/// Main recursive descent parser using the declarative grammar.
389///
390/// This is the entry point for parsing a sequence of tokens at any level.
391/// It iteratively tries to match patterns and recursively descends into containers.
392pub fn parse_with_declarative_grammar(
393    tokens: Vec<LineContainer>,
394    source: &str,
395) -> Result<Vec<ParseNode>, String> {
396    parse_with_declarative_grammar_internal(tokens, source, true, true)
397}
398
399/// Internal parsing function with nesting level tracking
400fn parse_with_declarative_grammar_internal(
401    tokens: Vec<LineContainer>,
402    source: &str,
403    allow_sessions: bool,
404    is_doc_start: bool,
405) -> Result<Vec<ParseNode>, String> {
406    let mut items: Vec<ParseNode> = Vec::new();
407    let mut idx = 0;
408
409    while idx < tokens.len() {
410        let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
411            if let Some(last_node) = items.last() {
412                (
413                    matches!(last_node.node_type, NodeType::BlankLineGroup),
414                    // A node with children indicates we just closed a container; this counts as a boundary.
415                    // DocumentStart also counts as a boundary - it marks the start of document content.
416                    !last_node.children.is_empty()
417                        || matches!(last_node.node_type, NodeType::DocumentStart),
418                    matches!(last_node.node_type, NodeType::Session),
419                )
420            } else {
421                (false, false, false)
422            };
423
424        let is_first_item = idx == 0 && is_doc_start;
425        if let Some((pattern, range)) = GrammarMatcher::try_match(
426            &tokens,
427            idx,
428            allow_sessions,
429            is_first_item,
430            has_preceding_blank,
431            has_preceding_boundary,
432            prev_was_session,
433        ) {
434            let mut pending_nodes = Vec::new();
435
436            if let PatternMatch::List {
437                preceding_blank_range: Some(blank_range),
438                ..
439            } = &pattern
440            {
441                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
442            }
443
444            if let PatternMatch::Session {
445                preceding_blank_range: Some(blank_range),
446                ..
447            } = &pattern
448            {
449                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
450            }
451
452            // Convert pattern to ParseNode
453            // Sessions parse their children with allow_sessions=true to allow nested sessions
454            // Other elements parse with allow_sessions=false to prevent sessions inside them
455            let is_session = matches!(&pattern, PatternMatch::Session { .. });
456            let item = convert_pattern_to_node(
457                &tokens,
458                &pattern,
459                range.clone(),
460                source,
461                &move |children, src| {
462                    parse_with_declarative_grammar_internal(children, src, is_session, false)
463                },
464            )?;
465            pending_nodes.push(item);
466
467            if let PatternMatch::List {
468                trailing_blank_range: Some(blank_range),
469                ..
470            } = &pattern
471            {
472                pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
473            }
474
475            items.extend(pending_nodes);
476            idx = range.end;
477        } else {
478            idx += 1;
479        }
480    }
481
482    Ok(items)
483}
lex_core/lex/parsing/parser.rs

lex_core/lex/parsing/
parser.rs