lex_core/lex/parsing/parser.rs
1//! Declarative Grammar Engine - Regex & Imperative Parser for lex
2//!
3//! This module implements a unified parser using declarative regex grammar rules
4//! with imperative fallbacks for patterns that need look-ahead:
5//! 1. Converts token sequences to grammar notation strings
6//! 2. Matches against regex patterns in declaration order
7//! 3. Falls back to imperative matchers (verbatim blocks, paragraphs)
8//! 4. Extracts consumed token indices from regex match
9//! 5. Recursively descends into containers when building AST
10//!
11//! The grammar patterns and AST building logic have been extracted to separate modules:
12//! - `grammar.rs` - Pattern definitions and matching order
13//! - `builder.rs` - AST node construction from matched patterns
14
15use crate::lex::parsing::ir::{NodeType, ParseNode};
16use crate::lex::token::{LineContainer, LineType};
17use regex::Regex;
18use std::ops::Range;
19
20mod builder;
21mod grammar;
22
23use builder::{
24 blank_line_node_from_range, container_starts_with_pipe_row, convert_pattern_to_node,
25 PatternMatch,
26};
27use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
28
29/// Pattern matcher for declarative grammar using regex-based matching
30pub struct GrammarMatcher;
31
32impl GrammarMatcher {
33 /// Try to match a pattern at the current level using regex patterns.
34 ///
35 /// Converts the current token sequence to a grammar string, matches against
36 /// regex patterns in declaration order, and returns the matched pattern with
37 /// consumed token indices.
38 ///
39 /// Returns (matched_pattern, consumed_indices)
40 fn try_match(
41 tokens: &[LineContainer],
42 start_idx: usize,
43 allow_sessions: bool,
44 is_first_item: bool,
45 has_preceding_blank: bool,
46 has_preceding_boundary: bool,
47 prev_was_session: bool,
48 ) -> Option<(PatternMatch, Range<usize>)> {
49 if start_idx >= tokens.len() {
50 return None;
51 }
52
53 // Try verbatim block first (requires special imperative matching logic)
54 if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
55 return Some(result);
56 }
57
58 // Try table: subject + container whose first non-blank line is a pipe row.
59 // Must run before the definition pattern (which matches the same subject + container).
60 if let Some(result) = Self::match_table(tokens, start_idx) {
61 return Some(result);
62 }
63
64 // Convert remaining tokens to grammar string
65 let remaining_tokens = &tokens[start_idx..];
66 let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
67
68 // Try each pattern in order
69 for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
70 // Skip patterns handled imperatively above
71 if *pattern_name == "verbatim_block" {
72 continue;
73 }
74 if let Ok(regex) = Regex::new(pattern_regex_str) {
75 if let Some(caps) = regex.captures(&token_string) {
76 let full_match = caps.get(0)?;
77 let consumed_count = Self::count_consumed_tokens(full_match.as_str());
78
79 // Use captures to extract indices and build the pattern
80 let pattern = match *pattern_name {
81 "annotation_block" => PatternMatch::AnnotationBlock {
82 start_idx: 0,
83 content_idx: 1,
84 },
85 "annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
86 "list_no_blank" => {
87 // List without preceding blank line
88 let items_str = caps.name("items")?.as_str();
89 let mut items = Vec::new();
90 let mut token_idx = 0; // No blank line, so start at 0
91 for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
92 let has_container = item_cap.as_str().contains("<container>");
93 items.push((
94 token_idx,
95 if has_container {
96 Some(token_idx + 1)
97 } else {
98 None
99 },
100 ));
101 token_idx += if has_container { 2 } else { 1 };
102 }
103
104 let trailing_blank_count = caps
105 .name("trailing_blank")
106 .map(|m| Self::count_consumed_tokens(m.as_str()))
107 .unwrap_or(0);
108 let trailing_blank_range = if trailing_blank_count > 0 {
109 Some(
110 start_idx + consumed_count - trailing_blank_count
111 ..start_idx + consumed_count,
112 )
113 } else {
114 None
115 };
116
117 PatternMatch::List {
118 items,
119 preceding_blank_range: None,
120 trailing_blank_range,
121 }
122 }
123 "list" => {
124 let blank_count = caps
125 .name("blank")
126 .map(|m| Self::count_consumed_tokens(m.as_str()))
127 .unwrap_or(0);
128 let items_str = caps.name("items")?.as_str();
129 let mut items = Vec::new();
130 let mut token_idx = blank_count;
131 for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
132 let has_container = item_cap.as_str().contains("<container>");
133 items.push((
134 token_idx,
135 if has_container {
136 Some(token_idx + 1)
137 } else {
138 None
139 },
140 ));
141 token_idx += if has_container { 2 } else { 1 };
142 }
143 let trailing_blank_count = caps
144 .name("trailing_blank")
145 .map(|m| Self::count_consumed_tokens(m.as_str()))
146 .unwrap_or(0);
147 let preceding_blank_range = if blank_count > 0 {
148 Some(start_idx..start_idx + blank_count)
149 } else {
150 None
151 };
152 let trailing_blank_range = if trailing_blank_count > 0 {
153 Some(
154 start_idx + consumed_count - trailing_blank_count
155 ..start_idx + consumed_count,
156 )
157 } else {
158 None
159 };
160
161 PatternMatch::List {
162 items,
163 preceding_blank_range,
164 trailing_blank_range,
165 }
166 }
167 "session" => {
168 // Allow session_no_blank in these cases:
169 // 1. At document start (is_first_item=true), OR
170 // 2. At container start when sessions are allowed (start_idx=0 && allow_sessions=true), OR
171 // 3. After a BlankLineGroup when sessions are allowed (has_preceding_blank && allow_sessions)
172 // 4. Immediately after another session (prev_was_session && allow_sessions)
173 // 5. Immediately after a container that just closed (has_preceding_boundary && allow_sessions)
174 // This prevents Sessions inside Definitions while allowing legitimate session sequences.
175 if !allow_sessions {
176 continue; // Definitions and other containers don't allow sessions
177 }
178 if !(is_first_item
179 || start_idx == 0
180 || has_preceding_blank
181 || has_preceding_boundary
182 || prev_was_session)
183 {
184 continue; // Sessions need a separator or another session before them
185 }
186 let blank_str = caps.name("blank")?.as_str();
187 let blank_count = Self::count_consumed_tokens(blank_str);
188 PatternMatch::Session {
189 subject_idx: 0,
190 content_idx: 1 + blank_count,
191 preceding_blank_range: None,
192 }
193 }
194 "definition" => PatternMatch::Definition {
195 subject_idx: 0,
196 content_idx: 1,
197 },
198 "blank_line_group" => PatternMatch::BlankLineGroup,
199 "document_title_with_subtitle" => {
200 // No container lookahead needed: the subtitle variant
201 // consumed two lines (title + subtitle) before blank lines.
202 // A session only has one line before blank + container, so
203 // the presence of a container after the blank is NOT ambiguous
204 // here — it's the document body, not a session body.
205 // Match: DocumentStart(0) + title(1) + subtitle(2) + blank lines
206 PatternMatch::DocumentTitle {
207 title_idx: 1,
208 subtitle_idx: Some(2),
209 }
210 }
211 "document_title" => {
212 // Imperative negative lookahead: not followed by container
213 let next_idx = start_idx + consumed_count;
214 if next_idx < tokens.len()
215 && matches!(&tokens[next_idx], LineContainer::Container { .. })
216 {
217 // Followed by container — this is a session, not a title
218 continue;
219 }
220 // Match is: DocumentStart(0) + title line(1) + blank lines
221 PatternMatch::DocumentTitle {
222 title_idx: 1,
223 subtitle_idx: None,
224 }
225 }
226 "document_start" => PatternMatch::DocumentStart,
227 _ => continue,
228 };
229
230 return Some((pattern, start_idx..start_idx + consumed_count));
231 }
232 }
233 }
234
235 // Paragraph: matched imperatively after all regex patterns fail.
236 // Stops before element boundaries (list starts, definition starts).
237 Self::match_paragraph(tokens, start_idx)
238 }
239
240 /// Convert remaining tokens to grammar notation string
241 fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
242 let mut result = String::new();
243 for token in tokens {
244 match token {
245 LineContainer::Token(t) => {
246 result.push_str(&t.line_type.to_grammar_string());
247 }
248 LineContainer::Container { .. } => {
249 result.push_str("<container>");
250 }
251 }
252 }
253 if result.is_empty() {
254 None
255 } else {
256 Some(result)
257 }
258 }
259
260 /// Count how many tokens are represented in a grammar string.
261 /// Each token type in angle brackets represents one token.
262 fn count_consumed_tokens(grammar_str: &str) -> usize {
263 grammar_str.matches('<').count()
264 }
265
266 /// Match paragraphs using imperative logic.
267 ///
268 /// Consumes content lines (paragraph, dialog, subject, list) one at a time,
269 /// stopping before sequences that form other block elements:
270 /// - Before 2+ consecutive list-like lines (list start)
271 /// - Before a subject line followed by a container (definition start)
272 fn match_paragraph(
273 tokens: &[LineContainer],
274 start_idx: usize,
275 ) -> Option<(PatternMatch, Range<usize>)> {
276 use LineType::*;
277
278 let len = tokens.len();
279 let mut idx = start_idx;
280
281 while idx < len {
282 match &tokens[idx] {
283 LineContainer::Token(t) => match t.line_type {
284 ParagraphLine | DialogLine => {
285 idx += 1;
286 }
287 SubjectLine => {
288 // Stop if followed by container (definition start)
289 if Self::next_is_container(tokens, idx) {
290 break;
291 }
292 idx += 1;
293 }
294 SubjectOrListItemLine => {
295 // Stop if followed by container (definition start)
296 if Self::next_is_container(tokens, idx) {
297 break;
298 }
299 // Stop if followed by another list-like line (list start)
300 if Self::next_is_list_like(tokens, idx) {
301 break;
302 }
303 idx += 1;
304 }
305 ListLine => {
306 // Stop if followed by another list-like line, possibly
307 // with a container in between (list start)
308 if Self::next_is_list_continuation(tokens, idx) {
309 break;
310 }
311 idx += 1;
312 }
313 _ => break, // Blank line, annotation, document-start, etc.
314 },
315 LineContainer::Container { .. } => break,
316 }
317 }
318
319 if idx > start_idx {
320 Some((
321 PatternMatch::Paragraph {
322 start_idx: 0,
323 end_idx: idx - start_idx - 1,
324 },
325 start_idx..idx,
326 ))
327 } else {
328 None
329 }
330 }
331
332 /// Check if the token after `idx` is a Container.
333 fn next_is_container(tokens: &[LineContainer], idx: usize) -> bool {
334 let next = idx + 1;
335 next < tokens.len() && matches!(&tokens[next], LineContainer::Container { .. })
336 }
337
338 /// Check if the token after `idx` is a list-like line (ListLine or SubjectOrListItemLine).
339 fn next_is_list_like(tokens: &[LineContainer], idx: usize) -> bool {
340 let next = idx + 1;
341 if next >= tokens.len() {
342 return false;
343 }
344 matches!(
345 &tokens[next],
346 LineContainer::Token(t) if matches!(t.line_type, LineType::ListLine | LineType::SubjectOrListItemLine)
347 )
348 }
349
350 /// Check if the token after `idx` starts a list continuation:
351 /// either directly another list-like line, or a container followed by a list-like line.
352 fn next_is_list_continuation(tokens: &[LineContainer], idx: usize) -> bool {
353 let next = idx + 1;
354 if next >= tokens.len() {
355 return false;
356 }
357 match &tokens[next] {
358 LineContainer::Token(t) => {
359 matches!(
360 t.line_type,
361 LineType::ListLine | LineType::SubjectOrListItemLine
362 )
363 }
364 LineContainer::Container { .. } => {
365 // Container after list item — check if another list item follows
366 let after = next + 1;
367 after < tokens.len()
368 && matches!(
369 &tokens[after],
370 LineContainer::Token(t) if matches!(t.line_type, LineType::ListLine | LineType::SubjectOrListItemLine)
371 )
372 }
373 }
374 }
375
376 /// Match tables using imperative logic.
377 ///
378 /// A table is a subject line followed immediately by a container whose first
379 /// non-blank line starts with a pipe character. This runs before the definition
380 /// pattern (which matches the same `subject + container` shape) to ensure
381 /// tables are detected by their content.
382 fn match_table(
383 tokens: &[LineContainer],
384 start_idx: usize,
385 ) -> Option<(PatternMatch, Range<usize>)> {
386 use LineType::{SubjectLine, SubjectOrListItemLine};
387
388 if start_idx >= tokens.len() {
389 return None;
390 }
391
392 // Must start with a subject line
393 let is_subject = matches!(
394 &tokens[start_idx],
395 LineContainer::Token(line) if matches!(line.line_type, SubjectLine | SubjectOrListItemLine)
396 );
397 if !is_subject {
398 return None;
399 }
400
401 // Must be immediately followed by a container
402 let content_idx = start_idx + 1;
403 if content_idx >= tokens.len() {
404 return None;
405 }
406 let container = &tokens[content_idx];
407 if !matches!(container, LineContainer::Container { .. }) {
408 return None;
409 }
410
411 // Container's first non-blank line must start with a pipe
412 if !container_starts_with_pipe_row(container) {
413 return None;
414 }
415
416 Some((
417 PatternMatch::Table {
418 subject_idx: 0,
419 content_idx: 1,
420 },
421 start_idx..content_idx + 1,
422 ))
423 }
424
425 /// Match verbatim blocks using imperative logic.
426 ///
427 /// Verbatim blocks consist of:
428 /// 1. A subject line
429 /// 2. Content that is either:
430 /// a) In a Container (inflow mode - content indented relative to subject)
431 /// b) Flat lines (fullwidth mode - content at fixed column, or groups)
432 /// 3. A closing annotation marker (:: ... ::)
433 ///
434 /// This matcher handles both the original inflow case (subject + container + annotation)
435 /// and the fullwidth case (subject + flat lines + annotation). To distinguish verbatim
436 /// blocks from sessions followed by annotations, we require that either:
437 /// - There's a Container immediately after the subject, OR
438 /// - The closing annotation is at the SAME indentation as the subject
439 ///
440 /// Sessions have their title at the root level and content is indented. If we see
441 /// a root-level annotation after a root-level subject with indented content between,
442 /// that's NOT a verbatim block - it's a session followed by an annotation.
443 fn match_verbatim_block(
444 tokens: &[LineContainer],
445 start_idx: usize,
446 ) -> Option<(PatternMatch, Range<usize>)> {
447 use LineType::{
448 BlankLine, DataMarkerLine, DocumentStart, SubjectLine, SubjectOrListItemLine,
449 };
450
451 let len = tokens.len();
452 if start_idx >= len {
453 return None;
454 }
455
456 // Allow blank lines and DocumentStart before the subject to be consumed as part of this match
457 let mut idx = start_idx;
458 while idx < len {
459 if let LineContainer::Token(line) = &tokens[idx] {
460 if line.line_type == BlankLine || line.line_type == DocumentStart {
461 idx += 1;
462 continue;
463 }
464 }
465 break;
466 }
467
468 if idx >= len {
469 return None;
470 }
471
472 // Must start with a subject line
473 let first_subject_idx = match &tokens[idx] {
474 LineContainer::Token(line)
475 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
476 {
477 idx
478 }
479 _ => return None,
480 };
481
482 let mut cursor = first_subject_idx + 1;
483
484 // Try to match one or more subject+content pairs followed by closing annotation
485 // This loop handles verbatim groups: multiple subjects sharing one closing annotation
486 loop {
487 // Skip blank lines
488 while cursor < len {
489 if let LineContainer::Token(line) = &tokens[cursor] {
490 if line.line_type == BlankLine {
491 cursor += 1;
492 continue;
493 }
494 }
495 break;
496 }
497
498 if cursor >= len {
499 return None;
500 }
501
502 // Check what we have at cursor
503 match &tokens[cursor] {
504 LineContainer::Container { .. } => {
505 // Found a container - this is potentially inflow mode verbatim content
506 // But we need to verify the pattern:
507 // - Verbatim: subject + container + (annotation OR another subject+container)
508 // - Session: subject + container + (other content)
509 cursor += 1;
510
511 // Skip blank lines after container
512 while cursor < len {
513 if let LineContainer::Token(line) = &tokens[cursor] {
514 if line.line_type == BlankLine {
515 cursor += 1;
516 continue;
517 }
518 }
519 break;
520 }
521
522 // After container, check what follows
523 if cursor >= len {
524 return None; // Container at end - not a verbatim block
525 }
526
527 match &tokens[cursor] {
528 LineContainer::Token(line) => {
529 if matches!(line.line_type, DataMarkerLine) {
530 // Container followed by closing annotation (:: label ::) - this IS verbatim!
531 // Continue loop to match it
532 continue;
533 }
534 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
535 // Container followed by another subject - this is a verbatim group!
536 // Continue loop to match more groups
537 continue;
538 }
539 // Container followed by something else - NOT a verbatim block
540 return None;
541 }
542 LineContainer::Container { .. } => {
543 // Container followed by another container - NOT verbatim pattern
544 return None;
545 }
546 }
547 }
548 LineContainer::Token(line) => {
549 if matches!(line.line_type, DataMarkerLine) {
550 // Found closing annotation (:: label ::) - success!
551 // But only if we haven't mixed containers with flat content in a problematic way
552 return Some((
553 PatternMatch::VerbatimBlock {
554 subject_idx: first_subject_idx,
555 content_range: (first_subject_idx + 1)..cursor,
556 closing_idx: cursor,
557 },
558 start_idx..(cursor + 1),
559 ));
560 }
561
562 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
563 // Another subject - this is another group
564 cursor += 1;
565 continue;
566 }
567
568 // Any other flat token (paragraph line, etc.)
569 // This is fullwidth mode or group content
570 cursor += 1;
571 }
572 }
573 }
574 }
575}
576
577/// Main recursive descent parser using the declarative grammar.
578///
579/// This is the entry point for parsing a sequence of tokens at any level.
580/// It iteratively tries to match patterns and recursively descends into containers.
581pub fn parse_with_declarative_grammar(
582 tokens: Vec<LineContainer>,
583 source: &str,
584) -> Result<Vec<ParseNode>, String> {
585 parse_with_declarative_grammar_internal(tokens, source, true, true)
586}
587
588/// Internal parsing function with nesting level tracking
589fn parse_with_declarative_grammar_internal(
590 tokens: Vec<LineContainer>,
591 source: &str,
592 allow_sessions: bool,
593 is_doc_start: bool,
594) -> Result<Vec<ParseNode>, String> {
595 let mut items: Vec<ParseNode> = Vec::new();
596 let mut idx = 0;
597
598 while idx < tokens.len() {
599 let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
600 if let Some(last_node) = items.last() {
601 (
602 matches!(last_node.node_type, NodeType::BlankLineGroup),
603 // A node with children indicates we just closed a container; this counts as a boundary.
604 // DocumentStart and DocumentTitle also count as boundaries.
605 !last_node.children.is_empty()
606 || matches!(
607 last_node.node_type,
608 NodeType::DocumentStart | NodeType::DocumentTitle
609 ),
610 matches!(last_node.node_type, NodeType::Session),
611 )
612 } else {
613 (false, false, false)
614 };
615
616 let is_first_item = idx == 0 && is_doc_start;
617 if let Some((pattern, range)) = GrammarMatcher::try_match(
618 &tokens,
619 idx,
620 allow_sessions,
621 is_first_item,
622 has_preceding_blank,
623 has_preceding_boundary,
624 prev_was_session,
625 ) {
626 let mut pending_nodes = Vec::new();
627
628 if let PatternMatch::List {
629 preceding_blank_range: Some(blank_range),
630 ..
631 } = &pattern
632 {
633 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
634 }
635
636 if let PatternMatch::Session {
637 preceding_blank_range: Some(blank_range),
638 ..
639 } = &pattern
640 {
641 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
642 }
643
644 // Convert pattern to ParseNode
645 // Sessions parse their children with allow_sessions=true to allow nested sessions
646 // Other elements parse with allow_sessions=false to prevent sessions inside them
647 let is_session = matches!(&pattern, PatternMatch::Session { .. });
648 let item = convert_pattern_to_node(
649 &tokens,
650 &pattern,
651 range.clone(),
652 source,
653 &move |children, src| {
654 parse_with_declarative_grammar_internal(children, src, is_session, false)
655 },
656 )?;
657 pending_nodes.push(item);
658
659 if let PatternMatch::List {
660 trailing_blank_range: Some(blank_range),
661 ..
662 } = &pattern
663 {
664 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
665 }
666
667 items.extend(pending_nodes);
668 idx = range.end;
669 } else {
670 // When no pattern matches, check if this is a Container (orphaned indented content).
671 // Rather than silently dropping it, parse its children and promote them to this level.
672 if let LineContainer::Container {
673 children: inner, ..
674 } = &tokens[idx]
675 {
676 if !inner.is_empty() {
677 let orphaned = parse_with_declarative_grammar_internal(
678 inner.clone(),
679 source,
680 allow_sessions,
681 false,
682 )?;
683 items.extend(orphaned);
684 }
685 }
686 idx += 1;
687 }
688 }
689
690 Ok(items)
691}