lex_core/lex/parsing/parser.rs
1//! Declarative Grammar Engine - Regex-Based Parser for lex
2//!
3//! This module implements a unified parser using declarative regex grammar rules:
4//! 1. Converts token sequences to grammar notation strings
5//! 2. Matches against regex patterns in declaration order
6//! 3. Extracts consumed token indices from regex match
7//! 4. Recursively descends into containers when building AST
8//! 5. No imperative pattern matching - grammar is data, not code
9//!
10//! The grammar patterns and AST building logic have been extracted to separate modules:
11//! - `grammar.rs` - Pattern definitions and matching order
12//! - `builder.rs` - AST node construction from matched patterns
13
14use crate::lex::parsing::ir::{NodeType, ParseNode};
15use crate::lex::token::{LineContainer, LineType};
16use regex::Regex;
17use std::ops::Range;
18
19mod builder;
20mod grammar;
21
22use builder::{blank_line_node_from_range, convert_pattern_to_node, PatternMatch};
23use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
24
25/// Pattern matcher for declarative grammar using regex-based matching
26pub struct GrammarMatcher;
27
28impl GrammarMatcher {
29 /// Try to match a pattern at the current level using regex patterns.
30 ///
31 /// Converts the current token sequence to a grammar string, matches against
32 /// regex patterns in declaration order, and returns the matched pattern with
33 /// consumed token indices.
34 ///
35 /// Returns (matched_pattern, consumed_indices)
36 fn try_match(
37 tokens: &[LineContainer],
38 start_idx: usize,
39 allow_sessions: bool,
40 is_first_item: bool,
41 has_preceding_blank: bool,
42 has_preceding_boundary: bool,
43 prev_was_session: bool,
44 ) -> Option<(PatternMatch, Range<usize>)> {
45 if start_idx >= tokens.len() {
46 return None;
47 }
48
49 // Try verbatim block first (requires special imperative matching logic)
50 if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
51 return Some(result);
52 }
53
54 // Convert remaining tokens to grammar string
55 let remaining_tokens = &tokens[start_idx..];
56 let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
57
58 // Try each pattern in order
59 for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
60 // Skip patterns handled imperatively above
61 if *pattern_name == "verbatim_block" {
62 continue;
63 }
64 if let Ok(regex) = Regex::new(pattern_regex_str) {
65 if let Some(caps) = regex.captures(&token_string) {
66 let full_match = caps.get(0)?;
67 let consumed_count = Self::count_consumed_tokens(full_match.as_str());
68
69 // Use captures to extract indices and build the pattern
70 let pattern = match *pattern_name {
71 "annotation_block_with_end" => PatternMatch::AnnotationBlock {
72 start_idx: 0,
73 content_idx: 1,
74 },
75 "annotation_block" => PatternMatch::AnnotationBlock {
76 start_idx: 0,
77 content_idx: 1,
78 },
79 "annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
80 "list_no_blank" => {
81 // List without preceding blank line
82 let items_str = caps.name("items")?.as_str();
83 let mut items = Vec::new();
84 let mut token_idx = 0; // No blank line, so start at 0
85 for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
86 let has_container = item_cap.as_str().contains("<container>");
87 items.push((
88 token_idx,
89 if has_container {
90 Some(token_idx + 1)
91 } else {
92 None
93 },
94 ));
95 token_idx += if has_container { 2 } else { 1 };
96 }
97
98 let trailing_blank_count = caps
99 .name("trailing_blank")
100 .map(|m| Self::count_consumed_tokens(m.as_str()))
101 .unwrap_or(0);
102 let trailing_blank_range = if trailing_blank_count > 0 {
103 Some(
104 start_idx + consumed_count - trailing_blank_count
105 ..start_idx + consumed_count,
106 )
107 } else {
108 None
109 };
110
111 PatternMatch::List {
112 items,
113 preceding_blank_range: None,
114 trailing_blank_range,
115 }
116 }
117 "list" => {
118 let blank_count = caps
119 .name("blank")
120 .map(|m| Self::count_consumed_tokens(m.as_str()))
121 .unwrap_or(0);
122 let items_str = caps.name("items")?.as_str();
123 let mut items = Vec::new();
124 let mut token_idx = blank_count;
125 for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
126 let has_container = item_cap.as_str().contains("<container>");
127 items.push((
128 token_idx,
129 if has_container {
130 Some(token_idx + 1)
131 } else {
132 None
133 },
134 ));
135 token_idx += if has_container { 2 } else { 1 };
136 }
137 let trailing_blank_count = caps
138 .name("trailing_blank")
139 .map(|m| Self::count_consumed_tokens(m.as_str()))
140 .unwrap_or(0);
141 let preceding_blank_range = if blank_count > 0 {
142 Some(start_idx..start_idx + blank_count)
143 } else {
144 None
145 };
146 let trailing_blank_range = if trailing_blank_count > 0 {
147 Some(
148 start_idx + consumed_count - trailing_blank_count
149 ..start_idx + consumed_count,
150 )
151 } else {
152 None
153 };
154
155 PatternMatch::List {
156 items,
157 preceding_blank_range,
158 trailing_blank_range,
159 }
160 }
161 "session" => {
162 // Allow session_no_blank in these cases:
163 // 1. At document start (is_first_item=true), OR
164 // 2. At container start when sessions are allowed (start_idx=0 && allow_sessions=true), OR
165 // 3. After a BlankLineGroup when sessions are allowed (has_preceding_blank && allow_sessions)
166 // 4. Immediately after another session (prev_was_session && allow_sessions)
167 // 5. Immediately after a container that just closed (has_preceding_boundary && allow_sessions)
168 // This prevents Sessions inside Definitions while allowing legitimate session sequences.
169 if !allow_sessions {
170 continue; // Definitions and other containers don't allow sessions
171 }
172 if !(is_first_item
173 || start_idx == 0
174 || has_preceding_blank
175 || has_preceding_boundary
176 || prev_was_session)
177 {
178 continue; // Sessions need a separator or another session before them
179 }
180 let blank_str = caps.name("blank")?.as_str();
181 let blank_count = Self::count_consumed_tokens(blank_str);
182 PatternMatch::Session {
183 subject_idx: 0,
184 content_idx: 1 + blank_count,
185 preceding_blank_range: None,
186 }
187 }
188 "definition" => PatternMatch::Definition {
189 subject_idx: 0,
190 content_idx: 1,
191 },
192 "paragraph" => PatternMatch::Paragraph {
193 start_idx: 0,
194 end_idx: consumed_count - 1,
195 },
196 "blank_line_group" => PatternMatch::BlankLineGroup,
197 "document_start" => PatternMatch::DocumentStart,
198 _ => continue,
199 };
200
201 return Some((pattern, start_idx..start_idx + consumed_count));
202 }
203 }
204 }
205
206 None
207 }
208
209 /// Convert remaining tokens to grammar notation string
210 fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
211 let mut result = String::new();
212 for token in tokens {
213 match token {
214 LineContainer::Token(t) => {
215 result.push_str(&t.line_type.to_grammar_string());
216 }
217 LineContainer::Container { .. } => {
218 result.push_str("<container>");
219 }
220 }
221 }
222 if result.is_empty() {
223 None
224 } else {
225 Some(result)
226 }
227 }
228
229 /// Count how many tokens are represented in a grammar string.
230 /// Each token type in angle brackets represents one token.
231 fn count_consumed_tokens(grammar_str: &str) -> usize {
232 grammar_str.matches('<').count()
233 }
234
235 /// Match verbatim blocks using imperative logic.
236 ///
237 /// Verbatim blocks consist of:
238 /// 1. A subject line
239 /// 2. Content that is either:
240 /// a) In a Container (inflow mode - content indented relative to subject)
241 /// b) Flat lines (fullwidth mode - content at fixed column, or groups)
242 /// 3. A closing annotation marker (:: ... ::)
243 ///
244 /// This matcher handles both the original inflow case (subject + container + annotation)
245 /// and the fullwidth case (subject + flat lines + annotation). To distinguish verbatim
246 /// blocks from sessions followed by annotations, we require that either:
247 /// - There's a Container immediately after the subject, OR
248 /// - The closing annotation is at the SAME indentation as the subject
249 ///
250 /// Sessions have their title at the root level and content is indented. If we see
251 /// a root-level annotation after a root-level subject with indented content between,
252 /// that's NOT a verbatim block - it's a session followed by an annotation.
253 fn match_verbatim_block(
254 tokens: &[LineContainer],
255 start_idx: usize,
256 ) -> Option<(PatternMatch, Range<usize>)> {
257 use LineType::{
258 AnnotationStartLine, BlankLine, DocumentStart, SubjectLine, SubjectOrListItemLine,
259 };
260
261 let len = tokens.len();
262 if start_idx >= len {
263 return None;
264 }
265
266 // Allow blank lines and DocumentStart before the subject to be consumed as part of this match
267 let mut idx = start_idx;
268 while idx < len {
269 if let LineContainer::Token(line) = &tokens[idx] {
270 if line.line_type == BlankLine || line.line_type == DocumentStart {
271 idx += 1;
272 continue;
273 }
274 }
275 break;
276 }
277
278 if idx >= len {
279 return None;
280 }
281
282 // Must start with a subject line
283 let first_subject_idx = match &tokens[idx] {
284 LineContainer::Token(line)
285 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
286 {
287 idx
288 }
289 _ => return None,
290 };
291
292 let mut cursor = first_subject_idx + 1;
293
294 // Try to match one or more subject+content pairs followed by closing annotation
295 // This loop handles verbatim groups: multiple subjects sharing one closing annotation
296 loop {
297 // Skip blank lines
298 while cursor < len {
299 if let LineContainer::Token(line) = &tokens[cursor] {
300 if line.line_type == BlankLine {
301 cursor += 1;
302 continue;
303 }
304 }
305 break;
306 }
307
308 if cursor >= len {
309 return None;
310 }
311
312 // Check what we have at cursor
313 match &tokens[cursor] {
314 LineContainer::Container { .. } => {
315 // Found a container - this is potentially inflow mode verbatim content
316 // But we need to verify the pattern:
317 // - Verbatim: subject + container + (annotation OR another subject+container)
318 // - Session: subject + container + (other content)
319 cursor += 1;
320
321 // Skip blank lines after container
322 while cursor < len {
323 if let LineContainer::Token(line) = &tokens[cursor] {
324 if line.line_type == BlankLine {
325 cursor += 1;
326 continue;
327 }
328 }
329 break;
330 }
331
332 // After container, check what follows
333 if cursor >= len {
334 return None; // Container at end - not a verbatim block
335 }
336
337 match &tokens[cursor] {
338 LineContainer::Token(line) => {
339 if matches!(line.line_type, AnnotationStartLine) {
340 // Container followed by closing annotation (:: label ::) - this IS verbatim!
341 // Continue loop to match it
342 continue;
343 }
344 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
345 // Container followed by another subject - this is a verbatim group!
346 // Continue loop to match more groups
347 continue;
348 }
349 // Container followed by something else - NOT a verbatim block
350 return None;
351 }
352 LineContainer::Container { .. } => {
353 // Container followed by another container - NOT verbatim pattern
354 return None;
355 }
356 }
357 }
358 LineContainer::Token(line) => {
359 if matches!(line.line_type, AnnotationStartLine) {
360 // Found closing annotation (:: label ::) - success!
361 // But only if we haven't mixed containers with flat content in a problematic way
362 return Some((
363 PatternMatch::VerbatimBlock {
364 subject_idx: first_subject_idx,
365 content_range: (first_subject_idx + 1)..cursor,
366 closing_idx: cursor,
367 },
368 start_idx..(cursor + 1),
369 ));
370 }
371
372 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
373 // Another subject - this is another group
374 cursor += 1;
375 continue;
376 }
377
378 // Any other flat token (paragraph line, etc.)
379 // This is fullwidth mode or group content
380 cursor += 1;
381 }
382 }
383 }
384 }
385}
386
387/// Main recursive descent parser using the declarative grammar.
388///
389/// This is the entry point for parsing a sequence of tokens at any level.
390/// It iteratively tries to match patterns and recursively descends into containers.
391pub fn parse_with_declarative_grammar(
392 tokens: Vec<LineContainer>,
393 source: &str,
394) -> Result<Vec<ParseNode>, String> {
395 parse_with_declarative_grammar_internal(tokens, source, true, true)
396}
397
398/// Internal parsing function with nesting level tracking
399fn parse_with_declarative_grammar_internal(
400 tokens: Vec<LineContainer>,
401 source: &str,
402 allow_sessions: bool,
403 is_doc_start: bool,
404) -> Result<Vec<ParseNode>, String> {
405 let mut items: Vec<ParseNode> = Vec::new();
406 let mut idx = 0;
407
408 while idx < tokens.len() {
409 let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
410 if let Some(last_node) = items.last() {
411 (
412 matches!(last_node.node_type, NodeType::BlankLineGroup),
413 // A node with children indicates we just closed a container; this counts as a boundary.
414 // DocumentStart also counts as a boundary - it marks the start of document content.
415 !last_node.children.is_empty()
416 || matches!(last_node.node_type, NodeType::DocumentStart),
417 matches!(last_node.node_type, NodeType::Session),
418 )
419 } else {
420 (false, false, false)
421 };
422
423 let is_first_item = idx == 0 && is_doc_start;
424 if let Some((pattern, range)) = GrammarMatcher::try_match(
425 &tokens,
426 idx,
427 allow_sessions,
428 is_first_item,
429 has_preceding_blank,
430 has_preceding_boundary,
431 prev_was_session,
432 ) {
433 let mut pending_nodes = Vec::new();
434
435 if let PatternMatch::List {
436 preceding_blank_range: Some(blank_range),
437 ..
438 } = &pattern
439 {
440 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
441 }
442
443 if let PatternMatch::Session {
444 preceding_blank_range: Some(blank_range),
445 ..
446 } = &pattern
447 {
448 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
449 }
450
451 // Convert pattern to ParseNode
452 // Sessions parse their children with allow_sessions=true to allow nested sessions
453 // Other elements parse with allow_sessions=false to prevent sessions inside them
454 let is_session = matches!(&pattern, PatternMatch::Session { .. });
455 let item = convert_pattern_to_node(
456 &tokens,
457 &pattern,
458 range.clone(),
459 source,
460 &move |children, src| {
461 parse_with_declarative_grammar_internal(children, src, is_session, false)
462 },
463 )?;
464 pending_nodes.push(item);
465
466 if let PatternMatch::List {
467 trailing_blank_range: Some(blank_range),
468 ..
469 } = &pattern
470 {
471 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
472 }
473
474 items.extend(pending_nodes);
475 idx = range.end;
476 } else {
477 // When no pattern matches, check if this is a Container (orphaned indented content).
478 // Rather than silently dropping it, parse its children and promote them to this level.
479 if let LineContainer::Container {
480 children: inner, ..
481 } = &tokens[idx]
482 {
483 if !inner.is_empty() {
484 let orphaned = parse_with_declarative_grammar_internal(
485 inner.clone(),
486 source,
487 allow_sessions,
488 false,
489 )?;
490 items.extend(orphaned);
491 }
492 }
493 idx += 1;
494 }
495 }
496
497 Ok(items)
498}