lex_core/lex/parsing/parser.rs
1//! Declarative Grammar Engine - Regex-Based Parser for lex
2//!
3//! This module implements a unified parser using declarative regex grammar rules:
4//! 1. Converts token sequences to grammar notation strings
5//! 2. Matches against regex patterns in declaration order
6//! 3. Extracts consumed token indices from regex match
7//! 4. Recursively descends into containers when building AST
8//! 5. No imperative pattern matching - grammar is data, not code
9//!
10//! The grammar patterns and AST building logic have been extracted to separate modules:
11//! - `grammar.rs` - Pattern definitions and matching order
12//! - `builder.rs` - AST node construction from matched patterns
13
14use crate::lex::parsing::ir::{NodeType, ParseNode};
15use crate::lex::token::{LineContainer, LineType};
16use regex::Regex;
17use std::ops::Range;
18
19mod builder;
20mod grammar;
21
22use builder::{blank_line_node_from_range, convert_pattern_to_node, PatternMatch};
23use grammar::{GRAMMAR_PATTERNS, LIST_ITEM_REGEX};
24
25/// Pattern matcher for declarative grammar using regex-based matching
26pub struct GrammarMatcher;
27
28impl GrammarMatcher {
29 /// Try to match a pattern at the current level using regex patterns.
30 ///
31 /// Converts the current token sequence to a grammar string, matches against
32 /// regex patterns in declaration order, and returns the matched pattern with
33 /// consumed token indices.
34 ///
35 /// Returns (matched_pattern, consumed_indices)
36 fn try_match(
37 tokens: &[LineContainer],
38 start_idx: usize,
39 allow_sessions: bool,
40 is_first_item: bool,
41 has_preceding_blank: bool,
42 has_preceding_boundary: bool,
43 prev_was_session: bool,
44 ) -> Option<(PatternMatch, Range<usize>)> {
45 if start_idx >= tokens.len() {
46 return None;
47 }
48
49 // Try verbatim block first (requires special imperative matching logic)
50 if let Some(result) = Self::match_verbatim_block(tokens, start_idx) {
51 return Some(result);
52 }
53
54 // Convert remaining tokens to grammar string
55 let remaining_tokens = &tokens[start_idx..];
56 let token_string = Self::tokens_to_grammar_string(remaining_tokens)?;
57
58 // Try each pattern in order
59 for (pattern_name, pattern_regex_str) in GRAMMAR_PATTERNS {
60 // Skip patterns handled imperatively above
61 if *pattern_name == "verbatim_block" {
62 continue;
63 }
64 if let Ok(regex) = Regex::new(pattern_regex_str) {
65 if let Some(caps) = regex.captures(&token_string) {
66 let full_match = caps.get(0)?;
67 let consumed_count = Self::count_consumed_tokens(full_match.as_str());
68
69 // Use captures to extract indices and build the pattern
70 let pattern = match *pattern_name {
71 "annotation_block_with_end" => PatternMatch::AnnotationBlock {
72 start_idx: 0,
73 content_idx: 1,
74 },
75 "annotation_block" => PatternMatch::AnnotationBlock {
76 start_idx: 0,
77 content_idx: 1,
78 },
79 "annotation_single" => PatternMatch::AnnotationSingle { start_idx: 0 },
80 "list_no_blank" => {
81 // List without preceding blank line
82 let items_str = caps.name("items")?.as_str();
83 let mut items = Vec::new();
84 let mut token_idx = 0; // No blank line, so start at 0
85 for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
86 let has_container = item_cap.as_str().contains("<container>");
87 items.push((
88 token_idx,
89 if has_container {
90 Some(token_idx + 1)
91 } else {
92 None
93 },
94 ));
95 token_idx += if has_container { 2 } else { 1 };
96 }
97
98 let trailing_blank_count = caps
99 .name("trailing_blank")
100 .map(|m| Self::count_consumed_tokens(m.as_str()))
101 .unwrap_or(0);
102 let trailing_blank_range = if trailing_blank_count > 0 {
103 Some(
104 start_idx + consumed_count - trailing_blank_count
105 ..start_idx + consumed_count,
106 )
107 } else {
108 None
109 };
110
111 PatternMatch::List {
112 items,
113 preceding_blank_range: None,
114 trailing_blank_range,
115 }
116 }
117 "list" => {
118 let blank_count = caps
119 .name("blank")
120 .map(|m| Self::count_consumed_tokens(m.as_str()))
121 .unwrap_or(0);
122 let items_str = caps.name("items")?.as_str();
123 let mut items = Vec::new();
124 let mut token_idx = blank_count;
125 for item_cap in LIST_ITEM_REGEX.find_iter(items_str) {
126 let has_container = item_cap.as_str().contains("<container>");
127 items.push((
128 token_idx,
129 if has_container {
130 Some(token_idx + 1)
131 } else {
132 None
133 },
134 ));
135 token_idx += if has_container { 2 } else { 1 };
136 }
137 let trailing_blank_count = caps
138 .name("trailing_blank")
139 .map(|m| Self::count_consumed_tokens(m.as_str()))
140 .unwrap_or(0);
141 let preceding_blank_range = if blank_count > 0 {
142 Some(start_idx..start_idx + blank_count)
143 } else {
144 None
145 };
146 let trailing_blank_range = if trailing_blank_count > 0 {
147 Some(
148 start_idx + consumed_count - trailing_blank_count
149 ..start_idx + consumed_count,
150 )
151 } else {
152 None
153 };
154
155 PatternMatch::List {
156 items,
157 preceding_blank_range,
158 trailing_blank_range,
159 }
160 }
161 "session" => {
162 // Allow session_no_blank in these cases:
163 // 1. At document start (is_first_item=true), OR
164 // 2. At container start when sessions are allowed (start_idx=0 && allow_sessions=true), OR
165 // 3. After a BlankLineGroup when sessions are allowed (has_preceding_blank && allow_sessions)
166 // 4. Immediately after another session (prev_was_session && allow_sessions)
167 // 5. Immediately after a container that just closed (has_preceding_boundary && allow_sessions)
168 // This prevents Sessions inside Definitions while allowing legitimate session sequences.
169 if !allow_sessions {
170 continue; // Definitions and other containers don't allow sessions
171 }
172 if !(is_first_item
173 || start_idx == 0
174 || has_preceding_blank
175 || has_preceding_boundary
176 || prev_was_session)
177 {
178 continue; // Sessions need a separator or another session before them
179 }
180 let blank_str = caps.name("blank")?.as_str();
181 let blank_count = Self::count_consumed_tokens(blank_str);
182 PatternMatch::Session {
183 subject_idx: 0,
184 content_idx: 1 + blank_count,
185 preceding_blank_range: None,
186 }
187 }
188 "definition" => PatternMatch::Definition {
189 subject_idx: 0,
190 content_idx: 1,
191 },
192 "paragraph" => PatternMatch::Paragraph {
193 start_idx: 0,
194 end_idx: consumed_count - 1,
195 },
196 "blank_line_group" => PatternMatch::BlankLineGroup,
197 "document_start" => PatternMatch::DocumentStart,
198 _ => continue,
199 };
200
201 return Some((pattern, start_idx..start_idx + consumed_count));
202 }
203 }
204 }
205
206 None
207 }
208
209 /// Convert remaining tokens to grammar notation string
210 fn tokens_to_grammar_string(tokens: &[LineContainer]) -> Option<String> {
211 let mut result = String::new();
212 for token in tokens {
213 match token {
214 LineContainer::Token(t) => {
215 result.push_str(&t.line_type.to_grammar_string());
216 }
217 LineContainer::Container { .. } => {
218 result.push_str("<container>");
219 }
220 }
221 }
222 if result.is_empty() {
223 None
224 } else {
225 Some(result)
226 }
227 }
228
229 /// Count how many tokens are represented in a grammar string.
230 /// Each token type in angle brackets represents one token.
231 fn count_consumed_tokens(grammar_str: &str) -> usize {
232 grammar_str.matches('<').count()
233 }
234
235 /// Match verbatim blocks using imperative logic.
236 ///
237 /// Verbatim blocks consist of:
238 /// 1. A subject line
239 /// 2. Content that is either:
240 /// a) In a Container (inflow mode - content indented relative to subject)
241 /// b) Flat lines (fullwidth mode - content at fixed column, or groups)
242 /// 3. A closing annotation marker (:: ... ::)
243 ///
244 /// This matcher handles both the original inflow case (subject + container + annotation)
245 /// and the fullwidth case (subject + flat lines + annotation). To distinguish verbatim
246 /// blocks from sessions followed by annotations, we require that either:
247 /// - There's a Container immediately after the subject, OR
248 /// - The closing annotation is at the SAME indentation as the subject
249 ///
250 /// Sessions have their title at the root level and content is indented. If we see
251 /// a root-level annotation after a root-level subject with indented content between,
252 /// that's NOT a verbatim block - it's a session followed by an annotation.
253 fn match_verbatim_block(
254 tokens: &[LineContainer],
255 start_idx: usize,
256 ) -> Option<(PatternMatch, Range<usize>)> {
257 use LineType::{
258 AnnotationStartLine, BlankLine, DataLine, DocumentStart, SubjectLine,
259 SubjectOrListItemLine,
260 };
261
262 let len = tokens.len();
263 if start_idx >= len {
264 return None;
265 }
266
267 // Allow blank lines and DocumentStart before the subject to be consumed as part of this match
268 let mut idx = start_idx;
269 while idx < len {
270 if let LineContainer::Token(line) = &tokens[idx] {
271 if line.line_type == BlankLine || line.line_type == DocumentStart {
272 idx += 1;
273 continue;
274 }
275 }
276 break;
277 }
278
279 if idx >= len {
280 return None;
281 }
282
283 // Must start with a subject line
284 let first_subject_idx = match &tokens[idx] {
285 LineContainer::Token(line)
286 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) =>
287 {
288 idx
289 }
290 _ => return None,
291 };
292
293 let mut cursor = first_subject_idx + 1;
294
295 // Try to match one or more subject+content pairs followed by closing annotation
296 // This loop handles verbatim groups: multiple subjects sharing one closing annotation
297 loop {
298 // Skip blank lines
299 while cursor < len {
300 if let LineContainer::Token(line) = &tokens[cursor] {
301 if line.line_type == BlankLine {
302 cursor += 1;
303 continue;
304 }
305 }
306 break;
307 }
308
309 if cursor >= len {
310 return None;
311 }
312
313 // Check what we have at cursor
314 match &tokens[cursor] {
315 LineContainer::Container { .. } => {
316 // Found a container - this is potentially inflow mode verbatim content
317 // But we need to verify the pattern:
318 // - Verbatim: subject + container + (annotation OR another subject+container)
319 // - Session: subject + container + (other content)
320 cursor += 1;
321
322 // Skip blank lines after container
323 while cursor < len {
324 if let LineContainer::Token(line) = &tokens[cursor] {
325 if line.line_type == BlankLine {
326 cursor += 1;
327 continue;
328 }
329 }
330 break;
331 }
332
333 // After container, check what follows
334 if cursor >= len {
335 return None; // Container at end - not a verbatim block
336 }
337
338 match &tokens[cursor] {
339 LineContainer::Token(line) => {
340 if matches!(line.line_type, DataLine | AnnotationStartLine) {
341 // Container followed by annotation - this IS verbatim!
342 // Continue loop to match it
343 continue;
344 }
345 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
346 // Container followed by another subject - this is a verbatim group!
347 // Continue loop to match more groups
348 continue;
349 }
350 // Container followed by something else - NOT a verbatim block
351 return None;
352 }
353 LineContainer::Container { .. } => {
354 // Container followed by another container - NOT verbatim pattern
355 return None;
356 }
357 }
358 }
359 LineContainer::Token(line) => {
360 if matches!(line.line_type, DataLine | AnnotationStartLine) {
361 // Found closing annotation - success!
362 // But only if we haven't mixed containers with flat content in a problematic way
363 return Some((
364 PatternMatch::VerbatimBlock {
365 subject_idx: first_subject_idx,
366 content_range: (first_subject_idx + 1)..cursor,
367 closing_idx: cursor,
368 },
369 start_idx..(cursor + 1),
370 ));
371 }
372
373 if matches!(line.line_type, SubjectLine | SubjectOrListItemLine) {
374 // Another subject - this is another group
375 cursor += 1;
376 continue;
377 }
378
379 // Any other flat token (paragraph line, etc.)
380 // This is fullwidth mode or group content
381 cursor += 1;
382 }
383 }
384 }
385 }
386}
387
388/// Main recursive descent parser using the declarative grammar.
389///
390/// This is the entry point for parsing a sequence of tokens at any level.
391/// It iteratively tries to match patterns and recursively descends into containers.
392pub fn parse_with_declarative_grammar(
393 tokens: Vec<LineContainer>,
394 source: &str,
395) -> Result<Vec<ParseNode>, String> {
396 parse_with_declarative_grammar_internal(tokens, source, true, true)
397}
398
399/// Internal parsing function with nesting level tracking
400fn parse_with_declarative_grammar_internal(
401 tokens: Vec<LineContainer>,
402 source: &str,
403 allow_sessions: bool,
404 is_doc_start: bool,
405) -> Result<Vec<ParseNode>, String> {
406 let mut items: Vec<ParseNode> = Vec::new();
407 let mut idx = 0;
408
409 while idx < tokens.len() {
410 let (has_preceding_blank, has_preceding_boundary, prev_was_session) =
411 if let Some(last_node) = items.last() {
412 (
413 matches!(last_node.node_type, NodeType::BlankLineGroup),
414 // A node with children indicates we just closed a container; this counts as a boundary.
415 // DocumentStart also counts as a boundary - it marks the start of document content.
416 !last_node.children.is_empty()
417 || matches!(last_node.node_type, NodeType::DocumentStart),
418 matches!(last_node.node_type, NodeType::Session),
419 )
420 } else {
421 (false, false, false)
422 };
423
424 let is_first_item = idx == 0 && is_doc_start;
425 if let Some((pattern, range)) = GrammarMatcher::try_match(
426 &tokens,
427 idx,
428 allow_sessions,
429 is_first_item,
430 has_preceding_blank,
431 has_preceding_boundary,
432 prev_was_session,
433 ) {
434 let mut pending_nodes = Vec::new();
435
436 if let PatternMatch::List {
437 preceding_blank_range: Some(blank_range),
438 ..
439 } = &pattern
440 {
441 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
442 }
443
444 if let PatternMatch::Session {
445 preceding_blank_range: Some(blank_range),
446 ..
447 } = &pattern
448 {
449 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
450 }
451
452 // Convert pattern to ParseNode
453 // Sessions parse their children with allow_sessions=true to allow nested sessions
454 // Other elements parse with allow_sessions=false to prevent sessions inside them
455 let is_session = matches!(&pattern, PatternMatch::Session { .. });
456 let item = convert_pattern_to_node(
457 &tokens,
458 &pattern,
459 range.clone(),
460 source,
461 &move |children, src| {
462 parse_with_declarative_grammar_internal(children, src, is_session, false)
463 },
464 )?;
465 pending_nodes.push(item);
466
467 if let PatternMatch::List {
468 trailing_blank_range: Some(blank_range),
469 ..
470 } = &pattern
471 {
472 pending_nodes.push(blank_line_node_from_range(&tokens, blank_range.clone())?);
473 }
474
475 items.extend(pending_nodes);
476 idx = range.end;
477 } else {
478 idx += 1;
479 }
480 }
481
482 Ok(items)
483}