Skip to main content

marco_core/parser/inlines/
mod.rs

1//! Inline parser modules - convert grammar output to AST nodes
2//!
3//! This module contains specialized parsers that convert inline grammar elements
4//! (from grammar/inlines) into AST nodes with proper position tracking.
5//!
6//! Phase 5: Inline parser module extraction
7
8// Shared utilities for all inline parsers
9pub mod shared;
10
11// Individual inline parser modules
12pub mod cm_autolink_parser;
13pub mod cm_backslash_escape_parser;
14pub mod cm_code_span_parser;
15pub mod cm_emphasis_parser;
16pub mod cm_entity_reference_parser;
17pub mod cm_image_parser;
18pub mod cm_inline_html_parser;
19pub mod cm_line_breaks_parser;
20pub mod cm_link_parser;
21pub mod cm_reference_link_parser;
22pub mod cm_strong_emphasis_parser;
23pub mod cm_strong_parser;
24pub mod gfm_autolink_literal_parser;
25pub mod gfm_footnote_reference_parser;
26pub mod gfm_strikethrough_parser;
27pub mod marco_dash_strikethrough_parser;
28pub mod marco_emoji_shortcode_parser;
29pub mod marco_inline_footnote_parser;
30pub mod marco_mark_parser;
31pub mod marco_platform_mentions_parser;
32pub mod marco_subscript_arrow_parser;
33pub mod marco_subscript_parser;
34pub mod marco_superscript_parser;
35pub mod marco_task_checkbox_inline_parser;
36pub mod math_display_parser;
37pub mod math_inline_parser;
38pub mod text_parser;
39
40// Re-export parser functions for convenience
41pub use cm_autolink_parser::parse_autolink;
42pub use cm_backslash_escape_parser::parse_backslash_escape;
43pub use cm_code_span_parser::parse_code_span;
44pub use cm_emphasis_parser::parse_emphasis;
45pub use cm_entity_reference_parser::parse_entity_reference;
46pub use cm_image_parser::parse_image;
47pub use cm_inline_html_parser::parse_inline_html;
48pub use cm_line_breaks_parser::{parse_hard_line_break, parse_soft_line_break};
49pub use cm_link_parser::parse_link;
50pub use cm_reference_link_parser::parse_reference_link;
51pub use cm_strong_emphasis_parser::parse_strong_emphasis;
52pub use cm_strong_parser::parse_strong;
53pub use gfm_autolink_literal_parser::parse_gfm_autolink_literal;
54pub use gfm_footnote_reference_parser::parse_footnote_reference;
55pub use gfm_strikethrough_parser::parse_strikethrough;
56pub use marco_dash_strikethrough_parser::parse_dash_strikethrough;
57pub use marco_emoji_shortcode_parser::parse_emoji_shortcode;
58pub use marco_inline_footnote_parser::parse_inline_footnote;
59pub use marco_mark_parser::parse_mark;
60pub use marco_platform_mentions_parser::parse_platform_mention;
61pub use marco_subscript_arrow_parser::parse_subscript_arrow;
62pub use marco_subscript_parser::parse_subscript;
63pub use marco_superscript_parser::parse_superscript;
64pub use marco_task_checkbox_inline_parser::parse_task_checkbox_inline;
65pub use math_display_parser::parse_display_math;
66pub use math_inline_parser::parse_inline_math;
67pub use text_parser::{parse_special_as_text, parse_text};
68
69use super::ast::{Node, NodeKind};
70use nom::bytes::complete::take;
71use shared::{opt_span, GrammarSpan};
72
73/// Parse inline elements within text content
74/// Takes a GrammarSpan to preserve position information
75/// Returns a vector of inline nodes (Text, Emphasis, Strong, Link, CodeSpan)
76pub fn parse_inlines_from_span(span: GrammarSpan) -> Result<Vec<Node>, Box<dyn std::error::Error>> {
77    log::debug!(
78        "Parsing inline elements in span at line {}: {:?}",
79        span.location_line(),
80        span.fragment()
81    );
82
83    let mut nodes = Vec::with_capacity(8);
84    let mut remaining = span;
85
86    // Safety: prevent infinite loops
87    const MAX_ITERATIONS: usize = 1000;
88    let mut iteration_count = 0;
89    let mut last_offset = 0;
90
91    while !remaining.fragment().is_empty() {
92        iteration_count += 1;
93        if iteration_count > MAX_ITERATIONS {
94            log::error!("Inline parser exceeded MAX_ITERATIONS ({})", MAX_ITERATIONS);
95            break;
96        }
97
98        let start_pos = remaining.location_offset();
99
100        // Safety: ensure we're making progress
101        if start_pos == last_offset && iteration_count > 1 {
102            log::error!(
103                "Inline parser not making progress at offset {}, forcing skip",
104                start_pos
105            );
106            // Force skip one character
107            let skip = remaining
108                .fragment()
109                .chars()
110                .next()
111                .map(|c| c.len_utf8())
112                .unwrap_or(1);
113            if let Ok((rest, _)) = take::<_, _, nom::error::Error<_>>(skip)(remaining) {
114                remaining = rest;
115                last_offset = remaining.location_offset();
116                continue;
117            } else {
118                break;
119            }
120        }
121        last_offset = start_pos;
122
123        // ---------------------------------------------------------------
124        // Fast path: skip all ~25 parser attempts for bytes that cannot
125        // possibly start any special inline sequence.
126        //
127        // Special ASCII bytes (can open a parser):
128        //   * _ ` [ < ! & \n \\ $ ^ ~ = -
129        // Space (0x20) is usually plain text but "  \n" (2+ spaces + newline)
130        // forms a hard line break — let full dispatch handle that case.
131        // Any byte >= 0x80 may start a multi-byte character (e.g. ˅ U+02C5)
132        // so we leave those for the full dispatch.
133        // ---------------------------------------------------------------
134        // SAFETY: the loop condition guarantees remaining is non-empty.
135        let first_byte = remaining.fragment().as_bytes()[0];
136        let is_non_special_ascii = first_byte < 0x80
137            && !matches!(
138                first_byte,
139                b'*' | b'_'
140                    | b'`'
141                    | b'['
142                    | b'<'
143                    | b'!'
144                    | b'&'
145                    | b'\n'
146                    | b'\\'
147                    | b'$'
148                    | b'^'
149                    | b'~'
150                    | b'='
151                    | b'-'
152            );
153        // Guard spaces: "  \n" is a hard line break — let the full loop handle it.
154        let safe_to_fast_path = is_non_special_ascii
155            && if first_byte == b' ' {
156                let frag = remaining.fragment().as_bytes();
157                let sp = frag.iter().take_while(|&&b| b == b' ').count();
158                !(sp >= 2 && frag.get(sp) == Some(&b'\n'))
159            } else {
160                true
161            };
162        if safe_to_fast_path {
163            if let Ok((rest, node)) = parse_text(remaining) {
164                nodes.push(node);
165                remaining = rest;
166                continue;
167            }
168            // parse_text failed: the position may start a GFM autolink literal,
169            // an emoji shortcode, a platform mention, or trailing hard-break spaces.
170            // Fall through to the full dispatch so those parsers get a chance.
171        }
172
173        // Try parsing code span first (highest priority to avoid conflicts)
174        if let Ok((rest, node)) = parse_code_span(remaining) {
175            nodes.push(node);
176            remaining = rest;
177            continue;
178        }
179
180        // Try parsing display math before inline math (avoid $$ being parsed as two $)
181        if crate::parser::shared::parse_math_enabled() {
182            if let Ok((rest, node)) = parse_display_math(remaining) {
183                nodes.push(node);
184                remaining = rest;
185                continue;
186            }
187
188            // Try parsing inline math
189            if let Ok((rest, node)) = parse_inline_math(remaining) {
190                nodes.push(node);
191                remaining = rest;
192                continue;
193            }
194        }
195
196        // Try parsing backslash escape (before other inline elements)
197        if let Ok((rest, node)) = parse_backslash_escape(remaining) {
198            nodes.push(node);
199            remaining = rest;
200            continue;
201        }
202
203        // Extension inlines (non-CommonMark): try these early so their delimiter
204        // sequences aren't consumed as plain text.
205        if let Ok((rest, node)) = parse_strikethrough(remaining) {
206            nodes.push(node);
207            remaining = rest;
208            continue;
209        }
210
211        if let Ok((rest, node)) = parse_dash_strikethrough(remaining) {
212            nodes.push(node);
213            remaining = rest;
214            continue;
215        }
216
217        if let Ok((rest, node)) = parse_mark(remaining) {
218            nodes.push(node);
219            remaining = rest;
220            continue;
221        }
222
223        // CommonMark underscore emphasis has special delimiter rules. In
224        // particular, intraword underscores (alnum _ alnum) should not open
225        // or close emphasis. Because our parser advances left-to-right, the
226        // underscore parsers may not be able to see the previous character.
227        //
228        // Workaround: when we're at an underscore run and the previous emitted
229        // character is alphanumeric and the next character after the run is
230        // alphanumeric, consume the underscore run as literal text.
231        if let Some(run_len) = intraword_underscore_run_len(&nodes, remaining.fragment()) {
232            if let Ok((rest, consumed)) = take::<_, _, nom::error::Error<_>>(run_len)(remaining) {
233                nodes.push(Node {
234                    kind: NodeKind::Text("_".repeat(run_len)),
235                    span: opt_span(consumed),
236                    children: Vec::new(),
237                });
238                remaining = rest;
239                continue;
240            }
241        }
242
243        // Try parsing strong+emphasis (***text*** / ___text___) before strong
244        // so we don't consume the first two delimiters as strong and leave a
245        // dangling delimiter behind.
246        if let Ok((rest, node)) = parse_strong_emphasis(remaining) {
247            nodes.push(node);
248            remaining = rest;
249            continue;
250        }
251
252        // Try parsing strong (must come before emphasis to match ** before *)
253        if let Ok((rest, node)) = parse_strong(remaining) {
254            nodes.push(node);
255            remaining = rest;
256            continue;
257        }
258
259        // Try parsing emphasis
260        if let Ok((rest, node)) = parse_emphasis(remaining) {
261            nodes.push(node);
262            remaining = rest;
263            continue;
264        }
265
266        // Extended syntax: inline footnotes `^[...]`.
267        // Try before superscript since both start with '^'.
268        if let Ok((rest, (ref_node, def_node))) = parse_inline_footnote(remaining) {
269            nodes.push(ref_node);
270            nodes.push(def_node);
271            remaining = rest;
272            continue;
273        }
274
275        if let Ok((rest, node)) = parse_superscript(remaining) {
276            nodes.push(node);
277            remaining = rest;
278            continue;
279        }
280
281        if let Ok((rest, node)) = parse_subscript_arrow(remaining) {
282            nodes.push(node);
283            remaining = rest;
284            continue;
285        }
286
287        if let Ok((rest, node)) = parse_subscript(remaining) {
288            nodes.push(node);
289            remaining = rest;
290            continue;
291        }
292
293        // Try parsing GFM autolink literals (www/http(s)/email/protocol forms)
294        if let Ok((rest, node)) = parse_gfm_autolink_literal(remaining) {
295            nodes.push(node);
296            remaining = rest;
297            continue;
298        }
299
300        // Try parsing autolink (must come before link and inline HTML since syntax starts with <)
301        if let Ok((rest, node)) = parse_autolink(remaining) {
302            nodes.push(node);
303            remaining = rest;
304            continue;
305        }
306
307        // Try parsing GFM-style footnote references `[^label]`.
308        // Must come before link parsing since it also starts with '['.
309        if let Ok((rest, node)) = parse_footnote_reference(remaining) {
310            nodes.push(node);
311            remaining = rest;
312            continue;
313        }
314
315        // Extended syntax: inline task checkbox markers mid-paragraph.
316        // This must come before link parsing since it starts with '['.
317        if is_task_checkbox_inline_start_boundary_ok(&nodes, remaining.fragment()) {
318            if let Ok((rest, node)) = parse_task_checkbox_inline(remaining) {
319                nodes.push(node);
320                remaining = rest;
321                continue;
322            }
323        }
324
325        // Try parsing image (must come before link since syntax is similar but starts with !)
326        if let Ok((rest, node)) = parse_image(remaining) {
327            nodes.push(node);
328            remaining = rest;
329            continue;
330        }
331
332        // Try parsing link
333        if let Ok((rest, node)) = parse_link(remaining) {
334            nodes.push(node);
335            remaining = rest;
336            continue;
337        }
338
339        // Try parsing reference-style links (CommonMark)
340        if let Ok((rest, node)) = parse_reference_link(remaining) {
341            nodes.push(node);
342            remaining = rest;
343            continue;
344        }
345
346        // Try parsing inline HTML
347        if let Ok((rest, node)) = parse_inline_html(remaining) {
348            nodes.push(node);
349            remaining = rest;
350            continue;
351        }
352
353        // Try parsing hard line break (two spaces + newline, or backslash + newline)
354        if let Ok((rest, node)) = parse_hard_line_break(remaining) {
355            log::debug!(
356                "Parsed hard line break at offset {}",
357                remaining.location_offset()
358            );
359            nodes.push(node);
360            remaining = rest;
361            continue;
362        }
363
364        // Try parsing soft line break (regular newline)
365        if let Ok((rest, node)) = parse_soft_line_break(remaining) {
366            nodes.push(node);
367            remaining = rest;
368            continue;
369        }
370
371        // Try parsing entity references (e.g. &copy;, &#169;)
372        if let Ok((rest, node)) = parse_entity_reference(remaining) {
373            nodes.push(node);
374            remaining = rest;
375            continue;
376        }
377
378        // Try parsing emoji shortcodes (extended syntax), e.g. :joy:
379        if let Ok((rest, node)) = parse_emoji_shortcode(remaining) {
380            nodes.push(node);
381            remaining = rest;
382            continue;
383        }
384
385        // Try parsing platform mentions (extended syntax), e.g. @user[github](Name)
386        if let Ok((rest, node)) = parse_platform_mention(remaining) {
387            nodes.push(node);
388            remaining = rest;
389            continue;
390        }
391
392        // No inline element matched - try parsing plain text
393        if let Ok((rest, node)) = parse_text(remaining) {
394            nodes.push(node);
395            remaining = rest;
396            continue;
397        }
398
399        // Special character that didn't parse as any inline element - consume as text
400        if let Ok((rest, node)) = parse_special_as_text(remaining) {
401            nodes.push(node);
402            remaining = rest;
403            continue;
404        }
405
406        // Safety check: if we reach here, we failed to parse anything
407        // This should not happen if all parsers are working correctly
408        log::error!(
409            "Inline parser unable to make progress at offset {}",
410            start_pos
411        );
412        break;
413    }
414
415    log::debug!("Parsed {} inline nodes", nodes.len());
416    Ok(nodes)
417}
418
419fn intraword_underscore_run_len(nodes: &[Node], fragment: &str) -> Option<usize> {
420    if !fragment.starts_with('_') {
421        return None;
422    }
423
424    let prev = last_emitted_char(nodes)?;
425    if !prev.is_alphanumeric() {
426        return None;
427    }
428
429    let run_len = fragment.chars().take_while(|&c| c == '_').count();
430    let after = fragment.chars().nth(run_len)?;
431    if !after.is_alphanumeric() {
432        return None;
433    }
434
435    Some(run_len)
436}
437
438fn is_task_checkbox_inline_start_boundary_ok(nodes: &[Node], fragment: &str) -> bool {
439    if !fragment.starts_with('[') {
440        return false;
441    }
442
443    // If the previous emitted character is alphanumeric/underscore, we do not
444    // treat `[x]` / `[ ]` as a task marker (avoid matching `word[x]`).
445    match last_emitted_char(nodes) {
446        None => true,
447        Some(prev) => !(prev.is_alphanumeric() || prev == '_'),
448    }
449}
450
451fn last_emitted_char(nodes: &[Node]) -> Option<char> {
452    nodes.iter().rev().find_map(last_char_in_node)
453}
454
455fn last_char_in_node(node: &Node) -> Option<char> {
456    match &node.kind {
457        NodeKind::Text(t) => t.chars().last(),
458        // Formatting/container nodes: use their last child.
459        _ => node.children.iter().rev().find_map(last_char_in_node),
460    }
461}
462
463/// Parse inline elements within text content (backward compatibility wrapper)
464/// Creates a new span at position 0:0 - USE parse_inlines_from_span() for position-aware parsing
465/// Returns a vector of inline nodes (Text, Emphasis, Strong, Link, CodeSpan)
466pub fn parse_inlines(text: &str) -> Result<Vec<Node>, Box<dyn std::error::Error>> {
467    parse_inlines_from_span(GrammarSpan::new(text))
468}
469
470#[cfg(test)]
471mod tests {
472    use super::*;
473
474    #[test]
475    fn smoke_test_triple_delimiter_parses_as_single_node() {
476        let nodes = parse_inlines("***hi***").expect("inline parse failed");
477        assert_eq!(nodes.len(), 1);
478        assert!(matches!(
479            nodes[0].kind,
480            crate::parser::ast::NodeKind::StrongEmphasis
481        ));
482    }
483
484    #[test]
485    fn smoke_test_extension_inlines_parse_mid_line() {
486        let nodes = parse_inlines(
487            "This is ^sup^ and ~sub~ and ˅sub2˅ and ==mark== and ~~del~~ and --del2--.",
488        )
489        .expect("inline parse failed");
490
491        use crate::parser::ast::NodeKind;
492
493        assert!(nodes
494            .iter()
495            .any(|n| matches!(n.kind, NodeKind::Superscript)));
496        assert!(nodes.iter().any(|n| matches!(n.kind, NodeKind::Subscript)));
497        assert!(nodes.iter().any(|n| matches!(n.kind, NodeKind::Mark)));
498        assert!(nodes
499            .iter()
500            .any(|n| matches!(n.kind, NodeKind::Strikethrough)));
501    }
502}