Skip to main content

docspec_markdown_reader/
lib.rs

1#![allow(dead_code)]
2
3//! Markdown to `DocSpec` event stream reader.
4//!
5//! This crate provides a [`MarkdownReader`] that implements [`EventSource`] to convert
6//! Markdown documents into the `DocSpec` event stream format. It uses `pulldown-cmark`
7//! to parse CommonMark-compliant Markdown and emits typed events representing document
8//! structure.
9//!
10//! # Quick Start
11//!
12//! ```
13//! use docspec_markdown_reader::{MarkdownReader, EventSource};
14//!
15//! let markdown = "# Hello\n\nWorld";
16//! let mut reader = MarkdownReader::from_str(markdown);
17//!
18//! while let Some(event) = reader.next_event()? {
19//!     println!("{event:?}");
20//! }
21//! # Ok::<(), docspec_core::Error>(())
22//! ```
23//!
24//! # Supported Elements
25//!
26//! - Headings (h1–h6) → `StartHeading` / `EndHeading`
27//! - Paragraphs → `StartParagraph` / `EndParagraph`
28//! - Block quotes → `StartBlockQuote` / `EndBlockQuote`
29//! - Code blocks → `StartPreformatted` / `EndPreformatted`
30//! - Bold text → `StartTextStyle { kind: Bold }` / `EndTextStyle`
31//! - Italic text → `StartTextStyle { kind: Italic }` / `EndTextStyle`
32//! - Inline code → `StartTextStyle { kind: Code }` / `EndTextStyle`
33//! - Strikethrough → `StartTextStyle { kind: Strikethrough }` / `EndTextStyle`
34//! - Images → `Image { source: Uri, alt, title, decorative }`
35//! - Hard line breaks → `LineBreak`
36//! - Soft line breaks → `SoftBreak`
37//! - Thematic breaks → `ThematicBreak`
38//! - Tables → `StartTable` / `EndTable`, `StartTableRow` / `EndTableRow`,
39//!   `StartTableHeader` / `EndTableHeader`, `StartTableCell` / `EndTableCell`
40//!   (GFM column alignment syntax is parsed, but alignment data is discarded)
41//! - Bullet lists → `StartUnorderedListItem` / `EndUnorderedListItem`
42//! - Numbered lists → `StartOrderedListItem` / `EndOrderedListItem`
43//!   (`start: Option<u64>` is `Some(n)` on the first item of each list, `None` on subsequent items;
44//!   child items may nest inside their parent's `Start*`/`End*` pair with `level` indicating
45//!   indent depth; task list markers (`- [ ]`/`- [x]`) are parsed as literal text)
46//! - Links → `StartLink { href, title }` / `EndLink` (inline, reference, collapsed,
47//!   shortcut, autolink, and email autolink variants — all resolved to inline form
48//!   by pulldown-cmark; image-inside-link closes the link before emitting the image
49//!   as a sibling block: content preceding the image stays inside the link, content
50//!   following the image is outside the link, and the link is empty only when the
51//!   image is the sole link label, e.g. `[![alt](img)](url)`)
52//!
53//! # Supported Raw HTML Tags
54//!
55//! The following raw HTML tags embedded in markdown source are translated into
56//! `DocSpec` events. All attributes on these tags are silently ignored. All other
57//! HTML tags continue to be silently dropped.
58//!
59//! ## Inline formatting (translated to `StartTextStyle` / `EndTextStyle`)
60//! - `<b>`, `<strong>` → `TextStyleKind::Bold`
61//! - `<i>`, `<em>` → `TextStyleKind::Italic`
62//! - `<u>` → `TextStyleKind::Underline`
63//! - `<s>`, `<strike>`, `<del>` → `TextStyleKind::Strikethrough`
64//! - `<code>` → `TextStyleKind::Code`
65//! - `<sub>` → `TextStyleKind::Subscript`
66//! - `<sup>` → `TextStyleKind::Superscript`
67//! - `<mark>` → `TextStyleKind::Mark` with constant yellow `#FFFF00`
68//!
69//! ## Self-closing / void
70//! - `<br>`, `<br/>`, `<br />` → `Event::LineBreak`
71//! - `<hr>` → `Event::ThematicBreak` (block context only; ignored in paragraph context)
72//!
73//! ## Block (only inside an `HtmlBlock`)
74//! - `<h1>`...`<h6>` → `Event::StartHeading { level: N }` + content + `Event::EndHeading`
75//!
76//! ## Known limitations
77//! - Raw HTML `<pre><code>...</code></pre>` is NOT treated as a code block; the `<pre>` is dropped
78//!   (out of scope) and the `<code>` becomes an inline style. Use markdown fenced code blocks instead.
79//! - HTML attributes (id, class, style, href, src, etc.) are NOT extracted.
80//! - Unclosed tags are auto-closed at the end of the containing block.
81//!
82//! # Unsupported Elements
83//!
84//! The following elements are not emitted as structured events. Text content is
85//! recursively extracted where applicable; structure is silently dropped:
86//! - Definition lists and footnotes
87//! - Math blocks and inline math
88//! - Subscript and superscript formatting (use `<sub>` / `<sup>` raw HTML instead)
89//!
90//! # Memory Model
91//!
92//! `MarkdownReader` owns its source text for the parser's lifetime. While events
93//! are emitted one at a time via [`EventSource::next_event`] (the stream-event
94//! guarantee is preserved), the source `String` is held in memory until the reader
95//! is dropped. This is a constraint of `pulldown-cmark`, which is permanently
96//! borrow-based by design (see [pulldown-cmark issue #463]).
97//!
98//! For contrast, `HtmlReader` (from `docspec-html-reader`) streams its source via a
99//! 16 KB sliding-window buffer and does not hold the full document in memory.
100//!
101//! [pulldown-cmark issue #463]: https://github.com/raphlinus/pulldown-cmark/issues/463
102
103extern crate alloc;
104
105#[cfg_attr(all(), allow(clippy::mem_forget))]
106mod parser_cell {
107    use self_cell::self_cell;
108
109    use super::MarkdownParser;
110
111    self_cell!(
112        pub(super) struct ParserCell {
113            owner: String,
114            #[covariant]
115            dependent: MarkdownParser,
116        }
117    );
118}
119
120mod html;
121
122use alloc::collections::VecDeque;
123use std::io::{Read, Seek};
124
125pub use docspec_core::EventSource;
126use docspec_core::{Event, ImageSource, ListStyleType, Result, TableHeaderScope, TextStyleKind};
127use parser_cell::ParserCell;
128use pulldown_cmark::{CodeBlockKind, CowStr, HeadingLevel, Options, Parser, Tag, TagEnd};
129
130struct MarkdownParser<'a>(Parser<'a>);
131
132/// Whether content is inside a block-level element.
133#[derive(Clone, Copy, PartialEq, Eq)]
134enum BlockState {
135    /// Inside an auto-opened paragraph (text arrived outside any explicit block).
136    AutoParagraph,
137    /// Inside an explicit block (from a `StartParagraph` or `StartHeading` tag).
138    Explicit,
139    /// Not inside any block context.
140    None,
141    /// Explicit block whose `StartParagraph` is deferred until the first real event.
142    PendingExplicit,
143}
144
145/// Document processing phase.
146#[derive(Clone, Copy, PartialEq, Eq)]
147enum Phase {
148    /// `EndDocument` has been emitted.
149    Finished,
150    /// `StartDocument` not yet emitted.
151    NotStarted,
152    /// Processing events between `StartDocument` and `EndDocument`.
153    Running,
154}
155
156/// Context for a single list level tracked by [`MarkdownReader`].
157struct ListContext {
158    /// Whether the item at this list level is currently open (start emitted, end not yet emitted).
159    item_open: bool,
160    /// Whether this list is ordered (numbered) rather than unordered (bulleted).
161    ordered: bool,
162    /// Start number to attach to the next item emitted; `Some(n)` only before the first
163    /// item is emitted, then `None` for all subsequent items in the same list.
164    pending_start: Option<u64>,
165}
166
167/// Buffered image state during image alt text collection.
168struct ImageBuffer {
169    /// Accumulated alt text.
170    alt_buf: String,
171    /// Image title if provided.
172    title: Option<String>,
173    /// Image URL.
174    url: String,
175}
176
177enum MarkdownPulldownEvent {
178    BlockHtml(String),
179    Code(String),
180    End(TagEnd),
181    HardBreak,
182    InlineHtml(String),
183    Ignored,
184    Rule,
185    SoftBreak,
186    Start(MarkdownStartTag),
187    Text(String),
188}
189
190enum MarkdownStartTag {
191    BlockQuote,
192    CodeBlock {
193        syntax: Option<String>,
194    },
195    Emphasis,
196    Heading {
197        level: HeadingLevel,
198    },
199    Image {
200        dest_url: String,
201        title: Option<String>,
202    },
203    Item,
204    Link {
205        dest_url: String,
206        title: Option<String>,
207    },
208    HtmlBlock,
209    List(Option<u64>),
210    Paragraph,
211    Strikethrough,
212    Strong,
213    Table,
214    TableCell,
215    TableHead,
216    TableRow,
217}
218
219/// Buffered link state during link inline content collection.
220struct LinkBuffer {
221    /// Link target URL.
222    href: String,
223    /// Whether `StartLink` has been emitted yet (deferred until first inline event arrives).
224    started: bool,
225    /// Optional link title (from `CommonMark` `[text](url "title")` syntax).
226    title: Option<String>,
227}
228
229/// A streaming Markdown reader that implements [`EventSource`].
230///
231/// `MarkdownReader` parses Markdown using `pulldown-cmark` and emits `DocSpec` events
232/// one at a time. It handles the mapping from `pulldown-cmark`'s event model to `DocSpec`'s
233/// event model, including tracking inline formatting state.
234///
235/// # Example
236///
237/// ```
238/// use docspec_markdown_reader::{MarkdownReader, EventSource};
239///
240/// let mut reader = MarkdownReader::from_str("**bold** and *italic*");
241/// while let Some(event) = reader.next_event()? {
242///     // Process events...
243/// }
244/// # Ok::<(), docspec_core::Error>(())
245/// ```
246pub struct MarkdownReader {
247    /// Current block-level context.
248    block_state: BlockState,
249    /// Owned source text and parser borrowing from it.
250    cell: ParserCell,
251    /// Buffered code block text (accumulated until `EndCodeBlock` to strip trailing newline).
252    code_block_buffer: Option<String>,
253    /// Buffered image being processed (alt text accumulation).
254    image: Option<ImageBuffer>,
255    /// Heading accumulator for block HTML fragments.
256    html_block_heading_acc: crate::html::translator::BlockHeadingAccumulator,
257    /// Inline style stack scoped to block HTML headings.
258    html_block_inline_stack: crate::html::stack::StyleStack,
259    /// Whether the parser is currently inside a pulldown HTML block wrapper.
260    in_html_block: bool,
261    /// Whether the parser is currently inside a preformatted code block.
262    in_preformatted: bool,
263    /// Whether the parser is currently inside a table header row.
264    in_table_head: bool,
265    /// Buffered link being processed (deferred Start emission for image-in-link extraction).
266    link: Option<LinkBuffer>,
267    /// LIFO stack of list contexts. `len()` gives the current nesting depth;
268    /// `level = list_stack.len().saturating_sub(1)` at item-emit time.
269    list_stack: alloc::vec::Vec<ListContext>,
270    /// Unified inline style stack shared by markdown emphasis and inline HTML.
271    inline_style_stack: crate::html::stack::StyleStack,
272    /// Document processing phase.
273    phase: Phase,
274    /// Queue of `DocSpec` events to emit.
275    queue: VecDeque<Event>,
276}
277
278impl MarkdownReader {
279    fn close_current_item_if_open(&mut self) {
280        let Some(ctx) = self.list_stack.last() else {
281            return;
282        };
283        if !ctx.item_open {
284            return;
285        }
286
287        let ordered = ctx.ordered;
288        self.flush_html_styles();
289        if ordered {
290            self.queue.push_back(Event::EndOrderedListItem);
291        } else {
292            self.queue.push_back(Event::EndUnorderedListItem);
293        }
294        if let Some(current_ctx) = self.list_stack.last_mut() {
295            current_ctx.item_open = false;
296        }
297        self.block_state = BlockState::None;
298    }
299
300    fn close_style(&mut self, kind: &TextStyleKind) {
301        if self.in_preformatted {
302            return;
303        }
304
305        for event in self
306            .inline_style_stack
307            .close(intent_from_text_style_kind(kind))
308        {
309            self.queue.push_back(event);
310        }
311    }
312
313    fn open_style(&mut self, kind: &TextStyleKind) {
314        if !self.in_preformatted {
315            for event in self
316                .inline_style_stack
317                .open(intent_from_text_style_kind(kind))
318            {
319                self.queue.push_back(event);
320            }
321        }
322    }
323
324    fn enqueue_text(&mut self, content: String) {
325        for event in self.inline_style_stack.note_text() {
326            self.queue.push_back(event);
327        }
328        let text_event = Event::Text { content };
329        self.queue.push_back(text_event);
330    }
331
332    fn flush_html_styles(&mut self) {
333        for event in self.inline_style_stack.close_all() {
334            self.queue.push_back(event);
335        }
336    }
337
338    /// Emits `StartLink` for the buffered link if it hasn't been emitted yet.
339    /// Called before any inline event that would belong inside a link.
340    fn emit_pending_link_start(&mut self) {
341        self.flush_pending_paragraph_start();
342        if let Some(link) = self.link.as_mut() {
343            if !link.started {
344                self.queue.push_back(Event::StartLink {
345                    href: link.href.clone(),
346                    id: None,
347                    title: link.title.clone(),
348                });
349                link.started = true;
350            }
351        }
352    }
353
354    /// Emits `StartParagraph` for the deferred paragraph if it hasn't been emitted yet.
355    /// Called before any committing event that would belong inside a paragraph.
356    fn flush_pending_paragraph_start(&mut self) {
357        if self.block_state == BlockState::PendingExplicit {
358            self.queue.push_back(Event::StartParagraph {
359                alignment: None,
360                id: None,
361            });
362            self.block_state = BlockState::Explicit;
363        }
364    }
365
366    fn from_owned_string(source: String) -> Self {
367        let options = Options::ENABLE_TABLES | Options::ENABLE_STRIKETHROUGH;
368        let cell = ParserCell::new(source, |s| MarkdownParser(Parser::new_ext(s, options)));
369        Self {
370            block_state: BlockState::None,
371            cell,
372            code_block_buffer: None,
373            image: None,
374            html_block_heading_acc: crate::html::translator::BlockHeadingAccumulator::default(),
375            html_block_inline_stack: crate::html::stack::StyleStack::default(),
376            in_html_block: false,
377            in_preformatted: false,
378            in_table_head: false,
379            link: None,
380            list_stack: Vec::new(),
381            inline_style_stack: crate::html::stack::StyleStack::default(),
382            phase: Phase::NotStarted,
383            queue: VecDeque::new(),
384        }
385    }
386
387    /// Creates a `MarkdownReader` from any `Read + Seek` source.
388    ///
389    /// Reads the entire source into memory (required by `pulldown_cmark`'s
390    /// borrow-based parser).
391    ///
392    /// # Errors
393    ///
394    /// Returns [`Error::Io`](docspec_core::Error::Io) if reading fails.
395    #[inline]
396    pub fn from_reader<R: Read + Seek + Send + 'static>(mut reader: R) -> Result<Self> {
397        let mut source = String::new();
398        reader.read_to_string(&mut source)?;
399        Ok(Self::from_owned_string(source))
400    }
401
402    /// Creates a `MarkdownReader` from a string slice.
403    ///
404    /// The input is copied into an owned `String` for the parser's lifetime.
405    ///
406    /// # Example
407    ///
408    /// ```
409    /// use docspec_markdown_reader::MarkdownReader;
410    ///
411    /// let reader = MarkdownReader::from_str("# Hello World");
412    /// ```
413    #[inline]
414    #[must_use]
415    #[expect(
416        clippy::should_implement_trait,
417        reason = "constructor name is required for reader API consistency"
418    )]
419    pub fn from_str(input: &str) -> Self {
420        Self::from_owned_string(input.to_owned())
421    }
422
423    fn handle_code(&mut self, content: String) {
424        if let Some(img) = &mut self.image {
425            img.alt_buf.push_str(&content);
426        } else {
427            self.emit_pending_link_start();
428            if self.block_state == BlockState::None {
429                self.queue.push_back(Event::StartParagraph {
430                    alignment: None,
431                    id: None,
432                });
433                self.block_state = BlockState::AutoParagraph;
434            }
435            self.open_style(&TextStyleKind::Code);
436            self.enqueue_text(content);
437            self.close_style(&TextStyleKind::Code);
438        }
439    }
440
441    /// Emits the buffered code block content (stripping the parser-added trailing newline)
442    /// followed by `EndPreformatted`. Skips the text event if the buffer is empty.
443    fn handle_end_code_block(&mut self) {
444        if let Some(buf) = self.code_block_buffer.take() {
445            let content = buf.strip_suffix('\n').unwrap_or(&buf).to_owned();
446            if !content.is_empty() {
447                self.enqueue_text(content);
448            }
449        }
450        self.in_preformatted = false;
451        self.push_event_end(Event::EndPreformatted);
452    }
453
454    /// Emits an `Image` event from the accumulated image buffer, deriving
455    /// `decorative = true` when the trimmed alt text is empty. Consumes the
456    /// in-progress image state; does nothing if no image is in progress.
457    fn handle_end_image(&mut self) {
458        let Some(img) = self.image.take() else { return };
459        self.flush_pending_paragraph_start();
460        let trimmed = img.alt_buf.trim();
461        let alt = if trimmed.is_empty() {
462            None
463        } else {
464            Some(trimmed.to_owned())
465        };
466        let decorative = alt.is_none();
467        self.queue.push_back(Event::Image {
468            source: ImageSource::Uri { uri: img.url },
469            alt,
470            title: img.title,
471            decorative,
472            id: None,
473        });
474    }
475
476    /// Closes an auto-opened paragraph if one is open, then closes the current
477    /// list item and resets block state.
478    fn handle_end_item(&mut self) {
479        if self.block_state == BlockState::AutoParagraph {
480            self.flush_html_styles();
481            self.queue.push_back(Event::EndParagraph);
482        }
483        self.close_current_item_if_open();
484        self.block_state = BlockState::None;
485    }
486
487    /// Emits `EndLink` (and `StartLink` if not yet emitted) for the buffered link.
488    fn handle_end_link(&mut self) {
489        let Some(link) = self.link.take() else { return };
490        if link.started {
491            self.queue.push_back(Event::EndLink);
492        } else {
493            self.flush_pending_paragraph_start();
494            self.queue.push_back(Event::StartLink {
495                href: link.href,
496                id: None,
497                title: link.title,
498            });
499            self.queue.push_back(Event::EndLink);
500        }
501    }
502
503    /// Closes the current list item if open, pops the list context, and resets block state.
504    fn handle_end_list(&mut self) {
505        self.close_current_item_if_open();
506        self.list_stack.pop();
507        self.block_state = BlockState::None;
508    }
509
510    /// Emits `EndTableCell` or `EndTableHeader` depending on whether the parser
511    /// is currently inside a table header row.
512    fn handle_end_table_cell(&mut self) {
513        if self.in_table_head {
514            self.push_event_end(Event::EndTableHeader);
515        } else {
516            self.push_event_end(Event::EndTableCell);
517        }
518    }
519
520    /// Emits `EndTableRow` and clears the table-head flag for a table head closing tag.
521    fn handle_end_table_head(&mut self) {
522        self.push_event_end(Event::EndTableRow);
523        self.in_table_head = false;
524    }
525
526    /// Dispatches a `pulldown-cmark` end tag to the appropriate per-tag handler.
527    ///
528    /// Tags in the explicit ignore list below are known-unsupported elements whose
529    /// structure is intentionally dropped (text content may still be extracted by
530    /// other event handlers).
531    fn handle_end_tag(&mut self, tag_end: TagEnd) {
532        match tag_end {
533            TagEnd::BlockQuote(_) => self.push_event_end(Event::EndBlockQuote),
534            TagEnd::CodeBlock => self.handle_end_code_block(),
535            TagEnd::Emphasis => self.close_style(&TextStyleKind::Italic),
536            TagEnd::Heading(_) => self.push_event_end(Event::EndHeading),
537            TagEnd::HtmlBlock => {
538                self.in_html_block = false;
539                for event in self.html_block_inline_stack.close_all() {
540                    self.queue.push_back(event);
541                }
542                if let Some(event) = self.html_block_heading_acc.finish_block() {
543                    self.queue.push_back(event);
544                }
545            }
546            TagEnd::Image => self.handle_end_image(),
547            TagEnd::Item => self.handle_end_item(),
548            TagEnd::Link => self.handle_end_link(),
549            TagEnd::List(_) => self.handle_end_list(),
550            TagEnd::Paragraph => {
551                if self.block_state == BlockState::PendingExplicit {
552                    self.flush_html_styles();
553                    self.block_state = BlockState::None;
554                } else {
555                    self.push_event_end(Event::EndParagraph);
556                }
557            }
558            TagEnd::Strikethrough => self.close_style(&TextStyleKind::Strikethrough),
559            TagEnd::Strong => self.close_style(&TextStyleKind::Bold),
560            TagEnd::Table => self.push_event_end(Event::EndTable),
561            TagEnd::TableCell => self.handle_end_table_cell(),
562            TagEnd::TableHead => self.handle_end_table_head(),
563            TagEnd::TableRow => self.push_event_end(Event::EndTableRow),
564            // Tags intentionally ignored (structure dropped, text extracted elsewhere):
565            TagEnd::DefinitionList
566            | TagEnd::DefinitionListDefinition
567            | TagEnd::DefinitionListTitle
568            | TagEnd::FootnoteDefinition
569            | TagEnd::MetadataBlock(_)
570            | TagEnd::Subscript
571            | TagEnd::Superscript => {}
572        }
573    }
574
575    fn handle_item_start(&mut self) {
576        let depth = self.list_stack.len().saturating_sub(1);
577        let level = u32::try_from(depth).map_or(u32::MAX, |v| v);
578        if let Some(ctx) = self.list_stack.last_mut() {
579            if ctx.ordered {
580                self.queue.push_back(Event::StartOrderedListItem {
581                    start: ctx.pending_start.take(),
582                    style_type: ListStyleType::Decimal,
583                    level,
584                    id: None,
585                });
586            } else {
587                self.queue.push_back(Event::StartUnorderedListItem {
588                    style_type: ListStyleType::Disc,
589                    level,
590                    id: None,
591                });
592            }
593            ctx.item_open = true;
594            self.block_state = BlockState::Explicit;
595        }
596    }
597
598    fn handle_list_start(&mut self, start_opt: Option<u64>) {
599        self.list_stack.push(ListContext {
600            item_open: false,
601            ordered: start_opt.is_some(),
602            pending_start: start_opt,
603        });
604    }
605
606    /// Emits `StartPreformatted` for a code block opening tag, initialising
607    /// the internal code-block buffer for content accumulation.
608    fn handle_start_code_block(&mut self, syntax: Option<String>) {
609        self.code_block_buffer = Some(String::new());
610        self.in_preformatted = true;
611        self.push_event_start(Event::StartPreformatted { id: None, syntax });
612    }
613
614    /// Emits `StartHeading` after mapping a `pulldown-cmark` `HeadingLevel` to a `u8` level.
615    fn handle_start_heading(&mut self, level: HeadingLevel) {
616        let level_u8 = match level {
617            HeadingLevel::H1 => 1,
618            HeadingLevel::H2 => 2,
619            HeadingLevel::H3 => 3,
620            HeadingLevel::H4 => 4,
621            HeadingLevel::H5 => 5,
622            HeadingLevel::H6 => 6,
623        };
624        self.push_event_start(Event::StartHeading {
625            level: level_u8,
626            id: None,
627        });
628    }
629
630    /// Initialises image state for alt-text accumulation when an image opening tag is
631    /// encountered. The title is stored as `None` when the pulldown-cmark title string
632    /// is empty.
633    fn handle_start_image(&mut self, dest_url: String, title: Option<String>) {
634        // Image-in-link extraction: close the link before processing the image so the
635        // image can be emitted as a sibling block (BlockNote and similar schemas do not
636        // allow block-level images inside inline links). When `link.started` is true, the
637        // link already contains preceding inline content — emit only `EndLink`. When it
638        // is false (image is the sole link label, e.g. `[![alt](img)](url)`), emit an
639        // empty `StartLink`/`EndLink` pair so the URL is preserved. `TagEnd::Image` fires
640        // `Event::Image` before `TagEnd::Paragraph`, so downstream writers close the
641        // surrounding paragraph before serialising the image as a sibling block.
642        self.flush_pending_paragraph_start();
643        if let Some(link) = self.link.take() {
644            if link.started {
645                self.queue.push_back(Event::EndLink);
646            } else {
647                self.queue.push_back(Event::StartLink {
648                    href: link.href,
649                    id: None,
650                    title: link.title,
651                });
652                self.queue.push_back(Event::EndLink);
653            }
654        }
655
656        self.image = Some(ImageBuffer {
657            alt_buf: String::new(),
658            title,
659            url: dest_url,
660        });
661    }
662
663    /// Stores link state for deferred `StartLink` emission.
664    ///
665    /// Emission is deferred until the first inline event arrives (lazy emission).
666    /// This allows image-in-link to be detected before any `StartLink` is emitted.
667    fn handle_start_link(&mut self, dest_url: String, title: Option<String>) {
668        self.link = Some(LinkBuffer {
669            href: dest_url,
670            started: false,
671            title,
672        });
673    }
674
675    /// Emits `StartTableHeader` or `StartTableCell` depending on whether the parser
676    /// is currently inside a table header row.
677    fn handle_start_table_cell(&mut self) {
678        if self.in_table_head {
679            self.push_event_start(Event::StartTableHeader {
680                scope: Some(TableHeaderScope::Column),
681                abbr: None,
682                colspan: None,
683                rowspan: None,
684                id: None,
685            });
686        } else {
687            self.push_event_start(Event::StartTableCell {
688                colspan: None,
689                rowspan: None,
690                id: None,
691            });
692        }
693    }
694
695    /// Sets the table-head flag and emits `StartTableRow` for a table head opening tag.
696    fn handle_start_table_head(&mut self) {
697        self.in_table_head = true;
698        self.push_event_start(Event::StartTableRow { id: None });
699    }
700
701    /// Dispatches a `pulldown-cmark` start tag to the appropriate per-tag handler.
702    ///
703    /// Tags in the explicit ignore list below are known-unsupported elements whose
704    /// structure is intentionally dropped (text content may still be extracted by
705    /// other event handlers).
706    fn handle_start_tag(&mut self, tag: MarkdownStartTag) {
707        match tag {
708            MarkdownStartTag::BlockQuote => {
709                self.push_event_start(Event::StartBlockQuote { id: None });
710            }
711            MarkdownStartTag::CodeBlock { syntax } => self.handle_start_code_block(syntax),
712            MarkdownStartTag::Emphasis => self.open_style(&TextStyleKind::Italic),
713            MarkdownStartTag::Heading { level } => self.handle_start_heading(level),
714            MarkdownStartTag::HtmlBlock => self.in_html_block = true,
715            MarkdownStartTag::Image { dest_url, title } => self.handle_start_image(dest_url, title),
716            MarkdownStartTag::Item => self.handle_item_start(),
717            MarkdownStartTag::Link { dest_url, title } => self.handle_start_link(dest_url, title),
718            MarkdownStartTag::List(start_opt) => self.handle_list_start(start_opt),
719            MarkdownStartTag::Paragraph => self.block_state = BlockState::PendingExplicit,
720            MarkdownStartTag::Strikethrough => self.open_style(&TextStyleKind::Strikethrough),
721            MarkdownStartTag::Strong => self.open_style(&TextStyleKind::Bold),
722            MarkdownStartTag::Table => self.push_event_start(Event::StartTable { id: None }),
723            MarkdownStartTag::TableCell => self.handle_start_table_cell(),
724            MarkdownStartTag::TableHead => self.handle_start_table_head(),
725            MarkdownStartTag::TableRow => self.push_event_start(Event::StartTableRow { id: None }),
726        }
727    }
728
729    fn handle_text(&mut self, content: String) {
730        if let Some(img) = &mut self.image {
731            img.alt_buf.push_str(&content);
732        } else if let Some(buf) = &mut self.code_block_buffer {
733            buf.push_str(&content);
734        } else {
735            self.emit_pending_link_start();
736            if self.block_state == BlockState::None {
737                self.queue.push_back(Event::StartParagraph {
738                    alignment: None,
739                    id: None,
740                });
741                self.block_state = BlockState::AutoParagraph;
742            }
743            self.enqueue_text(content);
744        }
745    }
746
747    fn next_pulldown_event(&mut self) -> Option<MarkdownPulldownEvent> {
748        self.cell.with_dependent_mut(|_, dep| {
749            dep.0.next().map(|event| match event {
750                pulldown_cmark::Event::Start(tag) => markdown_start_tag(tag)
751                    .map_or(MarkdownPulldownEvent::Ignored, MarkdownPulldownEvent::Start),
752                pulldown_cmark::Event::End(tag_end) => MarkdownPulldownEvent::End(tag_end),
753                pulldown_cmark::Event::Text(text) => {
754                    MarkdownPulldownEvent::Text(text.into_string())
755                }
756                pulldown_cmark::Event::Code(code) => {
757                    MarkdownPulldownEvent::Code(code.into_string())
758                }
759                pulldown_cmark::Event::HardBreak => MarkdownPulldownEvent::HardBreak,
760                pulldown_cmark::Event::SoftBreak => MarkdownPulldownEvent::SoftBreak,
761                pulldown_cmark::Event::Rule => MarkdownPulldownEvent::Rule,
762                pulldown_cmark::Event::InlineHtml(tag_str) => {
763                    MarkdownPulldownEvent::InlineHtml(tag_str.into_string())
764                }
765                pulldown_cmark::Event::Html(fragment) => {
766                    MarkdownPulldownEvent::BlockHtml(fragment.into_string())
767                }
768                pulldown_cmark::Event::DisplayMath(_)
769                | pulldown_cmark::Event::FootnoteReference(_)
770                | pulldown_cmark::Event::InlineMath(_)
771                | pulldown_cmark::Event::TaskListMarker(_) => MarkdownPulldownEvent::Ignored,
772            })
773        })
774    }
775
776    fn process_next_pulldown_event(&mut self) {
777        let Some(pm_event) = self.next_pulldown_event() else {
778            if self.phase != Phase::Finished {
779                self.phase = Phase::Finished;
780                self.flush_html_styles();
781                self.queue.push_back(Event::EndDocument);
782            }
783            return;
784        };
785
786        match pm_event {
787            MarkdownPulldownEvent::BlockHtml(fragment) => {
788                let events = crate::html::translator::translate_block(
789                    &fragment,
790                    &mut self.html_block_heading_acc,
791                    &mut self.html_block_inline_stack,
792                    self.in_preformatted,
793                );
794                for event in events {
795                    match event {
796                        Event::Text { content } => self.enqueue_text(content),
797                        other => self.queue.push_back(other),
798                    }
799                }
800            }
801            MarkdownPulldownEvent::Start(tag) => self.handle_start_tag(tag),
802            MarkdownPulldownEvent::End(tag_end) => self.handle_end_tag(tag_end),
803            MarkdownPulldownEvent::Text(text) => self.handle_text(text),
804            MarkdownPulldownEvent::Code(code) => self.handle_code(code),
805            MarkdownPulldownEvent::InlineHtml(fragment) => {
806                let events = crate::html::translator::translate_inline(
807                    &fragment,
808                    &mut self.inline_style_stack,
809                    self.in_preformatted,
810                );
811                for event in events {
812                    match event {
813                        Event::Text { content } => self.enqueue_text(content),
814                        other => self.queue.push_back(other),
815                    }
816                }
817            }
818            MarkdownPulldownEvent::HardBreak => {
819                if let Some(img) = &mut self.image {
820                    img.alt_buf.push(' ');
821                } else if self.block_state == BlockState::PendingExplicit {
822                    // emitting a break before StartParagraph would be malformed — discard
823                } else {
824                    self.emit_pending_link_start();
825                    self.queue.push_back(Event::LineBreak);
826                }
827            }
828            MarkdownPulldownEvent::SoftBreak => {
829                if let Some(img) = &mut self.image {
830                    img.alt_buf.push(' ');
831                } else if self.block_state == BlockState::PendingExplicit {
832                    // emitting a break before StartParagraph would be malformed — discard
833                } else {
834                    self.emit_pending_link_start();
835                    self.queue.push_back(Event::SoftBreak);
836                }
837            }
838            MarkdownPulldownEvent::Rule => {
839                self.queue.push_back(Event::ThematicBreak { id: None });
840            }
841            MarkdownPulldownEvent::Ignored => {}
842        }
843    }
844
845    fn push_event(&mut self, event: Event, state: BlockState) {
846        self.queue.push_back(event);
847        self.block_state = state;
848    }
849
850    fn push_event_end(&mut self, event: Event) {
851        self.flush_html_styles();
852        self.push_event(event, BlockState::None);
853    }
854
855    fn push_event_start(&mut self, event: Event) {
856        self.push_event(event, BlockState::Explicit);
857    }
858}
859
860impl EventSource for MarkdownReader {
861    #[inline]
862    fn next_event(&mut self) -> Result<Option<Event>> {
863        if self.phase == Phase::NotStarted {
864            self.phase = Phase::Running;
865            return Ok(Some(Event::StartDocument {
866                id: None,
867                language: None,
868                metadata: None,
869            }));
870        }
871
872        if self.phase == Phase::Finished && self.queue.is_empty() {
873            return Ok(None);
874        }
875
876        while self.queue.is_empty() && self.phase != Phase::Finished {
877            self.process_next_pulldown_event();
878        }
879
880        Ok(self.queue.pop_front())
881    }
882}
883
884fn intent_from_text_style_kind(k: &TextStyleKind) -> crate::html::tags::TagIntent {
885    use crate::html::tags::TagIntent;
886    match k {
887        TextStyleKind::Bold => TagIntent::Bold,
888        TextStyleKind::Italic => TagIntent::Italic,
889        TextStyleKind::Underline => TagIntent::Underline,
890        TextStyleKind::Strikethrough => TagIntent::Strikethrough,
891        TextStyleKind::Code => TagIntent::Code,
892        TextStyleKind::Subscript => TagIntent::Subscript,
893        TextStyleKind::Superscript => TagIntent::Superscript,
894        TextStyleKind::Mark(_) => TagIntent::Mark,
895        _ => TagIntent::Ignored,
896    }
897}
898
899fn markdown_start_tag(tag: Tag<'_>) -> Option<MarkdownStartTag> {
900    match tag {
901        Tag::BlockQuote(_) => Some(MarkdownStartTag::BlockQuote),
902        Tag::CodeBlock(kind) => Some(MarkdownStartTag::CodeBlock {
903            syntax: code_block_syntax(kind),
904        }),
905        Tag::Emphasis => Some(MarkdownStartTag::Emphasis),
906        Tag::Heading { level, .. } => Some(MarkdownStartTag::Heading { level }),
907        Tag::Image {
908            dest_url, title, ..
909        } => Some(MarkdownStartTag::Image {
910            dest_url: dest_url.into_string(),
911            title: cow_to_optional_string(title),
912        }),
913        Tag::HtmlBlock => Some(MarkdownStartTag::HtmlBlock),
914        Tag::Item => Some(MarkdownStartTag::Item),
915        Tag::Link {
916            dest_url, title, ..
917        } => Some(MarkdownStartTag::Link {
918            dest_url: dest_url.into_string(),
919            title: cow_to_optional_string(title),
920        }),
921        Tag::List(start_opt) => Some(MarkdownStartTag::List(start_opt)),
922        Tag::Paragraph => Some(MarkdownStartTag::Paragraph),
923        Tag::Strikethrough => Some(MarkdownStartTag::Strikethrough),
924        Tag::Strong => Some(MarkdownStartTag::Strong),
925        Tag::Table(_) => Some(MarkdownStartTag::Table),
926        Tag::TableCell => Some(MarkdownStartTag::TableCell),
927        Tag::TableHead => Some(MarkdownStartTag::TableHead),
928        Tag::TableRow => Some(MarkdownStartTag::TableRow),
929        Tag::DefinitionList
930        | Tag::DefinitionListDefinition
931        | Tag::DefinitionListTitle
932        | Tag::FootnoteDefinition(_)
933        | Tag::MetadataBlock(_)
934        | Tag::Subscript
935        | Tag::Superscript => None,
936    }
937}
938
939fn code_block_syntax(kind: CodeBlockKind<'_>) -> Option<String> {
940    match kind {
941        CodeBlockKind::Fenced(lang) if !lang.is_empty() => Some(lang.into_string()),
942        CodeBlockKind::Fenced(_) | CodeBlockKind::Indented => None,
943    }
944}
945
946fn cow_to_optional_string(value: CowStr<'_>) -> Option<String> {
947    if value.is_empty() {
948        None
949    } else {
950        Some(value.into_string())
951    }
952}
953
954#[cfg(test)]
955mod tests {
956    use super::*;
957
958    #[test]
959    fn handle_code_without_open_block_auto_opens_paragraph() {
960        let mut reader = MarkdownReader::from_str("");
961        reader.handle_code("code".to_string());
962
963        assert_eq!(reader.queue.len(), 4);
964        assert_eq!(
965            reader.queue.front(),
966            Some(&Event::StartParagraph {
967                alignment: None,
968                id: None,
969            })
970        );
971        assert_eq!(
972            reader.queue.get(1),
973            Some(&Event::StartTextStyle {
974                kind: TextStyleKind::Code,
975                id: None,
976            })
977        );
978        assert_eq!(
979            reader.queue.get(2),
980            Some(&Event::Text {
981                content: "code".to_string(),
982            })
983        );
984        assert_eq!(reader.queue.get(3), Some(&Event::EndTextStyle));
985    }
986
987    #[test]
988    fn handle_text_without_open_block_auto_opens_paragraph() {
989        let mut reader = MarkdownReader::from_str("");
990        reader.handle_text("hello".to_string());
991
992        assert_eq!(reader.queue.len(), 2);
993        assert_eq!(
994            reader.queue.front(),
995            Some(&Event::StartParagraph {
996                alignment: None,
997                id: None,
998            })
999        );
1000        assert_eq!(
1001            reader.queue.get(1),
1002            Some(&Event::Text {
1003                content: "hello".to_string(),
1004            })
1005        );
1006    }
1007}
1008
1009#[cfg(test)]
1010mod send_static_assertions {
1011    fn assert_send_static<T: Send + 'static>() {}
1012
1013    #[test]
1014    fn markdown_reader_is_send_static() {
1015        assert_send_static::<crate::MarkdownReader>();
1016    }
1017}