Skip to main content

zenith_core/markdown/
block.rs

1//! Block-level markdown parser → [`MdBlock`] sequence.
2//!
3//! Parses a multi-line string into a flat list of block-level elements.
4//! For blocks that carry inline text (headings, paragraphs, blockquotes,
5//! list items) the text is forwarded to [`super::inline::parse_inline_markdown`]
6//! to produce the final [`crate::ast::node::TextSpan`] spans.
7//!
8//! # Supported block syntax
9//!
10//! | Syntax                               | Produces                           |
11//! |--------------------------------------|------------------------------------|
12//! | `# …` … `###### …`                  | `Heading { level: 1..=6, … }`      |
13//! | Two or more consecutive non-blank    | `Paragraph { … }`                  |
14//! | `> …` (consecutive)                  | `Blockquote { … }`                 |
15//! | `- `/ `* `/ `+ ` prefix             | `ListItem { kind: Unordered, … }`  |
16//! | `1. ` / `2. ` … prefix              | `ListItem { kind: Ordered, … }`    |
17//! | ` ``` ` … ` ``` ` (fenced)          | `CodeBlock { lang, content }`      |
18//! | `---` / `***` / `___` / `- - -` …   | `HorizontalRule`                   |
19//!
20//! # Non-goals (V1)
21//!
22//! Setext headings, nested blockquote trees, true list-tree nesting, GFM
23//! tables, reference links, HTML blocks, indented (4-space) code blocks, and
24//! task checkboxes are not recognised; they degrade to `Paragraph` or the
25//! nearest applicable block kind.
26
27use crate::ast::node::TextSpan;
28
29use super::inline::parse_inline_markdown;
30
31/// The kind of a markdown list item.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub enum ListKind {
34    Unordered,
35    Ordered,
36}
37
38/// A parsed block-level markdown element.
39#[derive(Debug, Clone, PartialEq)]
40pub enum MdBlock {
41    /// ATX heading (`#` … `######`). `level` is 1–6.
42    Heading { level: u8, spans: Vec<TextSpan> },
43    /// One or more consecutive non-blank lines (not matching any other rule).
44    Paragraph { spans: Vec<TextSpan> },
45    /// A `>` blockquote (consecutive lines merged with a space).
46    Blockquote { spans: Vec<TextSpan> },
47    /// A single list item (flat; `depth` encodes indentation level).
48    ListItem {
49        kind: ListKind,
50        depth: u32,
51        ordinal: Option<u32>,
52        spans: Vec<TextSpan>,
53    },
54    /// A fenced code block (``` … ```). `content` is RAW — no inline parsing.
55    CodeBlock {
56        lang: Option<String>,
57        content: String,
58    },
59    /// A horizontal rule (`---`, `***`, `___`, `- - -`, …).
60    HorizontalRule,
61}
62
63// ---------------------------------------------------------------------------
64// Internal open-block state machine
65// ---------------------------------------------------------------------------
66
67/// The block currently being accumulated.
68#[derive(Debug)]
69enum Open {
70    None,
71    Paragraph(Vec<String>),
72    Blockquote(Vec<String>),
73    Code {
74        lang: Option<String>,
75        lines: Vec<String>,
76    },
77}
78
79/// Parse a markdown document into a flat list of [`MdBlock`]s.
80///
81/// Infallible: no panics, no errors; malformed markdown degrades gracefully.
82/// Fully deterministic: same input → same output.
83pub fn parse_block_markdown(input: &str) -> Vec<MdBlock> {
84    let mut out: Vec<MdBlock> = Vec::new();
85    let mut open: Open = Open::None;
86
87    for raw_line in input.split('\n') {
88        // Strip trailing CR (handle \r\n line endings).
89        let line = raw_line.trim_end_matches('\r');
90
91        // ── Inside a fenced code block ──────────────────────────────────────
92        if let Open::Code { lang, lines } = &mut open {
93            // A line that trims to ``` closes the fence.
94            if line.trim() == "```" {
95                let content = lines.join("\n");
96                let lang_out = lang.take();
97                out.push(MdBlock::CodeBlock {
98                    lang: lang_out,
99                    content,
100                });
101                open = Open::None;
102            } else {
103                lines.push(line.to_owned());
104            }
105            continue;
106        }
107
108        // ── Blank line: flush whatever is open ──────────────────────────────
109        if line.trim().is_empty() {
110            flush(&mut open, &mut out);
111            continue;
112        }
113
114        // ── Fenced code opener: trim_start begins with ``` ──────────────────
115        let line_trimmed_start = line.trim_start();
116        if line_trimmed_start.starts_with("```") {
117            flush(&mut open, &mut out);
118            let after_backticks = line_trimmed_start.get(3..).unwrap_or("").trim();
119            let lang = if after_backticks.is_empty() {
120                None
121            } else {
122                Some(after_backticks.to_owned())
123            };
124            open = Open::Code {
125                lang,
126                lines: Vec::new(),
127            };
128            continue;
129        }
130
131        // ── Horizontal rule ─────────────────────────────────────────────────
132        if is_horizontal_rule(line) {
133            flush(&mut open, &mut out);
134            out.push(MdBlock::HorizontalRule);
135            continue;
136        }
137
138        // ── ATX heading ─────────────────────────────────────────────────────
139        if let Some((level, text)) = parse_atx_heading(line) {
140            flush(&mut open, &mut out);
141            let spans = parse_inline_markdown(text);
142            out.push(MdBlock::Heading { level, spans });
143            continue;
144        }
145
146        // ── Blockquote ──────────────────────────────────────────────────────
147        if let Some(inner) = strip_blockquote_prefix(line) {
148            match &mut open {
149                Open::Blockquote(lines) => {
150                    lines.push(inner.to_owned());
151                }
152                Open::None | Open::Paragraph(_) | Open::Code { .. } => {
153                    flush(&mut open, &mut out);
154                    open = Open::Blockquote(vec![inner.to_owned()]);
155                }
156            }
157            continue;
158        }
159
160        // If we were accumulating a blockquote and this line is NOT a `>`
161        // line, flush the blockquote first (it's terminated).
162        if matches!(&open, Open::Blockquote(_)) {
163            flush(&mut open, &mut out);
164            // Fall through: the current line starts a new block below.
165        }
166
167        // ── List item ───────────────────────────────────────────────────────
168        if let Some(item) = parse_list_item(line) {
169            flush(&mut open, &mut out);
170            let spans = parse_inline_markdown(item.text);
171            out.push(MdBlock::ListItem {
172                kind: item.kind,
173                depth: item.depth,
174                ordinal: item.ordinal,
175                spans,
176            });
177            continue;
178        }
179
180        // ── Paragraph (default) ─────────────────────────────────────────────
181        match &mut open {
182            Open::Paragraph(lines) => {
183                lines.push(line.to_owned());
184            }
185            Open::None | Open::Blockquote(_) | Open::Code { .. } => {
186                open = Open::Paragraph(vec![line.to_owned()]);
187            }
188        }
189    }
190
191    // Flush any open block at EOF.
192    flush(&mut open, &mut out);
193    out
194}
195
196// ---------------------------------------------------------------------------
197// Flush helpers
198// ---------------------------------------------------------------------------
199
200/// Flush the currently open block into `out`, resetting `open` to `None`.
201fn flush(open: &mut Open, out: &mut Vec<MdBlock>) {
202    let done = std::mem::replace(open, Open::None);
203    match done {
204        Open::None => {}
205        Open::Paragraph(lines) => {
206            if lines.is_empty() {
207                return;
208            }
209            let text = lines.join(" ");
210            let spans = parse_inline_markdown(&text);
211            out.push(MdBlock::Paragraph { spans });
212        }
213        Open::Blockquote(lines) => {
214            if lines.is_empty() {
215                return;
216            }
217            let text = lines.join(" ");
218            let spans = parse_inline_markdown(&text);
219            out.push(MdBlock::Blockquote { spans });
220        }
221        Open::Code { lang, lines } => {
222            // Unclosed fence at EOF: flush with content so far.
223            let content = lines.join("\n");
224            out.push(MdBlock::CodeBlock { lang, content });
225        }
226    }
227}
228
229// ---------------------------------------------------------------------------
230// Line classifiers
231// ---------------------------------------------------------------------------
232
233/// Returns `true` if `line` (not yet trimmed) is a thematic break.
234///
235/// Rule: trimmed line consists solely of 3+ identical chars from `{-, *, _}`
236/// with optional spaces between them (e.g. `---`, `* * *`, `___`).
237fn is_horizontal_rule(line: &str) -> bool {
238    let trimmed = line.trim();
239    // Must start with one of the three break characters.
240    let ch = match trimmed.chars().next() {
241        Some(c) if matches!(c, '-' | '*' | '_') => c,
242        _ => return false,
243    };
244    let mut count = 0u32;
245    for c in trimmed.chars() {
246        if c == ch {
247            count += 1;
248        } else if c == ' ' {
249            // Spaces between chars are allowed.
250        } else {
251            // Any other character means this is not a HR.
252            return false;
253        }
254    }
255    count >= 3
256}
257
258/// Parse an ATX heading. Returns `(level, inner_text)` if the line matches,
259/// where `inner_text` has the leading `#` run and trailing `#` run stripped.
260///
261/// A valid ATX heading: trim_start matches 1–6 `#` chars then either a space
262/// or end-of-line. 7+ `#` or `#` with no following space are not headings.
263fn parse_atx_heading(line: &str) -> Option<(u8, &str)> {
264    let s = line.trim_start();
265    // Count leading `#` chars.
266    let hash_count = s.bytes().take_while(|&b| b == b'#').count();
267    if hash_count == 0 || hash_count > 6 {
268        return None;
269    }
270    let rest = s.get(hash_count..)?;
271    // Must be followed by a space or be EOL.
272    let inner = if rest.is_empty() {
273        ""
274    } else if rest.starts_with(' ') {
275        rest.get(1..).unwrap_or("")
276    } else {
277        // `#` not followed by space → not a heading.
278        return None;
279    };
280    // Strip optional trailing `#` run (e.g. `## heading ##`).
281    let inner = inner.trim_end();
282    let stripped = inner.trim_end_matches('#').trim_end();
283    // If stripping removed something, use the stripped version; otherwise keep
284    // the original (so `## ##` → `##` → `""` correctly).
285    let text = if stripped.len() < inner.len() {
286        stripped
287    } else {
288        inner
289    };
290    Some((hash_count as u8, text))
291}
292
293/// Strip the `> ` or `>` prefix from a blockquote line. Returns the inner
294/// text slice (with the prefix removed), or `None` if the line is not a `>`
295/// line.
296fn strip_blockquote_prefix(line: &str) -> Option<&str> {
297    let s = line.trim_start();
298    if !s.starts_with('>') {
299        return None;
300    }
301    let after = s.get(1..).unwrap_or("");
302    // Strip the optional single space after `>`.
303    Some(if after.starts_with(' ') {
304        after.get(1..).unwrap_or("")
305    } else {
306        after
307    })
308}
309
310/// Data extracted from a parsed list-item line.
311struct ListItemData<'a> {
312    kind: ListKind,
313    depth: u32,
314    ordinal: Option<u32>,
315    text: &'a str,
316}
317
318/// Try to parse `line` as a list item. The leading-space count (before the
319/// marker) determines `depth` (spaces / 2, clamped to `u32`).
320///
321/// Unordered markers: `-`, `*`, `+` followed by a space.
322/// Ordered markers: one or more ASCII digits followed by `.` then a space.
323fn parse_list_item(line: &str) -> Option<ListItemData<'_>> {
324    // Count leading spaces to determine depth.
325    let leading_spaces = line.count_ascii_lead_spaces();
326    let depth = (leading_spaces / 2) as u32;
327
328    let s = line.trim_start();
329
330    // Unordered: `-`/`*`/`+` + space.
331    if let Some(first) = s.chars().next() {
332        if matches!(first, '-' | '*' | '+') {
333            let rest = s.get(1..).unwrap_or("");
334            if rest.starts_with(' ') {
335                let text = rest.get(1..).unwrap_or("").trim_end();
336                return Some(ListItemData {
337                    kind: ListKind::Unordered,
338                    depth,
339                    ordinal: None,
340                    text,
341                });
342            }
343        }
344    }
345
346    // Ordered: digits + `.` + space.
347    let digit_end = s.bytes().take_while(|b| b.is_ascii_digit()).count();
348    if digit_end > 0 {
349        let after_digits = s.get(digit_end..)?;
350        if after_digits.starts_with(". ") {
351            let ordinal_str = s.get(..digit_end)?;
352            let ordinal: u32 = ordinal_str.parse().ok()?;
353            let text = after_digits.get(2..).unwrap_or("").trim_end();
354            return Some(ListItemData {
355                kind: ListKind::Ordered,
356                depth,
357                ordinal: Some(ordinal),
358                text,
359            });
360        }
361    }
362
363    None
364}
365
366// ---------------------------------------------------------------------------
367// Small utility trait to count leading spaces without allocation
368// ---------------------------------------------------------------------------
369
370trait CountAsciiLeadSpaces {
371    fn count_ascii_lead_spaces(&self) -> usize;
372}
373
374impl CountAsciiLeadSpaces for str {
375    fn count_ascii_lead_spaces(&self) -> usize {
376        self.bytes().take_while(|&b| b == b' ').count()
377    }
378}