zenith_core/markdown/block.rs
1//! Block-level markdown parser → [`MdBlock`] sequence.
2//!
3//! Parses a multi-line string into a flat list of block-level elements.
4//! For blocks that carry inline text (headings, paragraphs, blockquotes,
5//! list items) the text is forwarded to [`super::inline::parse_inline_markdown`]
6//! to produce the final [`crate::ast::node::TextSpan`] spans.
7//!
8//! # Supported block syntax
9//!
10//! | Syntax | Produces |
11//! |--------------------------------------|------------------------------------|
12//! | `# …` … `###### …` | `Heading { level: 1..=6, … }` |
13//! | Two or more consecutive non-blank | `Paragraph { … }` |
14//! | `> …` (consecutive) | `Blockquote { … }` |
15//! | `- `/ `* `/ `+ ` prefix | `ListItem { kind: Unordered, … }` |
16//! | `1. ` / `2. ` … prefix | `ListItem { kind: Ordered, … }` |
17//! | ` ``` ` … ` ``` ` (fenced) | `CodeBlock { lang, content }` |
18//! | `---` / `***` / `___` / `- - -` … | `HorizontalRule` |
19//!
20//! # Non-goals (V1)
21//!
22//! Setext headings, nested blockquote trees, true list-tree nesting, GFM
23//! tables, reference links, HTML blocks, indented (4-space) code blocks, and
24//! task checkboxes are not recognised; they degrade to `Paragraph` or the
25//! nearest applicable block kind.
26
27use crate::ast::node::TextSpan;
28
29use super::inline::parse_inline_markdown;
30
31/// The kind of a markdown list item.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub enum ListKind {
34 Unordered,
35 Ordered,
36}
37
38/// A parsed block-level markdown element.
39#[derive(Debug, Clone, PartialEq)]
40pub enum MdBlock {
41 /// ATX heading (`#` … `######`). `level` is 1–6.
42 Heading { level: u8, spans: Vec<TextSpan> },
43 /// One or more consecutive non-blank lines (not matching any other rule).
44 Paragraph { spans: Vec<TextSpan> },
45 /// A `>` blockquote (consecutive lines merged with a space).
46 Blockquote { spans: Vec<TextSpan> },
47 /// A single list item (flat; `depth` encodes indentation level).
48 ListItem {
49 kind: ListKind,
50 depth: u32,
51 ordinal: Option<u32>,
52 spans: Vec<TextSpan>,
53 },
54 /// A fenced code block (``` … ```). `content` is RAW — no inline parsing.
55 CodeBlock {
56 lang: Option<String>,
57 content: String,
58 },
59 /// A horizontal rule (`---`, `***`, `___`, `- - -`, …).
60 HorizontalRule,
61}
62
63// ---------------------------------------------------------------------------
64// Internal open-block state machine
65// ---------------------------------------------------------------------------
66
67/// The block currently being accumulated.
68#[derive(Debug)]
69enum Open {
70 None,
71 Paragraph(Vec<String>),
72 Blockquote(Vec<String>),
73 Code {
74 lang: Option<String>,
75 lines: Vec<String>,
76 },
77}
78
79/// Parse a markdown document into a flat list of [`MdBlock`]s.
80///
81/// Infallible: no panics, no errors; malformed markdown degrades gracefully.
82/// Fully deterministic: same input → same output.
83pub fn parse_block_markdown(input: &str) -> Vec<MdBlock> {
84 let mut out: Vec<MdBlock> = Vec::new();
85 let mut open: Open = Open::None;
86
87 for raw_line in input.split('\n') {
88 // Strip trailing CR (handle \r\n line endings).
89 let line = raw_line.trim_end_matches('\r');
90
91 // ── Inside a fenced code block ──────────────────────────────────────
92 if let Open::Code { lang, lines } = &mut open {
93 // A line that trims to ``` closes the fence.
94 if line.trim() == "```" {
95 let content = lines.join("\n");
96 let lang_out = lang.take();
97 out.push(MdBlock::CodeBlock {
98 lang: lang_out,
99 content,
100 });
101 open = Open::None;
102 } else {
103 lines.push(line.to_owned());
104 }
105 continue;
106 }
107
108 // ── Blank line: flush whatever is open ──────────────────────────────
109 if line.trim().is_empty() {
110 flush(&mut open, &mut out);
111 continue;
112 }
113
114 // ── Fenced code opener: trim_start begins with ``` ──────────────────
115 let line_trimmed_start = line.trim_start();
116 if line_trimmed_start.starts_with("```") {
117 flush(&mut open, &mut out);
118 let after_backticks = line_trimmed_start.get(3..).unwrap_or("").trim();
119 let lang = if after_backticks.is_empty() {
120 None
121 } else {
122 Some(after_backticks.to_owned())
123 };
124 open = Open::Code {
125 lang,
126 lines: Vec::new(),
127 };
128 continue;
129 }
130
131 // ── Horizontal rule ─────────────────────────────────────────────────
132 if is_horizontal_rule(line) {
133 flush(&mut open, &mut out);
134 out.push(MdBlock::HorizontalRule);
135 continue;
136 }
137
138 // ── ATX heading ─────────────────────────────────────────────────────
139 if let Some((level, text)) = parse_atx_heading(line) {
140 flush(&mut open, &mut out);
141 let spans = parse_inline_markdown(text);
142 out.push(MdBlock::Heading { level, spans });
143 continue;
144 }
145
146 // ── Blockquote ──────────────────────────────────────────────────────
147 if let Some(inner) = strip_blockquote_prefix(line) {
148 match &mut open {
149 Open::Blockquote(lines) => {
150 lines.push(inner.to_owned());
151 }
152 Open::None | Open::Paragraph(_) | Open::Code { .. } => {
153 flush(&mut open, &mut out);
154 open = Open::Blockquote(vec![inner.to_owned()]);
155 }
156 }
157 continue;
158 }
159
160 // If we were accumulating a blockquote and this line is NOT a `>`
161 // line, flush the blockquote first (it's terminated).
162 if matches!(&open, Open::Blockquote(_)) {
163 flush(&mut open, &mut out);
164 // Fall through: the current line starts a new block below.
165 }
166
167 // ── List item ───────────────────────────────────────────────────────
168 if let Some(item) = parse_list_item(line) {
169 flush(&mut open, &mut out);
170 let spans = parse_inline_markdown(item.text);
171 out.push(MdBlock::ListItem {
172 kind: item.kind,
173 depth: item.depth,
174 ordinal: item.ordinal,
175 spans,
176 });
177 continue;
178 }
179
180 // ── Paragraph (default) ─────────────────────────────────────────────
181 match &mut open {
182 Open::Paragraph(lines) => {
183 lines.push(line.to_owned());
184 }
185 Open::None | Open::Blockquote(_) | Open::Code { .. } => {
186 open = Open::Paragraph(vec![line.to_owned()]);
187 }
188 }
189 }
190
191 // Flush any open block at EOF.
192 flush(&mut open, &mut out);
193 out
194}
195
196// ---------------------------------------------------------------------------
197// Flush helpers
198// ---------------------------------------------------------------------------
199
200/// Flush the currently open block into `out`, resetting `open` to `None`.
201fn flush(open: &mut Open, out: &mut Vec<MdBlock>) {
202 let done = std::mem::replace(open, Open::None);
203 match done {
204 Open::None => {}
205 Open::Paragraph(lines) => {
206 if lines.is_empty() {
207 return;
208 }
209 let text = lines.join(" ");
210 let spans = parse_inline_markdown(&text);
211 out.push(MdBlock::Paragraph { spans });
212 }
213 Open::Blockquote(lines) => {
214 if lines.is_empty() {
215 return;
216 }
217 let text = lines.join(" ");
218 let spans = parse_inline_markdown(&text);
219 out.push(MdBlock::Blockquote { spans });
220 }
221 Open::Code { lang, lines } => {
222 // Unclosed fence at EOF: flush with content so far.
223 let content = lines.join("\n");
224 out.push(MdBlock::CodeBlock { lang, content });
225 }
226 }
227}
228
229// ---------------------------------------------------------------------------
230// Line classifiers
231// ---------------------------------------------------------------------------
232
233/// Returns `true` if `line` (not yet trimmed) is a thematic break.
234///
235/// Rule: trimmed line consists solely of 3+ identical chars from `{-, *, _}`
236/// with optional spaces between them (e.g. `---`, `* * *`, `___`).
237fn is_horizontal_rule(line: &str) -> bool {
238 let trimmed = line.trim();
239 // Must start with one of the three break characters.
240 let ch = match trimmed.chars().next() {
241 Some(c) if matches!(c, '-' | '*' | '_') => c,
242 _ => return false,
243 };
244 let mut count = 0u32;
245 for c in trimmed.chars() {
246 if c == ch {
247 count += 1;
248 } else if c == ' ' {
249 // Spaces between chars are allowed.
250 } else {
251 // Any other character means this is not a HR.
252 return false;
253 }
254 }
255 count >= 3
256}
257
258/// Parse an ATX heading. Returns `(level, inner_text)` if the line matches,
259/// where `inner_text` has the leading `#` run and trailing `#` run stripped.
260///
261/// A valid ATX heading: trim_start matches 1–6 `#` chars then either a space
262/// or end-of-line. 7+ `#` or `#` with no following space are not headings.
263fn parse_atx_heading(line: &str) -> Option<(u8, &str)> {
264 let s = line.trim_start();
265 // Count leading `#` chars.
266 let hash_count = s.bytes().take_while(|&b| b == b'#').count();
267 if hash_count == 0 || hash_count > 6 {
268 return None;
269 }
270 let rest = s.get(hash_count..)?;
271 // Must be followed by a space or be EOL.
272 let inner = if rest.is_empty() {
273 ""
274 } else if rest.starts_with(' ') {
275 rest.get(1..).unwrap_or("")
276 } else {
277 // `#` not followed by space → not a heading.
278 return None;
279 };
280 // Strip optional trailing `#` run (e.g. `## heading ##`).
281 let inner = inner.trim_end();
282 let stripped = inner.trim_end_matches('#').trim_end();
283 // If stripping removed something, use the stripped version; otherwise keep
284 // the original (so `## ##` → `##` → `""` correctly).
285 let text = if stripped.len() < inner.len() {
286 stripped
287 } else {
288 inner
289 };
290 Some((hash_count as u8, text))
291}
292
293/// Strip the `> ` or `>` prefix from a blockquote line. Returns the inner
294/// text slice (with the prefix removed), or `None` if the line is not a `>`
295/// line.
296fn strip_blockquote_prefix(line: &str) -> Option<&str> {
297 let s = line.trim_start();
298 if !s.starts_with('>') {
299 return None;
300 }
301 let after = s.get(1..).unwrap_or("");
302 // Strip the optional single space after `>`.
303 Some(if after.starts_with(' ') {
304 after.get(1..).unwrap_or("")
305 } else {
306 after
307 })
308}
309
310/// Data extracted from a parsed list-item line.
311struct ListItemData<'a> {
312 kind: ListKind,
313 depth: u32,
314 ordinal: Option<u32>,
315 text: &'a str,
316}
317
318/// Try to parse `line` as a list item. The leading-space count (before the
319/// marker) determines `depth` (spaces / 2, clamped to `u32`).
320///
321/// Unordered markers: `-`, `*`, `+` followed by a space.
322/// Ordered markers: one or more ASCII digits followed by `.` then a space.
323fn parse_list_item(line: &str) -> Option<ListItemData<'_>> {
324 // Count leading spaces to determine depth.
325 let leading_spaces = line.count_ascii_lead_spaces();
326 let depth = (leading_spaces / 2) as u32;
327
328 let s = line.trim_start();
329
330 // Unordered: `-`/`*`/`+` + space.
331 if let Some(first) = s.chars().next() {
332 if matches!(first, '-' | '*' | '+') {
333 let rest = s.get(1..).unwrap_or("");
334 if rest.starts_with(' ') {
335 let text = rest.get(1..).unwrap_or("").trim_end();
336 return Some(ListItemData {
337 kind: ListKind::Unordered,
338 depth,
339 ordinal: None,
340 text,
341 });
342 }
343 }
344 }
345
346 // Ordered: digits + `.` + space.
347 let digit_end = s.bytes().take_while(|b| b.is_ascii_digit()).count();
348 if digit_end > 0 {
349 let after_digits = s.get(digit_end..)?;
350 if after_digits.starts_with(". ") {
351 let ordinal_str = s.get(..digit_end)?;
352 let ordinal: u32 = ordinal_str.parse().ok()?;
353 let text = after_digits.get(2..).unwrap_or("").trim_end();
354 return Some(ListItemData {
355 kind: ListKind::Ordered,
356 depth,
357 ordinal: Some(ordinal),
358 text,
359 });
360 }
361 }
362
363 None
364}
365
366// ---------------------------------------------------------------------------
367// Small utility trait to count leading spaces without allocation
368// ---------------------------------------------------------------------------
369
370trait CountAsciiLeadSpaces {
371 fn count_ascii_lead_spaces(&self) -> usize;
372}
373
374impl CountAsciiLeadSpaces for str {
375 fn count_ascii_lead_spaces(&self) -> usize {
376 self.bytes().take_while(|&b| b == b' ').count()
377 }
378}