Skip to main content

carta_readers/
dokuwiki.rs

1//! Reader for the `DokuWiki` markup language.
2//!
3//! The grammar is line-oriented at the block level and recursive-descent at the inline level. A
4//! block is recognised by its first line: a heading (`=` runs), a code or raw passthrough region
5//! (`<code>`, `<file>`, `<HTML>`, `<PHP>`), a table (rows opening with `|` or `^`), a list (`*` for
6//! bullets, `-` for ordered, indented at least two columns), an indented code block, a thematic
7//! break, a blockquote (`>` runs), or, failing all of those, a paragraph. Inline content is scanned
8//! left to right with a small pending-text buffer: emphasis (`//`), strong (`**`), underline
9//! (`__`), monospace (`''`), the `<sub>`/`<sup>`/`<del>` spans, links (`[[…]]`), media (`{{…}}`),
10//! footnotes (`((…))`), bare URLs, and angle-bracket email addresses each form their own node.
11//!
12//! When the `Smart` extension is enabled, straight quotes fold into curly [`Inline::Quoted`] runs,
13//! `--`/`---` fold into en/em dashes, and `...` folds into an ellipsis.
14
15use carta_ast::{
16    Alignment, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Format, Inline,
17    ListAttributes, ListNumberDelim, ListNumberStyle, MathType, QuoteType, Row, Table, TableBody,
18    TableFoot, TableHead, Target, to_plain_text,
19};
20use carta_core::{Extension, Reader, ReaderOptions, Result};
21use unicode_normalization::UnicodeNormalization;
22
23use crate::entities;
24use crate::heading_ids::{IdRegistry, IdScheme};
25use crate::inline_text::trim_inline_ends;
26
27/// The inline-syntax toggles that the scanner threads through every level of parsing.
28#[derive(Debug, Clone, Copy)]
29struct Ctx {
30    /// Straight quotes, dashes, and ellipses fold into their typographic forms.
31    smart: bool,
32    /// `$…$` and `$$…$$` spans are read as inline and display math.
33    math: bool,
34}
35
36/// What ends an inline scan started by an enclosing construct.
37#[derive(Debug, Clone, Copy)]
38enum Closer {
39    /// A curly-quote run, closed by the matching straight quote character.
40    Quote(char),
41    /// A two-character emphasis run (`**`, `//`, `__`), closed by a repeat of the given character.
42    Delim(char),
43    /// A `''…''` monospace run, closed by `''`.
44    Mono,
45}
46
47/// Which quote kinds already enclose the current scan. A quote of a kind already open does not open
48/// again; the straight quote folds to its apostrophe or curly glyph instead.
49#[derive(Debug, Clone, Copy, Default)]
50struct QuoteCtx {
51    in_single: bool,
52    in_double: bool,
53}
54
55/// Parses `DokuWiki` markup into the document model.
56#[derive(Debug, Default, Clone, Copy)]
57pub struct DokuwikiReader;
58
59impl Reader for DokuwikiReader {
60    fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
61        let ctx = Ctx {
62            smart: options.extensions.contains(Extension::Smart),
63            math: options.extensions.contains(Extension::TexMathDollars),
64        };
65        let text = normalize_newlines(input);
66        let lines: Vec<&str> = text.split('\n').collect();
67        let mut index = 0;
68        let mut blocks = parse_blocks(&lines, &mut index, ctx, 0);
69        if options.extensions.contains(Extension::EastAsianLineBreaks) {
70            strip_wide_line_breaks(&mut blocks);
71        }
72        // Identifiers are derived only when `auto_identifiers` is on; the gfm variant and the
73        // ASCII fold only select the algorithm, they do not enable derivation on their own.
74        if options.extensions.contains(Extension::AutoIdentifiers)
75            && let Some(scheme) = IdScheme::select(options.extensions, false)
76        {
77            let ascii = options.extensions.contains(Extension::AsciiIdentifiers);
78            let mut registry = IdRegistry::default();
79            assign_heading_ids(&mut blocks, scheme, ascii, &mut registry);
80        }
81        Ok(Document {
82            blocks,
83            ..Default::default()
84        })
85    }
86}
87
88/// The deepest level of inline or block nesting that recursive parsing will follow. Beyond it,
89/// would-be delimiters are taken literally, bounding stack use on adversarial input.
90const MAX_DEPTH: usize = 32;
91
92/// Replace Windows and classic-Mac line endings with `\n` so the line-oriented scanner sees one
93/// newline convention.
94fn normalize_newlines(input: &str) -> String {
95    input.replace("\r\n", "\n").replace('\r', "\n")
96}
97
98/// Whether `chars` from index `i` begins with the characters of `needle`.
99fn matches_at(chars: &[char], i: usize, needle: &str) -> bool {
100    needle
101        .chars()
102        .enumerate()
103        .all(|(k, ch)| chars.get(i + k) == Some(&ch))
104}
105
106/// Count of leading space characters on a line.
107fn leading_spaces(line: &str) -> usize {
108    line.chars().take_while(|&c| c == ' ').count()
109}
110
111/// The width of one tab stop, in columns. A tab advances to the next multiple of this width.
112const TAB_STOP: usize = 4;
113
114/// Expand every tab in `line` to spaces, advancing to the next tab stop. Each non-tab character
115/// counts as one column.
116fn expand_tabs(line: &str) -> String {
117    let mut out = String::new();
118    let mut col = 0;
119    for c in line.chars() {
120        if c == '\t' {
121            let next = (col / TAB_STOP + 1) * TAB_STOP;
122            for _ in col..next {
123                out.push(' ');
124            }
125            col = next;
126        } else {
127            out.push(c);
128            col += 1;
129        }
130    }
131    out
132}
133
134/// The column at which a line's first non-whitespace character sits, counting a tab as the width to
135/// the next tab stop.
136fn leading_columns(line: &str) -> usize {
137    let mut col = 0;
138    for c in line.chars() {
139        match c {
140            '\t' => col = (col / TAB_STOP + 1) * TAB_STOP,
141            ' ' => col += 1,
142            _ => break,
143        }
144    }
145    col
146}
147
148// ===================================================================================================
149// Block level
150// ===================================================================================================
151
152/// Parse a run of lines into blocks, advancing `index` past the consumed lines.
153fn parse_blocks(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Vec<Block> {
154    let mut blocks = Vec::new();
155    while *index < lines.len() {
156        let line = lines.get(*index).copied().unwrap_or("");
157        if line.trim().is_empty() {
158            *index += 1;
159            continue;
160        }
161        if let Some((level, title, trailing)) = header_split(line) {
162            blocks.push(Block::Header(
163                level,
164                Box::default(),
165                inline_content(&title, ctx, depth),
166            ));
167            *index += 1;
168            // Content after the closing run is re-parsed as a fresh block of its own.
169            if !trailing.trim().is_empty() && depth < MAX_DEPTH {
170                let tail = [trailing.as_str()];
171                let mut tail_index = 0;
172                blocks.append(&mut parse_blocks(&tail, &mut tail_index, ctx, depth + 1));
173            }
174            continue;
175        }
176        if let Some(block) = parse_code_or_raw(lines, index) {
177            blocks.push(block);
178            continue;
179        }
180        if is_table_line(line) {
181            blocks.push(parse_table(lines, index, ctx, depth));
182            continue;
183        }
184        if opens_list(line) {
185            blocks.push(parse_list(lines, index, ctx, depth));
186            continue;
187        }
188        if is_indented_code(line) {
189            blocks.push(parse_indented_code(lines, index));
190            continue;
191        }
192        if is_thematic_break(line) {
193            blocks.push(Block::HorizontalRule);
194            *index += 1;
195            continue;
196        }
197        if quote_depth(line).is_some() {
198            blocks.push(parse_quote(lines, index, ctx, depth));
199            continue;
200        }
201        blocks.append(&mut parse_paragraph(lines, index, ctx, depth));
202    }
203    blocks
204}
205
206/// Whether the line, as the next line of an open paragraph, would instead begin a new block and so
207/// interrupt the paragraph.
208fn interrupts_paragraph(line: &str) -> bool {
209    line.trim().is_empty()
210        || header_split(line).is_some()
211        || is_block_tag(line)
212        || is_table_line(line)
213        || opens_list(line)
214        || is_indented_code(line)
215        || is_thematic_break(line)
216        || quote_depth(line).is_some()
217}
218
219/// Gather consecutive non-interrupting lines into a paragraph. An embedded `<code>` or `<file>`
220/// region, even mid-line, breaks the run into the text before it, the region as its own block, and
221/// the text after it.
222fn parse_paragraph(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Vec<Block> {
223    let mut buffer = String::new();
224    let mut first = true;
225    while *index < lines.len() {
226        let line = lines.get(*index).copied().unwrap_or("");
227        if !first && interrupts_paragraph(line) {
228            break;
229        }
230        if !first {
231            buffer.push('\n');
232        }
233        buffer.push_str(line);
234        first = false;
235        *index += 1;
236    }
237    split_on_embedded_code(&buffer, ctx, depth)
238}
239
240/// Split a paragraph's text on the first embedded `<code>`/`<file>` region that has a closing tag:
241/// the text before becomes a paragraph, the region its own code block, and the remainder is split
242/// again. Text with no such region is a single paragraph (or nothing, when blank).
243fn split_on_embedded_code(text: &str, ctx: Ctx, depth: usize) -> Vec<Block> {
244    let chars: Vec<char> = text.chars().collect();
245    if depth < MAX_DEPTH
246        && let Some((start, block, end)) = find_embedded_code(&chars)
247    {
248        let mut out = Vec::new();
249        let before: String = chars.get(..start).unwrap_or(&[]).iter().collect();
250        if !before.trim().is_empty() {
251            out.push(Block::Para(inline_content(before.trim(), ctx, depth)));
252        }
253        out.push(block);
254        let after: String = chars.get(end..).unwrap_or(&[]).iter().collect();
255        if !after.trim().is_empty() {
256            out.append(&mut split_on_embedded_code(&after, ctx, depth + 1));
257        }
258        out
259    } else if text.trim().is_empty() {
260        Vec::new()
261    } else {
262        vec![Block::Para(inline_content(text.trim(), ctx, depth))]
263    }
264}
265
266/// The first `<code>`/`<file>` region in `chars` that carries a closing tag, as its start index, the
267/// parsed code block, and the index just past the closing tag.
268fn find_embedded_code(chars: &[char]) -> Option<(usize, Block, usize)> {
269    let mut i = 0;
270    while i < chars.len() {
271        if chars.get(i) == Some(&'<')
272            && (named_tag_at(chars, i, "code") || named_tag_at(chars, i, "file"))
273            && let Some((block, end)) = parse_raw_region(chars, i)
274        {
275            return Some((i, block, end));
276        }
277        i += 1;
278    }
279    None
280}
281
282/// Whether `chars` at `start` opens with `<name` followed by `>` or whitespace.
283fn named_tag_at(chars: &[char], start: usize, name: &str) -> bool {
284    if chars.get(start) != Some(&'<') {
285        return false;
286    }
287    let after = start + 1 + name.chars().count();
288    if !matches_at(chars, start + 1, name) {
289        return false;
290    }
291    matches!(chars.get(after), Some('>')) || chars.get(after).is_some_and(|c| c.is_whitespace())
292}
293
294/// A heading line split into its level, title text, and any trailing content after the closing run.
295/// A heading opens with two to six `=` and carries no leading whitespace; it closes at the first run
296/// of at least two `=` that follows the opening run, and the level is six minus one for each opening
297/// `=` beyond the first. Whatever follows the closing run is returned verbatim as trailing content.
298/// `None` when the line does not open or never closes a heading.
299fn header_split(line: &str) -> Option<(i32, String, String)> {
300    if line.starts_with(' ') || line.starts_with('\t') {
301        return None;
302    }
303    let chars: Vec<char> = line.chars().collect();
304    let open = chars.iter().take_while(|&&c| c == '=').count();
305    if !(2..=6).contains(&open) {
306        return None;
307    }
308    let mut at = open;
309    while at < chars.len() {
310        if chars.get(at) == Some(&'=') {
311            let run = run_length(&chars, at, '=');
312            if run >= 2 {
313                let title: String = chars.get(open..at).unwrap_or(&[]).iter().collect();
314                let trailing: String = chars.get(at + run..).unwrap_or(&[]).iter().collect();
315                let level = i32::try_from(7 - open).unwrap_or(1);
316                return Some((level, title.trim().to_string(), trailing));
317            }
318            at += run;
319        } else {
320            at += 1;
321        }
322    }
323    None
324}
325
326/// Whether the line, at column zero, opens a code, file, or raw passthrough region.
327fn is_block_tag(line: &str) -> bool {
328    starts_named_tag(line, "code")
329        || starts_named_tag(line, "file")
330        || line.starts_with("<HTML>")
331        || line.starts_with("<PHP>")
332}
333
334/// Whether `line` opens with `<name` followed by either `>` or whitespace (an attribute list).
335fn starts_named_tag(line: &str, name: &str) -> bool {
336    let Some(rest) = line.strip_prefix('<').and_then(|l| l.strip_prefix(name)) else {
337        return false;
338    };
339    matches!(rest.chars().next(), Some('>')) || rest.starts_with(|c: char| c.is_whitespace())
340}
341
342/// Whether the line opens a table row: it begins with a cell delimiter and yields at least one cell.
343/// A lone delimiter with nothing to delimit is ordinary text, not a degenerate one-row table.
344fn is_table_line(line: &str) -> bool {
345    (line.starts_with('|') || line.starts_with('^')) && !split_row(line).is_empty()
346}
347
348/// Whether the line is an indented code line: indented at least two columns and carrying content.
349fn is_indented_code(line: &str) -> bool {
350    leading_columns(line) >= 2 && !line.trim().is_empty()
351}
352
353/// Whether the line is a thematic break: four or more `-` and nothing else. Any other character,
354/// including a trailing space, disqualifies it.
355fn is_thematic_break(line: &str) -> bool {
356    line.len() >= 4 && line.chars().all(|c| c == '-')
357}
358
359/// The list marker on a line: its indentation and whether it is ordered (`-`) rather than a bullet
360/// (`*`). A marker needs at least two leading spaces and a space after the marker character.
361fn list_marker(line: &str) -> Option<(usize, bool)> {
362    let indent = leading_spaces(line);
363    if indent < 2 {
364        return None;
365    }
366    let chars: Vec<char> = line.chars().collect();
367    let marker = chars.get(indent)?;
368    let ordered = match marker {
369        '*' => false,
370        '-' => true,
371        _ => return None,
372    };
373    if chars.get(indent + 1) == Some(&' ') {
374        Some((indent, ordered))
375    } else {
376        None
377    }
378}
379
380/// The nesting level of a list line: one level for every two columns of indentation, so indents of
381/// two and three columns share level one, four and five share level two, and so on.
382fn list_level(indent: usize) -> usize {
383    indent / 2
384}
385
386/// Whether a line opens a list: it carries a list marker that sits at the top level (level one).
387/// A marker indented deeper than that does not begin a list and is left to become indented code.
388fn opens_list(line: &str) -> bool {
389    list_marker(line).is_some_and(|(indent, _)| list_level(indent) == 1)
390}
391
392/// The blockquote nesting depth of a line (its run of leading `>`), or `None` when the line is not a
393/// quote — a `>` run with no content after it is treated as ordinary text.
394fn quote_depth(line: &str) -> Option<usize> {
395    if !line.starts_with('>') {
396        return None;
397    }
398    let depth = line.chars().take_while(|&c| c == '>').count();
399    let rest = line.get(depth..).unwrap_or("");
400    let rest = rest.strip_prefix(' ').unwrap_or(rest);
401    if rest.is_empty() { None } else { Some(depth) }
402}
403
404/// The kind of region a block-level passthrough tag opens.
405enum RawKind {
406    /// `<code …>` or `<file …>`: a code block whose first attribute word, when not `-`, is a class.
407    Code,
408    /// `<HTML>`: an HTML raw block.
409    Html,
410    /// `<PHP>`: a PHP snippet, wrapped as an HTML raw block.
411    Php,
412}
413
414/// Parse a code, file, or raw passthrough region beginning at the current line. The opening tag may
415/// carry content on its own line after `>`, and the region runs to its closing tag, possibly several
416/// lines below. A region that is never closed is not a block here; it stays as ordinary text.
417fn parse_code_or_raw(lines: &[&str], index: &mut usize) -> Option<Block> {
418    let line = lines.get(*index).copied().unwrap_or("");
419    if !is_block_tag(line) {
420        return None;
421    }
422    let joined: String = lines.get(*index..).unwrap_or(&[]).join("\n");
423    let chars: Vec<char> = joined.chars().collect();
424    let (block, end) = parse_raw_region(&chars, 0)?;
425    let consumed = chars
426        .get(..end)
427        .unwrap_or(&[])
428        .iter()
429        .filter(|&&c| c == '\n')
430        .count();
431    *index += consumed + 1;
432    Some(block)
433}
434
435/// Parse a `<code>`/`<file>`/`<HTML>`/`<PHP>` passthrough region beginning at `start`, returning the
436/// block and the index just past its closing tag. `None` when no such opener sits at `start` or the
437/// region has no closing tag.
438fn parse_raw_region(chars: &[char], start: usize) -> Option<(Block, usize)> {
439    let (kind, close) = if named_tag_at(chars, start, "code") {
440        (RawKind::Code, "</code>")
441    } else if named_tag_at(chars, start, "file") {
442        (RawKind::Code, "</file>")
443    } else if matches_at(chars, start, "<HTML>") {
444        (RawKind::Html, "</HTML>")
445    } else if matches_at(chars, start, "<PHP>") {
446        (RawKind::Php, "</PHP>")
447    } else {
448        return None;
449    };
450    let open_end = (start..chars.len()).find(|&i| chars.get(i) == Some(&'>'))?;
451    let attr_text: String = chars
452        .get(start + 1..open_end)
453        .unwrap_or(&[])
454        .iter()
455        .collect();
456    let content_start = open_end + 1;
457    let close_at = find_subsequence(chars, content_start, close)?;
458    let mut content: String = chars
459        .get(content_start..close_at)
460        .unwrap_or(&[])
461        .iter()
462        .collect();
463    if let Some(stripped) = content.strip_prefix('\n') {
464        content = stripped.to_string();
465    }
466    let end = close_at + close.chars().count();
467    let block = match kind {
468        RawKind::Code => {
469            let class = code_language(&attr_text);
470            let attr = Attr {
471                classes: class.into_iter().map(Into::into).collect(),
472                ..Default::default()
473            };
474            Block::CodeBlock(Box::new(attr), content.into())
475        }
476        RawKind::Html => Block::RawBlock(Format("html".into()), content.into()),
477        RawKind::Php => {
478            Block::RawBlock(Format("html".into()), format!("<?php {content} ?>").into())
479        }
480    };
481    Some((block, end))
482}
483
484/// The language class of a code or file region: its first attribute word, unless that word is `-`
485/// (an explicit "no language") or absent.
486fn code_language(attr_text: &str) -> Option<String> {
487    let mut words = attr_text.split_whitespace();
488    let first = words.next();
489    // Skip the tag name itself, which the attribute slice may still carry for `file`/`code`.
490    match first {
491        Some("code" | "file") => {}
492        Some(word) if word != "-" => return Some(word.to_string()),
493        _ => return None,
494    }
495    match words.next() {
496        Some(word) if word != "-" => Some(word.to_string()),
497        _ => None,
498    }
499}
500
501/// The index just past the first occurrence of `needle` in `chars` at or after `from`.
502fn find_subsequence(chars: &[char], from: usize, needle: &str) -> Option<usize> {
503    let len = needle.chars().count();
504    (from..=chars.len().saturating_sub(len)).find(|&i| matches_at(chars, i, needle))
505}
506
507/// Parse a run of indented code lines. Tabs are expanded to spaces, then the common two-column indent
508/// is stripped from each line.
509fn parse_indented_code(lines: &[&str], index: &mut usize) -> Block {
510    let mut out = String::new();
511    while *index < lines.len() {
512        let line = lines.get(*index).copied().unwrap_or("");
513        if !is_indented_code(line) {
514            break;
515        }
516        let expanded = expand_tabs(line);
517        let body = expanded.get(2..).unwrap_or("");
518        out.push_str(body);
519        out.push('\n');
520        *index += 1;
521    }
522    Block::CodeBlock(Box::default(), out.into())
523}
524
525/// Parse a thematically grouped run of list lines into one bullet or ordered list.
526fn parse_list(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Block {
527    let start = *index;
528    let mut items = Vec::new();
529    while *index < lines.len() {
530        let line = lines.get(*index).copied().unwrap_or("");
531        let Some((indent, ordered)) = list_marker(line) else {
532            break;
533        };
534        let text: String = line.chars().skip(indent + 2).collect();
535        items.push((list_level(indent), ordered, text));
536        *index += 1;
537    }
538    // A line whose level jumps more than one above the line before it does not belong to the list;
539    // it (and everything after) is left to be parsed afresh — an over-indented marker becomes
540    // indented code.
541    let cutoff = list_cutoff(&items);
542    let consumed = items.get(..cutoff).unwrap_or(&[]);
543    let mut pos = 0;
544    let list = build_list(consumed, &mut pos, ctx, depth);
545    // A marker-type switch or a dedent below the opening level also ends this list; rewind so the
546    // remaining lines are parsed fresh on the next pass.
547    *index = start + pos;
548    list
549}
550
551/// The number of leading items that form one list: the run ends at the first item whose level rises
552/// more than one above the item before it.
553fn list_cutoff(items: &[(usize, bool, String)]) -> usize {
554    let mut previous = None;
555    for (i, (level, _, _)) in items.iter().enumerate() {
556        if let Some(prev) = previous
557            && *level > prev + 1
558        {
559            return i;
560        }
561        previous = Some(*level);
562    }
563    items.len()
564}
565
566/// Build one list (and its nested sublists) from the collected items, advancing `pos`. A deeper
567/// level opens a child list; the same level with the other marker ends this list.
568fn build_list(items: &[(usize, bool, String)], pos: &mut usize, ctx: Ctx, depth: usize) -> Block {
569    let (base_level, ordered) = items
570        .get(*pos)
571        .map_or((0, false), |(level, ordered, _)| (*level, *ordered));
572    let mut entries: Vec<Vec<Block>> = Vec::new();
573    while let Some((level, item_ordered, text)) = items.get(*pos) {
574        if *level < base_level {
575            break;
576        }
577        if *level == base_level {
578            if *item_ordered != ordered {
579                break;
580            }
581            let mut blocks = vec![Block::Plain(inline_content(text, ctx, depth))];
582            *pos += 1;
583            if depth < MAX_DEPTH && items.get(*pos).is_some_and(|(l, _, _)| *l > base_level) {
584                blocks.push(build_list(items, pos, ctx, depth + 1));
585            }
586            entries.push(blocks);
587        } else if depth < MAX_DEPTH {
588            let child = build_list(items, pos, ctx, depth + 1);
589            match entries.last_mut() {
590                Some(last) => last.push(child),
591                None => entries.push(vec![child]),
592            }
593        } else {
594            *pos += 1;
595        }
596    }
597    if ordered {
598        Block::OrderedList(
599            ListAttributes {
600                start: 1,
601                style: ListNumberStyle::DefaultStyle,
602                delim: ListNumberDelim::DefaultDelim,
603            },
604            entries,
605        )
606    } else {
607        Block::BulletList(entries)
608    }
609}
610
611/// Parse a run of blockquote lines, nesting by `>` depth.
612fn parse_quote(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Block {
613    let mut items = Vec::new();
614    while *index < lines.len() {
615        let line = lines.get(*index).copied().unwrap_or("");
616        let Some(level) = quote_depth(line) else {
617            break;
618        };
619        let rest = line.get(level..).unwrap_or("");
620        let rest = rest.strip_prefix(' ').unwrap_or(rest);
621        items.push((level, rest.to_string()));
622        *index += 1;
623    }
624    let mut pos = 0;
625    Block::BlockQuote(build_quote(&items, &mut pos, 1, ctx, depth))
626}
627
628/// Build the blocks of a blockquote at nesting `level`, recursing into deeper runs.
629fn build_quote(
630    items: &[(usize, String)],
631    pos: &mut usize,
632    level: usize,
633    ctx: Ctx,
634    depth: usize,
635) -> Vec<Block> {
636    let mut blocks = Vec::new();
637    while let Some((line_level, _)) = items.get(*pos) {
638        if *line_level < level {
639            break;
640        }
641        if *line_level == level {
642            let mut inlines = Vec::new();
643            while let Some((line_level, text)) = items.get(*pos) {
644                if *line_level != level {
645                    break;
646                }
647                if !inlines.is_empty() {
648                    inlines.push(Inline::LineBreak);
649                }
650                inlines.extend(inline_content(text, ctx, depth));
651                *pos += 1;
652            }
653            blocks.push(Block::Plain(inlines));
654        } else if depth < MAX_DEPTH {
655            blocks.push(Block::BlockQuote(build_quote(
656                items,
657                pos,
658                level + 1,
659                ctx,
660                depth + 1,
661            )));
662        } else {
663            *pos += 1;
664        }
665    }
666    blocks
667}
668
669// ===================================================================================================
670// Heading identifiers
671// ===================================================================================================
672
673/// Assign a derived identifier to every heading in document order, descending through block
674/// containers. The slug is formed from the heading's plain text — folded to ASCII first when `ascii`
675/// is set — and made unique within the document by the registry.
676fn assign_heading_ids(
677    blocks: &mut [Block],
678    scheme: IdScheme,
679    ascii: bool,
680    registry: &mut IdRegistry,
681) {
682    for block in blocks {
683        match block {
684            Block::Header(_, attr, inlines) => {
685                let text = to_plain_text(inlines);
686                let text = if ascii { asciify(&text) } else { text };
687                attr.id = registry.assign(scheme, &text).into();
688            }
689            Block::BlockQuote(children)
690            | Block::Div(_, children)
691            | Block::Figure(_, _, children) => {
692                assign_heading_ids(children, scheme, ascii, registry);
693            }
694            Block::BulletList(items) | Block::OrderedList(_, items) => {
695                for item in items {
696                    assign_heading_ids(item, scheme, ascii, registry);
697                }
698            }
699            _ => {}
700        }
701    }
702}
703
704/// Fold text to ASCII by canonical decomposition, dropping every character that is not ASCII so a
705/// letter carrying a diacritic keeps its base letter.
706fn asciify(text: &str) -> String {
707    text.nfd().filter(char::is_ascii).collect()
708}
709
710// ===================================================================================================
711// East Asian line breaks
712// ===================================================================================================
713
714/// Drop soft line breaks that fall between two wide East Asian characters, where the break carries no
715/// visual width. The surrounding text runs are left separate rather than merged.
716fn strip_wide_line_breaks(blocks: &mut [Block]) {
717    for block in blocks {
718        match block {
719            Block::Para(inlines) | Block::Plain(inlines) | Block::Header(_, _, inlines) => {
720                strip_wide_in_inlines(inlines);
721            }
722            Block::BlockQuote(children)
723            | Block::Div(_, children)
724            | Block::Figure(_, _, children) => {
725                strip_wide_line_breaks(children);
726            }
727            Block::BulletList(items) | Block::OrderedList(_, items) => {
728                for item in items {
729                    strip_wide_line_breaks(item);
730                }
731            }
732            _ => {}
733        }
734    }
735}
736
737/// Drop width-free soft breaks within one inline sequence, recursing into nested inline containers.
738fn strip_wide_in_inlines(inlines: &mut Vec<Inline>) {
739    for inline in inlines.iter_mut() {
740        match inline {
741            Inline::Emph(children)
742            | Inline::Underline(children)
743            | Inline::Strong(children)
744            | Inline::Strikeout(children)
745            | Inline::Superscript(children)
746            | Inline::Subscript(children)
747            | Inline::SmallCaps(children)
748            | Inline::Quoted(_, children)
749            | Inline::Cite(_, children)
750            | Inline::Link(_, children, _)
751            | Inline::Image(_, children, _)
752            | Inline::Span(_, children) => strip_wide_in_inlines(children),
753            Inline::Note(blocks) => strip_wide_line_breaks(blocks),
754            _ => {}
755        }
756    }
757    let mut i = 0;
758    while i < inlines.len() {
759        if matches!(inlines.get(i), Some(Inline::SoftBreak)) {
760            let prev_wide = i
761                .checked_sub(1)
762                .and_then(|p| inlines.get(p))
763                .and_then(last_char)
764                .is_some_and(is_east_asian_wide);
765            let next_wide = inlines
766                .get(i + 1)
767                .and_then(first_char)
768                .is_some_and(is_east_asian_wide);
769            if prev_wide && next_wide {
770                inlines.remove(i);
771                continue;
772            }
773        }
774        i += 1;
775    }
776}
777
778/// The last character of an inline's textual content, descending into nested containers.
779fn last_char(inline: &Inline) -> Option<char> {
780    match inline {
781        Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
782            s.chars().last()
783        }
784        Inline::Emph(children)
785        | Inline::Underline(children)
786        | Inline::Strong(children)
787        | Inline::Strikeout(children)
788        | Inline::Superscript(children)
789        | Inline::Subscript(children)
790        | Inline::SmallCaps(children)
791        | Inline::Quoted(_, children)
792        | Inline::Cite(_, children)
793        | Inline::Link(_, children, _)
794        | Inline::Image(_, children, _)
795        | Inline::Span(_, children) => children.iter().rev().find_map(last_char),
796        _ => None,
797    }
798}
799
800/// The first character of an inline's textual content, descending into nested containers.
801fn first_char(inline: &Inline) -> Option<char> {
802    match inline {
803        Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
804            s.chars().next()
805        }
806        Inline::Emph(children)
807        | Inline::Underline(children)
808        | Inline::Strong(children)
809        | Inline::Strikeout(children)
810        | Inline::Superscript(children)
811        | Inline::Subscript(children)
812        | Inline::SmallCaps(children)
813        | Inline::Quoted(_, children)
814        | Inline::Cite(_, children)
815        | Inline::Link(_, children, _)
816        | Inline::Image(_, children, _)
817        | Inline::Span(_, children) => children.iter().find_map(first_char),
818        _ => None,
819    }
820}
821
822/// Whether a character occupies a wide cell in East Asian text (Unicode East Asian Width Wide or
823/// Fullwidth). Halfwidth and Ambiguous-width characters are excluded.
824fn is_east_asian_wide(c: char) -> bool {
825    matches!(c as u32,
826        0x1100..=0x115F
827        | 0x2E80..=0x2EFF
828        | 0x2F00..=0x2FDF
829        | 0x2FF0..=0x2FFF
830        | 0x3000..=0x303E
831        | 0x3041..=0x33FF
832        | 0x3400..=0x4DBF
833        | 0x4E00..=0x9FFF
834        | 0xA000..=0xA4CF
835        | 0xA960..=0xA97F
836        | 0xAC00..=0xD7A3
837        | 0xF900..=0xFAFF
838        | 0xFE10..=0xFE19
839        | 0xFE30..=0xFE6F
840        | 0xFF00..=0xFF60
841        | 0xFFE0..=0xFFE6
842        | 0x1B000..=0x1B16F
843        | 0x1F200..=0x1F2FF
844        | 0x20000..=0x2FFFD
845        | 0x30000..=0x3FFFD)
846}
847
848// ===================================================================================================
849// Inline level
850// ===================================================================================================
851
852/// The number of speculative delimiter openings an inline scan will attempt before it treats the
853/// rest of its input as literal text. Each opener whose closer must be searched for costs one unit,
854/// so this bounds the backtracking work an adversarial delimiter-dense run can provoke while staying
855/// far above what any genuine document consumes.
856fn inline_budget(len: usize) -> usize {
857    len.saturating_mul(8).saturating_add(64).min(200_000)
858}
859
860/// Parse a block's inline content: scan it, then drop leading and trailing whitespace.
861fn inline_content(text: &str, ctx: Ctx, depth: usize) -> Vec<Inline> {
862    let chars: Vec<char> = text.chars().collect();
863    let mut pos = 0;
864    let mut budget = inline_budget(chars.len());
865    let (mut inlines, _) = scan(
866        &chars,
867        &mut pos,
868        None,
869        ctx,
870        QuoteCtx::default(),
871        depth,
872        &mut budget,
873    );
874    trim_inline_ends(&mut inlines);
875    inlines
876}
877
878/// Scan a slice of characters as inline content with no surrounding-quote context.
879fn scan_slice(chars: &[char], ctx: Ctx, depth: usize) -> Vec<Inline> {
880    let mut pos = 0;
881    let mut budget = inline_budget(chars.len());
882    let (inlines, _) = scan(
883        chars,
884        &mut pos,
885        None,
886        ctx,
887        QuoteCtx::default(),
888        depth,
889        &mut budget,
890    );
891    inlines
892}
893
894/// Push the buffered text as a `Str` and clear the buffer.
895fn flush(pending: &mut String, out: &mut Vec<Inline>) {
896    if !pending.is_empty() {
897        out.push(Inline::Str(std::mem::take(pending).into()));
898    }
899}
900
901/// Scan characters into inlines from `*pos`. When `end` is set, the scan stops and reports `true` on
902/// the matching closing delimiter; otherwise it runs to the end and reports `false`.
903#[allow(clippy::too_many_lines)]
904fn scan(
905    chars: &[char],
906    pos: &mut usize,
907    end: Option<Closer>,
908    ctx: Ctx,
909    qctx: QuoteCtx,
910    depth: usize,
911    budget: &mut usize,
912) -> (Vec<Inline>, bool) {
913    let start = *pos;
914    let mut out: Vec<Inline> = Vec::new();
915    let mut pending = String::new();
916    while let Some(&c) = chars.get(*pos) {
917        if let Some(closer) = end
918            && at_closer(chars, *pos, start, closer)
919        {
920            flush(&mut pending, &mut out);
921            *pos += closer_width(closer);
922            return (coalesce(out), true);
923        }
924        if c.is_ascii_alphabetic()
925            && boundary_before(chars, *pos)
926            && let Some((link, end)) = try_autolink(chars, *pos)
927        {
928            flush(&mut pending, &mut out);
929            out.push(link);
930            *pos = end;
931            continue;
932        }
933        match c {
934            ' ' | '\t' | '\n' => scan_whitespace_run(chars, pos, &mut pending, &mut out),
935            '&' => {
936                if let Some((decoded, next)) =
937                    entities::read_reference(chars, *pos, chars.len(), true)
938                {
939                    pending.push_str(&decoded);
940                    *pos = next;
941                } else {
942                    pending.push('&');
943                    *pos += 1;
944                }
945            }
946            '\\' if chars.get(*pos + 1) == Some(&'\\') => {
947                scan_hard_break(chars, pos, &mut pending, &mut out);
948            }
949            '\\' if ctx.math && chars.get(*pos + 1) == Some(&'$') => {
950                // A backslash-escaped dollar is literal text, not a math delimiter.
951                pending.push('\\');
952                pending.push('$');
953                *pos += 2;
954            }
955            '*' if chars.get(*pos + 1) == Some(&'*') && depth < MAX_DEPTH => {
956                handle_delim(
957                    chars,
958                    pos,
959                    '*',
960                    ctx,
961                    qctx,
962                    depth,
963                    budget,
964                    &mut pending,
965                    &mut out,
966                    Inline::Strong,
967                );
968            }
969            '/' if chars.get(*pos + 1) == Some(&'/') && depth < MAX_DEPTH => {
970                handle_delim(
971                    chars,
972                    pos,
973                    '/',
974                    ctx,
975                    qctx,
976                    depth,
977                    budget,
978                    &mut pending,
979                    &mut out,
980                    Inline::Emph,
981                );
982            }
983            '_' if chars.get(*pos + 1) == Some(&'_') && depth < MAX_DEPTH => {
984                handle_delim(
985                    chars,
986                    pos,
987                    '_',
988                    ctx,
989                    qctx,
990                    depth,
991                    budget,
992                    &mut pending,
993                    &mut out,
994                    Inline::Underline,
995                );
996            }
997            '\'' if chars.get(*pos + 1) == Some(&'\'') => {
998                handle_mono_or_quote(chars, pos, ctx, qctx, depth, budget, &mut pending, &mut out);
999            }
1000            '\'' | '"' if ctx.smart => {
1001                handle_quote(
1002                    chars,
1003                    pos,
1004                    c,
1005                    ctx,
1006                    qctx,
1007                    depth,
1008                    budget,
1009                    &mut pending,
1010                    &mut out,
1011                );
1012            }
1013            '$' if ctx.math => {
1014                handle_math(chars, pos, &mut pending, &mut out);
1015            }
1016            '-' if ctx.smart => {
1017                let run = run_length(chars, *pos, '-');
1018                pending.push_str(&fold_dashes(run));
1019                *pos += run;
1020            }
1021            '.' if ctx.smart => {
1022                let run = run_length(chars, *pos, '.');
1023                pending.push_str(&fold_ellipsis(run));
1024                *pos += run;
1025            }
1026            '[' if chars.get(*pos + 1) == Some(&'[') && depth < MAX_DEPTH => {
1027                handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1028            }
1029            '{' if chars.get(*pos + 1) == Some(&'{') && depth < MAX_DEPTH => {
1030                handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1031            }
1032            '(' if chars.get(*pos + 1) == Some(&'(') && depth < MAX_DEPTH => {
1033                handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1034            }
1035            '%' if chars.get(*pos + 1) == Some(&'%') => {
1036                handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1037            }
1038            '<' if depth < MAX_DEPTH => {
1039                handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1040            }
1041            '~' if chars.get(*pos + 1) == Some(&'~') => {
1042                handle_construct(chars, pos, c, ctx, depth, budget, &mut pending, &mut out);
1043            }
1044            other => {
1045                pending.push(other);
1046                *pos += 1;
1047            }
1048        }
1049    }
1050    flush(&mut pending, &mut out);
1051    (coalesce(out), end.is_none())
1052}
1053
1054/// Whether the scan's closing delimiter sits at `pos`. The two-character closers must follow at least
1055/// one character of content and lean against non-whitespace on their left.
1056fn at_closer(chars: &[char], pos: usize, start: usize, closer: Closer) -> bool {
1057    match closer {
1058        Closer::Quote(quote) => {
1059            chars.get(pos) == Some(&quote) && can_close_quote(chars, pos, quote)
1060        }
1061        Closer::Delim(delim) => {
1062            chars.get(pos) == Some(&delim)
1063                && chars.get(pos + 1) == Some(&delim)
1064                && pos > start
1065                && chars.get(pos - 1).is_some_and(|c| !c.is_whitespace())
1066        }
1067        Closer::Mono => {
1068            chars.get(pos) == Some(&'\'')
1069                && chars.get(pos + 1) == Some(&'\'')
1070                && pos > start
1071                && chars.get(pos - 1).is_some_and(|c| !c.is_whitespace())
1072        }
1073    }
1074}
1075
1076/// The number of characters a closing delimiter occupies.
1077fn closer_width(closer: Closer) -> usize {
1078    match closer {
1079        Closer::Quote(_) => 1,
1080        Closer::Delim(_) | Closer::Mono => 2,
1081    }
1082}
1083
1084/// Handle a `''` opener: a monospace run when both delimiters flank non-whitespace content,
1085/// otherwise — under smart typography — the two quotes fold individually, and otherwise the opener
1086/// stays literal.
1087#[allow(clippy::too_many_arguments)]
1088fn handle_mono_or_quote(
1089    chars: &[char],
1090    pos: &mut usize,
1091    ctx: Ctx,
1092    qctx: QuoteCtx,
1093    depth: usize,
1094    budget: &mut usize,
1095    pending: &mut String,
1096    out: &mut Vec<Inline>,
1097) {
1098    if depth < MAX_DEPTH
1099        && let Some((node, end)) = parse_mono(chars, *pos, ctx, depth, budget)
1100    {
1101        flush(pending, out);
1102        out.push(node);
1103        *pos = end;
1104    } else if ctx.smart {
1105        handle_quote(chars, pos, '\'', ctx, qctx, depth, budget, pending, out);
1106    } else {
1107        pending.push('\'');
1108        *pos += 1;
1109    }
1110}
1111
1112/// Consume a run of spaces, tabs, and newlines at `*pos`, emitting a single break: a soft break
1113/// when the run contains a newline, an ordinary space otherwise.
1114fn scan_whitespace_run(
1115    chars: &[char],
1116    pos: &mut usize,
1117    pending: &mut String,
1118    out: &mut Vec<Inline>,
1119) {
1120    flush(pending, out);
1121    let mut has_newline = false;
1122    while let Some(&w) = chars.get(*pos) {
1123        match w {
1124            '\n' => {
1125                has_newline = true;
1126                *pos += 1;
1127            }
1128            ' ' | '\t' => *pos += 1,
1129            _ => break,
1130        }
1131    }
1132    out.push(if has_newline {
1133        Inline::SoftBreak
1134    } else {
1135        Inline::Space
1136    });
1137}
1138
1139/// Handle a `\\` sequence at `*pos`: a hard line break when followed by whitespace or the line end,
1140/// and two literal backslashes otherwise.
1141fn scan_hard_break(chars: &[char], pos: &mut usize, pending: &mut String, out: &mut Vec<Inline>) {
1142    let after = chars.get(*pos + 2);
1143    if after.is_none_or(|c| c.is_whitespace()) {
1144        flush(pending, out);
1145        out.push(Inline::LineBreak);
1146        *pos += 2;
1147        if after.is_some() {
1148            *pos += 1;
1149        }
1150    } else {
1151        pending.push('\\');
1152        pending.push('\\');
1153        *pos += 2;
1154    }
1155}
1156
1157/// Try to parse the inline construct introduced by `c` at `pos`: a link (`[[`), media (`{{`), a
1158/// footnote (`((`), a verbatim span (`%%`), an angle-bracket construct (`<`), or a dropped macro
1159/// (`~~`). Returns the produced nodes and the index past the construct.
1160fn scan_construct(
1161    chars: &[char],
1162    pos: usize,
1163    c: char,
1164    ctx: Ctx,
1165    depth: usize,
1166) -> Option<(Vec<Inline>, usize)> {
1167    match c {
1168        '[' => parse_link(chars, pos).map(|(node, end)| (vec![node], end)),
1169        '{' => parse_media(chars, pos).map(|(node, end)| (vec![node], end)),
1170        '(' => parse_footnote(chars, pos, ctx, depth).map(|(node, end)| (vec![node], end)),
1171        '%' => parse_nowiki_pct(chars, pos),
1172        '<' => parse_angle(chars, pos, ctx, depth),
1173        '~' => parse_macro(chars, pos).map(|end| (Vec::new(), end)),
1174        _ => None,
1175    }
1176}
1177
1178/// Dispatch an inline construct opener at `*pos`: on a successful parse the produced nodes are
1179/// appended and `*pos` advances past the construct; otherwise the opener is buffered literally and
1180/// `*pos` advances one character.
1181#[allow(clippy::too_many_arguments)]
1182fn handle_construct(
1183    chars: &[char],
1184    pos: &mut usize,
1185    c: char,
1186    ctx: Ctx,
1187    depth: usize,
1188    budget: &mut usize,
1189    pending: &mut String,
1190    out: &mut Vec<Inline>,
1191) {
1192    // Parsing a construct recurses (footnotes even re-parse their interior as blocks), and an
1193    // enclosing emphasis run that fails to close discards its scan and re-scans the same span — so
1194    // the same construct can be parsed many times over. Charge the shared backtracking budget by the
1195    // span consumed, so that repeated re-parsing of a region cannot exceed the input-proportional
1196    // budget and the total work stays linear.
1197    if *budget > 0
1198        && let Some((mut nodes, end)) = scan_construct(chars, *pos, c, ctx, depth)
1199    {
1200        *budget = budget.saturating_sub((end - *pos).max(1));
1201        flush(pending, out);
1202        out.append(&mut nodes);
1203        *pos = end;
1204    } else {
1205        pending.push(c);
1206        *pos += 1;
1207    }
1208}
1209
1210/// Wrap a generic two-character emphasis run, or, when no valid closer exists, emit the opener
1211/// literally and resume scanning right after it. The run's content is scanned recursively, so a
1212/// would-be inner marker that cannot close is taken as text and an outer marker that the inner run
1213/// consumed past never pairs.
1214#[allow(clippy::too_many_arguments)]
1215fn handle_delim(
1216    chars: &[char],
1217    pos: &mut usize,
1218    delim: char,
1219    ctx: Ctx,
1220    qctx: QuoteCtx,
1221    depth: usize,
1222    budget: &mut usize,
1223    pending: &mut String,
1224    out: &mut Vec<Inline>,
1225    wrap: fn(Vec<Inline>) -> Inline,
1226) {
1227    let begin = *pos;
1228    // The opener must lean against following non-whitespace content, and searching for its closer
1229    // must stay within the backtracking budget.
1230    if !is_ws_opt(chars.get(begin + 2).copied()) && *budget > 0 {
1231        *budget -= 1;
1232        let mut scan_pos = begin + 2;
1233        let (inner, closed) = scan(
1234            chars,
1235            &mut scan_pos,
1236            Some(Closer::Delim(delim)),
1237            ctx,
1238            qctx,
1239            depth + 1,
1240            budget,
1241        );
1242        if closed {
1243            flush(pending, out);
1244            out.push(wrap(inner));
1245            *pos = scan_pos;
1246            return;
1247        }
1248        // No closer: the opener is literal text and the speculative scan is thrown away, but the
1249        // outer scan resumes just past the opener and, in a delimiter-dense run, would re-scan the
1250        // same span from each following opener in turn. Charge the shared budget by the span scanned
1251        // so repeated failed opens stay linear in the input rather than quadratic (an OOM vector).
1252        *budget = budget.saturating_sub(scan_pos - begin);
1253    }
1254    pending.push(delim);
1255    pending.push(delim);
1256    *pos = begin + 2;
1257}
1258
1259/// Try to open a curly-quote run at `*pos`; on a missing closer, leave the opener as the apt quote
1260/// glyph and let the scan reprocess what follows. An empty run is kept for double quotes but folds to
1261/// apostrophes for single quotes.
1262#[allow(clippy::too_many_arguments)]
1263fn handle_quote(
1264    chars: &[char],
1265    pos: &mut usize,
1266    quote: char,
1267    ctx: Ctx,
1268    qctx: QuoteCtx,
1269    depth: usize,
1270    budget: &mut usize,
1271    pending: &mut String,
1272    out: &mut Vec<Inline>,
1273) {
1274    let begin = *pos;
1275    if can_open_quote(chars, begin, quote, qctx) && depth < MAX_DEPTH && *budget > 0 {
1276        *budget -= 1;
1277        *pos = begin + 1;
1278        let mut inner_qctx = qctx;
1279        if quote == '\'' {
1280            inner_qctx.in_single = true;
1281        } else {
1282            inner_qctx.in_double = true;
1283        }
1284        let (inner, closed) = scan(
1285            chars,
1286            pos,
1287            Some(Closer::Quote(quote)),
1288            ctx,
1289            inner_qctx,
1290            depth + 1,
1291            budget,
1292        );
1293        if closed && (quote == '"' || !inner.is_empty()) {
1294            flush(pending, out);
1295            out.push(Inline::Quoted(quote_type(quote), inner));
1296            return;
1297        }
1298        // As in `handle_delim`: an unpaired opener rewinds to just past itself, so charge the span
1299        // the failed scan covered to keep a quote-dense run from being re-scanned quadratically.
1300        *budget = budget.saturating_sub(pos.saturating_sub(begin));
1301        *pos = begin + 1;
1302    } else {
1303        *pos = begin + 1;
1304    }
1305    pending.push(quote_glyph(chars, begin, quote));
1306}
1307
1308/// The quote-node kind for a straight quote character.
1309fn quote_type(quote: char) -> QuoteType {
1310    if quote == '\'' {
1311        QuoteType::SingleQuote
1312    } else {
1313        QuoteType::DoubleQuote
1314    }
1315}
1316
1317/// The curly glyph a non-paired straight quote folds into: an apostrophe for `'`, and an opening or
1318/// closing double quote depending on which side it leans.
1319fn quote_glyph(chars: &[char], pos: usize, quote: char) -> char {
1320    if quote == '\'' {
1321        '\u{2019}'
1322    } else if left_flanking(chars, pos) {
1323        '\u{201c}'
1324    } else {
1325        '\u{201d}'
1326    }
1327}
1328
1329/// Monospace run `''…''`: its interior is parsed and then flattened to text. The run forms only when
1330/// the opener is followed by a non-space, the closer preceded by a non-space, and the interior is
1331/// non-empty; otherwise the opener is not a monospace marker.
1332///
1333/// Under smart typography the interior is scanned with quote folding active: any typographic quotes
1334/// that pair within the run are rendered as their glyphs, but if a straight quote inside disrupts the
1335/// closing `''` so the run never closes, the opener is not a monospace marker after all.
1336fn parse_mono(
1337    chars: &[char],
1338    begin: usize,
1339    ctx: Ctx,
1340    depth: usize,
1341    budget: &mut usize,
1342) -> Option<(Inline, usize)> {
1343    if is_ws_opt(chars.get(begin + 2).copied()) {
1344        return None;
1345    }
1346    if ctx.smart {
1347        if *budget == 0 {
1348            return None;
1349        }
1350        *budget -= 1;
1351        let mut pos = begin + 2;
1352        let (inner, closed) = scan(
1353            chars,
1354            &mut pos,
1355            Some(Closer::Mono),
1356            ctx,
1357            QuoteCtx::default(),
1358            depth + 1,
1359            budget,
1360        );
1361        if !closed {
1362            return None;
1363        }
1364        return Some((
1365            Inline::Code(Box::default(), flatten_mono(&inner).into()),
1366            pos,
1367        ));
1368    }
1369    let close = find_subsequence(chars, begin + 2, "''")?;
1370    if close <= begin + 2 || is_ws_opt(chars.get(close - 1).copied()) {
1371        return None;
1372    }
1373    let content = chars.get(begin + 2..close).unwrap_or(&[]);
1374    let inner = scan_slice(content, ctx, depth + 1);
1375    Some((
1376        Inline::Code(Box::default(), to_plain_text(&inner).into()),
1377        close + 2,
1378    ))
1379}
1380
1381/// Flatten monospace interior inlines to text, rendering a quoted run as its curly quote glyphs so
1382/// folded quotation survives inside the code span.
1383fn flatten_mono(inlines: &[Inline]) -> String {
1384    let mut out = String::new();
1385    push_mono_text(inlines, &mut out);
1386    out
1387}
1388
1389fn push_mono_text(inlines: &[Inline], out: &mut String) {
1390    for inline in inlines {
1391        match inline {
1392            Inline::Str(text) | Inline::Code(_, text) | Inline::Math(_, text) => out.push_str(text),
1393            Inline::Space | Inline::SoftBreak | Inline::LineBreak => out.push(' '),
1394            Inline::Quoted(QuoteType::SingleQuote, xs) => {
1395                out.push('\u{2018}');
1396                push_mono_text(xs, out);
1397                out.push('\u{2019}');
1398            }
1399            Inline::Quoted(QuoteType::DoubleQuote, xs) => {
1400                out.push('\u{201c}');
1401                push_mono_text(xs, out);
1402                out.push('\u{201d}');
1403            }
1404            Inline::Emph(xs)
1405            | Inline::Underline(xs)
1406            | Inline::Strong(xs)
1407            | Inline::Strikeout(xs)
1408            | Inline::Superscript(xs)
1409            | Inline::Subscript(xs)
1410            | Inline::SmallCaps(xs)
1411            | Inline::Cite(_, xs)
1412            | Inline::Link(_, xs, _)
1413            | Inline::Image(_, xs, _)
1414            | Inline::Span(_, xs) => push_mono_text(xs, out),
1415            Inline::RawInline(..) | Inline::Note(_) => {}
1416        }
1417    }
1418}
1419
1420/// Handle a `$` opener under dollar-math: a `$$…$$` display span when the next character is also `$`,
1421/// otherwise a `$…$` inline span. A failed attempt emits a single literal `$` and resumes scanning at
1422/// the following character, so an unmatched dollar is taken as text.
1423fn handle_math(chars: &[char], pos: &mut usize, pending: &mut String, out: &mut Vec<Inline>) {
1424    let begin = *pos;
1425    let parsed = if chars.get(begin + 1) == Some(&'$') {
1426        parse_display_math(chars, begin)
1427    } else {
1428        parse_inline_math(chars, begin)
1429    };
1430    if let Some((node, end)) = parsed {
1431        flush(pending, out);
1432        out.push(node);
1433        *pos = end;
1434    } else {
1435        pending.push('$');
1436        *pos = begin + 1;
1437    }
1438}
1439
1440/// A `$$…$$` display-math span: its interior is taken verbatim. `None` when the span has no closer or
1441/// encloses nothing.
1442fn parse_display_math(chars: &[char], begin: usize) -> Option<(Inline, usize)> {
1443    let close = find_subsequence(chars, begin + 2, "$$")?;
1444    if close <= begin + 2 {
1445        return None;
1446    }
1447    let content: String = chars.get(begin + 2..close).unwrap_or(&[]).iter().collect();
1448    Some((
1449        Inline::Math(MathType::DisplayMath, content.into()),
1450        close + 2,
1451    ))
1452}
1453
1454/// A `$…$` inline-math span: the opener must be followed by a non-space, the closer preceded by a
1455/// non-space and not followed by a digit. Its interior is taken verbatim.
1456fn parse_inline_math(chars: &[char], begin: usize) -> Option<(Inline, usize)> {
1457    if is_ws_opt(chars.get(begin + 1).copied()) {
1458        return None;
1459    }
1460    let mut j = begin + 1;
1461    while j < chars.len() {
1462        if chars.get(j) == Some(&'$')
1463            && j > begin + 1
1464            && chars.get(j - 1).is_some_and(|c| !c.is_whitespace())
1465            && !chars.get(j + 1).is_some_and(char::is_ascii_digit)
1466        {
1467            let content: String = chars.get(begin + 1..j).unwrap_or(&[]).iter().collect();
1468            return Some((Inline::Math(MathType::InlineMath, content.into()), j + 1));
1469        }
1470        j += 1;
1471    }
1472    None
1473}
1474
1475/// The number of consecutive `ch` at `pos`.
1476fn run_length(chars: &[char], pos: usize, ch: char) -> usize {
1477    let mut n = 0;
1478    while chars.get(pos + n) == Some(&ch) {
1479        n += 1;
1480    }
1481    n
1482}
1483
1484/// Fold a run of `n` hyphens into em and en dashes: every three become an em dash, a remaining two a
1485/// single en dash, a remaining one a hyphen.
1486fn fold_dashes(n: usize) -> String {
1487    let mut s = "\u{2014}".repeat(n / 3);
1488    match n % 3 {
1489        2 => s.push('\u{2013}'),
1490        1 => s.push('-'),
1491        _ => {}
1492    }
1493    s
1494}
1495
1496/// Fold a run of `n` dots: every three become an ellipsis, with any remainder kept as dots.
1497fn fold_ellipsis(n: usize) -> String {
1498    let mut s = "\u{2026}".repeat(n / 3);
1499    s.push_str(&".".repeat(n % 3));
1500    s
1501}
1502
1503// --- flanking ---
1504
1505/// The character before `pos`, if any.
1506fn before_char(chars: &[char], pos: usize) -> Option<char> {
1507    pos.checked_sub(1).and_then(|p| chars.get(p)).copied()
1508}
1509
1510/// Whether an optional character is whitespace, treating a missing character (a boundary) as
1511/// whitespace.
1512fn is_ws_opt(opt: Option<char>) -> bool {
1513    opt.is_none_or(char::is_whitespace)
1514}
1515
1516/// Whether a character slice is empty or all whitespace.
1517fn is_blank(chars: &[char]) -> bool {
1518    chars.iter().all(|c| c.is_whitespace())
1519}
1520
1521/// Whether an optional character is punctuation, treating a missing character as not punctuation.
1522fn is_punct_opt(opt: Option<char>) -> bool {
1523    opt.is_some_and(is_punct)
1524}
1525
1526/// Whether a character counts as punctuation for flanking: ASCII punctuation, or any other
1527/// non-alphanumeric, non-whitespace character.
1528fn is_punct(c: char) -> bool {
1529    c.is_ascii_punctuation() || (!c.is_alphanumeric() && !c.is_whitespace())
1530}
1531
1532/// Whether the single character at `pos` is left-flanking (it leans against following content).
1533fn left_flanking(chars: &[char], pos: usize) -> bool {
1534    let before = before_char(chars, pos);
1535    let after = chars.get(pos + 1).copied();
1536    !is_ws_opt(after) && (!is_punct_opt(after) || is_ws_opt(before) || is_punct_opt(before))
1537}
1538
1539/// Whether the single character at `pos` is right-flanking (it leans against preceding content).
1540fn right_flanking(chars: &[char], pos: usize) -> bool {
1541    let before = before_char(chars, pos);
1542    let after = chars.get(pos + 1).copied();
1543    !is_ws_opt(before) && (!is_punct_opt(before) || is_ws_opt(after) || is_punct_opt(after))
1544}
1545
1546/// Whether a straight quote at `pos` may open a quoted run. A quote whose kind already encloses the
1547/// position may not open again, so nested same-kind quotation never forms.
1548fn can_open_quote(chars: &[char], pos: usize, quote: char, qctx: QuoteCtx) -> bool {
1549    if (quote == '\'' && qctx.in_single) || (quote == '"' && qctx.in_double) {
1550        return false;
1551    }
1552    left_flanking(chars, pos)
1553}
1554
1555/// Whether a straight quote at `pos` may close a quoted run. A single quote may not close against a
1556/// following alphanumeric, so a word-internal apostrophe never ends a quotation.
1557fn can_close_quote(chars: &[char], pos: usize, quote: char) -> bool {
1558    if !right_flanking(chars, pos) {
1559        return false;
1560    }
1561    if quote == '\'' {
1562        !chars.get(pos + 1).is_some_and(|c| c.is_alphanumeric())
1563    } else {
1564        true
1565    }
1566}
1567
1568/// Whether `pos` sits at a non-alphanumeric boundary (the start of a word for autolink purposes).
1569fn boundary_before(chars: &[char], pos: usize) -> bool {
1570    before_char(chars, pos).is_none_or(|c| !c.is_alphanumeric())
1571}
1572
1573// --- bare URL autolinking ---
1574
1575/// Match a bare URL beginning at `pos` (`scheme://…`), returning the link and the end index.
1576fn try_autolink(chars: &[char], pos: usize) -> Option<(Inline, usize)> {
1577    let mut k = pos;
1578    while chars
1579        .get(k)
1580        .is_some_and(|&c| c.is_ascii_alphanumeric() || matches!(c, '.' | '+' | '-'))
1581    {
1582        k += 1;
1583    }
1584    if !matches_at(chars, k, "://") {
1585        return None;
1586    }
1587    let scheme: String = chars.get(pos..k)?.iter().collect::<String>().to_lowercase();
1588    if !crate::url_schemes::is_scheme(&scheme) {
1589        return None;
1590    }
1591    let content_start = k + 3;
1592    let scan_end = forward_scan(chars, pos);
1593    let end = trim_trailing(chars, content_start, scan_end);
1594    if end <= content_start {
1595        return None;
1596    }
1597    let url: String = chars.get(pos..end)?.iter().collect();
1598    Some((
1599        Inline::Link(
1600            Box::default(),
1601            vec![Inline::Str(url.clone().into())],
1602            Box::new(Target {
1603                url: url.into(),
1604                title: carta_ast::Text::default(),
1605            }),
1606        ),
1607        end,
1608    ))
1609}
1610
1611/// Walk a URL run forward, stopping at whitespace or `<`, balancing parentheses, and ending at an
1612/// unbalanced `)` or a `]` outside any parenthesis.
1613fn forward_scan(chars: &[char], from: usize) -> usize {
1614    let mut depth: i32 = 0;
1615    let mut j = from;
1616    while let Some(&c) = chars.get(j) {
1617        if c.is_whitespace() || c == '<' {
1618            break;
1619        }
1620        match c {
1621            '(' => depth += 1,
1622            ')' | ']' if depth == 0 => break,
1623            ')' => depth -= 1,
1624            _ => {}
1625        }
1626        j += 1;
1627    }
1628    j
1629}
1630
1631/// Drop trailing punctuation from a URL run, never below `min`. A trailing `;` takes a preceding
1632/// `&entity;` with it.
1633fn trim_trailing(chars: &[char], min: usize, mut end: usize) -> usize {
1634    while end > min {
1635        match chars.get(end - 1) {
1636            Some('!' | '"' | '\'' | '*' | ',' | '.' | ':' | '?' | '_' | '~') => end -= 1,
1637            Some(';') => {
1638                let mut j = end - 1;
1639                while j > min
1640                    && chars
1641                        .get(j - 1)
1642                        .is_some_and(|&c| c.is_ascii_alphanumeric() || c == '#')
1643                {
1644                    j -= 1;
1645                }
1646                end = if j > min && chars.get(j - 1) == Some(&'&') {
1647                    j - 1
1648                } else {
1649                    end - 1
1650                };
1651            }
1652            _ => break,
1653        }
1654    }
1655    end
1656}
1657
1658// --- post-processing ---
1659
1660/// Merge adjacent text runs and collapse adjacent whitespace into a single token (preferring a hard
1661/// space), so dropped macros and split apostrophes leave no doubled spacing or fragmented words.
1662fn coalesce(inlines: Vec<Inline>) -> Vec<Inline> {
1663    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
1664    for inline in inlines {
1665        match inline {
1666            Inline::Str(s) => {
1667                if let Some(Inline::Str(prev)) = out.last_mut() {
1668                    prev.push_str(&s);
1669                } else if !s.is_empty() {
1670                    out.push(Inline::Str(s));
1671                }
1672            }
1673            Inline::Space | Inline::SoftBreak => match out.last() {
1674                Some(Inline::Space) => {}
1675                Some(Inline::SoftBreak) => {
1676                    if matches!(inline, Inline::Space)
1677                        && let Some(slot) = out.last_mut()
1678                    {
1679                        *slot = Inline::Space;
1680                    }
1681                }
1682                _ => out.push(inline),
1683            },
1684            other => out.push(other),
1685        }
1686    }
1687    out
1688}
1689
1690/// Split text into `Str` words separated by single whitespace tokens, with no markup interpretation.
1691fn tokenize_text(text: &str) -> Vec<Inline> {
1692    let mut out = Vec::new();
1693    let mut word = String::new();
1694    for c in text.chars() {
1695        if c.is_whitespace() {
1696            if !word.is_empty() {
1697                out.push(Inline::Str(std::mem::take(&mut word).into()));
1698            }
1699            let token = if c == '\n' {
1700                Inline::SoftBreak
1701            } else {
1702                Inline::Space
1703            };
1704            if !matches!(out.last(), Some(Inline::Space | Inline::SoftBreak)) {
1705                out.push(token);
1706            }
1707        } else {
1708            word.push(c);
1709        }
1710    }
1711    if !word.is_empty() {
1712        out.push(Inline::Str(word.into()));
1713    }
1714    out
1715}
1716
1717// --- links and media ---
1718
1719/// Parse a `[[target|label]]` link, returning the link node and its end index. A bracket pair whose
1720/// target side (the text before the first `|`) is entirely empty is not a link; the opener stays
1721/// literal.
1722fn parse_link(chars: &[char], start: usize) -> Option<(Inline, usize)> {
1723    let close = find_subsequence(chars, start + 2, "]]")?;
1724    let inner: String = chars.get(start + 2..close).unwrap_or(&[]).iter().collect();
1725    let (raw_target, label) = match inner.split_once('|') {
1726        Some((t, l)) => (t, Some(l.to_string())),
1727        None => (inner.as_str(), None),
1728    };
1729    if raw_target.is_empty() {
1730        return None;
1731    }
1732    let target = raw_target.trim().to_string();
1733    let (url, display) = classify_link_target(&target);
1734    // An explicit but empty label falls back to the target's auto-display text.
1735    let label_inlines = match label {
1736        Some(text) if !text.trim().is_empty() => tokenize_text(text.trim()),
1737        _ => vec![Inline::Str(display.into())],
1738    };
1739    Some((
1740        Inline::Link(
1741            Box::default(),
1742            label_inlines,
1743            Box::new(Target {
1744                url: url.into(),
1745                title: carta_ast::Text::default(),
1746            }),
1747        ),
1748        close + 2,
1749    ))
1750}
1751
1752/// Resolve a link target to its destination URL and auto-display text.
1753fn classify_link_target(target: &str) -> (String, String) {
1754    if target.starts_with("\\\\") || is_external(target) {
1755        (target.to_string(), target.to_string())
1756    } else if let Some((prefix, rest)) = target.split_once('>') {
1757        (interwiki_url(prefix, rest), rest.to_string())
1758    } else {
1759        (resolve_id(target), display_id(target))
1760    }
1761}
1762
1763/// Parse a `{{image?query|caption}}` media reference into an image, or, when the query opts out of
1764/// embedding, a link.
1765fn parse_media(chars: &[char], start: usize) -> Option<(Inline, usize)> {
1766    let close = find_subsequence(chars, start + 2, "}}")?;
1767    let inner: String = chars.get(start + 2..close).unwrap_or(&[]).iter().collect();
1768    let end = close + 2;
1769
1770    let leading_space = inner.starts_with(char::is_whitespace);
1771    let (spec, caption) = match inner.split_once('|') {
1772        Some((s, c)) => (s, Some(c)),
1773        None => (inner.as_str(), None),
1774    };
1775    // A brace pair whose source side (before the first `|`) is empty is not a media reference.
1776    if spec.is_empty() {
1777        return None;
1778    }
1779    let trailing_space = spec.ends_with(char::is_whitespace);
1780    let mut classes = Vec::new();
1781    if let Some(class) = media_align(leading_space, trailing_space) {
1782        classes.push(class.into());
1783    }
1784
1785    let spec = spec.trim();
1786    let (id, query) = match spec.split_once('?') {
1787        Some((i, q)) => (i, Some(q)),
1788        None => (spec, None),
1789    };
1790    let url = if is_external(id) {
1791        id.to_string()
1792    } else {
1793        resolve_id(id)
1794    };
1795    // An explicit but empty caption falls back to the source's auto-display text.
1796    let alt = match caption {
1797        Some(text) if !text.trim().is_empty() => tokenize_text(text.trim()),
1798        _ if is_external(id) => vec![Inline::Str(id.into())],
1799        _ => vec![Inline::Str(display_id(id).into())],
1800    };
1801    let target = Target {
1802        url: url.into(),
1803        title: carta_ast::Text::default(),
1804    };
1805
1806    let node = match query {
1807        Some(q) if q.contains("linkonly") => Inline::Link(
1808            Box::new(Attr {
1809                classes,
1810                ..Default::default()
1811            }),
1812            alt,
1813            Box::new(target),
1814        ),
1815        Some(q) => {
1816            let (width, height) = parse_size(q);
1817            let mut attributes = Vec::new();
1818            if let Some(w) = width {
1819                attributes.push(("width".to_string(), w));
1820            }
1821            if let Some(h) = height {
1822                attributes.push(("height".to_string(), h));
1823            }
1824            attributes.push(("query".to_string(), format!("?{q}")));
1825            Inline::Image(
1826                Box::new(Attr {
1827                    classes,
1828                    attributes: attributes
1829                        .into_iter()
1830                        .map(|(k, v)| (k.into(), v.into()))
1831                        .collect(),
1832                    ..Default::default()
1833                }),
1834                alt,
1835                Box::new(target),
1836            )
1837        }
1838        None => Inline::Image(
1839            Box::new(Attr {
1840                classes,
1841                ..Default::default()
1842            }),
1843            alt,
1844            Box::new(target),
1845        ),
1846    };
1847    Some((node, end))
1848}
1849
1850/// The alignment class for a media reference, from whether its braces carry interior padding.
1851fn media_align(leading: bool, trailing: bool) -> Option<&'static str> {
1852    match (leading, trailing) {
1853        (true, true) => Some("align-center"),
1854        (false, true) => Some("align-left"),
1855        (true, false) => Some("align-right"),
1856        (false, false) => None,
1857    }
1858}
1859
1860/// Parse the leading `width` and optional `xheight` of a media query into pixel strings.
1861fn parse_size(query: &str) -> (Option<String>, Option<String>) {
1862    let chars: Vec<char> = query.chars().collect();
1863    let mut i = 0;
1864    let mut width = String::new();
1865    while let Some(&c) = chars.get(i) {
1866        if c.is_ascii_digit() {
1867            width.push(c);
1868            i += 1;
1869        } else {
1870            break;
1871        }
1872    }
1873    if width.is_empty() {
1874        return (None, None);
1875    }
1876    let mut height = String::new();
1877    if matches!(chars.get(i), Some('x' | 'X')) {
1878        i += 1;
1879        while let Some(&c) = chars.get(i) {
1880            if c.is_ascii_digit() {
1881                height.push(c);
1882                i += 1;
1883            } else {
1884                break;
1885            }
1886        }
1887    }
1888    let height = if height.is_empty() {
1889        None
1890    } else {
1891        Some(height)
1892    };
1893    (Some(width), height)
1894}
1895
1896/// Whether a target names an external destination: a known scheme followed by `://`.
1897fn is_external(s: &str) -> bool {
1898    match s.find("://") {
1899        Some(idx) => {
1900            let scheme = s.get(..idx).unwrap_or("");
1901            !scheme.is_empty()
1902                && scheme
1903                    .chars()
1904                    .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '+' | '-'))
1905                && crate::url_schemes::is_scheme(&scheme.to_lowercase())
1906        }
1907        None => false,
1908    }
1909}
1910
1911/// Resolve a page identifier to a site-relative URL. A namespaced id becomes a slash path, rooted
1912/// unless it is relative (a leading `.`); an id with no namespace is left untouched.
1913fn resolve_id(id: &str) -> String {
1914    if !id.contains(':') {
1915        return id.to_string();
1916    }
1917    if let Some(rest) = id.strip_prefix('.') {
1918        return rest.trim_start_matches('.').replace(':', "/");
1919    }
1920    let replaced = id.replace(':', "/");
1921    if replaced.starts_with('/') {
1922        replaced
1923    } else {
1924        format!("/{replaced}")
1925    }
1926}
1927
1928/// The display text for a bare page identifier: the segment after the last namespace separator.
1929fn display_id(id: &str) -> String {
1930    match id.rsplit_once(':') {
1931        Some((_, last)) => last.to_string(),
1932        None => id.to_string(),
1933    }
1934}
1935
1936/// Map an interwiki shortcut and its tail to a destination URL.
1937fn interwiki_url(prefix: &str, rest: &str) -> String {
1938    match prefix {
1939        "wp" => format!("https://en.wikipedia.org/wiki/{rest}"),
1940        "wpfr" => format!("https://fr.wikipedia.org/wiki/{rest}"),
1941        "wpde" => format!("https://de.wikipedia.org/wiki/{rest}"),
1942        "wpes" => format!("https://es.wikipedia.org/wiki/{rest}"),
1943        "wpjp" => format!("https://jp.wikipedia.org/wiki/{rest}"),
1944        "wppl" => format!("https://pl.wikipedia.org/wiki/{rest}"),
1945        "doku" => format!("https://www.dokuwiki.org/{rest}"),
1946        "phpfn" => format!("https://secure.php.net/{rest}"),
1947        "callto" => format!("callto://{rest}"),
1948        other => format!("{other}>{rest}"),
1949    }
1950}
1951
1952// --- footnotes, nowiki, angle tags, macros ---
1953
1954/// Parse a `((…))` footnote into a note holding the block content of its body. A body that is empty
1955/// or only whitespace is not a footnote, so the opener stays literal.
1956fn parse_footnote(chars: &[char], begin: usize, ctx: Ctx, depth: usize) -> Option<(Inline, usize)> {
1957    let close = find_subsequence(chars, begin + 2, "))")?;
1958    let inner: String = chars.get(begin + 2..close).unwrap_or(&[]).iter().collect();
1959    if inner.trim().is_empty() {
1960        return None;
1961    }
1962    Some((
1963        Inline::Note(parse_blocks_str(&inner, ctx, depth + 1)),
1964        close + 2,
1965    ))
1966}
1967
1968/// Parse a `%%…%%` no-formatting span: its content is taken verbatim as text. Like the emphasis
1969/// markers, the opener needs a non-whitespace character after it and the closer one before it, so a
1970/// `%%` adjacent to a space stays literal.
1971fn parse_nowiki_pct(chars: &[char], begin: usize) -> Option<(Vec<Inline>, usize)> {
1972    if chars.get(begin + 2).is_none_or(|c| c.is_whitespace()) {
1973        return None;
1974    }
1975    let mut j = begin + 2;
1976    while j < chars.len() {
1977        if chars.get(j) == Some(&'%')
1978            && chars.get(j + 1) == Some(&'%')
1979            && j > begin + 2
1980            && chars.get(j - 1).is_some_and(|c| !c.is_whitespace())
1981        {
1982            let inner: String = chars.get(begin + 2..j).unwrap_or(&[]).iter().collect();
1983            return Some((tokenize_text(&inner), j + 2));
1984        }
1985        j += 1;
1986    }
1987    None
1988}
1989
1990/// Parse an angle-bracket inline construct: the markup spans, a verbatim span, raw HTML/PHP, or an
1991/// email address.
1992fn parse_angle(
1993    chars: &[char],
1994    begin: usize,
1995    ctx: Ctx,
1996    depth: usize,
1997) -> Option<(Vec<Inline>, usize)> {
1998    // A span tag with a blank interior is not markup; the opener stays literal text.
1999    if let Some((inner, end)) = tag_region(chars, begin, "<sub>", "</sub>")
2000        && !is_blank(&inner)
2001    {
2002        return Some((
2003            vec![Inline::Subscript(scan_slice(&inner, ctx, depth + 1))],
2004            end,
2005        ));
2006    }
2007    if let Some((inner, end)) = tag_region(chars, begin, "<sup>", "</sup>")
2008        && !is_blank(&inner)
2009    {
2010        return Some((
2011            vec![Inline::Superscript(scan_slice(&inner, ctx, depth + 1))],
2012            end,
2013        ));
2014    }
2015    if let Some((inner, end)) = tag_region(chars, begin, "<del>", "</del>")
2016        && !is_blank(&inner)
2017    {
2018        return Some((
2019            vec![Inline::Strikeout(scan_slice(&inner, ctx, depth + 1))],
2020            end,
2021        ));
2022    }
2023    if let Some((inner, end)) = tag_region(chars, begin, "<nowiki>", "</nowiki>") {
2024        let text: String = inner.iter().collect();
2025        return Some((tokenize_text(&text), end));
2026    }
2027    if let Some((inner, end)) = tag_region(chars, begin, "<html>", "</html>") {
2028        let text: String = inner.iter().collect();
2029        return Some((
2030            vec![Inline::RawInline(Format("html".into()), text.into())],
2031            end,
2032        ));
2033    }
2034    if let Some((inner, end)) = tag_region(chars, begin, "<php>", "</php>") {
2035        let text: String = inner.iter().collect();
2036        return Some((
2037            vec![Inline::RawInline(
2038                Format("html".into()),
2039                format!("<?php {text} ?>").into(),
2040            )],
2041            end,
2042        ));
2043    }
2044    angle_email(chars, begin).map(|(node, end)| (vec![node], end))
2045}
2046
2047/// The interior characters and end index of an `open…close` tag region starting at `start`.
2048fn tag_region(chars: &[char], start: usize, open: &str, close: &str) -> Option<(Vec<char>, usize)> {
2049    if !matches_at(chars, start, open) {
2050        return None;
2051    }
2052    let content_start = start + open.chars().count();
2053    let close_at = find_subsequence(chars, content_start, close)?;
2054    let inner = chars.get(content_start..close_at).unwrap_or(&[]).to_vec();
2055    Some((inner, close_at + close.chars().count()))
2056}
2057
2058/// Parse `<local@domain>` into a `mailto:` link.
2059fn angle_email(chars: &[char], start: usize) -> Option<(Inline, usize)> {
2060    if chars.get(start) != Some(&'<') {
2061        return None;
2062    }
2063    let mut j = start + 1;
2064    while let Some(&c) = chars.get(j) {
2065        if c == '>' {
2066            break;
2067        }
2068        if c.is_whitespace() || c == '<' {
2069            return None;
2070        }
2071        j += 1;
2072    }
2073    if chars.get(j) != Some(&'>') {
2074        return None;
2075    }
2076    let inner: String = chars.get(start + 1..j).unwrap_or(&[]).iter().collect();
2077    let (local, domain) = inner.split_once('@')?;
2078    if local.is_empty() || !domain.contains('.') || domain.starts_with('.') || domain.ends_with('.')
2079    {
2080        return None;
2081    }
2082    let url = format!("mailto:{inner}");
2083    Some((
2084        Inline::Link(
2085            Box::default(),
2086            vec![Inline::Str(inner.into())],
2087            Box::new(Target {
2088                url: url.into(),
2089                title: carta_ast::Text::default(),
2090            }),
2091        ),
2092        j + 1,
2093    ))
2094}
2095
2096/// Recognise a dropped page macro (`~~NOTOC~~`, `~~NOCACHE~~`), returning its end index.
2097fn parse_macro(chars: &[char], start: usize) -> Option<usize> {
2098    for token in ["~~NOTOC~~", "~~NOCACHE~~"] {
2099        if matches_at(chars, start, token) {
2100            return Some(start + token.chars().count());
2101        }
2102    }
2103    None
2104}
2105
2106/// Split text into lines and parse them as blocks.
2107fn parse_blocks_str(text: &str, ctx: Ctx, depth: usize) -> Vec<Block> {
2108    let lines: Vec<&str> = text.split('\n').collect();
2109    let mut index = 0;
2110    parse_blocks(&lines, &mut index, ctx, depth)
2111}
2112
2113// ===================================================================================================
2114// Tables
2115// ===================================================================================================
2116
2117/// Parse a run of table rows. The first row sets the column count and per-column alignment, and is
2118/// the header row when it opens with `^`; all remaining rows form the single body.
2119fn parse_table(lines: &[&str], index: &mut usize, ctx: Ctx, depth: usize) -> Block {
2120    let mut rows: Vec<(bool, Vec<String>)> = Vec::new();
2121    while *index < lines.len() {
2122        let line = lines.get(*index).copied().unwrap_or("");
2123        if !is_table_line(line) {
2124            break;
2125        }
2126        rows.push((line.starts_with('^'), split_row(line)));
2127        *index += 1;
2128    }
2129
2130    let first = rows.first();
2131    let col_count = first.map_or(0, |(_, cells)| cells.len());
2132    let col_specs: Vec<ColSpec> = first
2133        .map(|(_, cells)| {
2134            cells
2135                .iter()
2136                .map(|cell| ColSpec {
2137                    align: cell_align(cell),
2138                    width: ColWidth::ColWidthDefault,
2139                })
2140                .collect()
2141        })
2142        .unwrap_or_default();
2143
2144    let mut head_rows = Vec::new();
2145    let mut body_rows = Vec::new();
2146    for (i, (header, cells)) in rows.iter().enumerate() {
2147        let row = build_row(cells, col_count, ctx, depth);
2148        if i == 0 && *header {
2149            head_rows.push(row);
2150        } else {
2151            body_rows.push(row);
2152        }
2153    }
2154
2155    Block::Table(Box::new(Table {
2156        attr: Attr::default(),
2157        caption: Caption::default(),
2158        col_specs,
2159        head: TableHead {
2160            attr: Attr::default(),
2161            rows: head_rows,
2162        },
2163        bodies: vec![TableBody {
2164            attr: Attr::default(),
2165            row_head_columns: 0,
2166            head: Vec::new(),
2167            body: body_rows,
2168        }],
2169        foot: TableFoot::default(),
2170    }))
2171}
2172
2173/// Build a table row, fitting it to `col_count` by truncating extra cells and padding short rows.
2174fn build_row(cells: &[String], col_count: usize, ctx: Ctx, depth: usize) -> Row {
2175    let mut out = Vec::with_capacity(col_count);
2176    for i in 0..col_count {
2177        let trimmed = cells.get(i).map_or("", |c| c.trim());
2178        let content = if trimmed.is_empty() {
2179            Vec::new()
2180        } else {
2181            vec![Block::Plain(inline_content(trimmed, ctx, depth))]
2182        };
2183        out.push(Cell {
2184            attr: Attr::default(),
2185            align: Alignment::AlignDefault,
2186            row_span: 1,
2187            col_span: 1,
2188            content,
2189        });
2190    }
2191    Row {
2192        attr: Attr::default(),
2193        cells: out,
2194    }
2195}
2196
2197/// The column alignment implied by a raw cell's padding: at least two spaces on a side anchors that
2198/// side, both anchors centre, neither leaves the default.
2199fn cell_align(raw: &str) -> Alignment {
2200    let leading = raw.chars().take_while(|&c| c == ' ').count();
2201    let trailing = raw.chars().rev().take_while(|&c| c == ' ').count();
2202    match (leading >= 2, trailing >= 2) {
2203        (true, true) => Alignment::AlignCenter,
2204        (_, true) => Alignment::AlignLeft,
2205        (true, _) => Alignment::AlignRight,
2206        _ => Alignment::AlignDefault,
2207    }
2208}
2209
2210/// Split a table row into its raw cell texts, treating `|` and `^` as delimiters but ignoring those
2211/// inside links, media, monospace, no-format spans, and verbatim regions.
2212fn split_row(line: &str) -> Vec<String> {
2213    let chars: Vec<char> = line.chars().collect();
2214    let mut segments: Vec<String> = Vec::new();
2215    let mut seg = String::new();
2216    let mut i = 0;
2217    while i < chars.len() {
2218        if let Some(skip) = protected_end(&chars, i) {
2219            seg.extend(chars.get(i..skip).unwrap_or(&[]));
2220            i = skip;
2221            continue;
2222        }
2223        match chars.get(i) {
2224            Some('|' | '^') => {
2225                segments.push(std::mem::take(&mut seg));
2226                i += 1;
2227            }
2228            Some(&c) => {
2229                seg.push(c);
2230                i += 1;
2231            }
2232            None => break,
2233        }
2234    }
2235    segments.push(seg);
2236    if !segments.is_empty() {
2237        segments.remove(0);
2238    }
2239    if segments.last().is_some_and(String::is_empty) {
2240        segments.pop();
2241    }
2242    segments
2243}
2244
2245/// If a protected span opens at `i`, the index just past its closing delimiter (or the end of the
2246/// line when it is unterminated).
2247fn protected_end(chars: &[char], i: usize) -> Option<usize> {
2248    for (open, close) in [("[[", "]]"), ("{{", "}}"), ("''", "''"), ("%%", "%%")] {
2249        if matches_at(chars, i, open) {
2250            let from = i + open.chars().count();
2251            let end = find_subsequence(chars, from, close)
2252                .map_or(chars.len(), |p| p + close.chars().count());
2253            return Some(end);
2254        }
2255    }
2256    if matches_at(chars, i, "<nowiki>") {
2257        let from = i + "<nowiki>".chars().count();
2258        let end = find_subsequence(chars, from, "</nowiki>")
2259            .map_or(chars.len(), |p| p + "</nowiki>".chars().count());
2260        return Some(end);
2261    }
2262    None
2263}
2264
2265#[cfg(test)]
2266mod tests {
2267    use super::*;
2268
2269    /// Reads with the default option set and reports only whether the read completed without error,
2270    /// so a pathological input can be checked for graceful, bounded-time handling.
2271    fn reads_ok(input: &str) -> bool {
2272        DokuwikiReader
2273            .read(input, &ReaderOptions::default())
2274            .is_ok()
2275    }
2276
2277    #[test]
2278    fn adversarial_footnotes_under_open_emphasis_do_not_stall() {
2279        // Each `((…))` footnote re-parses its interior, and an emphasis run that fails to close
2280        // discards its scan and re-scans the same span — so overlapping footnotes and unclosed `//`
2281        // openers once re-parsed the same regions a super-linear number of times, which a nightly
2282        // fuzz run hit as a timeout. Charging the inline backtracking budget for each construct
2283        // bounds how often a region can be re-parsed; the pre-fix code blew up exponentially on an
2284        // input a fraction of this size.
2285        let input = format!("(({}))", "//((x)) ".repeat(400));
2286        assert!(reads_ok(&input));
2287    }
2288
2289    #[test]
2290    fn adversarially_nested_footnotes_do_not_stall() {
2291        let input = format!("{}x{}", "((".repeat(2_000), "))".repeat(2_000));
2292        assert!(reads_ok(&input));
2293    }
2294
2295    #[test]
2296    fn a_delimiter_dense_run_does_not_blow_up() {
2297        // An emphasis opener with no closer discards its speculative scan and rewinds to just past
2298        // itself, so a run of unclosed `//` openers whose would-be closers are all whitespace-led
2299        // (never valid) was re-scanned from every position — quadratic work that allocated a
2300        // discarded inline tree each time. A nightly fuzz run hit this as an out-of-memory on a
2301        // sub-kilobyte input. Charging the backtracking budget for the scanned span keeps it linear.
2302        let input = "//a ".repeat(4_000);
2303        assert!(reads_ok(&input));
2304    }
2305}