Skip to main content

carta_readers/
mediawiki.rs

1//! Reader for `MediaWiki`'s wikitext markup.
2//!
3//! The source is first cleared of comments (`<!-- … -->`), then parsed line by line into blocks.
4//! A line opening with `=` runs is a heading, `*`/`#`/`:`/`;` runs start lists, four or more `-`
5//! alone are a horizontal rule, a leading space marks preformatted text, `{{…}}` and `{|…|}` are
6//! template and table markup, and `<pre>`/`<blockquote>`/`<syntaxhighlight>` are recognized block
7//! tags; everything else is a paragraph. Inline markup — apostrophe emphasis, `[[internal]]` and
8//! `[external]` links, bare URLs, entity references, and a fixed set of HTML tags — is scanned
9//! within each block's text.
10//!
11//! Heading identifiers follow the enabled identifier scheme: with `gfm_auto_identifiers` the GitHub
12//! algorithm (hyphen separators), otherwise `auto_identifiers` lowercases the text, keeps
13//! alphanumerics together with `_` and `.`, turns spaces and `-` into single `_`, and drops a
14//! leading run of non-letters; duplicates gain a numeric suffix and an empty result becomes
15//! `section`. With neither enabled, headings carry no identifier.
16//!
17//! The scanner is panic-free on malformed input: unbalanced or unterminated constructs degrade to
18//! literal text rather than being rejected.
19
20use std::collections::BTreeMap;
21
22use carta_ast::{
23    Alignment, ApiVersion, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Format, Inline,
24    ListAttributes, ListNumberDelim, ListNumberStyle, MathType, MetaValue, QuoteType, Row, Table,
25    TableBody, TableFoot, TableHead, Target, ToCompactString, slug_gfm, to_plain_text,
26};
27use carta_core::{Extension, Extensions, Reader, ReaderOptions, Result};
28
29use crate::emoji;
30use crate::entities;
31use crate::heading_ids;
32
33/// Parses a wikitext document into the document model.
34#[derive(Debug, Default, Clone, Copy)]
35pub struct MediawikiReader;
36
37impl Reader for MediawikiReader {
38    fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
39        let stripped = strip_comments(&expand_tabs(input));
40        let (source, behavior_switches) = extract_behavior_switches(&stripped);
41        let chars: Vec<char> = source.chars().collect();
42        let mut parser = Parser::new(options);
43        let mut blocks = parser.parse_blocks(&chars);
44        // Category memberships are pulled out of the inline flow as they are encountered and gathered
45        // into a single trailing paragraph, one link per category in document order.
46        if !parser.categories.is_empty() {
47            let mut inlines: Vec<Inline> = Vec::new();
48            for (index, category) in parser.categories.drain(..).enumerate() {
49                if index > 0 {
50                    inlines.push(Inline::Space);
51                }
52                inlines.push(category);
53            }
54            blocks.push(Block::Para(inlines));
55        }
56        let mut meta: BTreeMap<String, MetaValue> = BTreeMap::new();
57        for switch in behavior_switches {
58            meta.insert(switch, MetaValue::MetaBool(true));
59        }
60        Ok(Document {
61            api_version: ApiVersion::default(),
62            meta: meta.into_iter().map(|(k, v)| (k.into(), v)).collect(),
63            blocks,
64        })
65    }
66}
67
68/// Carries the state that spans a whole document: the enabled extensions, the running counter for
69/// unlabeled external links, and the heading identifiers already issued (for de-duplication).
70struct Parser {
71    extensions: Extensions,
72    link_counter: usize,
73    ids: heading_ids::IdRegistry,
74    /// Category links pulled out of the inline flow, to be emitted as one trailing paragraph.
75    categories: Vec<Inline>,
76    /// Current block-nesting depth, capped to keep adversarially deep input from exhausting the stack.
77    depth: usize,
78}
79
80/// Block-nesting depth past which parsing stops descending: deeply stacked blockquotes, list levels,
81/// notes, and table cells degrade to flat content rather than recursing without bound. The cap sits
82/// far below the point where either parsing or serialization would overflow the stack.
83const MAX_BLOCK_DEPTH: usize = 64;
84
85/// One line of list markup: its leading marker run and the trimmed text that follows.
86struct ListItem {
87    markers: Vec<char>,
88    content: String,
89}
90
91/// The list family a marker character opens.
92#[derive(PartialEq, Eq, Clone, Copy)]
93enum ListKind {
94    Bullet,
95    Ordered,
96    Definition,
97}
98
99/// A table cell collected during the line scan, before its text is parsed into blocks. A `!`-marked
100/// cell is a header cell; the spans and attributes come from the cell's leading attribute list.
101struct RawCell {
102    is_header: bool,
103    align: Alignment,
104    col_span: i32,
105    row_span: i32,
106    attr: Attr,
107    content: String,
108}
109
110/// The alignment, spans, and attributes parsed from a cell's leading attribute list.
111struct CellAttrs {
112    align: Alignment,
113    col_span: i32,
114    row_span: i32,
115    attr: Attr,
116}
117
118/// Which open construct a table continuation line extends.
119#[derive(Clone, Copy)]
120enum OpenTarget {
121    None,
122    Caption,
123    Cell,
124}
125
126/// A lexical unit of inline text: a finished inline node, a run of apostrophes whose emphasis role is
127/// resolved once the surrounding run structure is known, a block-level HTML tag that interrupts the
128/// paragraph, or a paragraph break carried by a block-level tag that leaves no output.
129enum Tok {
130    Inline(Inline),
131    Apostrophes(usize),
132    BlockRaw(String),
133    BlockBreak,
134    /// A verbatim block element (`<pre>`, `<blockquote>`, `<syntaxhighlight>`) found mid-paragraph:
135    /// it interrupts the paragraph and emerges as its own block.
136    Block(Block),
137}
138
139/// The role a recognized HTML tag plays in the inline stream.
140enum HtmlTagRole {
141    /// An inline element: its opening and closing tags pass through as raw inline HTML.
142    Inline,
143    /// A block element: its tags interrupt the paragraph and pass through as raw block HTML.
144    Block,
145    /// A paragraph-only element (`p`, `gallery`): its tags interrupt the paragraph but leave no output.
146    Break,
147}
148
149impl Parser {
150    fn new(options: &ReaderOptions) -> Self {
151        Self {
152            extensions: options.extensions,
153            link_counter: 0,
154            ids: heading_ids::IdRegistry::default(),
155            categories: Vec::new(),
156            depth: 0,
157        }
158    }
159
160    /// Whether straight double quotes should fold into typographic quote runs.
161    fn smart(&self) -> bool {
162        self.extensions.contains(Extension::Smart)
163    }
164
165    fn parse_blocks(&mut self, chars: &[char]) -> Vec<Block> {
166        self.depth += 1;
167        if self.depth > MAX_BLOCK_DEPTH {
168            self.depth -= 1;
169            return degraded_blocks(chars);
170        }
171        let blocks = self.parse_blocks_inner(chars);
172        self.depth -= 1;
173        blocks
174    }
175
176    fn parse_blocks_inner(&mut self, chars: &[char]) -> Vec<Block> {
177        let mut blocks: Vec<Block> = Vec::new();
178        let mut pos = 0;
179        let mut line_start = true;
180        let n = chars.len();
181        // Heading-region lookahead memo, shared across every line-classification query over this
182        // slice so each line's region is resolved at most once. `chars` is fixed for the whole
183        // call, so positions stay valid throughout; nested slices (cells, blockquotes) get their
184        // own memo via their own `parse_blocks_inner`.
185        let mut scan = HeaderScan::default();
186        while pos < n {
187            if line_start {
188                let le = line_end(chars, pos);
189                if is_blank(chars, pos, le) {
190                    pos = if le < n { le + 1 } else { le };
191                    continue;
192                }
193                let c = at(chars, pos).unwrap_or(' ');
194                if c == '{'
195                    && at(chars, pos + 1) == Some('{')
196                    && template_opens(chars, pos)
197                    && let Some(after) = balanced_braces(chars, pos)
198                {
199                    let raw = collect_range(chars, pos, after);
200                    blocks.push(Block::RawBlock(format_mediawiki(), raw.into()));
201                    let (np, ls) = finish_inline_block(chars, after);
202                    pos = np;
203                    line_start = ls;
204                    continue;
205                }
206                if c == '{' && at(chars, pos + 1) == Some('|') {
207                    let (block, after) = self.parse_table(chars, pos);
208                    blocks.push(block);
209                    let (np, ls) = finish_inline_block(chars, after);
210                    pos = np;
211                    line_start = ls;
212                    continue;
213                }
214                if c == '='
215                    && let Some((level, inlines, closer_end)) =
216                        self.try_header(chars, pos, &mut scan)
217                {
218                    let id = self.make_id(&inlines);
219                    let attr = Attr {
220                        id: id.into(),
221                        classes: Vec::new(),
222                        attributes: Vec::new(),
223                    };
224                    blocks.push(Block::Header(level, Box::new(attr), inlines));
225                    let (np, ls) = finish_inline_block(chars, closer_end);
226                    pos = np;
227                    line_start = ls;
228                    continue;
229                }
230                if c == '-' && is_hr_line(chars, pos) {
231                    blocks.push(Block::HorizontalRule);
232                    let le2 = line_end(chars, pos);
233                    pos = if le2 < n { le2 + 1 } else { le2 };
234                    line_start = true;
235                    continue;
236                }
237                if matches!(c, '*' | '#' | ':' | ';') && list_run_uniform(chars, pos) {
238                    let (list_blocks, after) = self.parse_list(chars, pos);
239                    blocks.extend(list_blocks);
240                    pos = after;
241                    line_start = true;
242                    continue;
243                }
244                if c == ' ' {
245                    let (block, after) = self.parse_preformatted(chars, pos);
246                    blocks.push(block);
247                    pos = after;
248                    line_start = true;
249                    continue;
250                }
251                if c == '<'
252                    && let Some((block, after)) = self.parse_block_tag(chars, pos)
253                {
254                    blocks.push(block);
255                    let (np, ls) = finish_inline_block(chars, after);
256                    pos = np;
257                    line_start = ls;
258                    continue;
259                }
260            }
261            let (mut para_blocks, after) = self.parse_paragraph(chars, pos, &mut scan);
262            blocks.append(&mut para_blocks);
263            pos = after;
264            line_start = true;
265        }
266        blocks
267    }
268
269    fn try_header(
270        &mut self,
271        chars: &[char],
272        pos: usize,
273        scan: &mut HeaderScan,
274    ) -> Option<(i32, Vec<Inline>, usize)> {
275        let le = line_end(chars, pos);
276        let mut m = 0;
277        while pos + m < le && at(chars, pos + m) == Some('=') {
278            m += 1;
279        }
280        if m == 0 || m > 6 {
281            return None;
282        }
283        let content_start = pos + m;
284        // The closing run may sit several lines below: the heading text continues like a paragraph
285        // until a blank line or a line that opens its own block, and the trailing `=` run anywhere in
286        // that span closes it.
287        let region_end = header_region_end_scan(chars, pos, scan);
288        let closer = header_closer(chars, content_start, region_end, m)?;
289        let content = collect_range(chars, content_start, closer);
290        let inlines = self.parse_inlines(content.trim());
291        Some((i32::try_from(m).unwrap_or(1), inlines, closer + m))
292    }
293
294    fn parse_list(&mut self, chars: &[char], pos: usize) -> (Vec<Block>, usize) {
295        let mut items: Vec<ListItem> = Vec::new();
296        let mut cursor = pos;
297        let n = chars.len();
298        while at(chars, cursor).is_some_and(is_list_marker) {
299            let le = line_end(chars, cursor);
300            let mut scan = cursor;
301            let mut markers: Vec<char> = Vec::new();
302            while scan < le && at(chars, scan).is_some_and(is_list_marker) {
303                if let Some(marker) = at(chars, scan) {
304                    markers.push(marker);
305                }
306                scan += 1;
307            }
308            let content = collect_range(chars, scan, le).trim().to_string();
309            items.push(ListItem { markers, content });
310            if le >= n {
311                cursor = le;
312                break;
313            }
314            cursor = le + 1;
315        }
316        (self.build_lists(&items, 0), cursor)
317    }
318
319    fn build_lists(&mut self, items: &[ListItem], level: usize) -> Vec<Block> {
320        if level >= MAX_BLOCK_DEPTH {
321            // Past the nesting cap, each item's text becomes a flat plain block with no deeper list
322            // structure, so adversarially deep marker runs cannot exhaust the stack.
323            let mut out: Vec<Block> = Vec::new();
324            for item in items {
325                let inlines = self.parse_inlines(&item.content);
326                if !inlines.is_empty() {
327                    out.push(Block::Plain(inlines));
328                }
329            }
330            return out;
331        }
332        let mut out: Vec<Block> = Vec::new();
333        let mut i = 0;
334        while i < items.len() {
335            let kind = if let Some(&m) = items.get(i).and_then(|it| it.markers.get(level)) {
336                list_kind(m)
337            } else {
338                i += 1;
339                continue;
340            };
341            let mut j = i;
342            while j < items.len() {
343                match items.get(j).and_then(|it| it.markers.get(level)) {
344                    Some(&m) if list_kind(m) == kind => j += 1,
345                    _ => break,
346                }
347            }
348            let group = items.get(i..j).unwrap_or(&[]);
349            match kind {
350                ListKind::Bullet => out.push(Block::BulletList(self.build_simple(group, level))),
351                ListKind::Ordered => {
352                    out.push(Block::OrderedList(
353                        default_list_attrs(),
354                        self.build_simple(group, level),
355                    ));
356                }
357                ListKind::Definition => out.push(self.build_definition(group, level)),
358            }
359            i = j;
360        }
361        out
362    }
363
364    fn build_simple(&mut self, group: &[ListItem], level: usize) -> Vec<Vec<Block>> {
365        let mut entries: Vec<Vec<Block>> = Vec::new();
366        let mut i = 0;
367        while i < group.len() {
368            let depth = group.get(i).map_or(0, |it| it.markers.len());
369            if depth == level + 1 {
370                let content = group.get(i).map_or("", |it| it.content.as_str());
371                let mut blocks = vec![plain_or_figure(self.parse_inlines(content))];
372                i += 1;
373                let start = i;
374                while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
375                    i += 1;
376                }
377                if let Some(sub) = group.get(start..i)
378                    && !sub.is_empty()
379                {
380                    blocks.extend(self.build_lists(sub, level + 1));
381                }
382                entries.push(blocks);
383            } else {
384                let start = i;
385                while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
386                    i += 1;
387                }
388                if i == start {
389                    i += 1;
390                }
391                let blocks = group
392                    .get(start..i)
393                    .map(|sub| self.build_lists(sub, level + 1))
394                    .unwrap_or_default();
395                entries.push(blocks);
396            }
397        }
398        entries
399    }
400
401    fn build_definition(&mut self, group: &[ListItem], level: usize) -> Block {
402        let mut pairs: Vec<(Vec<Inline>, Vec<Vec<Block>>)> = Vec::new();
403        let mut i = 0;
404        while i < group.len() {
405            let Some(item) = group.get(i) else { break };
406            if item.markers.len() == level + 1 {
407                let marker = item.markers.get(level).copied().unwrap_or(':');
408                let content = item.content.clone();
409                i += 1;
410                let start = i;
411                while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
412                    i += 1;
413                }
414                let nested = group
415                    .get(start..i)
416                    .map(|sub| self.build_lists(sub, level + 1))
417                    .unwrap_or_default();
418                if marker == ';' {
419                    let (term_str, def_str) = split_term(&content);
420                    let term = self.parse_inlines(&term_str);
421                    let mut defs: Vec<Vec<Block>> = Vec::new();
422                    if let Some(d) = def_str {
423                        defs.push(vec![plain_or_figure(self.parse_inlines(&d))]);
424                    }
425                    if !nested.is_empty() {
426                        match defs.last_mut() {
427                            Some(last) => last.extend(nested),
428                            None => defs.push(nested),
429                        }
430                    }
431                    // Terms stacked with no definition between them share one entry, separated by a
432                    // line break, until a definition arrives.
433                    match pairs.last_mut() {
434                        Some((last_term, last_defs)) if last_defs.is_empty() => {
435                            last_term.push(Inline::LineBreak);
436                            last_term.extend(term);
437                            *last_defs = defs;
438                        }
439                        _ => pairs.push((term, defs)),
440                    }
441                } else {
442                    let mut blocks = vec![plain_or_figure(self.parse_inlines(&content))];
443                    blocks.extend(nested);
444                    match pairs.last_mut() {
445                        Some(last) => last.1.push(blocks),
446                        None => pairs.push((Vec::new(), vec![blocks])),
447                    }
448                }
449            } else {
450                let start = i;
451                while i < group.len() && group.get(i).map_or(0, |it| it.markers.len()) > level + 1 {
452                    i += 1;
453                }
454                if i == start {
455                    i += 1;
456                }
457                let nested = group
458                    .get(start..i)
459                    .map(|sub| self.build_lists(sub, level + 1))
460                    .unwrap_or_default();
461                match pairs.last_mut() {
462                    Some(last) => match last.1.last_mut() {
463                        Some(d) => d.extend(nested),
464                        None => last.1.push(nested),
465                    },
466                    None => pairs.push((Vec::new(), vec![nested])),
467                }
468            }
469        }
470        Block::DefinitionList(pairs)
471    }
472
473    fn parse_preformatted(&mut self, chars: &[char], pos: usize) -> (Block, usize) {
474        let n = chars.len();
475        let mut p = pos;
476        let mut lines: Vec<Vec<Inline>> = Vec::new();
477        while at(chars, p) == Some(' ') {
478            let le = line_end(chars, p);
479            let content = collect_range(chars, p + 1, le);
480            lines.push(self.preformatted_line(&content));
481            if le >= n {
482                p = le;
483                break;
484            }
485            p = le + 1;
486        }
487        let mut out: Vec<Inline> = Vec::new();
488        for (idx, mut inlines) in lines.into_iter().enumerate() {
489            if idx > 0 {
490                out.push(Inline::LineBreak);
491            }
492            out.append(&mut inlines);
493        }
494        (Block::Para(out), p)
495    }
496
497    fn parse_block_tag(&mut self, chars: &[char], pos: usize) -> Option<(Block, usize)> {
498        let (name, raw_open, self_closing, after_open) = open_tag(chars, pos)?;
499        match name.as_str() {
500            "blockquote" => {
501                if self_closing {
502                    return Some((Block::BlockQuote(Vec::new()), after_open));
503                }
504                let (inner, after) = enclosed(chars, after_open, "blockquote");
505                let inner_chars: Vec<char> = inner.chars().collect();
506                Some((Block::BlockQuote(self.parse_blocks(&inner_chars)), after))
507            }
508            "pre" => {
509                let (inner, after) = enclosed(chars, after_open, "pre");
510                Some((
511                    Block::CodeBlock(Box::default(), trim_code(&inner).into()),
512                    after,
513                ))
514            }
515            "source" | "syntaxhighlight" => {
516                let (inner, after) = enclosed(chars, after_open, &name);
517                let mut classes = Vec::new();
518                if let Some(lang) = tag_attribute(&raw_open, "lang")
519                    && !lang.is_empty()
520                {
521                    classes.push(lang.into());
522                }
523                let attr = Attr {
524                    id: carta_ast::Text::default(),
525                    classes,
526                    attributes: Vec::new(),
527                };
528                Some((
529                    Block::CodeBlock(Box::new(attr), trim_code(&inner).into()),
530                    after,
531                ))
532            }
533            "ul" => Some(self.parse_html_list(chars, after_open, false, &raw_open, self_closing)),
534            "ol" => Some(self.parse_html_list(chars, after_open, true, &raw_open, self_closing)),
535            _ => None,
536        }
537    }
538
539    /// Parses an HTML `<ul>`/`<ol>` list into a native list block. Each `<li>` becomes one item whose
540    /// content is parsed as blocks, with a leading paragraph rendered as plain text; nested `<ul>`/
541    /// `<ol>` lists nest. For an ordered list, a `start` attribute sets the first number while `type`
542    /// and any per-item `value` are ignored. Whitespace between items is skipped; the first stray
543    /// (non-`<li>`) content ends the list, leaving the remainder to be parsed as ordinary blocks.
544    fn parse_html_list(
545        &mut self,
546        chars: &[char],
547        start: usize,
548        ordered: bool,
549        raw_open: &str,
550        self_closing: bool,
551    ) -> (Block, usize) {
552        let mut items: Vec<Vec<Block>> = Vec::new();
553        let mut i = start;
554        let close_name = if ordered { "ol" } else { "ul" };
555        if !self_closing {
556            loop {
557                while at(chars, i).is_some_and(char::is_whitespace) {
558                    i += 1;
559                }
560                if at(chars, i) == Some('<')
561                    && at(chars, i + 1) == Some('/')
562                    && tag_name_matches(chars, i + 2, close_name)
563                    && let Some((_, _, after)) = close_tag_parse(chars, i)
564                {
565                    i = after;
566                    break;
567                }
568                if at(chars, i) == Some('<')
569                    && at(chars, i + 1) != Some('/')
570                    && tag_name_matches(chars, i + 1, "li")
571                    && let Some((_, _, _self_closing, after_li)) = open_tag(chars, i)
572                {
573                    let (content_end, next) = html_li_content_bounds(chars, after_li);
574                    let content: Vec<char> = collect_range(chars, after_li, content_end)
575                        .chars()
576                        .collect();
577                    let mut blocks = self.parse_blocks(&content);
578                    if let Some(Block::Para(inlines)) = blocks.first() {
579                        let inlines = inlines.clone();
580                        if let Some(first) = blocks.first_mut() {
581                            *first = Block::Plain(inlines);
582                        }
583                    }
584                    items.push(blocks);
585                    i = next;
586                    continue;
587                }
588                break;
589            }
590        }
591        let block = if ordered {
592            let start_num = tag_attribute(raw_open, "start")
593                .and_then(|value| value.trim().parse::<i32>().ok())
594                .unwrap_or(1);
595            Block::OrderedList(
596                ListAttributes {
597                    start: start_num,
598                    style: ListNumberStyle::DefaultStyle,
599                    delim: ListNumberDelim::DefaultDelim,
600                },
601                items,
602            )
603        } else {
604            Block::BulletList(items)
605        };
606        (block, i)
607    }
608
609    fn parse_paragraph(
610        &mut self,
611        chars: &[char],
612        pos: usize,
613        scan: &mut HeaderScan,
614    ) -> (Vec<Block>, usize) {
615        let n = chars.len();
616        let mut pieces: Vec<String> = Vec::new();
617        let mut cur = pos;
618        loop {
619            let le = line_end(chars, cur);
620            pieces.push(collect_range(chars, cur, le));
621            if le >= n {
622                cur = le;
623                break;
624            }
625            let next = le + 1;
626            if next >= n {
627                cur = next;
628                break;
629            }
630            // A `<ref>` whose `</ref>` has not yet been seen keeps the paragraph open across a blank
631            // line so the note's body — including any internal paragraph breaks — is captured whole.
632            // A line that would otherwise begin a block only stays attached when the open note reads
633            // as block content (its body began on a fresh line); a note opened with text on the same
634            // line reads inline and ends at such a line instead.
635            let ref_open = open_ref_depth(chars, pos, next) > 0;
636            let next_end = line_end(chars, next);
637            if is_blank(chars, next, next_end) {
638                if ref_open {
639                    cur = next;
640                    continue;
641                }
642                cur = if next_end < n { next_end + 1 } else { next_end };
643                break;
644            }
645            if line_starts_block_scan(chars, next, scan) {
646                if ref_open && open_ref_block_bodied(chars, pos, next) {
647                    cur = next;
648                    continue;
649                }
650                cur = next;
651                break;
652            }
653            cur = next;
654        }
655        let raw = pieces.join("\n");
656        let trimmed = raw.trim();
657        if trimmed.is_empty() {
658            return (Vec::new(), cur);
659        }
660        (self.parse_block_content(trimmed), cur)
661    }
662
663    /// Parses a paragraph's text into blocks. Recognized block-level HTML tags split the run: the
664    /// text on either side becomes its own paragraph and each tag becomes a raw block, so a `<div>`
665    /// embedded in prose interrupts the paragraph exactly where it appears.
666    fn parse_block_content(&mut self, text: &str) -> Vec<Block> {
667        let chars: Vec<char> = text.chars().collect();
668        let toks = self.lex(&chars, false, true);
669        let smart = self.smart();
670        let east_asian = self.extensions.contains(Extension::EastAsianLineBreaks);
671        let mut blocks: Vec<Block> = Vec::new();
672        let mut segment: Vec<Tok> = Vec::new();
673        for tok in toks {
674            match tok {
675                Tok::BlockRaw(raw) => {
676                    flush_para_segment(&mut segment, &mut blocks, smart, east_asian);
677                    blocks.push(Block::RawBlock(format_html(), raw.into()));
678                }
679                Tok::Block(block) => {
680                    flush_para_segment(&mut segment, &mut blocks, smart, east_asian);
681                    blocks.push(block);
682                }
683                Tok::BlockBreak => flush_para_segment(&mut segment, &mut blocks, smart, east_asian),
684                other => segment.push(other),
685            }
686        }
687        flush_para_segment(&mut segment, &mut blocks, smart, east_asian);
688        blocks
689    }
690
691    /// Parses a `{|`-delimited table into a [`Block::Table`], returning the index past the closing
692    /// `|}`. Table and row attribute lists are dropped; a cell's attribute list supplies its
693    /// alignment, spans, identifier, and classes. The first row becomes the header when its first
694    /// cell is a `!` header cell.
695    fn parse_table(&mut self, chars: &[char], pos: usize) -> (Block, usize) {
696        let after = table_block_end(chars, pos);
697        let region = collect_range(chars, pos, after);
698        (self.build_table(&region), after)
699    }
700
701    fn build_table(&mut self, region: &str) -> Block {
702        let (mut rows, caption_text) = scan_table_region(region);
703        // The first row may omit its leading `|-` separator, so a `|-` seen before any cell merely
704        // opens the first row rather than closing an empty one: an empty leading segment is dropped.
705        // Every later `|-` closes a row, so empty rows elsewhere are kept.
706        if rows.first().is_some_and(Vec::is_empty) {
707            rows.remove(0);
708        }
709        if rows.is_empty() {
710            // A table with no cells still yields one empty row.
711            rows.push(Vec::new());
712        }
713
714        let n_rows = rows.len();
715        // The first row fixes the column count; cells that overflow it in later rows are dropped.
716        let ncols = rows.first().map_or(0, |r| {
717            r.iter().map(|c| col_count(c.col_span)).sum::<usize>()
718        });
719        let col_specs = column_specs(&rows, ncols);
720
721        let is_header_first = rows
722            .first()
723            .and_then(|r| r.first())
724            .is_some_and(|c| c.is_header);
725
726        let ast_rows = self.lay_grid(&rows, ncols, n_rows);
727
728        let (head_rows, body_rows) = if is_header_first {
729            let mut iter = ast_rows.into_iter();
730            let head: Vec<Row> = iter.next().into_iter().collect();
731            (head, iter.collect::<Vec<Row>>())
732        } else {
733            (Vec::new(), ast_rows)
734        };
735
736        let caption = match caption_text {
737            Some(text) => {
738                let inlines = self.parse_inlines(text.trim());
739                if inlines.is_empty() {
740                    Caption::default()
741                } else {
742                    Caption {
743                        short: None,
744                        long: vec![Block::Plain(inlines)],
745                    }
746                }
747            }
748            None => Caption::default(),
749        };
750
751        Block::Table(Box::new(Table {
752            attr: Attr::default(),
753            caption,
754            col_specs,
755            head: TableHead {
756                attr: Attr::default(),
757                rows: head_rows,
758            },
759            bodies: vec![TableBody {
760                attr: Attr::default(),
761                row_head_columns: 0,
762                head: Vec::new(),
763                body: body_rows,
764            }],
765            foot: TableFoot::default(),
766        }))
767    }
768
769    /// Lays the parsed cells onto a fixed `ncols`-wide grid so spans stay in bounds: a `rowspan`
770    /// cannot reach past the last row, a `colspan` cannot reach past the last column (an overflowing
771    /// cell is dropped), a cell skips columns still covered by a `rowspan` from an earlier row, and
772    /// any column a row leaves uncovered is filled with an empty cell.
773    fn lay_grid(&mut self, rows: &[Vec<RawCell>], ncols: usize, n_rows: usize) -> Vec<Row> {
774        let mut ast_rows: Vec<Row> = Vec::new();
775        let mut occupied: Vec<i32> = vec![0; ncols];
776        for (r, raw) in rows.iter().enumerate() {
777            let available = i32::try_from(n_rows.saturating_sub(r)).unwrap_or(i32::MAX);
778            let mut cells: Vec<Cell> = Vec::new();
779            let mut col = 0usize;
780            for c in raw {
781                while col < ncols && occupied.get(col).copied().unwrap_or(0) > 0 {
782                    col += 1;
783                }
784                if col >= ncols {
785                    break;
786                }
787                let col_span = col_count(c.col_span).min(ncols - col);
788                let row_span = c.row_span.max(1).min(available);
789                let content_chars: Vec<char> = c.content.trim().chars().collect();
790                let content = self.parse_cell_blocks(&content_chars);
791                cells.push(Cell {
792                    attr: c.attr.clone(),
793                    align: c.align.clone(),
794                    row_span,
795                    col_span: i32::try_from(col_span).unwrap_or(1),
796                    content,
797                });
798                for k in col..col + col_span {
799                    if let Some(slot) = occupied.get_mut(k) {
800                        *slot = row_span;
801                    }
802                }
803                col += col_span;
804            }
805            while col < ncols {
806                if occupied.get(col).copied().unwrap_or(0) == 0 {
807                    cells.push(empty_cell());
808                }
809                col += 1;
810            }
811            for slot in &mut occupied {
812                *slot = (*slot - 1).max(0);
813            }
814            ast_rows.push(Row {
815                attr: Attr::default(),
816                cells,
817            });
818        }
819        ast_rows
820    }
821
822    /// Parses a table cell's content. On the cell's first line the list and heading markers
823    /// `* # ; =` are inert and read as plain paragraph text; from the second line on every marker is
824    /// recognized again. Definition (`:`), horizontal rules, templates, and nested tables stay
825    /// active even on the first line.
826    fn parse_cell_blocks(&mut self, chars: &[char]) -> Vec<Block> {
827        let first = at(chars, 0);
828        let suppressed = matches!(first, Some('*' | '#' | ';'))
829            || (first == Some('=') && is_header_line_within(chars, 0));
830        if !suppressed {
831            return self.parse_blocks(chars);
832        }
833        let (mut blocks, after) = self.parse_paragraph(chars, 0, &mut HeaderScan::default());
834        if let Some(rest) = chars.get(after..) {
835            blocks.extend(self.parse_blocks(rest));
836        }
837        blocks
838    }
839
840    /// Parses the content of a `<ref>` note as blocks; a lone paragraph becomes a [`Block::Plain`].
841    fn note_blocks(&mut self, chars: &[char]) -> Vec<Block> {
842        let blocks = self.parse_blocks(chars);
843        match blocks.as_slice() {
844            [Block::Para(inlines)] => vec![Block::Plain(inlines.clone())],
845            _ => blocks,
846        }
847    }
848
849    fn parse_inlines(&mut self, text: &str) -> Vec<Inline> {
850        let chars: Vec<char> = text.chars().collect();
851        let toks = self.lex(&chars, false, false);
852        let mut inlines = coalesce(resolve_emphasis(toks));
853        if self.extensions.contains(Extension::EastAsianLineBreaks) {
854            inlines = drop_east_asian_breaks(inlines);
855        }
856        if self.smart() {
857            inlines = apply_smart_quotes(inlines);
858        }
859        inlines
860    }
861
862    /// Parses one preformatted line: markup is honored, but literal text and its exact spacing are
863    /// preserved as code spans rather than collapsed.
864    fn preformatted_line(&mut self, text: &str) -> Vec<Inline> {
865        let chars: Vec<char> = text.chars().collect();
866        let toks = self.lex(&chars, true, false);
867        preformat_transform(resolve_emphasis(toks))
868    }
869
870    #[allow(clippy::too_many_lines)]
871    fn lex(&mut self, chars: &[char], preformatted: bool, block_context: bool) -> Vec<Tok> {
872        let mut toks: Vec<Tok> = Vec::new();
873        let mut word = String::new();
874        let mut i = 0;
875        let n = chars.len();
876        while i < n {
877            let Some(c) = at(chars, i) else { break };
878            if c == '\'' {
879                let mut end = i;
880                while at(chars, end) == Some('\'') {
881                    end += 1;
882                }
883                let run = end - i;
884                if run >= 2 {
885                    flush_word(&mut word, &mut toks);
886                    toks.push(Tok::Apostrophes(run));
887                } else {
888                    word.push('\'');
889                }
890                i = end;
891                continue;
892            }
893            if c.is_whitespace() {
894                if preformatted {
895                    word.push(c);
896                    i += 1;
897                    continue;
898                }
899                flush_word(&mut word, &mut toks);
900                let (token, next) = whitespace_token(chars, i);
901                toks.push(Tok::Inline(token));
902                i = next;
903                continue;
904            }
905            if c == '&' {
906                if let Some((decoded, next)) = entities::read_reference(chars, i, chars.len(), true)
907                {
908                    word.push_str(&decoded);
909                    i = next;
910                } else {
911                    word.push('&');
912                    i += 1;
913                }
914                continue;
915            }
916            if c == '<' {
917                if let Some((inlines, next)) = self.handle_tag(chars, i) {
918                    flush_word(&mut word, &mut toks);
919                    for inline in inlines {
920                        toks.push(Tok::Inline(inline));
921                    }
922                    i = next;
923                    continue;
924                }
925                if block_context
926                    && starts_block_tag(chars, i)
927                    && let Some((block, next)) = self.parse_block_tag(chars, i)
928                {
929                    flush_word(&mut word, &mut toks);
930                    toks.push(Tok::Block(block));
931                    i = next;
932                    continue;
933                }
934                if let Some((tok, next)) = block_tag_token(chars, i) {
935                    flush_word(&mut word, &mut toks);
936                    toks.push(tok);
937                    i = next;
938                    continue;
939                }
940                word.push('<');
941                i += 1;
942                continue;
943            }
944            if c == '{' && at(chars, i + 1) == Some('{') {
945                if template_opens(chars, i)
946                    && let Some(after) = balanced_braces(chars, i)
947                {
948                    flush_word(&mut word, &mut toks);
949                    let raw = collect_range(chars, i, after);
950                    toks.push(Tok::Inline(Inline::RawInline(
951                        format_mediawiki(),
952                        raw.into(),
953                    )));
954                    i = after;
955                    continue;
956                }
957                word.push('{');
958                i += 1;
959                continue;
960            }
961            if c == '[' {
962                let handled = if at(chars, i + 1) == Some('[') {
963                    self.internal_link(chars, i)
964                } else {
965                    self.external_link(chars, i)
966                };
967                if let Some((inlines, next)) = handled {
968                    flush_word(&mut word, &mut toks);
969                    for inline in inlines {
970                        toks.push(Tok::Inline(inline));
971                    }
972                    i = next;
973                    continue;
974                }
975                // A single `[` glued to a bare URL is a literal bracket followed by that URL.
976                if at(chars, i + 1) != Some('[')
977                    && let Some((inline, next)) = bare_url(chars, i + 1)
978                {
979                    word.push('[');
980                    flush_word(&mut word, &mut toks);
981                    toks.push(Tok::Inline(inline));
982                    i = next;
983                    continue;
984                }
985                word.push('[');
986                i += 1;
987                continue;
988            }
989            if word.is_empty()
990                && let Some((inline, next)) = bare_url(chars, i)
991            {
992                toks.push(Tok::Inline(inline));
993                i = next;
994                continue;
995            }
996            word.push(c);
997            i += 1;
998        }
999        flush_word(&mut word, &mut toks);
1000        toks
1001    }
1002
1003    #[allow(clippy::too_many_lines)]
1004    fn handle_tag(&mut self, chars: &[char], i: usize) -> Option<(Vec<Inline>, usize)> {
1005        if at(chars, i) != Some('<') {
1006            return None;
1007        }
1008        match at(chars, i + 1) {
1009            Some('/') => {
1010                let (name, raw, after) = close_tag_parse(chars, i)?;
1011                return match html_tag_role(&name) {
1012                    Some(HtmlTagRole::Inline) => Some((vec![raw_html(raw)], after)),
1013                    _ => None,
1014                };
1015            }
1016            Some(c) if c.is_ascii_alphabetic() => {}
1017            _ => return None,
1018        }
1019        let (name, raw_open, self_closing, after_open) = open_tag(chars, i)?;
1020        match name.as_str() {
1021            "br" => Some((vec![Inline::LineBreak], after_open)),
1022            "ref" => {
1023                if self_closing {
1024                    return Some((vec![Inline::Note(Vec::new())], after_open));
1025                }
1026                match close_tag(chars, after_open, "ref") {
1027                    Some((inner_end, after)) => {
1028                        let inner = collect_range(chars, after_open, inner_end);
1029                        let inner_chars: Vec<char> = inner.chars().collect();
1030                        Some((vec![Inline::Note(self.note_blocks(&inner_chars))], after))
1031                    }
1032                    None => Some((vec![raw_html(raw_open)], after_open)),
1033                }
1034            }
1035            "nowiki" => {
1036                if self_closing {
1037                    return Some((Vec::new(), after_open));
1038                }
1039                let (inner, after) = enclosed(chars, after_open, "nowiki");
1040                Some((plain_inlines(&inner), after))
1041            }
1042            "math" => {
1043                if self_closing {
1044                    return Some((Vec::new(), after_open));
1045                }
1046                match close_tag(chars, after_open, "math") {
1047                    Some((inner_end, after)) => {
1048                        let inner = collect_range(chars, after_open, inner_end);
1049                        Some((
1050                            vec![Inline::Math(MathType::InlineMath, inner.trim().into())],
1051                            after,
1052                        ))
1053                    }
1054                    None => Some((vec![raw_html(raw_open)], after_open)),
1055                }
1056            }
1057            "code" | "tt" => Some(verbatim_code(
1058                chars,
1059                &name,
1060                after_open,
1061                &raw_open,
1062                self_closing,
1063                &[],
1064            )),
1065            "var" => Some(verbatim_code(
1066                chars,
1067                "var",
1068                after_open,
1069                &raw_open,
1070                self_closing,
1071                &["variable"],
1072            )),
1073            "samp" => Some(verbatim_code(
1074                chars,
1075                "samp",
1076                after_open,
1077                &raw_open,
1078                self_closing,
1079                &["sample"],
1080            )),
1081            "sub" => Some(self.wrap(
1082                chars,
1083                "sub",
1084                after_open,
1085                &raw_open,
1086                self_closing,
1087                Inline::Subscript,
1088            )),
1089            "sup" => Some(self.wrap(
1090                chars,
1091                "sup",
1092                after_open,
1093                &raw_open,
1094                self_closing,
1095                Inline::Superscript,
1096            )),
1097            "del" | "strike" => Some(self.wrap(
1098                chars,
1099                &name,
1100                after_open,
1101                &raw_open,
1102                self_closing,
1103                Inline::Strikeout,
1104            )),
1105            "kbd" => Some(self.span(chars, "kbd", after_open, &raw_open, self_closing, "kbd")),
1106            "mark" => Some(self.span(chars, "mark", after_open, &raw_open, self_closing, "mark")),
1107            _ => match html_tag_role(&name) {
1108                Some(HtmlTagRole::Inline) => {
1109                    if self_closing {
1110                        return Some((vec![raw_html(raw_open)], after_open));
1111                    }
1112                    match close_tag(chars, after_open, &name) {
1113                        Some((inner_end, after)) => {
1114                            let inner = collect_range(chars, after_open, inner_end);
1115                            let close_raw = collect_range(chars, inner_end, after);
1116                            let mut out = vec![raw_html(raw_open)];
1117                            out.extend(self.parse_inlines(&inner));
1118                            out.push(raw_html(close_raw));
1119                            Some((out, after))
1120                        }
1121                        None => Some((vec![raw_html(raw_open)], after_open)),
1122                    }
1123                }
1124                // Block-level and unrecognized tags are not inline output: a recognized block tag
1125                // becomes a raw block at the paragraph level, an unrecognized tag stays literal.
1126                _ => None,
1127            },
1128        }
1129    }
1130
1131    fn wrap(
1132        &mut self,
1133        chars: &[char],
1134        name: &str,
1135        after_open: usize,
1136        raw_open: &str,
1137        self_closing: bool,
1138        ctor: fn(Vec<Inline>) -> Inline,
1139    ) -> (Vec<Inline>, usize) {
1140        if self_closing {
1141            return (vec![raw_html(raw_open.to_string())], after_open);
1142        }
1143        match close_tag(chars, after_open, name) {
1144            Some((inner_end, after)) => {
1145                let inner = collect_range(chars, after_open, inner_end);
1146                (vec![ctor(self.parse_inlines(&inner))], after)
1147            }
1148            None => (vec![raw_html(raw_open.to_string())], after_open),
1149        }
1150    }
1151
1152    fn span(
1153        &mut self,
1154        chars: &[char],
1155        name: &str,
1156        after_open: usize,
1157        raw_open: &str,
1158        self_closing: bool,
1159        class: &str,
1160    ) -> (Vec<Inline>, usize) {
1161        if self_closing {
1162            return (vec![raw_html(raw_open.to_string())], after_open);
1163        }
1164        match close_tag(chars, after_open, name) {
1165            Some((inner_end, after)) => {
1166                let inner = collect_range(chars, after_open, inner_end);
1167                let attr = Attr {
1168                    id: carta_ast::Text::default(),
1169                    classes: vec![class.into()],
1170                    attributes: Vec::new(),
1171                };
1172                (
1173                    vec![Inline::Span(Box::new(attr), self.parse_inlines(&inner))],
1174                    after,
1175                )
1176            }
1177            None => (vec![raw_html(raw_open.to_string())], after_open),
1178        }
1179    }
1180
1181    fn external_link(&mut self, chars: &[char], i: usize) -> Option<(Vec<Inline>, usize)> {
1182        let close = find_char(chars, i + 1, ']')?;
1183        let inner = collect_range(chars, i + 1, close);
1184        let (url, label) = match inner.split_once(|c: char| c.is_whitespace()) {
1185            Some((u, rest)) => (u.to_string(), rest.trim_start().to_string()),
1186            None => (inner.clone(), String::new()),
1187        };
1188        if !is_url(&url) {
1189            return None;
1190        }
1191        // A bracketed URL with no label that runs straight into a letter or digit is not a link: the
1192        // bracket stays literal and the URL continues past the `]` as a bare URL.
1193        if label.is_empty() && at(chars, close + 1).is_some_and(char::is_alphanumeric) {
1194            return None;
1195        }
1196        let text = if label.is_empty() {
1197            self.link_counter += 1;
1198            vec![Inline::Str(self.link_counter.to_compact_string())]
1199        } else {
1200            self.parse_inlines(&label)
1201        };
1202        Some((
1203            vec![Inline::Link(
1204                Box::default(),
1205                text,
1206                Box::new(Target {
1207                    url: encode_url_target(&url).into(),
1208                    title: carta_ast::Text::default(),
1209                }),
1210            )],
1211            close + 1,
1212        ))
1213    }
1214
1215    fn internal_link(&mut self, chars: &[char], i: usize) -> Option<(Vec<Inline>, usize)> {
1216        // The target ends at the first `|` or the first `]]`, whichever comes first; nesting is not
1217        // tracked, so a `]]` from an inner link can close an unpiped target.
1218        let start = i + 2;
1219        let (target_end, has_pipe) = scan_link_target(chars, start)?;
1220        let target = collect_range(chars, start, target_end).trim().to_string();
1221
1222        // With a pipe present, the label runs to the `]]` that closes this link, stepping over any
1223        // nested `[[ … ]]` so an inner link does not close the outer one.
1224        let (label_content, close) = if has_pipe {
1225            let label_start = target_end + 1;
1226            let close = find_link_close(chars, label_start)?;
1227            (Some(collect_range(chars, label_start, close)), close)
1228        } else {
1229            (None, target_end)
1230        };
1231
1232        if let Some(ns) = namespace_of(&target) {
1233            if ns == "category" {
1234                let text = match &label_content {
1235                    Some(label) if !label.trim().is_empty() => self.parse_inlines(label),
1236                    _ => self.parse_inlines(&target),
1237                };
1238                let title = title_text(&text);
1239                let attr = Attr {
1240                    id: carta_ast::Text::default(),
1241                    classes: vec!["wikilink".into()],
1242                    attributes: Vec::new(),
1243                };
1244                self.categories.push(Inline::Link(
1245                    Box::new(attr),
1246                    text,
1247                    Box::new(Target {
1248                        url: wikilink_url(&target).into(),
1249                        title: title.into(),
1250                    }),
1251                ));
1252                return Some((Vec::new(), close + 2));
1253            }
1254            // A file or image embed may decline (a parameter it cannot represent as an image); when it
1255            // does, the markup falls through to the ordinary wikilink path below.
1256            if matches!(ns.as_str(), "file" | "image")
1257                && !strip_namespace(&target).is_empty()
1258                && let Some(image) = self.image_embed(&target, label_content.as_deref())
1259            {
1260                return Some((vec![image], close + 2));
1261            }
1262        }
1263        let mut after = close + 2;
1264        let mut trail = String::new();
1265        while let Some(c) = at(chars, after) {
1266            if c.is_ascii_alphabetic() {
1267                trail.push(c);
1268                after += 1;
1269            } else {
1270                break;
1271            }
1272        }
1273        let mut label = match &label_content {
1274            // An empty label invokes the pipe trick: the display text is derived from the target.
1275            Some(l) if l.trim().is_empty() => self.pipe_trick_label(&target),
1276            Some(l) => self.parse_inlines(l),
1277            None => self.parse_inlines(&target),
1278        };
1279        let title = title_text(&label);
1280        if !trail.is_empty() {
1281            label.push(Inline::Str(trail.into()));
1282            label = coalesce(label);
1283        }
1284        let attr = Attr {
1285            id: carta_ast::Text::default(),
1286            classes: vec!["wikilink".into()],
1287            attributes: Vec::new(),
1288        };
1289        let url = wikilink_url(&target);
1290        Some((
1291            vec![Inline::Link(
1292                Box::new(attr),
1293                label,
1294                Box::new(Target {
1295                    url: url.into(),
1296                    title: title.into(),
1297                }),
1298            )],
1299            after,
1300        ))
1301    }
1302
1303    /// The display text the pipe trick derives from an empty-label link's target: the part after the
1304    /// first colon when the target is namespaced (so `Help:Contents` shows as `Contents`), otherwise
1305    /// no text at all.
1306    fn pipe_trick_label(&mut self, target: &str) -> Vec<Inline> {
1307        match target.split_once(':') {
1308            Some((_, rest)) => self.parse_inlines(rest),
1309            None => Vec::new(),
1310        }
1311    }
1312
1313    /// Builds the image for a `[[File:…|…]]` / `[[Image:…|…]]` embed. The page name (with the
1314    /// namespace stripped) is the source; the `WxHpx` parameters set width/height; recognized
1315    /// placement and option keywords are dropped; the last remaining parameter is the caption,
1316    /// defaulting to the file name. A lone embed in its own paragraph later becomes a figure
1317    /// (see [`lone_image_figure`]).
1318    fn image_embed(&mut self, target: &str, params: Option<&str>) -> Option<Inline> {
1319        let url = wikilink_url(strip_namespace(target));
1320        let mut attributes: Vec<(String, String)> = Vec::new();
1321        let mut caption: Option<String> = None;
1322        if let Some(params) = params {
1323            for part in params.split('|') {
1324                let option = part.trim();
1325                if image_param_declines(option) {
1326                    return None;
1327                }
1328                if let Some((width, height)) = image_size(option) {
1329                    attributes.retain(|(key, _)| key != "width" && key != "height");
1330                    attributes.push(("width".to_string(), width));
1331                    if let Some(height) = height {
1332                        attributes.push(("height".to_string(), height));
1333                    }
1334                } else if is_image_keyword(option) || is_recognized_image_attr(option) {
1335                    // A placement or framing keyword, or a recognized `key=value` attribute, carries
1336                    // no caption text. An unrecognized `key=value` is treated as caption text.
1337                } else {
1338                    caption = Some(part.to_string());
1339                }
1340            }
1341        }
1342        let caption = caption.unwrap_or_else(|| url.clone());
1343        let alt = self.parse_inlines(&caption);
1344        let title = title_text(&alt);
1345        let attr = Attr {
1346            id: carta_ast::Text::default(),
1347            classes: Vec::new(),
1348            attributes: attributes
1349                .into_iter()
1350                .map(|(k, v)| (k.into(), v.into()))
1351                .collect(),
1352        };
1353        Some(Inline::Image(
1354            Box::new(attr),
1355            alt,
1356            Box::new(Target {
1357                url: url.into(),
1358                title: title.into(),
1359            }),
1360        ))
1361    }
1362
1363    fn make_id(&mut self, inlines: &[Inline]) -> String {
1364        let plain = to_plain_text(inlines);
1365        if self.extensions.contains(Extension::GfmAutoIdentifiers) {
1366            let base = self.finish_id(slug_gfm, &emoji_to_aliases(&plain));
1367            self.ids.assign_with_separator(base, '-')
1368        } else if self.extensions.contains(Extension::AutoIdentifiers) {
1369            let base = self.finish_id(mediawiki_slug, &plain);
1370            self.ids.assign_with_separator(base, '_')
1371        } else {
1372            String::new()
1373        }
1374    }
1375
1376    /// Builds an identifier with `slug`, then — when `ascii_identifiers` is on — folds the finished
1377    /// slug to pure ASCII (accents stripped, non-Latin letters dropped) and re-slugs it, so a dropped
1378    /// letter leaves its separators intact while a now-leading separator is trimmed. An empty result
1379    /// is mapped to a placeholder during disambiguation.
1380    fn finish_id(&self, slug: fn(&str) -> String, source: &str) -> String {
1381        let mut base = slug(source);
1382        if self.extensions.contains(Extension::AsciiIdentifiers) {
1383            base = slug(&transliterate_ascii(&base));
1384        }
1385        base
1386    }
1387}
1388
1389// --- preprocessing ------------------------------------------------------------------------------
1390
1391/// Expands tab characters to spaces on a four-column grid, with the column resetting at each line
1392/// break. Wikitext markup is column-sensitive — a leading space marks preformatted text — so tabs
1393/// are normalized before any block scanning runs.
1394fn expand_tabs(input: &str) -> String {
1395    if !input.contains('\t') {
1396        return input.to_string();
1397    }
1398    let mut out = String::with_capacity(input.len());
1399    let mut col = 0usize;
1400    for ch in input.chars() {
1401        match ch {
1402            '\t' => {
1403                let spaces = 4 - (col % 4);
1404                for _ in 0..spaces {
1405                    out.push(' ');
1406                }
1407                col += spaces;
1408            }
1409            '\n' => {
1410                out.push('\n');
1411                col = 0;
1412            }
1413            other => {
1414                out.push(other);
1415                col += 1;
1416            }
1417        }
1418    }
1419    out
1420}
1421
1422// --- comment stripping --------------------------------------------------------------------------
1423
1424/// Removes wikitext comments. A comment that is the whole line (preceded by a line start and
1425/// followed by a line end) is dropped together with its trailing newline; one embedded in other
1426/// text collapses to a single space. Verbatim regions (`pre`, `nowiki`, `math`, `source`,
1427/// `syntaxhighlight`) are copied unchanged so comment-like text inside them survives. An
1428/// unterminated `<!--` is left as literal text.
1429fn strip_comments(input: &str) -> String {
1430    let chars: Vec<char> = input.chars().collect();
1431    let n = chars.len();
1432    let mut out = String::new();
1433    let mut i = 0;
1434    while i < n {
1435        let Some(c) = at(&chars, i) else { break };
1436        if c == '<' {
1437            if let Some(after) = verbatim_region_end(&chars, i) {
1438                out.push_str(&collect_range(&chars, i, after));
1439                i = after;
1440                continue;
1441            }
1442            if matches_prefix_ci(&chars, i, "<!--") {
1443                if let Some(dash) = find_seq(&chars, i + 4, &['-', '-', '>']) {
1444                    let comment_end = dash + 3;
1445                    let preceded = i == 0 || at(&chars, i - 1) == Some('\n');
1446                    let followed = comment_end >= n || at(&chars, comment_end) == Some('\n');
1447                    if preceded && followed {
1448                        i = if comment_end < n {
1449                            comment_end + 1
1450                        } else {
1451                            comment_end
1452                        };
1453                    } else if preceded || followed {
1454                        // Adjacent to a line boundary, the comment leaves nothing behind, so the
1455                        // line neither gains a leading space (which would make it preformatted) nor
1456                        // a trailing one.
1457                        i = comment_end;
1458                    } else {
1459                        // Between text, the comment collapses to a single space.
1460                        out.push(' ');
1461                        i = comment_end;
1462                    }
1463                    continue;
1464                }
1465                out.push('<');
1466                i += 1;
1467                continue;
1468            }
1469        }
1470        out.push(c);
1471        i += 1;
1472    }
1473    out
1474}
1475
1476/// If a verbatim tag opens at `i`, the index just past its closing tag (or end of input).
1477fn verbatim_region_end(chars: &[char], i: usize) -> Option<usize> {
1478    let (name, _raw, self_closing, after_open) = open_tag(chars, i)?;
1479    if !matches!(
1480        name.as_str(),
1481        "pre" | "nowiki" | "math" | "source" | "syntaxhighlight"
1482    ) {
1483        return None;
1484    }
1485    if self_closing {
1486        return Some(after_open);
1487    }
1488    match close_tag(chars, after_open, &name) {
1489        Some((_, after)) => Some(after),
1490        None => Some(chars.len()),
1491    }
1492}
1493
1494/// Behavior switches recognized in `__WORD__` form. A matched switch is removed from the text and
1495/// recorded as a boolean metadata entry under its lowercased name; the comparison is case-sensitive,
1496/// so only the uppercase spelling is a switch.
1497const BEHAVIOR_SWITCHES: &[&str] = &[
1498    "ARCHIVEDTALK",
1499    "DISAMBIG",
1500    "EXPECTUNUSEDCATEGORY",
1501    "EXPECTUNUSEDTEMPLATE",
1502    "FORCETOC",
1503    "HIDDENCAT",
1504    "INDEX",
1505    "NEWSECTIONLINK",
1506    "NOCC",
1507    "NOCONTENTCONVERT",
1508    "NOEDITSECTION",
1509    "NOGALLERY",
1510    "NOGLOBAL",
1511    "NOINDEX",
1512    "NONEWSECTIONLINK",
1513    "NOTC",
1514    "NOTITLECONVERT",
1515    "NOTOC",
1516    "STATICREDIRECT",
1517    "TOC",
1518];
1519
1520/// Removes every recognized `__WORD__` behavior switch from the text, returning the cleaned text and
1521/// the lowercased names of the switches found in document order. Switches inside verbatim regions
1522/// (`<nowiki>`, `<pre>`, …) are left untouched as literal text.
1523fn extract_behavior_switches(input: &str) -> (String, Vec<String>) {
1524    let chars: Vec<char> = input.chars().collect();
1525    let n = chars.len();
1526    let mut out = String::new();
1527    let mut found: Vec<String> = Vec::new();
1528    let mut i = 0;
1529    while i < n {
1530        if at(&chars, i) == Some('<')
1531            && let Some(after) = verbatim_region_end(&chars, i)
1532        {
1533            out.push_str(&collect_range(&chars, i, after));
1534            i = after;
1535            continue;
1536        }
1537        if at(&chars, i) == Some('_')
1538            && at(&chars, i + 1) == Some('_')
1539            && let Some((word, after)) = behavior_switch_at(&chars, i)
1540        {
1541            let key = word.to_ascii_lowercase();
1542            if !found.contains(&key) {
1543                found.push(key);
1544            }
1545            i = after;
1546            // A switch that begins a line is removed together with the spaces and tabs that follow
1547            // it on that line, so the line does not gain a leading space that would mark it as
1548            // preformatted text; the line break itself is left in place.
1549            if out.is_empty() || out.ends_with('\n') {
1550                while matches!(at(&chars, i), Some(' ' | '\t')) {
1551                    i += 1;
1552                }
1553            }
1554            continue;
1555        }
1556        if let Some(c) = at(&chars, i) {
1557            out.push(c);
1558        }
1559        i += 1;
1560    }
1561    (out, found)
1562}
1563
1564/// Reads a `__WORD__` behavior switch at `i`, returning the uppercase word and the index past it.
1565fn behavior_switch_at(chars: &[char], i: usize) -> Option<(String, usize)> {
1566    let start = i + 2;
1567    let mut j = start;
1568    while at(chars, j).is_some_and(|c| c.is_ascii_uppercase()) {
1569        j += 1;
1570    }
1571    let word = collect_range(chars, start, j);
1572    if word.is_empty()
1573        || at(chars, j) != Some('_')
1574        || at(chars, j + 1) != Some('_')
1575        || !BEHAVIOR_SWITCHES.contains(&word.as_str())
1576    {
1577        return None;
1578    }
1579    Some((word, j + 2))
1580}
1581
1582// --- emphasis resolution ------------------------------------------------------------------------
1583
1584/// A unit of the stream emphasis resolution works over: one apostrophe of a run, or a finished node.
1585enum Unit {
1586    Apostrophe,
1587    Node(Inline),
1588}
1589
1590/// Resolves apostrophe emphasis. Runs of two apostrophes open and close `Emph`, three open and close
1591/// `Strong`. The structure is found by recursive descent with backtracking: at each run the parser
1592/// tries to open the span whose width fits, parses its content up to a matching closing run, and
1593/// falls back to a literal apostrophe when no span can be formed. A span is never reopened by its
1594/// immediate parent of the same kind, and a span's content has its outer whitespace removed.
1595fn resolve_emphasis(toks: Vec<Tok>) -> Vec<Inline> {
1596    let mut units: Vec<Unit> = Vec::new();
1597    for tok in toks {
1598        match tok {
1599            Tok::Inline(inline) => units.push(Unit::Node(inline)),
1600            Tok::Apostrophes(n) => units.extend((0..n).map(|_| Unit::Apostrophe)),
1601            Tok::BlockRaw(raw) => units.push(Unit::Node(raw_html(raw))),
1602            Tok::BlockBreak | Tok::Block(_) => {}
1603        }
1604    }
1605    let runs = apostrophe_runs(&units);
1606    // Bound the backtracking work so adversarial apostrophe-dense input cannot blow up.
1607    let mut budget = units
1608        .len()
1609        .saturating_mul(8)
1610        .saturating_add(64)
1611        .min(200_000);
1612    let (nodes, _, _) = parse_runs(&units, &runs, 0, None, &mut budget);
1613    nodes
1614}
1615
1616/// For each position, the length of the apostrophe run starting there (zero at a non-apostrophe).
1617fn apostrophe_runs(units: &[Unit]) -> Vec<usize> {
1618    let mut runs = vec![0usize; units.len()];
1619    for i in (0..units.len()).rev() {
1620        if matches!(units.get(i), Some(Unit::Apostrophe)) {
1621            let next = runs.get(i + 1).copied().unwrap_or(0);
1622            if let Some(slot) = runs.get_mut(i) {
1623                *slot = 1 + next;
1624            }
1625        }
1626    }
1627    runs
1628}
1629
1630/// The apostrophe width an emphasis kind consumes: three for `Strong`, two for `Emph`.
1631fn emphasis_width(strong: bool) -> usize {
1632    if strong { 3 } else { 2 }
1633}
1634
1635/// Tries to open an emphasis span of the given kind at the apostrophe run starting at `i`. Returns
1636/// the span node and the index just past its closing run, or `None` if no matching closer is found
1637/// or the span would be empty.
1638fn try_open(
1639    units: &[Unit],
1640    runs: &[usize],
1641    i: usize,
1642    strong: bool,
1643    budget: &mut usize,
1644) -> Option<(Inline, usize)> {
1645    if *budget == 0 {
1646        return None;
1647    }
1648    *budget -= 1;
1649    let width = emphasis_width(strong);
1650    let (body, next, closed) = parse_runs(units, runs, i + width, Some(strong), budget);
1651    if !closed || body.is_empty() {
1652        return None;
1653    }
1654    let body = strip_outer_whitespace(body);
1655    Some((
1656        if strong {
1657            Inline::Strong(body)
1658        } else {
1659            Inline::Emph(body)
1660        },
1661        next,
1662    ))
1663}
1664
1665/// Parses content until the run that closes `closer` (or end of input when `closer` is `None`).
1666/// Returns the collected nodes, the index reached, and whether a closer was found.
1667///
1668/// At each apostrophe run, a wider `'''…'''` strong span is preferred over a `''…''` emphasis span,
1669/// and closing the enclosing span takes precedence over opening a same-kind span. A run that opens
1670/// nothing and closes nothing is emitted as literal apostrophes.
1671fn parse_runs(
1672    units: &[Unit],
1673    runs: &[usize],
1674    start: usize,
1675    closer: Option<bool>,
1676    budget: &mut usize,
1677) -> (Vec<Inline>, usize, bool) {
1678    let mut nodes: Vec<Inline> = Vec::new();
1679    let mut pos = start;
1680    while let Some(unit) = units.get(pos) {
1681        match unit {
1682            Unit::Node(inline) => {
1683                nodes.push(inline.clone());
1684                pos += 1;
1685            }
1686            Unit::Apostrophe => {
1687                let run = runs.get(pos).copied().unwrap_or(0);
1688                if run >= emphasis_width(true)
1689                    && closer != Some(true)
1690                    && let Some((span, next)) = try_open(units, runs, pos, true, budget)
1691                {
1692                    nodes.push(span);
1693                    pos = next;
1694                    continue;
1695                }
1696                if let Some(strong) = closer
1697                    && run >= emphasis_width(strong)
1698                {
1699                    return (nodes, pos + emphasis_width(strong), true);
1700                }
1701                if run >= emphasis_width(false)
1702                    && closer != Some(false)
1703                    && let Some((span, next)) = try_open(units, runs, pos, false, budget)
1704                {
1705                    nodes.push(span);
1706                    pos = next;
1707                    continue;
1708                }
1709                nodes.push(Inline::Str("'".into()));
1710                pos += 1;
1711            }
1712        }
1713    }
1714    (nodes, pos, closer.is_none())
1715}
1716
1717/// Removes leading and trailing spaces and soft breaks from a span's content.
1718fn strip_outer_whitespace(mut inlines: Vec<Inline>) -> Vec<Inline> {
1719    let lead = inlines
1720        .iter()
1721        .take_while(|x| matches!(x, Inline::Space | Inline::SoftBreak))
1722        .count();
1723    inlines.drain(0..lead);
1724    while matches!(inlines.last(), Some(Inline::Space | Inline::SoftBreak)) {
1725        inlines.pop();
1726    }
1727    inlines
1728}
1729
1730/// A flattened unit used while pairing smart double quotes: a `"` awaiting a partner, an ordinary
1731/// character, a whitespace inline (which cannot follow an opening quote), or an opaque inline node
1732/// carried through unchanged.
1733enum SmartUnit {
1734    Quote,
1735    Ch(char),
1736    Space(Inline),
1737    Node(Inline),
1738}
1739
1740/// Folds straight double quotes into [`Inline::Quoted`] runs. A double quote followed by
1741/// non-whitespace content opens a run that the next double quote closes; an unpaired quote stays a
1742/// literal `"`. Single quotes, which mark emphasis, are left untouched. The fold also descends into
1743/// the children of container inlines.
1744fn apply_smart_quotes(inlines: Vec<Inline>) -> Vec<Inline> {
1745    let recursed: Vec<Inline> = inlines.into_iter().map(smart_descend).collect();
1746    let units = flatten_smart(recursed);
1747    resolve_double_quotes(&units, 0, units.len())
1748}
1749
1750/// Applies the double-quote fold to the inline children of a container, leaving leaf and opaque
1751/// inlines (text, code, math, raw passthrough, notes) untouched.
1752fn smart_descend(inline: Inline) -> Inline {
1753    match inline {
1754        Inline::Emph(v) => Inline::Emph(apply_smart_quotes(v)),
1755        Inline::Underline(v) => Inline::Underline(apply_smart_quotes(v)),
1756        Inline::Strong(v) => Inline::Strong(apply_smart_quotes(v)),
1757        Inline::Strikeout(v) => Inline::Strikeout(apply_smart_quotes(v)),
1758        Inline::Superscript(v) => Inline::Superscript(apply_smart_quotes(v)),
1759        Inline::Subscript(v) => Inline::Subscript(apply_smart_quotes(v)),
1760        Inline::SmallCaps(v) => Inline::SmallCaps(apply_smart_quotes(v)),
1761        Inline::Quoted(quote_type, v) => Inline::Quoted(quote_type, apply_smart_quotes(v)),
1762        Inline::Span(attr, v) => Inline::Span(attr, apply_smart_quotes(v)),
1763        Inline::Link(attr, v, target) => Inline::Link(attr, apply_smart_quotes(v), target),
1764        Inline::Image(attr, v, target) => Inline::Image(attr, apply_smart_quotes(v), target),
1765        other => other,
1766    }
1767}
1768
1769fn flatten_smart(inlines: Vec<Inline>) -> Vec<SmartUnit> {
1770    let mut units: Vec<SmartUnit> = Vec::new();
1771    for inline in inlines {
1772        match inline {
1773            Inline::Str(text) => {
1774                for c in text.chars() {
1775                    if c == '"' {
1776                        units.push(SmartUnit::Quote);
1777                    } else {
1778                        units.push(SmartUnit::Ch(c));
1779                    }
1780                }
1781            }
1782            space @ (Inline::Space | Inline::SoftBreak | Inline::LineBreak) => {
1783                units.push(SmartUnit::Space(space));
1784            }
1785            other => units.push(SmartUnit::Node(other)),
1786        }
1787    }
1788    units
1789}
1790
1791fn resolve_double_quotes(units: &[SmartUnit], lo: usize, hi: usize) -> Vec<Inline> {
1792    let mut out: Vec<Inline> = Vec::new();
1793    let mut buf = String::new();
1794    let mut i = lo;
1795    while i < hi {
1796        match units.get(i) {
1797            Some(SmartUnit::Quote) => {
1798                if smart_quote_opens(units, i, hi)
1799                    && let Some(j) = next_smart_quote(units, i + 1, hi)
1800                {
1801                    flush_smart_buf(&mut buf, &mut out);
1802                    out.push(Inline::Quoted(
1803                        QuoteType::DoubleQuote,
1804                        strip_outer_whitespace(resolve_double_quotes(units, i + 1, j)),
1805                    ));
1806                    i = j + 1;
1807                } else {
1808                    buf.push('"');
1809                    i += 1;
1810                }
1811            }
1812            Some(SmartUnit::Ch(c)) => {
1813                buf.push(*c);
1814                i += 1;
1815            }
1816            Some(SmartUnit::Space(inline) | SmartUnit::Node(inline)) => {
1817                flush_smart_buf(&mut buf, &mut out);
1818                out.push(inline.clone());
1819                i += 1;
1820            }
1821            None => break,
1822        }
1823    }
1824    flush_smart_buf(&mut buf, &mut out);
1825    out
1826}
1827
1828fn flush_smart_buf(buf: &mut String, out: &mut Vec<Inline>) {
1829    if !buf.is_empty() {
1830        out.push(Inline::Str(std::mem::take(buf).into()));
1831    }
1832}
1833
1834/// A double quote opens a run when the unit immediately after it, within the same span, is
1835/// non-whitespace content.
1836fn smart_quote_opens(units: &[SmartUnit], i: usize, hi: usize) -> bool {
1837    if i + 1 >= hi {
1838        return false;
1839    }
1840    match units.get(i + 1) {
1841        Some(SmartUnit::Ch(c)) => !c.is_whitespace(),
1842        Some(SmartUnit::Quote | SmartUnit::Node(_)) => true,
1843        Some(SmartUnit::Space(_)) | None => false,
1844    }
1845}
1846
1847fn next_smart_quote(units: &[SmartUnit], from: usize, hi: usize) -> Option<usize> {
1848    (from..hi).find(|&j| matches!(units.get(j), Some(SmartUnit::Quote)))
1849}
1850
1851/// Merges adjacent string runs so a span never holds two consecutive [`Inline::Str`] nodes,
1852/// descending into the markup wrappers a reader produces.
1853fn coalesce(inlines: Vec<Inline>) -> Vec<Inline> {
1854    let mut out: Vec<Inline> = Vec::new();
1855    for inline in inlines {
1856        let inline = match inline {
1857            Inline::Emph(xs) => Inline::Emph(coalesce(xs)),
1858            Inline::Strong(xs) => Inline::Strong(coalesce(xs)),
1859            Inline::Strikeout(xs) => Inline::Strikeout(coalesce(xs)),
1860            Inline::Superscript(xs) => Inline::Superscript(coalesce(xs)),
1861            Inline::Subscript(xs) => Inline::Subscript(coalesce(xs)),
1862            Inline::Underline(xs) => Inline::Underline(coalesce(xs)),
1863            Inline::SmallCaps(xs) => Inline::SmallCaps(coalesce(xs)),
1864            Inline::Span(attr, xs) => Inline::Span(attr, coalesce(xs)),
1865            other => other,
1866        };
1867        match (out.last_mut(), &inline) {
1868            (Some(Inline::Str(prev)), Inline::Str(next)) => prev.push_str(next),
1869            // Two whitespace tokens land next to each other only where a zero-width construct (a
1870            // category, an empty element) was removed between them; collapse them to one, keeping a
1871            // soft break if either side carried one.
1872            (
1873                Some(slot @ (Inline::Space | Inline::SoftBreak)),
1874                Inline::Space | Inline::SoftBreak,
1875            ) => {
1876                if matches!(inline, Inline::SoftBreak) {
1877                    *slot = Inline::SoftBreak;
1878                }
1879            }
1880            _ => out.push(inline),
1881        }
1882    }
1883    out
1884}
1885
1886/// Removes a soft line break that falls between two East Asian wide characters, so wrapped CJK text
1887/// rejoins with no intervening space. A break next to a non-wide character, or an explicit space, is
1888/// left as is.
1889fn drop_east_asian_breaks(inlines: Vec<Inline>) -> Vec<Inline> {
1890    let mut out: Vec<Inline> = Vec::with_capacity(inlines.len());
1891    let mut iter = inlines.into_iter().peekable();
1892    while let Some(inline) = iter.next() {
1893        if matches!(inline, Inline::SoftBreak) {
1894            let prev_wide = out.last().and_then(trailing_char).is_some_and(is_wide_char);
1895            let next_wide = iter.peek().and_then(leading_char).is_some_and(is_wide_char);
1896            if prev_wide && next_wide {
1897                continue;
1898            }
1899        }
1900        out.push(inline);
1901    }
1902    out
1903}
1904
1905/// The last rendered character of an inline, descending into wrapper inlines, or `None` for one that
1906/// renders no character at the boundary (a break, image, or note).
1907fn trailing_char(inline: &Inline) -> Option<char> {
1908    match inline {
1909        Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
1910            s.chars().next_back()
1911        }
1912        Inline::Emph(xs)
1913        | Inline::Underline(xs)
1914        | Inline::Strong(xs)
1915        | Inline::Strikeout(xs)
1916        | Inline::Superscript(xs)
1917        | Inline::Subscript(xs)
1918        | Inline::SmallCaps(xs)
1919        | Inline::Quoted(_, xs)
1920        | Inline::Span(_, xs)
1921        | Inline::Link(_, xs, _)
1922        | Inline::Cite(_, xs) => xs.iter().rev().find_map(trailing_char),
1923        _ => None,
1924    }
1925}
1926
1927/// The first rendered character of an inline, descending into wrapper inlines, or `None` for one
1928/// that renders no character at the boundary.
1929fn leading_char(inline: &Inline) -> Option<char> {
1930    match inline {
1931        Inline::Str(s) | Inline::Code(_, s) | Inline::Math(_, s) | Inline::RawInline(_, s) => {
1932            s.chars().next()
1933        }
1934        Inline::Emph(xs)
1935        | Inline::Underline(xs)
1936        | Inline::Strong(xs)
1937        | Inline::Strikeout(xs)
1938        | Inline::Superscript(xs)
1939        | Inline::Subscript(xs)
1940        | Inline::SmallCaps(xs)
1941        | Inline::Quoted(_, xs)
1942        | Inline::Span(_, xs)
1943        | Inline::Link(_, xs, _)
1944        | Inline::Cite(_, xs) => xs.iter().find_map(leading_char),
1945        _ => None,
1946    }
1947}
1948
1949/// Whether `c` is an East Asian wide or fullwidth character, the class of characters that wrap
1950/// without a separating space.
1951fn is_wide_char(c: char) -> bool {
1952    let cp = c as u32;
1953    matches!(cp,
1954        0x1100..=0x115F
1955        | 0x2329 | 0x232A
1956        | 0x2E80..=0x303E
1957        | 0x3041..=0x33FF
1958        | 0x3400..=0x4DBF
1959        | 0x4E00..=0x9FFF
1960        | 0xA000..=0xA4CF
1961        | 0xA960..=0xA97F
1962        | 0xAC00..=0xD7A3
1963        | 0xF900..=0xFAFF
1964        | 0xFE10..=0xFE19
1965        | 0xFE30..=0xFE6F
1966        | 0xFF00..=0xFF60
1967        | 0xFFE0..=0xFFE6
1968        | 0x1B000..=0x1B2FF
1969        | 0x1F200..=0x1F2FF
1970        | 0x1F300..=0x1F64F
1971        | 0x1F900..=0x1F9FF
1972        | 0x20000..=0x3FFFD
1973    )
1974}
1975
1976// --- preformatted text --------------------------------------------------------------------------
1977
1978/// Turns a parsed preformatted line into code spans: runs of literal text and spaces become
1979/// [`Inline::Code`] while markup wrappers keep their structure with code interiors. A space inside a
1980/// code run is held as a non-breaking space so the rendered width is preserved.
1981fn preformat_transform(inlines: Vec<Inline>) -> Vec<Inline> {
1982    let mut out: Vec<Inline> = Vec::new();
1983    let mut run = String::new();
1984    for inline in inlines {
1985        match inline {
1986            Inline::Str(s) => run.push_str(&s.replace(' ', "\u{a0}")),
1987            Inline::Space | Inline::SoftBreak => run.push('\u{a0}'),
1988            other => {
1989                if !run.is_empty() {
1990                    out.push(Inline::Code(
1991                        Box::default(),
1992                        std::mem::take(&mut run).into(),
1993                    ));
1994                }
1995                out.push(preformat_descend(other));
1996            }
1997        }
1998    }
1999    if !run.is_empty() {
2000        out.push(Inline::Code(Box::default(), run.into()));
2001    }
2002    out
2003}
2004
2005/// Recurses preformatting into a wrapper inline, leaving leaf inlines (code, math, breaks, raw)
2006/// untouched.
2007fn preformat_descend(inline: Inline) -> Inline {
2008    match inline {
2009        Inline::Emph(xs) => Inline::Emph(preformat_transform(xs)),
2010        Inline::Strong(xs) => Inline::Strong(preformat_transform(xs)),
2011        Inline::Strikeout(xs) => Inline::Strikeout(preformat_transform(xs)),
2012        Inline::Superscript(xs) => Inline::Superscript(preformat_transform(xs)),
2013        Inline::Subscript(xs) => Inline::Subscript(preformat_transform(xs)),
2014        Inline::Underline(xs) => Inline::Underline(preformat_transform(xs)),
2015        Inline::SmallCaps(xs) => Inline::SmallCaps(preformat_transform(xs)),
2016        Inline::Span(attr, xs) => Inline::Span(attr, preformat_transform(xs)),
2017        Inline::Link(attr, xs, target) => Inline::Link(attr, preformat_transform(xs), target),
2018        other => other,
2019    }
2020}
2021
2022// --- plain text & entities ----------------------------------------------------------------------
2023
2024/// Tokenizes literal text (used for `nowiki`): entity references are decoded, whitespace runs become
2025/// a single [`Inline::Space`] or [`Inline::SoftBreak`], and no other markup is recognized.
2026fn plain_inlines(text: &str) -> Vec<Inline> {
2027    let chars: Vec<char> = text.chars().collect();
2028    let n = chars.len();
2029    let mut out: Vec<Inline> = Vec::new();
2030    let mut word = String::new();
2031    let mut i = 0;
2032    while i < n {
2033        let Some(c) = at(&chars, i) else { break };
2034        if c.is_whitespace() {
2035            if !word.is_empty() {
2036                out.push(Inline::Str(std::mem::take(&mut word).into()));
2037            }
2038            let (token, next) = whitespace_token(&chars, i);
2039            out.push(token);
2040            i = next;
2041        } else if c == '&' {
2042            if let Some((decoded, next)) = entities::read_reference(&chars, i, chars.len(), true) {
2043                word.push_str(&decoded);
2044                i = next;
2045            } else {
2046                word.push('&');
2047                i += 1;
2048            }
2049        } else {
2050            word.push(c);
2051            i += 1;
2052        }
2053    }
2054    if !word.is_empty() {
2055        out.push(Inline::Str(word.into()));
2056    }
2057    out
2058}
2059
2060/// Decodes every entity reference in a string, leaving other characters untouched.
2061fn decode_entities(text: &str) -> String {
2062    let chars: Vec<char> = text.chars().collect();
2063    let n = chars.len();
2064    let mut out = String::new();
2065    let mut i = 0;
2066    while i < n {
2067        if at(&chars, i) == Some('&')
2068            && let Some((decoded, next)) = entities::read_reference(&chars, i, chars.len(), true)
2069        {
2070            out.push_str(&decoded);
2071            i = next;
2072            continue;
2073        }
2074        if let Some(c) = at(&chars, i) {
2075            out.push(c);
2076        }
2077        i += 1;
2078    }
2079    out
2080}
2081
2082// --- bare URLs & namespaces ---------------------------------------------------------------------
2083
2084/// Whether `name` (compared case-insensitively) is a recognized URL scheme. Beyond the shared
2085/// registry, this format additionally autolinks the `doi` and `javascript` schemes.
2086fn is_scheme(name: &str) -> bool {
2087    let lower = name.to_ascii_lowercase();
2088    crate::url_schemes::is_scheme(&lower) || lower == "doi" || lower == "javascript"
2089}
2090
2091/// Whether `text` begins with a recognized scheme followed by a colon — the test a bracketed
2092/// `[url label]` target must pass to be a link.
2093fn is_url(text: &str) -> bool {
2094    match text.split_once(':') {
2095        Some((scheme, _)) => {
2096            !scheme.is_empty()
2097                && scheme
2098                    .chars()
2099                    .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.'))
2100                && is_scheme(scheme)
2101        }
2102        None => false,
2103    }
2104}
2105
2106/// The length of a `scheme:` prefix at `i` (the scheme name plus its colon) when the name is a
2107/// recognized scheme, else `None`. The scheme name is the run of letters, digits, `+`, `-`, and `.`
2108/// before the colon.
2109fn url_scheme_len(chars: &[char], i: usize) -> Option<usize> {
2110    let mut j = i;
2111    let mut name = String::new();
2112    while let Some(c) = at(chars, j) {
2113        if c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.') {
2114            name.push(c);
2115            j += 1;
2116        } else {
2117            break;
2118        }
2119    }
2120    if name.is_empty() || at(chars, j) != Some(':') || !is_scheme(&name) {
2121        return None;
2122    }
2123    Some(j - i + 1)
2124}
2125
2126/// Reads a bare URL beginning at a word boundary. The URL runs to the next space or angle bracket,
2127/// after which trailing punctuation and unbalanced brackets are trimmed back. The displayed text
2128/// keeps the characters literally while the link target percent-encodes the unsafe ones. Returns the
2129/// autolink and the index just past the consumed URL.
2130fn bare_url(chars: &[char], i: usize) -> Option<(Inline, usize)> {
2131    let scheme_len = url_scheme_len(chars, i)?;
2132    let mut j = i + scheme_len;
2133    while let Some(c) = at(chars, j) {
2134        if c.is_whitespace() || matches!(c, '<' | '>') {
2135            break;
2136        }
2137        // A run of two or more apostrophes opens emphasis, so it also ends the URL.
2138        if c == '\'' && at(chars, j + 1) == Some('\'') {
2139            break;
2140        }
2141        j += 1;
2142    }
2143    if j <= i + scheme_len {
2144        return None;
2145    }
2146    let mut display = collect_range(chars, i, j);
2147    trim_url_trailing(&mut display);
2148    if display.is_empty() {
2149        return None;
2150    }
2151    let consumed = display.chars().count();
2152    let target = encode_url_target(&display);
2153    Some((
2154        Inline::Link(
2155            Box::default(),
2156            vec![Inline::Str(display.into())],
2157            Box::new(Target {
2158                url: target.into(),
2159                title: carta_ast::Text::default(),
2160            }),
2161        ),
2162        i + consumed,
2163    ))
2164}
2165
2166/// Trims a URL's trailing characters that read as sentence punctuation or unbalanced brackets: the
2167/// always-trimmed set never legitimately ends a URL, and a closing bracket is trimmed only when it
2168/// outnumbers its opener so a balanced `(a)` or `[a]` survives.
2169fn trim_url_trailing(url: &mut String) {
2170    while let Some(last) = url.chars().last() {
2171        let always = matches!(
2172            last,
2173            '.' | ',' | ';' | ':' | '!' | '?' | '"' | '*' | '~' | '\'' | '|'
2174        );
2175        let unbalanced = match last {
2176            ')' => url.matches(')').count() > url.matches('(').count(),
2177            ']' => url.matches(']').count() > url.matches('[').count(),
2178            '}' => url.matches('}').count() > url.matches('{').count(),
2179            _ => false,
2180        };
2181        if always || unbalanced {
2182            url.pop();
2183        } else {
2184            break;
2185        }
2186    }
2187}
2188
2189/// Percent-encodes the characters a wikitext link target escapes, leaving the rest intact.
2190fn encode_url_target(url: &str) -> String {
2191    let mut out = String::with_capacity(url.len());
2192    for ch in url.chars() {
2193        match ch {
2194            ' ' => out.push_str("%20"),
2195            '"' => out.push_str("%22"),
2196            '`' => out.push_str("%60"),
2197            '^' => out.push_str("%5E"),
2198            '[' => out.push_str("%5B"),
2199            ']' => out.push_str("%5D"),
2200            '{' => out.push_str("%7B"),
2201            '}' => out.push_str("%7D"),
2202            '|' => out.push_str("%7C"),
2203            other => out.push(other),
2204        }
2205    }
2206    out
2207}
2208
2209/// Builds a wikilink target URL from a page name: each run of whitespace collapses to a single
2210/// underscore, every other character is kept as written.
2211fn wikilink_url(target: &str) -> String {
2212    let mut out = String::new();
2213    let mut pending = false;
2214    for ch in target.chars() {
2215        if ch.is_whitespace() {
2216            pending = true;
2217        } else {
2218            if pending {
2219                out.push('_');
2220                pending = false;
2221            }
2222            out.push(ch);
2223        }
2224    }
2225    out
2226}
2227
2228/// Flatten inline content into the plain string stored as a link or image title. Markup wrappers
2229/// unwrap to their contents and breaks collapse to a space, as for any plain-text flattening, but a
2230/// [`Inline::Quoted`] node renders the matching curly quote glyphs around its contents so a curled
2231/// quotation survives into the title text.
2232fn title_text(inlines: &[Inline]) -> String {
2233    let mut out = String::new();
2234    push_title_text(inlines, &mut out);
2235    out
2236}
2237
2238fn push_title_text(inlines: &[Inline], out: &mut String) {
2239    for inline in inlines {
2240        match inline {
2241            Inline::Str(text) | Inline::Code(_, text) | Inline::Math(_, text) => out.push_str(text),
2242            Inline::Space | Inline::SoftBreak | Inline::LineBreak => out.push(' '),
2243            Inline::Quoted(QuoteType::SingleQuote, xs) => {
2244                out.push('\u{2018}');
2245                push_title_text(xs, out);
2246                out.push('\u{2019}');
2247            }
2248            Inline::Quoted(QuoteType::DoubleQuote, xs) => {
2249                out.push('\u{201c}');
2250                push_title_text(xs, out);
2251                out.push('\u{201d}');
2252            }
2253            Inline::Emph(xs)
2254            | Inline::Underline(xs)
2255            | Inline::Strong(xs)
2256            | Inline::Strikeout(xs)
2257            | Inline::Superscript(xs)
2258            | Inline::Subscript(xs)
2259            | Inline::SmallCaps(xs)
2260            | Inline::Cite(_, xs)
2261            | Inline::Link(_, xs, _)
2262            | Inline::Image(_, xs, _)
2263            | Inline::Span(_, xs) => push_title_text(xs, out),
2264            Inline::RawInline(..) | Inline::Note(_) => {}
2265        }
2266    }
2267}
2268
2269fn namespace_of(target: &str) -> Option<String> {
2270    if target.starts_with(':') {
2271        return None;
2272    }
2273    let (before, _) = target.split_once(':')?;
2274    Some(before.trim().to_lowercase())
2275}
2276
2277// --- image embeds -------------------------------------------------------------------------------
2278
2279/// The page name with a leading `namespace:` prefix removed.
2280fn strip_namespace(target: &str) -> &str {
2281    match target.split_once(':') {
2282        Some((_, rest)) => rest.trim(),
2283        None => target,
2284    }
2285}
2286
2287/// Parses an image size parameter — `<w>px`, `x<h>px`, or `<w>x<h>px` — into its width and optional
2288/// height. The width is the digits before an `x` (empty when the form is `x<h>px`); the height is
2289/// the digits after it. Returns `None` for any parameter that is not a pixel size.
2290fn image_size(param: &str) -> Option<(String, Option<String>)> {
2291    let digits = param.strip_suffix("px")?;
2292    match digits.split_once('x') {
2293        Some((width, height)) => {
2294            let valid = width.chars().all(|c| c.is_ascii_digit())
2295                && !height.is_empty()
2296                && height.chars().all(|c| c.is_ascii_digit());
2297            valid.then(|| (width.to_string(), Some(height.to_string())))
2298        }
2299        None => (!digits.is_empty() && digits.chars().all(|c| c.is_ascii_digit()))
2300            .then(|| (digits.to_string(), None)),
2301    }
2302}
2303
2304/// Whether an image parameter forces the embed to decline, so the markup becomes an ordinary
2305/// wikilink instead of an image. A `thumbtime` parameter (with or without a value) and an `upright`
2306/// parameter that carries an explicit value have no image representation; a bare `upright` keyword
2307/// is a normal sizing hint and does not decline.
2308fn image_param_declines(param: &str) -> bool {
2309    match param.split_once('=') {
2310        Some((key, _)) => {
2311            let key = key.trim().to_ascii_lowercase();
2312            key == "thumbtime" || key == "upright"
2313        }
2314        None => param.trim().eq_ignore_ascii_case("thumbtime"),
2315    }
2316}
2317
2318/// Whether an image parameter is a recognized `key=value` attribute (`alt`, `link`, `class`,
2319/// `page`) that is consumed without contributing caption text. Any other `key=value` becomes
2320/// caption text.
2321fn is_recognized_image_attr(param: &str) -> bool {
2322    match param.split_once('=') {
2323        Some((key, _)) => matches!(
2324            key.trim().to_ascii_lowercase().as_str(),
2325            "alt" | "link" | "class" | "page"
2326        ),
2327        None => false,
2328    }
2329}
2330
2331/// Whether an image parameter is a recognized placement, framing, or alignment keyword that
2332/// carries no caption text.
2333fn is_image_keyword(param: &str) -> bool {
2334    matches!(
2335        param.to_ascii_lowercase().as_str(),
2336        "thumb"
2337            | "thumbnail"
2338            | "frame"
2339            | "framed"
2340            | "frameless"
2341            | "border"
2342            | "left"
2343            | "right"
2344            | "center"
2345            | "centre"
2346            | "none"
2347            | "upright"
2348            | "baseline"
2349            | "sub"
2350            | "super"
2351            | "top"
2352            | "text-top"
2353            | "middle"
2354            | "bottom"
2355            | "text-bottom"
2356    )
2357}
2358
2359/// Wraps a paragraph whose only content is an image in a figure, moving the image's description to
2360/// the figure caption; any other paragraph is returned unchanged.
2361fn para_or_figure(inlines: Vec<Inline>) -> Block {
2362    match lone_image_figure(&inlines) {
2363        Some(figure) => figure,
2364        None => Block::Para(inlines),
2365    }
2366}
2367
2368/// As [`para_or_figure`], for a context (a list item) whose tight content is a [`Block::Plain`].
2369fn plain_or_figure(inlines: Vec<Inline>) -> Block {
2370    match lone_image_figure(&inlines) {
2371        Some(figure) => figure,
2372        None => Block::Plain(inlines),
2373    }
2374}
2375
2376/// Builds a figure from a paragraph that holds a single image (ignoring surrounding whitespace),
2377/// or `None` when the paragraph is anything else.
2378fn lone_image_figure(inlines: &[Inline]) -> Option<Block> {
2379    let mut significant = inlines.iter().filter(|inline| {
2380        !matches!(
2381            inline,
2382            Inline::Space | Inline::SoftBreak | Inline::LineBreak
2383        )
2384    });
2385    let Inline::Image(attr, alt, target) = significant.next()? else {
2386        return None;
2387    };
2388    if significant.next().is_some() {
2389        return None;
2390    }
2391    let caption = Caption {
2392        short: None,
2393        long: vec![Block::Plain(alt.clone())],
2394    };
2395    let image = Inline::Image(attr.clone(), Vec::new(), target.clone());
2396    Some(Block::Figure(
2397        Box::default(),
2398        Box::new(caption),
2399        vec![Block::Plain(vec![image])],
2400    ))
2401}
2402
2403// --- identifiers --------------------------------------------------------------------------------
2404
2405/// Under the `gfm_auto_identifiers` scheme each emoji that has a known shortname contributes that
2406/// name to the identifier in place of the raw character. Spans of text with no emoji pass through
2407/// unchanged; the shortname is spliced in directly, without inserting word boundaries.
2408fn emoji_to_aliases(text: &str) -> String {
2409    let mut out = String::new();
2410    let mut rest = text;
2411    while !rest.is_empty() {
2412        if let Some((alias, len)) = emoji::alias_at(rest) {
2413            out.push_str(alias);
2414            rest = rest.get(len..).unwrap_or("");
2415        } else if let Some(ch) = rest.chars().next() {
2416            out.push(ch);
2417            rest = rest.get(ch.len_utf8()..).unwrap_or("");
2418        } else {
2419            break;
2420        }
2421    }
2422    out
2423}
2424
2425/// Builds a heading identifier under the `auto_identifiers` scheme: lowercase, keep alphanumerics
2426/// with `_` and `.`, collapse each whitespace run to a single `_`, turn each hyphen into its own
2427/// `_`, drop other punctuation without breaking an adjacent whitespace run, and strip a leading run
2428/// of non-letters.
2429fn mediawiki_slug(text: &str) -> String {
2430    let mut out = String::new();
2431    let mut in_ws = false;
2432    for ch in text.chars() {
2433        if ch.is_whitespace() {
2434            if !in_ws {
2435                out.push('_');
2436                in_ws = true;
2437            }
2438        } else if ch == '-' {
2439            out.push('_');
2440            in_ws = false;
2441        } else if ch.is_alphanumeric() || ch == '_' || ch == '.' {
2442            out.extend(ch.to_lowercase());
2443            in_ws = false;
2444        }
2445        // Other punctuation is transparent: it emits nothing and leaves a running whitespace
2446        // collapse intact, so `Foo : Bar` and `Foo  Bar` both yield a single separating `_`.
2447    }
2448    out.chars().skip_while(|c| !c.is_alphabetic()).collect()
2449}
2450
2451/// Transliterates text to ASCII for `ascii_identifiers`: an ASCII character is kept as is, a
2452/// character whose canonical decomposition begins with an ASCII letter or digit folds to that
2453/// character (so `é` becomes `e`), and any other non-ASCII character is dropped (so `Œ`, `ß`, and
2454/// `½` vanish). The result is then slugged like any other identifier.
2455fn transliterate_ascii(text: &str) -> String {
2456    let mut out = String::with_capacity(text.len());
2457    for ch in text.chars() {
2458        if ch.is_ascii() {
2459            out.push(ch);
2460        } else if let Ok(index) = ASCII_FOLD.binary_search_by(|&(cp, _)| cp.cmp(&(ch as u32)))
2461            && let Some(&(_, byte)) = ASCII_FOLD.get(index)
2462        {
2463            out.push(byte as char);
2464        }
2465    }
2466    out
2467}
2468
2469/// The ASCII fold for `ascii_identifiers`, keyed by Unicode code point and kept sorted for binary
2470/// search. Each entry maps a precomposed character to the ASCII letter or digit its canonical
2471/// decomposition begins with; characters with no ASCII base are absent and are dropped instead.
2472const ASCII_FOLD: &[(u32, u8)] = &[
2473    (0x00C0, b'a'),
2474    (0x00C1, b'a'),
2475    (0x00C2, b'a'),
2476    (0x00C3, b'a'),
2477    (0x00C4, b'a'),
2478    (0x00C5, b'a'),
2479    (0x00C7, b'c'),
2480    (0x00C8, b'e'),
2481    (0x00C9, b'e'),
2482    (0x00CA, b'e'),
2483    (0x00CB, b'e'),
2484    (0x00CC, b'i'),
2485    (0x00CD, b'i'),
2486    (0x00CE, b'i'),
2487    (0x00CF, b'i'),
2488    (0x00D1, b'n'),
2489    (0x00D2, b'o'),
2490    (0x00D3, b'o'),
2491    (0x00D4, b'o'),
2492    (0x00D5, b'o'),
2493    (0x00D6, b'o'),
2494    (0x00D9, b'u'),
2495    (0x00DA, b'u'),
2496    (0x00DB, b'u'),
2497    (0x00DC, b'u'),
2498    (0x00DD, b'y'),
2499    (0x00E0, b'a'),
2500    (0x00E1, b'a'),
2501    (0x00E2, b'a'),
2502    (0x00E3, b'a'),
2503    (0x00E4, b'a'),
2504    (0x00E5, b'a'),
2505    (0x00E7, b'c'),
2506    (0x00E8, b'e'),
2507    (0x00E9, b'e'),
2508    (0x00EA, b'e'),
2509    (0x00EB, b'e'),
2510    (0x00EC, b'i'),
2511    (0x00ED, b'i'),
2512    (0x00EE, b'i'),
2513    (0x00EF, b'i'),
2514    (0x00F1, b'n'),
2515    (0x00F2, b'o'),
2516    (0x00F3, b'o'),
2517    (0x00F4, b'o'),
2518    (0x00F5, b'o'),
2519    (0x00F6, b'o'),
2520    (0x00F9, b'u'),
2521    (0x00FA, b'u'),
2522    (0x00FB, b'u'),
2523    (0x00FC, b'u'),
2524    (0x00FD, b'y'),
2525    (0x00FF, b'y'),
2526    (0x0100, b'a'),
2527    (0x0101, b'a'),
2528    (0x0102, b'a'),
2529    (0x0103, b'a'),
2530    (0x0104, b'a'),
2531    (0x0105, b'a'),
2532    (0x0106, b'c'),
2533    (0x0107, b'c'),
2534    (0x0108, b'c'),
2535    (0x0109, b'c'),
2536    (0x010A, b'c'),
2537    (0x010B, b'c'),
2538    (0x010C, b'c'),
2539    (0x010D, b'c'),
2540    (0x010E, b'd'),
2541    (0x010F, b'd'),
2542    (0x0112, b'e'),
2543    (0x0113, b'e'),
2544    (0x0114, b'e'),
2545    (0x0115, b'e'),
2546    (0x0116, b'e'),
2547    (0x0117, b'e'),
2548    (0x0118, b'e'),
2549    (0x0119, b'e'),
2550    (0x011A, b'e'),
2551    (0x011B, b'e'),
2552    (0x011C, b'g'),
2553    (0x011D, b'g'),
2554    (0x011E, b'g'),
2555    (0x011F, b'g'),
2556    (0x0120, b'g'),
2557    (0x0121, b'g'),
2558    (0x0122, b'g'),
2559    (0x0123, b'g'),
2560    (0x0124, b'h'),
2561    (0x0125, b'h'),
2562    (0x0128, b'i'),
2563    (0x0129, b'i'),
2564    (0x012A, b'i'),
2565    (0x012B, b'i'),
2566    (0x012C, b'i'),
2567    (0x012D, b'i'),
2568    (0x012E, b'i'),
2569    (0x012F, b'i'),
2570    (0x0130, b'i'),
2571    (0x0134, b'j'),
2572    (0x0135, b'j'),
2573    (0x0136, b'k'),
2574    (0x0137, b'k'),
2575    (0x0139, b'l'),
2576    (0x013A, b'l'),
2577    (0x013B, b'l'),
2578    (0x013C, b'l'),
2579    (0x013D, b'l'),
2580    (0x013E, b'l'),
2581    (0x0143, b'n'),
2582    (0x0144, b'n'),
2583    (0x0145, b'n'),
2584    (0x0146, b'n'),
2585    (0x0147, b'n'),
2586    (0x0148, b'n'),
2587    (0x014C, b'o'),
2588    (0x014D, b'o'),
2589    (0x014E, b'o'),
2590    (0x014F, b'o'),
2591    (0x0150, b'o'),
2592    (0x0151, b'o'),
2593    (0x0154, b'r'),
2594    (0x0155, b'r'),
2595    (0x0156, b'r'),
2596    (0x0157, b'r'),
2597    (0x0158, b'r'),
2598    (0x0159, b'r'),
2599    (0x015A, b's'),
2600    (0x015B, b's'),
2601    (0x015C, b's'),
2602    (0x015D, b's'),
2603    (0x015E, b's'),
2604    (0x015F, b's'),
2605    (0x0160, b's'),
2606    (0x0161, b's'),
2607    (0x0162, b't'),
2608    (0x0163, b't'),
2609    (0x0164, b't'),
2610    (0x0165, b't'),
2611    (0x0168, b'u'),
2612    (0x0169, b'u'),
2613    (0x016A, b'u'),
2614    (0x016B, b'u'),
2615    (0x016C, b'u'),
2616    (0x016D, b'u'),
2617    (0x016E, b'u'),
2618    (0x016F, b'u'),
2619    (0x0170, b'u'),
2620    (0x0171, b'u'),
2621    (0x0172, b'u'),
2622    (0x0173, b'u'),
2623    (0x0174, b'w'),
2624    (0x0175, b'w'),
2625    (0x0176, b'y'),
2626    (0x0177, b'y'),
2627    (0x0178, b'y'),
2628    (0x0179, b'z'),
2629    (0x017A, b'z'),
2630    (0x017B, b'z'),
2631    (0x017C, b'z'),
2632    (0x017D, b'z'),
2633    (0x017E, b'z'),
2634    (0x01A0, b'o'),
2635    (0x01A1, b'o'),
2636    (0x01AF, b'u'),
2637    (0x01B0, b'u'),
2638    (0x01CD, b'a'),
2639    (0x01CE, b'a'),
2640    (0x01CF, b'i'),
2641    (0x01D0, b'i'),
2642    (0x01D1, b'o'),
2643    (0x01D2, b'o'),
2644    (0x01D3, b'u'),
2645    (0x01D4, b'u'),
2646    (0x01D5, b'u'),
2647    (0x01D6, b'u'),
2648    (0x01D7, b'u'),
2649    (0x01D8, b'u'),
2650    (0x01D9, b'u'),
2651    (0x01DA, b'u'),
2652    (0x01DB, b'u'),
2653    (0x01DC, b'u'),
2654    (0x01DE, b'a'),
2655    (0x01DF, b'a'),
2656    (0x01E0, b'a'),
2657    (0x01E1, b'a'),
2658    (0x01E6, b'g'),
2659    (0x01E7, b'g'),
2660    (0x01E8, b'k'),
2661    (0x01E9, b'k'),
2662    (0x01EA, b'o'),
2663    (0x01EB, b'o'),
2664    (0x01EC, b'o'),
2665    (0x01ED, b'o'),
2666    (0x01F0, b'j'),
2667    (0x01F4, b'g'),
2668    (0x01F5, b'g'),
2669    (0x01F8, b'n'),
2670    (0x01F9, b'n'),
2671    (0x01FA, b'a'),
2672    (0x01FB, b'a'),
2673    (0x0200, b'a'),
2674    (0x0201, b'a'),
2675    (0x0202, b'a'),
2676    (0x0203, b'a'),
2677    (0x0204, b'e'),
2678    (0x0205, b'e'),
2679    (0x0206, b'e'),
2680    (0x0207, b'e'),
2681    (0x0208, b'i'),
2682    (0x0209, b'i'),
2683    (0x020A, b'i'),
2684    (0x020B, b'i'),
2685    (0x020C, b'o'),
2686    (0x020D, b'o'),
2687    (0x020E, b'o'),
2688    (0x020F, b'o'),
2689    (0x0210, b'r'),
2690    (0x0211, b'r'),
2691    (0x0212, b'r'),
2692    (0x0213, b'r'),
2693    (0x0214, b'u'),
2694    (0x0215, b'u'),
2695    (0x0216, b'u'),
2696    (0x0217, b'u'),
2697    (0x0218, b's'),
2698    (0x0219, b's'),
2699    (0x021A, b't'),
2700    (0x021B, b't'),
2701    (0x021E, b'h'),
2702    (0x021F, b'h'),
2703    (0x0226, b'a'),
2704    (0x0227, b'a'),
2705    (0x0228, b'e'),
2706    (0x0229, b'e'),
2707    (0x022A, b'o'),
2708    (0x022B, b'o'),
2709    (0x022C, b'o'),
2710    (0x022D, b'o'),
2711    (0x022E, b'o'),
2712    (0x022F, b'o'),
2713    (0x0230, b'o'),
2714    (0x0231, b'o'),
2715    (0x0232, b'y'),
2716    (0x0233, b'y'),
2717    (0x1E00, b'a'),
2718    (0x1E01, b'a'),
2719    (0x1E02, b'b'),
2720    (0x1E03, b'b'),
2721    (0x1E04, b'b'),
2722    (0x1E05, b'b'),
2723    (0x1E06, b'b'),
2724    (0x1E07, b'b'),
2725    (0x1E08, b'c'),
2726    (0x1E09, b'c'),
2727    (0x1E0A, b'd'),
2728    (0x1E0B, b'd'),
2729    (0x1E0C, b'd'),
2730    (0x1E0D, b'd'),
2731    (0x1E0E, b'd'),
2732    (0x1E0F, b'd'),
2733    (0x1E10, b'd'),
2734    (0x1E11, b'd'),
2735    (0x1E12, b'd'),
2736    (0x1E13, b'd'),
2737    (0x1E14, b'e'),
2738    (0x1E15, b'e'),
2739    (0x1E16, b'e'),
2740    (0x1E17, b'e'),
2741    (0x1E18, b'e'),
2742    (0x1E19, b'e'),
2743    (0x1E1A, b'e'),
2744    (0x1E1B, b'e'),
2745    (0x1E1C, b'e'),
2746    (0x1E1D, b'e'),
2747    (0x1E1E, b'f'),
2748    (0x1E1F, b'f'),
2749    (0x1E20, b'g'),
2750    (0x1E21, b'g'),
2751    (0x1E22, b'h'),
2752    (0x1E23, b'h'),
2753    (0x1E24, b'h'),
2754    (0x1E25, b'h'),
2755    (0x1E26, b'h'),
2756    (0x1E27, b'h'),
2757    (0x1E28, b'h'),
2758    (0x1E29, b'h'),
2759    (0x1E2A, b'h'),
2760    (0x1E2B, b'h'),
2761    (0x1E2C, b'i'),
2762    (0x1E2D, b'i'),
2763    (0x1E2E, b'i'),
2764    (0x1E2F, b'i'),
2765    (0x1E30, b'k'),
2766    (0x1E31, b'k'),
2767    (0x1E32, b'k'),
2768    (0x1E33, b'k'),
2769    (0x1E34, b'k'),
2770    (0x1E35, b'k'),
2771    (0x1E36, b'l'),
2772    (0x1E37, b'l'),
2773    (0x1E38, b'l'),
2774    (0x1E39, b'l'),
2775    (0x1E3A, b'l'),
2776    (0x1E3B, b'l'),
2777    (0x1E3C, b'l'),
2778    (0x1E3D, b'l'),
2779    (0x1E3E, b'm'),
2780    (0x1E3F, b'm'),
2781    (0x1E40, b'm'),
2782    (0x1E41, b'm'),
2783    (0x1E42, b'm'),
2784    (0x1E43, b'm'),
2785    (0x1E44, b'n'),
2786    (0x1E45, b'n'),
2787    (0x1E46, b'n'),
2788    (0x1E47, b'n'),
2789    (0x1E48, b'n'),
2790    (0x1E49, b'n'),
2791    (0x1E4A, b'n'),
2792    (0x1E4B, b'n'),
2793    (0x1E4C, b'o'),
2794    (0x1E4D, b'o'),
2795    (0x1E4E, b'o'),
2796    (0x1E4F, b'o'),
2797    (0x1E50, b'o'),
2798    (0x1E51, b'o'),
2799    (0x1E52, b'o'),
2800    (0x1E53, b'o'),
2801    (0x1E54, b'p'),
2802    (0x1E55, b'p'),
2803    (0x1E56, b'p'),
2804    (0x1E57, b'p'),
2805    (0x1E58, b'r'),
2806    (0x1E59, b'r'),
2807    (0x1E5A, b'r'),
2808    (0x1E5B, b'r'),
2809    (0x1E5C, b'r'),
2810    (0x1E5D, b'r'),
2811    (0x1E5E, b'r'),
2812    (0x1E5F, b'r'),
2813    (0x1E60, b's'),
2814    (0x1E61, b's'),
2815    (0x1E62, b's'),
2816    (0x1E63, b's'),
2817    (0x1E64, b's'),
2818    (0x1E65, b's'),
2819    (0x1E66, b's'),
2820    (0x1E67, b's'),
2821    (0x1E68, b's'),
2822    (0x1E69, b's'),
2823    (0x1E6A, b't'),
2824    (0x1E6B, b't'),
2825    (0x1E6C, b't'),
2826    (0x1E6D, b't'),
2827    (0x1E6E, b't'),
2828    (0x1E6F, b't'),
2829    (0x1E70, b't'),
2830    (0x1E71, b't'),
2831    (0x1E72, b'u'),
2832    (0x1E73, b'u'),
2833    (0x1E74, b'u'),
2834    (0x1E75, b'u'),
2835    (0x1E76, b'u'),
2836    (0x1E77, b'u'),
2837    (0x1E78, b'u'),
2838    (0x1E79, b'u'),
2839    (0x1E7A, b'u'),
2840    (0x1E7B, b'u'),
2841    (0x1E7C, b'v'),
2842    (0x1E7D, b'v'),
2843    (0x1E7E, b'v'),
2844    (0x1E7F, b'v'),
2845    (0x1E80, b'w'),
2846    (0x1E81, b'w'),
2847    (0x1E82, b'w'),
2848    (0x1E83, b'w'),
2849    (0x1E84, b'w'),
2850    (0x1E85, b'w'),
2851    (0x1E86, b'w'),
2852    (0x1E87, b'w'),
2853    (0x1E88, b'w'),
2854    (0x1E89, b'w'),
2855    (0x1E8A, b'x'),
2856    (0x1E8B, b'x'),
2857    (0x1E8C, b'x'),
2858    (0x1E8D, b'x'),
2859    (0x1E8E, b'y'),
2860    (0x1E8F, b'y'),
2861    (0x1E90, b'z'),
2862    (0x1E91, b'z'),
2863    (0x1E92, b'z'),
2864    (0x1E93, b'z'),
2865    (0x1E94, b'z'),
2866    (0x1E95, b'z'),
2867    (0x1E96, b'h'),
2868    (0x1E97, b't'),
2869    (0x1E98, b'w'),
2870    (0x1E99, b'y'),
2871    (0x1EA0, b'a'),
2872    (0x1EA1, b'a'),
2873    (0x1EA2, b'a'),
2874    (0x1EA3, b'a'),
2875    (0x1EA4, b'a'),
2876    (0x1EA5, b'a'),
2877    (0x1EA6, b'a'),
2878    (0x1EA7, b'a'),
2879    (0x1EA8, b'a'),
2880    (0x1EA9, b'a'),
2881    (0x1EAA, b'a'),
2882    (0x1EAB, b'a'),
2883    (0x1EAC, b'a'),
2884    (0x1EAD, b'a'),
2885    (0x1EAE, b'a'),
2886    (0x1EAF, b'a'),
2887    (0x1EB0, b'a'),
2888    (0x1EB1, b'a'),
2889    (0x1EB2, b'a'),
2890    (0x1EB3, b'a'),
2891    (0x1EB4, b'a'),
2892    (0x1EB5, b'a'),
2893    (0x1EB6, b'a'),
2894    (0x1EB7, b'a'),
2895    (0x1EB8, b'e'),
2896    (0x1EB9, b'e'),
2897    (0x1EBA, b'e'),
2898    (0x1EBB, b'e'),
2899    (0x1EBC, b'e'),
2900    (0x1EBD, b'e'),
2901    (0x1EBE, b'e'),
2902    (0x1EBF, b'e'),
2903    (0x1EC0, b'e'),
2904    (0x1EC1, b'e'),
2905    (0x1EC2, b'e'),
2906    (0x1EC3, b'e'),
2907    (0x1EC4, b'e'),
2908    (0x1EC5, b'e'),
2909    (0x1EC6, b'e'),
2910    (0x1EC7, b'e'),
2911    (0x1EC8, b'i'),
2912    (0x1EC9, b'i'),
2913    (0x1ECA, b'i'),
2914    (0x1ECB, b'i'),
2915    (0x1ECC, b'o'),
2916    (0x1ECD, b'o'),
2917    (0x1ECE, b'o'),
2918    (0x1ECF, b'o'),
2919    (0x1ED0, b'o'),
2920    (0x1ED1, b'o'),
2921    (0x1ED2, b'o'),
2922    (0x1ED3, b'o'),
2923    (0x1ED4, b'o'),
2924    (0x1ED5, b'o'),
2925    (0x1ED6, b'o'),
2926    (0x1ED7, b'o'),
2927    (0x1ED8, b'o'),
2928    (0x1ED9, b'o'),
2929    (0x1EDA, b'o'),
2930    (0x1EDB, b'o'),
2931    (0x1EDC, b'o'),
2932    (0x1EDD, b'o'),
2933    (0x1EDE, b'o'),
2934    (0x1EDF, b'o'),
2935    (0x1EE0, b'o'),
2936    (0x1EE1, b'o'),
2937    (0x1EE2, b'o'),
2938    (0x1EE3, b'o'),
2939    (0x1EE4, b'u'),
2940    (0x1EE5, b'u'),
2941    (0x1EE6, b'u'),
2942    (0x1EE7, b'u'),
2943    (0x1EE8, b'u'),
2944    (0x1EE9, b'u'),
2945    (0x1EEA, b'u'),
2946    (0x1EEB, b'u'),
2947    (0x1EEC, b'u'),
2948    (0x1EED, b'u'),
2949    (0x1EEE, b'u'),
2950    (0x1EEF, b'u'),
2951    (0x1EF0, b'u'),
2952    (0x1EF1, b'u'),
2953    (0x1EF2, b'y'),
2954    (0x1EF3, b'y'),
2955    (0x1EF4, b'y'),
2956    (0x1EF5, b'y'),
2957    (0x1EF6, b'y'),
2958    (0x1EF7, b'y'),
2959    (0x1EF8, b'y'),
2960    (0x1EF9, b'y'),
2961    (0x212A, b'k'),
2962    (0x212B, b'a'),
2963];
2964
2965// --- tag scanning -------------------------------------------------------------------------------
2966
2967/// Reads an opening tag at `chars[i]`, returning its lowercased name, the raw `<…>` text, whether it
2968/// is self-closing, and the index just past the `>`. Attribute values in quotes may contain `>`.
2969fn open_tag(chars: &[char], start: usize) -> Option<(String, String, bool, usize)> {
2970    let mut cursor = start + 1;
2971    let mut name = String::new();
2972    while let Some(ch) = at(chars, cursor) {
2973        if ch.is_ascii_alphanumeric() {
2974            name.push(ch.to_ascii_lowercase());
2975            cursor += 1;
2976        } else {
2977            break;
2978        }
2979    }
2980    if name.is_empty() {
2981        return None;
2982    }
2983    let mut quote: Option<char> = None;
2984    let len = chars.len();
2985    while cursor < len {
2986        let Some(ch) = at(chars, cursor) else { break };
2987        match quote {
2988            Some(open_quote) => {
2989                if ch == open_quote {
2990                    quote = None;
2991                }
2992                cursor += 1;
2993            }
2994            None => {
2995                if ch == '"' || ch == '\'' {
2996                    quote = Some(ch);
2997                    cursor += 1;
2998                } else if ch == '>' {
2999                    break;
3000                } else {
3001                    cursor += 1;
3002                }
3003            }
3004        }
3005    }
3006    if at(chars, cursor) != Some('>') {
3007        return None;
3008    }
3009    let self_closing = cursor > 0 && at(chars, cursor - 1) == Some('/');
3010    let raw = collect_range(chars, start, cursor + 1);
3011    Some((name, raw, self_closing, cursor + 1))
3012}
3013
3014/// Finds the matching `</name>` for an element whose content begins at `start`, counting nested
3015/// same-named tags. Returns the index where the closing tag begins and the index just past its `>`.
3016fn close_tag(chars: &[char], start: usize, name: &str) -> Option<(usize, usize)> {
3017    let mut depth = 0i32;
3018    let mut j = start;
3019    let n = chars.len();
3020    while j < n {
3021        if at(chars, j) == Some('<') {
3022            if at(chars, j + 1) == Some('/') {
3023                if tag_name_matches(chars, j + 2, name) {
3024                    if depth == 0 {
3025                        let gt = find_char(chars, j, '>')?;
3026                        return Some((j, gt + 1));
3027                    }
3028                    depth -= 1;
3029                }
3030            } else if tag_name_matches(chars, j + 1, name) {
3031                depth += 1;
3032            }
3033        }
3034        j += 1;
3035    }
3036    None
3037}
3038
3039/// The content of an element starting at `start` together with the index just past its closing tag;
3040/// an unterminated element runs to the end of input.
3041fn enclosed(chars: &[char], start: usize, name: &str) -> (String, usize) {
3042    match close_tag(chars, start, name) {
3043        Some((inner_end, after)) => (collect_range(chars, start, inner_end), after),
3044        None => (collect_range(chars, start, chars.len()), chars.len()),
3045    }
3046}
3047
3048fn tag_name_matches(chars: &[char], pos: usize, name: &str) -> bool {
3049    let mut count = 0;
3050    for (k, nc) in name.chars().enumerate() {
3051        match at(chars, pos + k) {
3052            Some(c) if c.eq_ignore_ascii_case(&nc) => count += 1,
3053            _ => return false,
3054        }
3055    }
3056    match at(chars, pos + count) {
3057        Some(c) => c.is_whitespace() || c == '>' || c == '/',
3058        None => false,
3059    }
3060}
3061
3062fn starts_block_tag(chars: &[char], pos: usize) -> bool {
3063    if at(chars, pos) != Some('<') {
3064        return false;
3065    }
3066    ["pre", "source", "syntaxhighlight", "blockquote", "ul", "ol"]
3067        .iter()
3068        .any(|name| tag_name_matches(chars, pos + 1, name))
3069}
3070
3071/// The count of `<ref>` tags opened but not yet closed within `chars[start..end]`. A self-closing
3072/// `<ref … />` opens nothing; verbatim regions are stepped over so a `<ref>` inside `<nowiki>` does
3073/// not count. Used to keep a paragraph open until a `<ref>` note's body is complete.
3074fn open_ref_depth(chars: &[char], start: usize, end: usize) -> i32 {
3075    let mut depth = 0i32;
3076    let mut i = start;
3077    while i < end {
3078        if at(chars, i) == Some('<') {
3079            if let Some(after) = verbatim_region_end(chars, i) {
3080                i = after;
3081                continue;
3082            }
3083            if at(chars, i + 1) == Some('/') {
3084                if tag_name_matches(chars, i + 2, "ref") {
3085                    depth = (depth - 1).max(0);
3086                }
3087            } else if tag_name_matches(chars, i + 1, "ref")
3088                && let Some((_, _, self_closing, after)) = open_tag(chars, i)
3089            {
3090                if !self_closing {
3091                    depth += 1;
3092                }
3093                i = after;
3094                continue;
3095            }
3096        }
3097        i += 1;
3098    }
3099    depth
3100}
3101
3102/// Whether the innermost `<ref>` still open at `end` has a body that begins on a fresh line — its
3103/// open tag is the last non-blank thing on its line. Such a note is read as block content, so its
3104/// body may hold lists and other block constructs; a note opened with text on the same line reads as
3105/// inline content and a following block-level line ends it instead of joining it.
3106fn open_ref_block_bodied(chars: &[char], start: usize, end: usize) -> bool {
3107    let mut stack: Vec<bool> = Vec::new();
3108    let mut i = start;
3109    while i < end {
3110        if at(chars, i) == Some('<') {
3111            if let Some(after) = verbatim_region_end(chars, i) {
3112                i = after;
3113                continue;
3114            }
3115            if at(chars, i + 1) == Some('/') {
3116                if tag_name_matches(chars, i + 2, "ref") {
3117                    stack.pop();
3118                }
3119            } else if tag_name_matches(chars, i + 1, "ref")
3120                && let Some((_, _, self_closing, after)) = open_tag(chars, i)
3121            {
3122                if !self_closing {
3123                    let mut j = after;
3124                    while matches!(at(chars, j), Some(' ' | '\t')) {
3125                        j += 1;
3126                    }
3127                    stack.push(matches!(at(chars, j), None | Some('\n')));
3128                }
3129                i = after;
3130                continue;
3131            }
3132        }
3133        i += 1;
3134    }
3135    stack.last().copied().unwrap_or(false)
3136}
3137
3138/// The role of a recognized HTML element, or `None` when the name is not a recognized HTML tag (in
3139/// which case the surrounding `<…>` stays literal text).
3140fn html_tag_role(name: &str) -> Option<HtmlTagRole> {
3141    const INLINE: &[&str] = &[
3142        "abbr", "b", "bdi", "bdo", "big", "cite", "data", "dfn", "em", "font", "i", "ins", "q",
3143        "rb", "rt", "rtc", "ruby", "s", "small", "span", "strong", "u", "wbr",
3144    ];
3145    const BLOCK: &[&str] = &[
3146        "caption",
3147        "center",
3148        "col",
3149        "colgroup",
3150        "dd",
3151        "div",
3152        "dl",
3153        "dt",
3154        "h1",
3155        "h2",
3156        "h3",
3157        "h4",
3158        "h5",
3159        "h6",
3160        "hr",
3161        "li",
3162        "ol",
3163        "references",
3164        "rp",
3165        "table",
3166        "td",
3167        "th",
3168        "time",
3169        "tr",
3170        "ul",
3171    ];
3172    const PARAGRAPH: &[&str] = &["gallery", "p"];
3173    if INLINE.contains(&name) {
3174        Some(HtmlTagRole::Inline)
3175    } else if BLOCK.contains(&name) {
3176        Some(HtmlTagRole::Block)
3177    } else if PARAGRAPH.contains(&name) {
3178        Some(HtmlTagRole::Break)
3179    } else {
3180        None
3181    }
3182}
3183
3184/// Reads a closing tag `</name…>` at `i`, returning its lowercased name, raw text, and the index
3185/// just past `>`.
3186fn close_tag_parse(chars: &[char], i: usize) -> Option<(String, String, usize)> {
3187    if at(chars, i) != Some('<') || at(chars, i + 1) != Some('/') {
3188        return None;
3189    }
3190    let mut cursor = i + 2;
3191    let mut name = String::new();
3192    while let Some(ch) = at(chars, cursor) {
3193        if ch.is_ascii_alphanumeric() {
3194            name.push(ch.to_ascii_lowercase());
3195            cursor += 1;
3196        } else {
3197            break;
3198        }
3199    }
3200    if name.is_empty() {
3201        return None;
3202    }
3203    let gt = find_char(chars, cursor, '>')?;
3204    Some((name, collect_range(chars, i, gt + 1), gt + 1))
3205}
3206
3207/// Finds where one `<li>` item's content ends, given the index just past its `<li>` open tag.
3208/// Returns the index where the content ends and the index to resume the enclosing list scan from.
3209/// The item ends at its own `</li>` (consumed), at a sibling `<li>` (left in place), or at the
3210/// enclosing list's `</ul>`/`</ol>` (left in place); nested `<ul>`/`<ol>` lists are stepped over so
3211/// their markers do not end the item.
3212fn html_li_content_bounds(chars: &[char], start: usize) -> (usize, usize) {
3213    let n = chars.len();
3214    let mut list_depth = 0i32;
3215    let mut j = start;
3216    while j < n {
3217        if at(chars, j) == Some('<') {
3218            if at(chars, j + 1) == Some('/') {
3219                if tag_name_matches(chars, j + 2, "ul") || tag_name_matches(chars, j + 2, "ol") {
3220                    if list_depth == 0 {
3221                        return (j, j);
3222                    }
3223                    list_depth -= 1;
3224                    if let Some((_, _, after)) = close_tag_parse(chars, j) {
3225                        j = after;
3226                        continue;
3227                    }
3228                } else if list_depth == 0
3229                    && tag_name_matches(chars, j + 2, "li")
3230                    && let Some((_, _, after)) = close_tag_parse(chars, j)
3231                {
3232                    return (j, after);
3233                }
3234            } else if tag_name_matches(chars, j + 1, "ul") || tag_name_matches(chars, j + 1, "ol") {
3235                if let Some((_, _, self_closing, after)) = open_tag(chars, j) {
3236                    if !self_closing {
3237                        list_depth += 1;
3238                    }
3239                    j = after;
3240                    continue;
3241                }
3242            } else if list_depth == 0 && tag_name_matches(chars, j + 1, "li") {
3243                return (j, j);
3244            }
3245        }
3246        j += 1;
3247    }
3248    (n, n)
3249}
3250
3251/// Reads a recognized block-level HTML tag (opening, closing, or self-closing) at `i`, returning the
3252/// token it contributes to the paragraph stream and the index just past it. Inline and unrecognized
3253/// tags yield `None`.
3254fn block_tag_token(chars: &[char], i: usize) -> Option<(Tok, usize)> {
3255    let (name, raw, after) = if at(chars, i + 1) == Some('/') {
3256        close_tag_parse(chars, i)?
3257    } else {
3258        let (name, raw, _self_closing, after) = open_tag(chars, i)?;
3259        (name, raw, after)
3260    };
3261    match html_tag_role(&name)? {
3262        HtmlTagRole::Block => Some((Tok::BlockRaw(raw), after)),
3263        HtmlTagRole::Break => Some((Tok::BlockBreak, after)),
3264        HtmlTagRole::Inline => None,
3265    }
3266}
3267
3268/// Whether the list line at `pos` may begin a list: its marker run must be a single repeated marker
3269/// character. A run that mixes marker characters (`*#`, `:;`, …) has no parent item to anchor its
3270/// deeper level, so it is not a list.
3271fn list_run_uniform(chars: &[char], pos: usize) -> bool {
3272    let first = at(chars, pos);
3273    let le = line_end(chars, pos);
3274    let mut p = pos;
3275    while p < le && at(chars, p).is_some_and(is_list_marker) {
3276        if at(chars, p) != first {
3277            return false;
3278        }
3279        p += 1;
3280    }
3281    true
3282}
3283
3284/// Resolves a buffered run of inline tokens into a paragraph (or figure) block, dropping it when it
3285/// holds only whitespace. Used between block-level tags while splitting a paragraph.
3286fn flush_para_segment(
3287    segment: &mut Vec<Tok>,
3288    blocks: &mut Vec<Block>,
3289    smart: bool,
3290    east_asian: bool,
3291) {
3292    if segment.is_empty() {
3293        return;
3294    }
3295    let toks = std::mem::take(segment);
3296    let mut inlines = coalesce(strip_outer_whitespace(resolve_emphasis(toks)));
3297    if east_asian {
3298        inlines = drop_east_asian_breaks(inlines);
3299    }
3300    if smart {
3301        inlines = apply_smart_quotes(inlines);
3302    }
3303    if !inlines.is_empty() {
3304        blocks.push(para_or_figure(inlines));
3305    }
3306}
3307
3308/// Reads the value of `key` from a raw tag string, accepting quoted or bare values.
3309fn tag_attribute(raw: &str, key: &str) -> Option<String> {
3310    let chars: Vec<char> = raw.chars().collect();
3311    let n = chars.len();
3312    let mut i = 0;
3313    while i < n {
3314        match at(&chars, i) {
3315            Some(c) if c.is_ascii_alphabetic() => {
3316                let start = i;
3317                while let Some(c) = at(&chars, i) {
3318                    if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
3319                        i += 1;
3320                    } else {
3321                        break;
3322                    }
3323                }
3324                let name = collect_range(&chars, start, i).to_lowercase();
3325                while at(&chars, i).is_some_and(char::is_whitespace) {
3326                    i += 1;
3327                }
3328                if at(&chars, i) == Some('=') {
3329                    i += 1;
3330                    while at(&chars, i).is_some_and(char::is_whitespace) {
3331                        i += 1;
3332                    }
3333                    let value = if let Some(q @ ('"' | '\'')) = at(&chars, i) {
3334                        i += 1;
3335                        let vs = i;
3336                        while at(&chars, i).is_some_and(|c| c != q) {
3337                            i += 1;
3338                        }
3339                        let v = collect_range(&chars, vs, i);
3340                        i += 1;
3341                        v
3342                    } else {
3343                        let vs = i;
3344                        while at(&chars, i)
3345                            .is_some_and(|c| !c.is_whitespace() && c != '>' && c != '/')
3346                        {
3347                            i += 1;
3348                        }
3349                        collect_range(&chars, vs, i)
3350                    };
3351                    if name == key {
3352                        return Some(value);
3353                    }
3354                }
3355            }
3356            _ => i += 1,
3357        }
3358    }
3359    None
3360}
3361
3362// --- line classification ------------------------------------------------------------------------
3363
3364/// Memo tables for the heading-region lookahead, keyed by the starting char index within one
3365/// `chars` slice. A heading's text runs until the next line that opens a block, and deciding
3366/// whether a `=`-prefixed line opens its own heading needs that same lookahead, so region end and
3367/// header-ness are mutually recursive. Every recursive step advances to a strictly later line, so
3368/// the recursion always terminates on its own — but without memoization each line's region would be
3369/// recomputed once per enclosing region, which is exponential in the number of consecutive
3370/// `=`-prefixed lines. Caching each result by position collapses that to linear work per line.
3371#[derive(Default)]
3372struct HeaderScan {
3373    region_end: BTreeMap<usize, usize>,
3374    is_header: BTreeMap<usize, bool>,
3375}
3376
3377fn line_starts_block_scan(chars: &[char], ls: usize, scan: &mut HeaderScan) -> bool {
3378    match at(chars, ls) {
3379        Some('*' | '#' | ':' | ';' | ' ') => true,
3380        Some('=') => is_header_line(chars, ls, scan),
3381        Some('-') => is_hr_line(chars, ls),
3382        Some('{') => matches!(at(chars, ls + 1), Some('{' | '|')),
3383        Some('<') => starts_block_tag(chars, ls),
3384        _ => false,
3385    }
3386}
3387
3388fn is_header_line_within(chars: &[char], pos: usize) -> bool {
3389    is_header_line(chars, pos, &mut HeaderScan::default())
3390}
3391
3392fn is_header_line(chars: &[char], pos: usize, scan: &mut HeaderScan) -> bool {
3393    if let Some(&cached) = scan.is_header.get(&pos) {
3394        return cached;
3395    }
3396    let le = line_end(chars, pos);
3397    let mut m = 0;
3398    while pos + m < le && at(chars, pos + m) == Some('=') {
3399        m += 1;
3400    }
3401    let result = if m == 0 || m > 6 {
3402        false
3403    } else {
3404        let region_end = header_region_end_scan(chars, pos, scan);
3405        header_closer(chars, pos + m, region_end, m).is_some()
3406    };
3407    scan.is_header.insert(pos, result);
3408    result
3409}
3410
3411/// The end index of the span a heading's text may cover: the heading continues across lines like a
3412/// paragraph until a blank line or a line that opens its own block, and the result is the line end
3413/// of the last line still part of that span.
3414fn header_region_end_scan(chars: &[char], pos: usize, scan: &mut HeaderScan) -> usize {
3415    if let Some(&cached) = scan.region_end.get(&pos) {
3416        return cached;
3417    }
3418    let n = chars.len();
3419    // The region end of a line depends only on lines that come after it, so resolve them
3420    // back-to-front. First gather the forward run of consecutive non-blank line starts beginning at
3421    // `pos` (the run stops at the first blank line or the end of input); then fill the memo from the
3422    // last line to the first. Resolving bottom-up keeps the mutual recursion between region-end and
3423    // header-ness at constant stack depth no matter how many `=`-prefixed lines are stacked — a
3424    // naive recursive walk would instead recurse once per line and overflow the stack on adversarial
3425    // input.
3426    let mut starts = Vec::new();
3427    let mut cur = pos;
3428    loop {
3429        starts.push(cur);
3430        let le = line_end(chars, cur);
3431        if le >= n {
3432            break;
3433        }
3434        let next = le + 1;
3435        if next >= n {
3436            break;
3437        }
3438        let next_end = line_end(chars, next);
3439        if is_blank(chars, next, next_end) {
3440            break;
3441        }
3442        cur = next;
3443    }
3444    for &s in starts.iter().rev() {
3445        if scan.region_end.contains_key(&s) {
3446            continue;
3447        }
3448        let le = line_end(chars, s);
3449        let region = if le >= n {
3450            le
3451        } else {
3452            let next = le + 1;
3453            if next >= n {
3454                le
3455            } else {
3456                let next_end = line_end(chars, next);
3457                // `next` is the following run element (already resolved) or a blank/EOF line, so
3458                // `line_starts_block_scan` and the recursive lookup below both hit the memo.
3459                if is_blank(chars, next, next_end) || line_starts_block_scan(chars, next, scan) {
3460                    le
3461                } else {
3462                    header_region_end_scan(chars, next, scan)
3463                }
3464            }
3465        };
3466        scan.region_end.insert(s, region);
3467    }
3468    scan.region_end
3469        .get(&pos)
3470        .copied()
3471        .unwrap_or_else(|| line_end(chars, pos))
3472}
3473
3474/// The index of the first bare `=` run after the heading text, when that run is at least `m` long;
3475/// otherwise no valid closer. Constructs (templates, links, tags) are skipped so an `=` inside them
3476/// is not mistaken for the closer.
3477fn header_closer(chars: &[char], content_start: usize, line_end: usize, m: usize) -> Option<usize> {
3478    let mut i = content_start;
3479    while i < line_end {
3480        if let Some(next) = skip_construct(chars, i)
3481            && next > i
3482        {
3483            i = next.min(line_end);
3484            continue;
3485        }
3486        if at(chars, i) == Some('=') {
3487            let mut j = i;
3488            while j < line_end && at(chars, j) == Some('=') {
3489                j += 1;
3490            }
3491            return if j - i >= m { Some(i) } else { None };
3492        }
3493        i += 1;
3494    }
3495    None
3496}
3497
3498fn is_hr_line(chars: &[char], pos: usize) -> bool {
3499    let le = line_end(chars, pos);
3500    let mut k = pos;
3501    while k < le && at(chars, k) == Some('-') {
3502        k += 1;
3503    }
3504    k - pos >= 4 && is_blank(chars, k, le)
3505}
3506
3507/// Splits a definition term at the first top-level `:`, skipping constructs so a `:` inside a link
3508/// or template is not treated as the separator.
3509fn split_term(content: &str) -> (String, Option<String>) {
3510    let chars: Vec<char> = content.chars().collect();
3511    let n = chars.len();
3512    let mut i = 0;
3513    while i < n {
3514        if let Some(next) = skip_construct(&chars, i)
3515            && next > i
3516        {
3517            i = next;
3518            continue;
3519        }
3520        // A bare URL is stepped over whole so the `:` in its scheme is not read as the separator.
3521        if let Some((_, next)) = bare_url(&chars, i)
3522            && next > i
3523        {
3524            i = next;
3525            continue;
3526        }
3527        if at(&chars, i) == Some(':') {
3528            let before = collect_range(&chars, 0, i).trim().to_string();
3529            let after = collect_range(&chars, i + 1, n).trim().to_string();
3530            return (before, Some(after));
3531        }
3532        i += 1;
3533    }
3534    (content.trim().to_string(), None)
3535}
3536
3537/// If an inline construct opens at `i`, the index just past it: `{{…}}`, `[[…]]`, `[…]`, or `<…>`.
3538fn skip_construct(chars: &[char], i: usize) -> Option<usize> {
3539    match at(chars, i) {
3540        Some('{') if at(chars, i + 1) == Some('{') => balanced_braces(chars, i),
3541        Some('[') if at(chars, i + 1) == Some('[') => {
3542            find_seq(chars, i + 2, &[']', ']']).map(|c| c + 2)
3543        }
3544        Some('[') => find_char(chars, i + 1, ']').map(|c| c + 1),
3545        Some('<') => find_char(chars, i, '>').map(|c| c + 1),
3546        _ => None,
3547    }
3548}
3549
3550/// The index just past the `}}` that balances the `{{` at `i`, accounting for nesting.
3551/// Whether the `{{` at `i` opens a template transclusion. A template name begins with a letter, a
3552/// digit, or a `:` (a leading-colon main-namespace reference); a `{{` followed by anything else —
3553/// whitespace, a parser-function `#`, a pipe, or `}}` — is literal braces, not a template.
3554fn template_opens(chars: &[char], i: usize) -> bool {
3555    matches!(at(chars, i + 2), Some(c) if c.is_alphanumeric() || c == ':')
3556}
3557
3558fn balanced_braces(chars: &[char], i: usize) -> Option<usize> {
3559    let mut depth = 0i32;
3560    let mut j = i;
3561    let n = chars.len();
3562    while j < n {
3563        if at(chars, j) == Some('{') && at(chars, j + 1) == Some('{') {
3564            depth += 1;
3565            j += 2;
3566        } else if at(chars, j) == Some('}') && at(chars, j + 1) == Some('}') {
3567            depth -= 1;
3568            j += 2;
3569            if depth == 0 {
3570                return Some(j);
3571            }
3572        } else {
3573            j += 1;
3574        }
3575    }
3576    None
3577}
3578
3579// --- small helpers ------------------------------------------------------------------------------
3580
3581fn is_list_marker(c: char) -> bool {
3582    matches!(c, '*' | '#' | ':' | ';')
3583}
3584
3585/// Flat fallback used when block nesting reaches [`MAX_BLOCK_DEPTH`]: the remaining text becomes a
3586/// single paragraph of its literal content, with no further block structure parsed, so deeply
3587/// stacked constructs cannot exhaust the stack during parsing or serialization.
3588fn degraded_blocks(chars: &[char]) -> Vec<Block> {
3589    let text = collect_range(chars, 0, chars.len());
3590    let trimmed = text.trim();
3591    if trimmed.is_empty() {
3592        Vec::new()
3593    } else {
3594        vec![Block::Para(vec![Inline::Str(trimmed.into())])]
3595    }
3596}
3597
3598/// Consumes the whitespace run beginning at `from`, returning a single break token (soft when the
3599/// run spans a newline, otherwise a space) and the index just past the run.
3600fn whitespace_token(chars: &[char], from: usize) -> (Inline, usize) {
3601    let mut i = from;
3602    let mut has_newline = false;
3603    while let Some(w) = at(chars, i) {
3604        if w.is_whitespace() {
3605            if w == '\n' {
3606                has_newline = true;
3607            }
3608            i += 1;
3609        } else {
3610            break;
3611        }
3612    }
3613    let token = if has_newline {
3614        Inline::SoftBreak
3615    } else {
3616        Inline::Space
3617    };
3618    (token, i)
3619}
3620
3621fn list_kind(marker: char) -> ListKind {
3622    match marker {
3623        '#' => ListKind::Ordered,
3624        ';' | ':' => ListKind::Definition,
3625        _ => ListKind::Bullet,
3626    }
3627}
3628
3629/// Parses `<code>`-family verbatim content into a [`Inline::Code`] node carrying `classes`, with
3630/// entity references decoded. An unterminated tag degrades to its literal opening as raw HTML.
3631fn verbatim_code(
3632    chars: &[char],
3633    name: &str,
3634    after_open: usize,
3635    raw_open: &str,
3636    self_closing: bool,
3637    classes: &[&str],
3638) -> (Vec<Inline>, usize) {
3639    if self_closing {
3640        return (vec![raw_html(raw_open.to_string())], after_open);
3641    }
3642    match close_tag(chars, after_open, name) {
3643        Some((inner_end, after)) => {
3644            let inner = collect_range(chars, after_open, inner_end);
3645            let attr = Attr {
3646                id: carta_ast::Text::default(),
3647                classes: classes.iter().map(|s| (*s).into()).collect(),
3648                attributes: Vec::new(),
3649            };
3650            (
3651                vec![Inline::Code(Box::new(attr), decode_entities(&inner).into())],
3652                after,
3653            )
3654        }
3655        None => (vec![raw_html(raw_open.to_string())], after_open),
3656    }
3657}
3658
3659fn default_list_attrs() -> ListAttributes {
3660    ListAttributes {
3661        start: 1,
3662        style: ListNumberStyle::DefaultStyle,
3663        delim: ListNumberDelim::DefaultDelim,
3664    }
3665}
3666
3667fn finish_inline_block(chars: &[char], pos: usize) -> (usize, bool) {
3668    let le = line_end(chars, pos);
3669    if is_blank(chars, pos, le) {
3670        let next = if le < chars.len() { le + 1 } else { le };
3671        (next, true)
3672    } else {
3673        (pos, false)
3674    }
3675}
3676
3677fn trim_code(inner: &str) -> String {
3678    let stripped = inner
3679        .strip_prefix("\r\n")
3680        .or_else(|| inner.strip_prefix('\n'))
3681        .unwrap_or(inner);
3682    stripped
3683        .strip_suffix("\r\n")
3684        .or_else(|| stripped.strip_suffix('\n'))
3685        .unwrap_or(stripped)
3686        .to_string()
3687}
3688
3689fn flush_word(word: &mut String, toks: &mut Vec<Tok>) {
3690    if !word.is_empty() {
3691        toks.push(Tok::Inline(Inline::Str(std::mem::take(word).into())));
3692    }
3693}
3694
3695fn raw_html(text: String) -> Inline {
3696    Inline::RawInline(Format("html".into()), text.into())
3697}
3698
3699fn format_mediawiki() -> Format {
3700    Format("mediawiki".into())
3701}
3702
3703fn format_html() -> Format {
3704    Format("html".into())
3705}
3706
3707fn at(chars: &[char], i: usize) -> Option<char> {
3708    chars.get(i).copied()
3709}
3710
3711fn collect_range(chars: &[char], start: usize, end: usize) -> String {
3712    if end <= start {
3713        return String::new();
3714    }
3715    chars.iter().skip(start).take(end - start).collect()
3716}
3717
3718/// Finds the index one past the end of a table block opening with `{|` at `pos`. Opening (`{|`) and
3719/// closing (`|}`) markers are matched by depth, scanning whole lines, so a nested table does not
3720/// close the outer one early; an unterminated table runs to the end of input.
3721fn table_block_end(chars: &[char], pos: usize) -> usize {
3722    let n = chars.len();
3723    let mut depth = 0usize;
3724    let mut line = pos;
3725    loop {
3726        let mut content = line;
3727        while matches!(at(chars, content), Some(' ' | '\t')) {
3728            content += 1;
3729        }
3730        if at(chars, content) == Some('{') && at(chars, content + 1) == Some('|') {
3731            depth += 1;
3732        } else if at(chars, content) == Some('|') && at(chars, content + 1) == Some('}') {
3733            depth = depth.saturating_sub(1);
3734            if depth == 0 {
3735                return content + 2;
3736            }
3737        }
3738        let le = line_end(chars, line);
3739        if le >= n {
3740            return n;
3741        }
3742        line = le + 1;
3743    }
3744}
3745
3746/// The number of grid columns a cell spans, never less than one.
3747/// Scans the body of a `{|…|}` region into its rows of raw cells and an optional caption.
3748/// Each `|-` closes the current row; nested tables are passed through verbatim as cell content.
3749fn scan_table_region(region: &str) -> (Vec<Vec<RawCell>>, Option<String>) {
3750    let mut caption_text: Option<String> = None;
3751    let mut rows: Vec<Vec<RawCell>> = Vec::new();
3752    let mut cur: Vec<RawCell> = Vec::new();
3753    let mut open = OpenTarget::None;
3754    let mut nest = 0i32;
3755
3756    let mut lines = region.lines();
3757    lines.next(); // The opening `{|` line; any table attribute list it carries is dropped.
3758    for line in lines {
3759        let trimmed = line.trim_start();
3760        if nest > 0 {
3761            if trimmed.starts_with("{|") {
3762                nest += 1;
3763            } else if trimmed.starts_with("|}") {
3764                nest -= 1;
3765            }
3766            append_continuation(open, &mut cur, &mut caption_text, line);
3767            continue;
3768        }
3769        if trimmed.starts_with("|}") {
3770            break;
3771        }
3772        if trimmed.starts_with("{|") {
3773            nest += 1;
3774            append_continuation(open, &mut cur, &mut caption_text, line);
3775            continue;
3776        }
3777        if let Some(rest) = trimmed.strip_prefix("|+") {
3778            caption_text = Some(rest.to_string());
3779            open = OpenTarget::Caption;
3780            continue;
3781        }
3782        if trimmed.starts_with("|-") {
3783            rows.push(std::mem::take(&mut cur));
3784            open = OpenTarget::None;
3785            continue;
3786        }
3787        if let Some(rest) = trimmed.strip_prefix('|') {
3788            cur.extend(parse_cell_line(false, rest));
3789            open = OpenTarget::Cell;
3790            continue;
3791        }
3792        if let Some(rest) = trimmed.strip_prefix('!') {
3793            cur.extend(parse_cell_line(true, rest));
3794            open = OpenTarget::Cell;
3795            continue;
3796        }
3797        append_continuation(open, &mut cur, &mut caption_text, line);
3798    }
3799    rows.push(cur);
3800    (rows, caption_text)
3801}
3802
3803/// Builds the column specifications from the first row, taking each column's alignment from the
3804/// cell that opens it and defaulting every column's width.
3805fn column_specs(rows: &[Vec<RawCell>], ncols: usize) -> Vec<ColSpec> {
3806    let mut aligns: Vec<Alignment> = Vec::new();
3807    if let Some(first) = rows.first() {
3808        for cell in first {
3809            for _ in 0..col_count(cell.col_span) {
3810                aligns.push(cell.align.clone());
3811            }
3812        }
3813    }
3814    aligns.resize(ncols, Alignment::AlignDefault);
3815    aligns
3816        .into_iter()
3817        .map(|align| ColSpec {
3818            align,
3819            width: ColWidth::ColWidthDefault,
3820        })
3821        .collect()
3822}
3823
3824fn col_count(col_span: i32) -> usize {
3825    usize::try_from(col_span.max(1)).unwrap_or(1)
3826}
3827
3828/// A blank single-column cell used to fill a row that covers fewer columns than the table is wide.
3829fn empty_cell() -> Cell {
3830    Cell {
3831        attr: Attr::default(),
3832        align: Alignment::AlignDefault,
3833        row_span: 1,
3834        col_span: 1,
3835        content: Vec::new(),
3836    }
3837}
3838
3839/// Appends a table continuation line to whichever construct is currently open.
3840fn append_continuation(
3841    open: OpenTarget,
3842    cur: &mut [RawCell],
3843    caption: &mut Option<String>,
3844    line: &str,
3845) {
3846    match open {
3847        OpenTarget::Cell => {
3848            if let Some(cell) = cur.last_mut() {
3849                cell.content.push('\n');
3850                cell.content.push_str(line);
3851            }
3852        }
3853        OpenTarget::Caption => {
3854            if let Some(text) = caption {
3855                text.push('\n');
3856                text.push_str(line);
3857            }
3858        }
3859        OpenTarget::None => {}
3860    }
3861}
3862
3863/// Splits one cell-marker line into its cells. A `|` data line separates cells with `||`; a `!`
3864/// header line additionally separates them with `!!`.
3865fn parse_cell_line(is_header: bool, rest: &str) -> Vec<RawCell> {
3866    split_cells(rest, is_header)
3867        .iter()
3868        .map(|chunk| parse_cell_chunk(is_header, chunk))
3869        .collect()
3870}
3871
3872/// Splits a marker line's text into per-cell chunks at top-level `||` (and, for a header line, `!!`)
3873/// separators, leaving separators inside `[…]` or `{…}` groups untouched.
3874fn split_cells(s: &str, header: bool) -> Vec<String> {
3875    let chars: Vec<char> = s.chars().collect();
3876    let n = chars.len();
3877    let mut out: Vec<String> = Vec::new();
3878    let mut start = 0usize;
3879    let mut square = 0i32;
3880    let mut curly = 0i32;
3881    let mut i = 0usize;
3882    while i < n {
3883        match at(&chars, i) {
3884            Some('[') => square += 1,
3885            Some(']') => square = (square - 1).max(0),
3886            Some('{') => curly += 1,
3887            Some('}') => curly = (curly - 1).max(0),
3888            _ => {}
3889        }
3890        if square == 0 && curly == 0 {
3891            let pipe = at(&chars, i) == Some('|') && at(&chars, i + 1) == Some('|');
3892            let bang = header && at(&chars, i) == Some('!') && at(&chars, i + 1) == Some('!');
3893            if pipe || bang {
3894                out.push(collect_range(&chars, start, i));
3895                i += 2;
3896                start = i;
3897                continue;
3898            }
3899        }
3900        i += 1;
3901    }
3902    out.push(collect_range(&chars, start, n));
3903    out
3904}
3905
3906/// Parses one cell chunk into a [`RawCell`], splitting a leading attribute list from the content at
3907/// the first top-level `|` when the text before it is a valid attribute list.
3908fn parse_cell_chunk(is_header: bool, chunk: &str) -> RawCell {
3909    if let Some(idx) = find_attr_pipe(chunk)
3910        && let Some(attrs) = parse_cell_attrs(chunk.get(..idx).unwrap_or(""))
3911    {
3912        return RawCell {
3913            is_header,
3914            align: attrs.align,
3915            col_span: attrs.col_span,
3916            row_span: attrs.row_span,
3917            attr: attrs.attr,
3918            content: chunk.get(idx + 1..).unwrap_or("").to_string(),
3919        };
3920    }
3921    RawCell {
3922        is_header,
3923        align: Alignment::AlignDefault,
3924        col_span: 1,
3925        row_span: 1,
3926        attr: Attr::default(),
3927        content: chunk.to_string(),
3928    }
3929}
3930
3931/// Finds the byte offset of the first top-level `|` in a cell chunk — the boundary between a leading
3932/// attribute list and the cell content — skipping any `|` inside `[…]` or `{…}` groups.
3933fn find_attr_pipe(s: &str) -> Option<usize> {
3934    let mut square = 0i32;
3935    let mut curly = 0i32;
3936    let mut in_quote = false;
3937    for (i, ch) in s.char_indices() {
3938        if in_quote {
3939            if ch == '"' {
3940                in_quote = false;
3941            }
3942            continue;
3943        }
3944        match ch {
3945            '"' => in_quote = true,
3946            '[' => square += 1,
3947            ']' => square = (square - 1).max(0),
3948            '{' => curly += 1,
3949            '}' => curly = (curly - 1).max(0),
3950            '|' if square == 0 && curly == 0 => return Some(i),
3951            _ => {}
3952        }
3953    }
3954    None
3955}
3956
3957/// Parses a cell's leading attribute list. `align` maps to a column alignment, `colspan`/`rowspan`
3958/// to spans, `id`/`class` to the cell's identifier and classes, and everything else to a key/value
3959/// attribute. A bare token without a value is not a valid attribute list, so the whole text is
3960/// content instead — signalled by [`None`].
3961fn parse_cell_attrs(s: &str) -> Option<CellAttrs> {
3962    let chars: Vec<char> = s.chars().collect();
3963    let n = chars.len();
3964    let mut i = 0usize;
3965    let mut id = String::new();
3966    let mut classes: Vec<String> = Vec::new();
3967    let mut attributes: Vec<(String, String)> = Vec::new();
3968    let mut align = Alignment::AlignDefault;
3969    let mut col_span = 1i32;
3970    let mut row_span = 1i32;
3971    let mut any = false;
3972    while i < n {
3973        while at(&chars, i).is_some_and(char::is_whitespace) {
3974            i += 1;
3975        }
3976        if i >= n {
3977            break;
3978        }
3979        let name_start = i;
3980        while at(&chars, i).is_some_and(|c| !c.is_whitespace() && c != '=') {
3981            i += 1;
3982        }
3983        let name = collect_range(&chars, name_start, i);
3984        if name.is_empty() || at(&chars, i) != Some('=') {
3985            return None;
3986        }
3987        i += 1;
3988        let value = if at(&chars, i) == Some('"') {
3989            i += 1;
3990            let value_start = i;
3991            while at(&chars, i).is_some_and(|c| c != '"') {
3992                i += 1;
3993            }
3994            let value = collect_range(&chars, value_start, i);
3995            if at(&chars, i) == Some('"') {
3996                i += 1;
3997            }
3998            value
3999        } else {
4000            let value_start = i;
4001            while at(&chars, i).is_some_and(|c| !c.is_whitespace()) {
4002                i += 1;
4003            }
4004            collect_range(&chars, value_start, i)
4005        };
4006        any = true;
4007        match name.to_ascii_lowercase().as_str() {
4008            "id" => id = value,
4009            "class" => classes.extend(value.split_whitespace().map(str::to_string)),
4010            "align" => match value.to_ascii_lowercase().as_str() {
4011                "left" => align = Alignment::AlignLeft,
4012                "right" => align = Alignment::AlignRight,
4013                "center" => align = Alignment::AlignCenter,
4014                _ => attributes.push(("align".to_string(), value)),
4015            },
4016            "colspan" => match value.trim().parse::<i32>() {
4017                Ok(v) if v >= 1 => col_span = v,
4018                _ => attributes.push(("colspan".to_string(), value)),
4019            },
4020            "rowspan" => match value.trim().parse::<i32>() {
4021                Ok(v) if v >= 1 => row_span = v,
4022                _ => attributes.push(("rowspan".to_string(), value)),
4023            },
4024            _ => attributes.push((name, value)),
4025        }
4026    }
4027    if !any {
4028        return None;
4029    }
4030    Some(CellAttrs {
4031        align,
4032        col_span,
4033        row_span,
4034        attr: Attr {
4035            id: id.into(),
4036            classes: classes.into_iter().map(Into::into).collect(),
4037            attributes: attributes
4038                .into_iter()
4039                .map(|(k, v)| (k.into(), v.into()))
4040                .collect(),
4041        },
4042    })
4043}
4044
4045fn line_end(chars: &[char], pos: usize) -> usize {
4046    find_char(chars, pos, '\n').unwrap_or(chars.len())
4047}
4048
4049fn is_blank(chars: &[char], start: usize, end: usize) -> bool {
4050    (start..end).all(|j| at(chars, j).is_none_or(char::is_whitespace))
4051}
4052
4053fn find_char(chars: &[char], from: usize, target: char) -> Option<usize> {
4054    (from..chars.len()).find(|&j| at(chars, j) == Some(target))
4055}
4056
4057fn find_seq(chars: &[char], from: usize, seq: &[char]) -> Option<usize> {
4058    let n = chars.len();
4059    let m = seq.len();
4060    if m == 0 || n < m {
4061        return None;
4062    }
4063    (from..=n - m).find(|&j| (0..m).all(|k| at(chars, j + k) == seq.get(k).copied()))
4064}
4065
4066/// Scans an internal link's target from `start`: it ends at the first `|` or the first `]]`,
4067/// whichever comes first, with no nesting tracked. Returns the end index and whether a `|` (rather
4068/// than `]]`) was the delimiter, or `None` if neither appears.
4069fn scan_link_target(chars: &[char], start: usize) -> Option<(usize, bool)> {
4070    let mut i = start;
4071    while let Some(c) = at(chars, i) {
4072        if c == '|' {
4073            return Some((i, true));
4074        }
4075        if c == ']' && at(chars, i + 1) == Some(']') {
4076            return Some((i, false));
4077        }
4078        i += 1;
4079    }
4080    None
4081}
4082
4083/// Finds the `]]` that closes an internal link whose label may hold nested `[[ … ]]` links, stepping
4084/// over each balanced inner pair so only the outer close is returned.
4085fn find_link_close(chars: &[char], start: usize) -> Option<usize> {
4086    let mut depth = 0usize;
4087    let mut i = start;
4088    while let Some(c) = at(chars, i) {
4089        if c == '[' && at(chars, i + 1) == Some('[') {
4090            depth += 1;
4091            i += 2;
4092        } else if c == ']' && at(chars, i + 1) == Some(']') {
4093            if depth == 0 {
4094                return Some(i);
4095            }
4096            depth -= 1;
4097            i += 2;
4098        } else {
4099            i += 1;
4100        }
4101    }
4102    None
4103}
4104
4105fn matches_prefix_ci(chars: &[char], i: usize, prefix: &str) -> bool {
4106    prefix
4107        .chars()
4108        .enumerate()
4109        .all(|(k, pc)| match at(chars, i + k) {
4110            Some(c) => c.eq_ignore_ascii_case(&pc),
4111            None => false,
4112        })
4113}
4114
4115#[cfg(test)]
4116mod tests {
4117    use super::*;
4118
4119    fn parse(input: &str) -> Vec<Block> {
4120        let mut options = ReaderOptions::default();
4121        options.extensions = Extensions::from_list(&[Extension::AutoIdentifiers]);
4122        MediawikiReader
4123            .read(input, &options)
4124            .expect("read should not fail")
4125            .blocks
4126    }
4127
4128    fn parse_gfm(input: &str) -> Vec<Block> {
4129        let mut options = ReaderOptions::default();
4130        options.extensions = Extensions::from_list(&[Extension::GfmAutoIdentifiers]);
4131        MediawikiReader.read(input, &options).expect("read").blocks
4132    }
4133
4134    #[test]
4135    fn doi_and_javascript_are_recognized_schemes() {
4136        assert!(is_scheme("doi"));
4137        assert!(is_scheme("javascript"));
4138        assert!(is_scheme("DOI"));
4139        assert!(is_scheme("http"));
4140        assert!(!is_scheme("notascheme"));
4141    }
4142
4143    fn cell_with(content: Vec<Block>) -> Cell {
4144        Cell {
4145            attr: Attr::default(),
4146            align: Alignment::AlignDefault,
4147            row_span: 1,
4148            col_span: 1,
4149            content,
4150        }
4151    }
4152
4153    fn data_cell(text: &str) -> Cell {
4154        cell_with(vec![Block::Para(vec![Inline::Str(text.into())])])
4155    }
4156
4157    fn table_row(cells: Vec<Cell>) -> Row {
4158        Row {
4159            attr: Attr::default(),
4160            cells,
4161        }
4162    }
4163
4164    fn default_col() -> ColSpec {
4165        ColSpec {
4166            align: Alignment::AlignDefault,
4167            width: ColWidth::ColWidthDefault,
4168        }
4169    }
4170
4171    #[test]
4172    fn table_markup_becomes_a_table() {
4173        assert_eq!(
4174            parse("{|\n! Header\n|-\n| Cell\n|}\nafter"),
4175            vec![
4176                Block::Table(Box::new(Table {
4177                    col_specs: vec![default_col()],
4178                    head: TableHead {
4179                        rows: vec![table_row(vec![data_cell("Header")])],
4180                        ..Default::default()
4181                    },
4182                    bodies: vec![TableBody {
4183                        body: vec![table_row(vec![data_cell("Cell")])],
4184                        ..Default::default()
4185                    }],
4186                    ..Default::default()
4187                })),
4188                Block::Para(vec![Inline::Str("after".into())]),
4189            ]
4190        );
4191    }
4192
4193    #[test]
4194    fn unterminated_table_markup_does_not_panic() {
4195        assert_eq!(
4196            parse("{|"),
4197            vec![Block::Table(Box::new(Table {
4198                bodies: vec![TableBody {
4199                    body: vec![table_row(Vec::new())],
4200                    ..Default::default()
4201                }],
4202                ..Default::default()
4203            }))]
4204        );
4205    }
4206
4207    #[test]
4208    fn nested_table_markup_closes_at_the_outer_marker() {
4209        let inner = Block::Table(Box::new(Table {
4210            col_specs: vec![default_col()],
4211            bodies: vec![TableBody {
4212                body: vec![table_row(vec![data_cell("inner")])],
4213                ..Default::default()
4214            }],
4215            ..Default::default()
4216        }));
4217        assert_eq!(
4218            parse("{|\n|\n{|\n| inner\n|}\n|}"),
4219            vec![Block::Table(Box::new(Table {
4220                col_specs: vec![default_col()],
4221                bodies: vec![TableBody {
4222                    body: vec![table_row(vec![cell_with(vec![inner])])],
4223                    ..Default::default()
4224                }],
4225                ..Default::default()
4226            }))]
4227        );
4228    }
4229
4230    #[test]
4231    fn paragraph_joins_lines_with_soft_breaks() {
4232        assert_eq!(
4233            parse("one two\nthree"),
4234            vec![Block::Para(vec![
4235                Inline::Str("one".into()),
4236                Inline::Space,
4237                Inline::Str("two".into()),
4238                Inline::SoftBreak,
4239                Inline::Str("three".into()),
4240            ])]
4241        );
4242    }
4243
4244    #[test]
4245    fn emphasis_runs_decompose() {
4246        assert_eq!(
4247            parse("''i'' '''b''' '''''both'''''"),
4248            vec![Block::Para(vec![
4249                Inline::Emph(vec![Inline::Str("i".into())]),
4250                Inline::Space,
4251                Inline::Strong(vec![Inline::Str("b".into())]),
4252                Inline::Space,
4253                Inline::Strong(vec![Inline::Emph(vec![Inline::Str("both".into())])]),
4254            ])]
4255        );
4256    }
4257
4258    #[test]
4259    fn header_carries_mediawiki_identifier() {
4260        assert_eq!(
4261            parse("== Hello World =="),
4262            vec![Block::Header(
4263                2,
4264                Box::new(Attr {
4265                    id: "hello_world".into(),
4266                    classes: vec![],
4267                    attributes: vec![],
4268                }),
4269                vec![
4270                    Inline::Str("Hello".into()),
4271                    Inline::Space,
4272                    Inline::Str("World".into()),
4273                ],
4274            )]
4275        );
4276    }
4277
4278    #[test]
4279    fn duplicate_identifiers_are_suffixed() {
4280        let blocks = parse("== Dup ==\n== Dup ==");
4281        let ids: Vec<String> = blocks
4282            .iter()
4283            .filter_map(|b| match b {
4284                Block::Header(_, attr, _) => Some(attr.id.to_string()),
4285                _ => None,
4286            })
4287            .collect();
4288        assert_eq!(ids, vec!["dup".to_string(), "dup_1".to_string()]);
4289    }
4290
4291    #[test]
4292    fn gfm_identifier_scheme_uses_hyphens() {
4293        let blocks = parse_gfm("== Hello World ==");
4294        match blocks.first() {
4295            Some(Block::Header(_, attr, _)) => assert_eq!(attr.id, "hello-world"),
4296            other => panic!("expected header, got {other:?}"),
4297        }
4298    }
4299
4300    #[test]
4301    fn empty_identifier_falls_back_to_section() {
4302        let blocks = parse("== !!! ==\n== ??? ==");
4303        let ids: Vec<String> = blocks
4304            .iter()
4305            .filter_map(|b| match b {
4306                Block::Header(_, attr, _) => Some(attr.id.to_string()),
4307                _ => None,
4308            })
4309            .collect();
4310        assert_eq!(ids, vec!["section".to_string(), "section_1".to_string()]);
4311    }
4312
4313    #[test]
4314    fn malformed_header_is_a_paragraph() {
4315        assert_eq!(
4316            parse("== a=b =="),
4317            vec![Block::Para(vec![
4318                Inline::Str("==".into()),
4319                Inline::Space,
4320                Inline::Str("a=b".into()),
4321                Inline::Space,
4322                Inline::Str("==".into()),
4323            ])]
4324        );
4325    }
4326
4327    #[test]
4328    fn header_leftover_becomes_paragraph() {
4329        assert_eq!(
4330            parse("== H ==="),
4331            vec![
4332                Block::Header(
4333                    2,
4334                    Box::new(Attr {
4335                        id: "h".into(),
4336                        classes: vec![],
4337                        attributes: vec![],
4338                    }),
4339                    vec![Inline::Str("H".into())],
4340                ),
4341                Block::Para(vec![Inline::Str("=".into())]),
4342            ]
4343        );
4344    }
4345
4346    #[test]
4347    fn nested_bullets_and_ordered() {
4348        assert_eq!(
4349            parse("* a\n** b\n*# c"),
4350            vec![Block::BulletList(vec![vec![
4351                Block::Plain(vec![Inline::Str("a".into())]),
4352                Block::BulletList(vec![vec![Block::Plain(vec![Inline::Str("b".into())])]]),
4353                Block::OrderedList(
4354                    default_list_attrs(),
4355                    vec![vec![Block::Plain(vec![Inline::Str("c".into())])]]
4356                ),
4357            ]])]
4358        );
4359    }
4360
4361    #[test]
4362    fn definition_list_splits_inline_definition() {
4363        assert_eq!(
4364            parse("; term : def"),
4365            vec![Block::DefinitionList(vec![(
4366                vec![Inline::Str("term".into())],
4367                vec![vec![Block::Plain(vec![Inline::Str("def".into())])]],
4368            )])]
4369        );
4370    }
4371
4372    #[test]
4373    fn internal_link_with_trail() {
4374        assert_eq!(
4375            parse("[[Page]]s"),
4376            vec![Block::Para(vec![Inline::Link(
4377                Box::new(Attr {
4378                    id: carta_ast::Text::default(),
4379                    classes: vec!["wikilink".into()],
4380                    attributes: vec![],
4381                }),
4382                vec![Inline::Str("Pages".into())],
4383                Box::new(Target {
4384                    url: "Page".into(),
4385                    title: "Page".into(),
4386                }),
4387            )])]
4388        );
4389    }
4390
4391    #[test]
4392    fn lone_file_embed_becomes_a_figure() {
4393        assert_eq!(
4394            parse("[[File:Foo.jpg|thumb|A caption]]"),
4395            vec![Block::Figure(
4396                Box::default(),
4397                Box::new(Caption {
4398                    short: None,
4399                    long: vec![Block::Plain(vec![
4400                        Inline::Str("A".into()),
4401                        Inline::Space,
4402                        Inline::Str("caption".into()),
4403                    ])],
4404                }),
4405                vec![Block::Plain(vec![Inline::Image(
4406                    Box::default(),
4407                    vec![],
4408                    Box::new(Target {
4409                        url: "Foo.jpg".into(),
4410                        title: "A caption".into(),
4411                    }),
4412                )])],
4413            )]
4414        );
4415    }
4416
4417    #[test]
4418    fn embed_without_caption_defaults_to_the_file_name() {
4419        assert_eq!(
4420            parse("[[Image:My Photo.jpg]]"),
4421            vec![Block::Figure(
4422                Box::default(),
4423                Box::new(Caption {
4424                    short: None,
4425                    long: vec![Block::Plain(vec![Inline::Str("My_Photo.jpg".into())])],
4426                }),
4427                vec![Block::Plain(vec![Inline::Image(
4428                    Box::default(),
4429                    vec![],
4430                    Box::new(Target {
4431                        url: "My_Photo.jpg".into(),
4432                        title: "My_Photo.jpg".into(),
4433                    }),
4434                )])],
4435            )]
4436        );
4437    }
4438
4439    #[test]
4440    fn embed_size_parameters_set_width_and_height() {
4441        assert_eq!(
4442            parse("[[File:Foo.jpg|100x200px|cap]]"),
4443            vec![Block::Figure(
4444                Box::default(),
4445                Box::new(Caption {
4446                    short: None,
4447                    long: vec![Block::Plain(vec![Inline::Str("cap".into())])],
4448                }),
4449                vec![Block::Plain(vec![Inline::Image(
4450                    Box::new(Attr {
4451                        id: carta_ast::Text::default(),
4452                        classes: vec![],
4453                        attributes: vec![
4454                            ("width".into(), "100".into()),
4455                            ("height".into(), "200".into()),
4456                        ],
4457                    }),
4458                    vec![],
4459                    Box::new(Target {
4460                        url: "Foo.jpg".into(),
4461                        title: "cap".into(),
4462                    }),
4463                )])],
4464            )]
4465        );
4466    }
4467
4468    #[test]
4469    fn inline_embed_stays_an_image_not_a_figure() {
4470        assert_eq!(
4471            parse("x [[File:Foo.jpg|cap]]"),
4472            vec![Block::Para(vec![
4473                Inline::Str("x".into()),
4474                Inline::Space,
4475                Inline::Image(
4476                    Box::default(),
4477                    vec![Inline::Str("cap".into())],
4478                    Box::new(Target {
4479                        url: "Foo.jpg".into(),
4480                        title: "cap".into(),
4481                    }),
4482                ),
4483            ])]
4484        );
4485    }
4486
4487    #[test]
4488    fn empty_file_embed_is_an_ordinary_wikilink() {
4489        assert_eq!(
4490            parse("[[File:]]"),
4491            vec![Block::Para(vec![Inline::Link(
4492                Box::new(Attr {
4493                    id: carta_ast::Text::default(),
4494                    classes: vec!["wikilink".into()],
4495                    attributes: vec![],
4496                }),
4497                vec![Inline::Str("File:".into())],
4498                Box::new(Target {
4499                    url: "File:".into(),
4500                    title: "File:".into(),
4501                }),
4502            )])]
4503        );
4504    }
4505
4506    #[test]
4507    fn external_links_number_and_label() {
4508        assert_eq!(
4509            parse("[http://x.com lbl] [http://y.com]"),
4510            vec![Block::Para(vec![
4511                Inline::Link(
4512                    Box::default(),
4513                    vec![Inline::Str("lbl".into())],
4514                    Box::new(Target {
4515                        url: "http://x.com".into(),
4516                        title: carta_ast::Text::default(),
4517                    }),
4518                ),
4519                Inline::Space,
4520                Inline::Link(
4521                    Box::default(),
4522                    vec![Inline::Str("1".into())],
4523                    Box::new(Target {
4524                        url: "http://y.com".into(),
4525                        title: carta_ast::Text::default(),
4526                    }),
4527                ),
4528            ])]
4529        );
4530    }
4531
4532    #[test]
4533    fn bare_url_trims_trailing_punctuation() {
4534        assert_eq!(
4535            parse("see http://x.com."),
4536            vec![Block::Para(vec![
4537                Inline::Str("see".into()),
4538                Inline::Space,
4539                Inline::Link(
4540                    Box::default(),
4541                    vec![Inline::Str("http://x.com".into())],
4542                    Box::new(Target {
4543                        url: "http://x.com".into(),
4544                        title: carta_ast::Text::default(),
4545                    }),
4546                ),
4547                Inline::Str(".".into()),
4548            ])]
4549        );
4550    }
4551
4552    #[test]
4553    fn entities_are_decoded_in_text() {
4554        assert_eq!(
4555            parse("AT&amp;T &copy;"),
4556            vec![Block::Para(vec![
4557                Inline::Str("AT&T".into()),
4558                Inline::Space,
4559                Inline::Str("\u{a9}".into()),
4560            ])]
4561        );
4562    }
4563
4564    #[test]
4565    fn nowiki_is_literal_text() {
4566        assert_eq!(
4567            parse("<nowiki>'''raw'''</nowiki>"),
4568            vec![Block::Para(vec![Inline::Str("'''raw'''".into())])]
4569        );
4570    }
4571
4572    #[test]
4573    fn reference_becomes_a_note() {
4574        assert_eq!(
4575            parse("x<ref>note</ref>"),
4576            vec![Block::Para(vec![
4577                Inline::Str("x".into()),
4578                Inline::Note(vec![Block::Plain(vec![Inline::Str("note".into())])]),
4579            ])]
4580        );
4581    }
4582
4583    #[test]
4584    fn code_tag_decodes_entities() {
4585        assert_eq!(
4586            parse("<code>a &amp; b</code>"),
4587            vec![Block::Para(vec![Inline::Code(
4588                Box::default(),
4589                "a & b".into()
4590            )])]
4591        );
4592    }
4593
4594    #[test]
4595    fn unknown_tag_passes_through_as_raw_html() {
4596        assert_eq!(
4597            parse("<b>x</b>"),
4598            vec![Block::Para(vec![
4599                raw_html("<b>".into()),
4600                Inline::Str("x".into()),
4601                raw_html("</b>".into()),
4602            ])]
4603        );
4604    }
4605
4606    #[test]
4607    fn whole_line_comment_is_removed_with_its_newline() {
4608        assert_eq!(
4609            parse("x\n<!--c-->\ny"),
4610            vec![Block::Para(vec![
4611                Inline::Str("x".into()),
4612                Inline::SoftBreak,
4613                Inline::Str("y".into()),
4614            ])]
4615        );
4616    }
4617
4618    #[test]
4619    fn inline_comment_becomes_a_space() {
4620        assert_eq!(
4621            parse("a<!--c-->b"),
4622            vec![Block::Para(vec![
4623                Inline::Str("a".into()),
4624                Inline::Space,
4625                Inline::Str("b".into()),
4626            ])]
4627        );
4628    }
4629
4630    #[test]
4631    fn syntax_highlight_block_keeps_language_and_content() {
4632        assert_eq!(
4633            parse("<syntaxhighlight lang=\"rust\">\nfn main(){}\n</syntaxhighlight>"),
4634            vec![Block::CodeBlock(
4635                Box::new(Attr {
4636                    id: carta_ast::Text::default(),
4637                    classes: vec!["rust".into()],
4638                    attributes: vec![],
4639                }),
4640                "fn main(){}".into(),
4641            )]
4642        );
4643    }
4644
4645    #[test]
4646    fn horizontal_rule_requires_a_dashes_only_line() {
4647        assert_eq!(parse("----"), vec![Block::HorizontalRule]);
4648        assert_eq!(
4649            parse("----foo"),
4650            vec![Block::Para(vec![Inline::Str("----foo".into())])]
4651        );
4652    }
4653
4654    #[test]
4655    fn preformatted_lines_become_code() {
4656        assert_eq!(
4657            parse(" indented  line"),
4658            vec![Block::Para(vec![Inline::Code(
4659                Box::default(),
4660                "indented\u{a0}\u{a0}line".into()
4661            )])]
4662        );
4663    }
4664
4665    #[test]
4666    fn preformatted_preserves_markup_and_spacing() {
4667        assert_eq!(
4668            parse(" a '''b''' c"),
4669            vec![Block::Para(vec![
4670                Inline::Code(Box::default(), "a\u{a0}".into()),
4671                Inline::Strong(vec![Inline::Code(Box::default(), "b".into())]),
4672                Inline::Code(Box::default(), "\u{a0}c".into()),
4673            ])]
4674        );
4675    }
4676
4677    #[test]
4678    fn block_template_is_raw_then_trailing_paragraph() {
4679        assert_eq!(
4680            parse("{{tpl}} trailing"),
4681            vec![
4682                Block::RawBlock(format_mediawiki(), "{{tpl}}".into()),
4683                Block::Para(vec![Inline::Str("trailing".into())]),
4684            ]
4685        );
4686    }
4687
4688    /// Reads with the default option set and reports only whether the read completed without error,
4689    /// so a deeply nested input can be checked for graceful (non-panicking) handling.
4690    fn reads_ok(input: &str) -> bool {
4691        MediawikiReader
4692            .read(input, &ReaderOptions::default())
4693            .is_ok()
4694    }
4695
4696    #[test]
4697    fn adversarially_nested_wiki_list_does_not_panic() {
4698        let mut input = String::new();
4699        for n in 1..4000 {
4700            input.push_str(&"*".repeat(n));
4701            input.push_str(" item\n");
4702        }
4703        assert!(reads_ok(&input));
4704        let single = format!("{} item", "*".repeat(20_000));
4705        assert!(reads_ok(&single));
4706    }
4707
4708    #[test]
4709    fn adversarially_nested_tables_do_not_panic() {
4710        let input = format!("{}| x\n{}", "{|\n".repeat(4000), "|}\n".repeat(4000));
4711        assert!(reads_ok(&input));
4712    }
4713
4714    #[test]
4715    fn adversarially_nested_html_list_does_not_panic() {
4716        let input = format!("{}x{}", "<ul><li>".repeat(4000), "</li></ul>".repeat(4000));
4717        assert!(reads_ok(&input));
4718    }
4719
4720    #[test]
4721    fn adversarially_nested_refs_do_not_panic() {
4722        let input = format!("{}x{}", "a<ref>".repeat(4000), "</ref>".repeat(4000));
4723        assert!(reads_ok(&input));
4724    }
4725
4726    #[test]
4727    fn stacked_header_lines_do_not_blow_up() {
4728        // A run of consecutive `=`-prefixed lines with no blank separators and no same-line closer
4729        // once forced the heading-region lookahead to recompute each line's region for every
4730        // enclosing region — exponential in the number of stacked lines, which a nightly fuzz run
4731        // hit as a timeout. Memoizing the region scan makes it linear; a run this long would never
4732        // finish under the old code.
4733        let input = "== ~iT\n= w e\n= J".repeat(4000);
4734        assert!(reads_ok(&input));
4735    }
4736}