Skip to main content

carta_readers/
org.rs

1//! Org reader: parses Org markup into the document model.
2//!
3//! Parsing is two-phase. A line-oriented block pass consumes the input into [`Block`]s, dispatching
4//! on each line's opening: headlines (`* `), greater blocks (`#+begin_…`/`#+end_…`), keyword lines
5//! (`#+key: value`), tables (`|`), lists, drawers, fixed-width (`: `) and comment (`# `) lines, and
6//! everything else as a paragraph. A second, per-fragment pass then scans each paragraph, headline,
7//! cell, and item into [`Inline`]s: emphasis, verbatim, sub/superscripts, links, footnotes, math,
8//! entities, and citations.
9//!
10//! Footnote definitions are gathered up front and their references resolved inline, so a `[fn:label]`
11//! reference expands to a [`Inline::Note`] carrying the definition's blocks.
12
13use std::borrow::Cow;
14use std::collections::BTreeMap;
15use std::mem;
16
17use carta_ast::{
18    Alignment, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Format, Inline,
19    ListAttributes, ListNumberDelim, ListNumberStyle, MathType, MetaValue, QuoteType, Row, Table,
20    TableBody, TableFoot, TableHead, Text, slug, slug_gfm,
21};
22use carta_core::{Extension, Extensions, Reader, ReaderOptions, Result};
23
24use crate::heading_ids::{IdRegistry, IdScheme, fold_to_ascii};
25
26/// Parses Org markup into the document model.
27///
28/// The default extension set enables auto identifiers, citations, task-list checkboxes, and the
29/// typographic replacements of `special_strings`; `smart` adds curly quotes, `fancy_lists` numbered
30/// list markers, and `gfm_auto_identifiers`/`ascii_identifiers` alternate identifier shapes.
31#[derive(Debug, Default, Clone, Copy)]
32pub struct OrgReader;
33
34impl Reader for OrgReader {
35    fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
36        let ext = options.extensions;
37        let normalized = normalize(input);
38        let lines: Vec<&str> = normalized.split('\n').collect();
39
40        let (body_lines, defs) = collect_footnotes(&lines);
41
42        // Footnote bodies are parsed first so a reference in the body can carry the definition's
43        // blocks. Nested footnote references inside a definition resolve against an empty table.
44        let empty_notes: BTreeMap<String, Vec<Block>> = BTreeMap::new();
45        let mut notes: BTreeMap<String, Vec<Block>> = BTreeMap::new();
46        for (label, text) in &defs {
47            let def_lines: Vec<&str> = text.split('\n').collect();
48            let mut throwaway_ids = new_id_registry();
49            let mut throwaway_meta = BTreeMap::new();
50            let blocks = parse_blocks(
51                &def_lines,
52                ext,
53                &empty_notes,
54                &mut throwaway_ids,
55                &mut throwaway_meta,
56            );
57            notes.insert(label.clone(), blocks);
58        }
59
60        let mut ids = new_id_registry();
61        let mut meta: BTreeMap<Text, MetaValue> = BTreeMap::new();
62        let blocks = parse_blocks(&body_lines, ext, &notes, &mut ids, &mut meta);
63
64        Ok(Document {
65            meta,
66            blocks,
67            ..Document::default()
68        })
69    }
70}
71
72/// Normalizes line endings to `\n` so the line-oriented pass sees a single terminator. Input without
73/// a carriage return is already normalized and is borrowed unchanged.
74fn normalize(input: &str) -> Cow<'_, str> {
75    if input.contains('\r') {
76        Cow::Owned(input.replace("\r\n", "\n").replace('\r', "\n"))
77    } else {
78        Cow::Borrowed(input)
79    }
80}
81
82// -- Footnote gathering ------------------------------------------------------------------------
83
84/// Splits block-level footnote definitions (`[fn:label] …`) out of the line stream, returning the
85/// remaining body lines and the ordered `(label, joined-body)` definitions. A definition's body
86/// continues across single blank lines, so it can hold several blocks; it ends at the next footnote
87/// definition, a headline, two consecutive blank lines, or the end of input.
88fn collect_footnotes<'a>(lines: &[&'a str]) -> (Vec<&'a str>, Vec<(String, String)>) {
89    let mut body = Vec::new();
90    let mut defs = Vec::new();
91    let mut i = 0;
92    while let Some(line) = lines.get(i) {
93        if let Some((label, first)) = footnote_definition(line) {
94            let mut collected = vec![first];
95            i += 1;
96            while let Some(next) = lines.get(i) {
97                if footnote_definition(next).is_some() || headline_level(next).is_some() {
98                    break;
99                }
100                if next.trim().is_empty()
101                    && lines
102                        .get(i + 1)
103                        .is_none_or(|following| following.trim().is_empty())
104                {
105                    break;
106                }
107                collected.push((*next).to_owned());
108                i += 1;
109            }
110            defs.push((label, collected.join("\n")));
111        } else {
112            body.push(*line);
113            i += 1;
114        }
115    }
116    (body, defs)
117}
118
119/// Recognizes a block-level footnote definition `[fn:label] rest`, returning the label and the text
120/// after the closing bracket.
121fn footnote_definition(line: &str) -> Option<(String, String)> {
122    let rest = line.strip_prefix("[fn:")?;
123    let close = rest.find(']')?;
124    let label = &rest[..close];
125    if label.is_empty() || !label.chars().all(is_footnote_label_char) {
126        return None;
127    }
128    let after = rest.get(close + 1..).unwrap_or("");
129    Some((label.to_owned(), after.trim_start().to_owned()))
130}
131
132fn is_footnote_label_char(c: char) -> bool {
133    c.is_ascii_alphanumeric() || matches!(c, '_' | '-')
134}
135
136// -- Identifier derivation ---------------------------------------------------------------------
137
138/// A fresh heading-identifier registry with `section` reserved from the start, so the first heading
139/// that reduces to it is already `section-1`.
140fn new_id_registry() -> IdRegistry {
141    let mut ids = IdRegistry::default();
142    ids.reserve_native("section");
143    ids
144}
145
146/// Derives an identifier for `text` under the active extensions, or an empty string when no
147/// auto-identifier extension is on. The slug shape follows the extension, but headings always
148/// disambiguate natively: an empty slug becomes `section` and repeats increment until unused.
149fn assign_id(ids: &mut IdRegistry, text: &str, ext: Extensions) -> String {
150    let Some(scheme) = IdScheme::select(ext, true) else {
151        return String::new();
152    };
153    let folded;
154    let source = if ext.contains(Extension::AsciiIdentifiers) {
155        folded = fold_to_ascii(text);
156        folded.as_str()
157    } else {
158        text
159    };
160    let base = match scheme {
161        IdScheme::Plain => slug(source),
162        IdScheme::Gfm => slug_gfm(source),
163    };
164    ids.assign_native(base)
165}
166
167// -- Block parsing -----------------------------------------------------------------------------
168
169/// Affiliated keywords (`#+caption:`, `#+name:`) that attach to the block that follows them.
170#[derive(Default)]
171struct Affiliated {
172    caption: Option<Vec<Inline>>,
173    name: Option<String>,
174}
175
176impl Affiliated {
177    fn is_empty(&self) -> bool {
178        self.caption.is_none() && self.name.is_none()
179    }
180}
181
182#[allow(clippy::too_many_lines)]
183fn parse_blocks(
184    lines: &[&str],
185    ext: Extensions,
186    notes: &BTreeMap<String, Vec<Block>>,
187    ids: &mut IdRegistry,
188    meta: &mut BTreeMap<Text, MetaValue>,
189) -> Vec<Block> {
190    let mut out = Vec::new();
191    let mut pending = Affiliated::default();
192    let mut i = 0;
193    while let Some(&line) = lines.get(i) {
194        if line.trim().is_empty() {
195            i += 1;
196            continue;
197        }
198        // Headline.
199        if let Some(level) = headline_level(line) {
200            i += 1;
201            let mut id_override = None;
202            if let Some((custom_id, skip)) = read_property_drawer(lines, i) {
203                id_override = custom_id;
204                i += skip;
205            }
206            out.push(build_headline(line, level, id_override, ext, notes, ids));
207            pending = Affiliated::default();
208            continue;
209        }
210        // Greater block: #+begin_… / #+end_….
211        if let Some(name) = greater_block_open(line) {
212            let (block, consumed) = parse_greater_block(lines, i, &name, ext, notes, ids, meta);
213            i += consumed;
214            if let Some(block) = block {
215                out.push(apply_affiliated(block, &mut pending));
216            }
217            continue;
218        }
219        // Keyword line: #+key: value.
220        if let Some((key, value)) = keyword_line(line) {
221            handle_keyword(&key, &value, line, ext, notes, meta, &mut pending, &mut out);
222            i += 1;
223            continue;
224        }
225        // Comment line.
226        if line.trim_start() == "#" || line.trim_start().starts_with("# ") {
227            i += 1;
228            continue;
229        }
230        // Horizontal rule.
231        if is_horizontal_rule(line) {
232            out.push(Block::HorizontalRule);
233            i += 1;
234            pending = Affiliated::default();
235            continue;
236        }
237        // Fixed-width (colon) block.
238        if is_fixed_width(line) {
239            let (text, consumed) = collect_fixed_width(lines, i);
240            out.push(Block::CodeBlock(Box::default(), text.into()));
241            i += consumed;
242            pending = Affiliated::default();
243            continue;
244        }
245        // Drawer.
246        if let Some(name) = drawer_open(line) {
247            let (inner, consumed) = collect_drawer(lines, i);
248            i += consumed;
249            // A metadata drawer holds bookkeeping, not document content, and is elided; every other
250            // named drawer becomes a div wrapping its parsed contents.
251            if name.eq_ignore_ascii_case("PROPERTIES") || name.eq_ignore_ascii_case("LOGBOOK") {
252                pending = Affiliated::default();
253                continue;
254            }
255            let body = parse_blocks(&inner, ext, notes, ids, meta);
256            let attr = Attr {
257                classes: vec![name.into(), "drawer".into()],
258                ..Attr::default()
259            };
260            out.push(Block::Div(Box::new(attr), body));
261            pending = Affiliated::default();
262            continue;
263        }
264        // Table.
265        if is_table_line(line) {
266            let (rows, consumed) = collect_table(lines, i);
267            let table = build_table(&rows, ext, notes, &mut pending);
268            out.push(table);
269            i += consumed;
270            continue;
271        }
272        // List.
273        if list_marker(line).is_some() {
274            let (block, consumed) = parse_list(lines, i, ext, notes, ids, meta);
275            i += consumed;
276            if let Some(block) = block {
277                out.push(block);
278            }
279            pending = Affiliated::default();
280            continue;
281        }
282        // Paragraph: gather until a structural line or blank. The dispatch above already proved this
283        // first line is neither blank nor a block opener, so continuation begins at the next line.
284        let start = i;
285        i += 1;
286        while let Some(&l) = lines.get(i) {
287            if l.trim().is_empty() || opens_block(l) {
288                break;
289            }
290            i += 1;
291        }
292        let text = lines
293            .get(start..i)
294            .unwrap_or(&[])
295            .iter()
296            .map(|l| l.trim())
297            .collect::<Vec<_>>()
298            .join("\n");
299        let para = Block::Para(parse_inlines(&text, ext, notes));
300        out.push(apply_affiliated(para, &mut pending));
301    }
302    out
303}
304
305/// Whether a line begins a block that interrupts an open paragraph.
306fn opens_block(line: &str) -> bool {
307    headline_level(line).is_some()
308        || greater_block_open(line).is_some()
309        || keyword_line(line).is_some()
310        || line.trim_start() == "#"
311        || line.trim_start().starts_with("# ")
312        || is_horizontal_rule(line)
313        || is_fixed_width(line)
314        || drawer_open(line).is_some()
315        || is_table_line(line)
316        || list_marker(line).is_some()
317}
318
319/// Attaches a pending caption/name to a freshly built block: a caption turns a lone-image paragraph
320/// into a figure, and a name supplies its identifier.
321fn apply_affiliated(block: Block, pending: &mut Affiliated) -> Block {
322    if pending.is_empty() {
323        return block;
324    }
325    let Affiliated { caption, name } = mem::take(pending);
326    match block {
327        Block::Para(inlines) if is_lone_image(&inlines) => {
328            let attr = Attr {
329                id: name.unwrap_or_default().into(),
330                ..Attr::default()
331            };
332            let long = caption.map(|c| vec![Block::Plain(c)]).unwrap_or_default();
333            Block::Figure(
334                Box::new(attr),
335                Box::new(Caption { short: None, long }),
336                vec![Block::Plain(inlines)],
337            )
338        }
339        Block::CodeBlock(mut attr, text) => {
340            if let Some(name) = name {
341                attr.id = name.into();
342            }
343            Block::CodeBlock(attr, text)
344        }
345        other => other,
346    }
347}
348
349fn is_lone_image(inlines: &[Inline]) -> bool {
350    matches!(inlines, [Inline::Image(..)])
351}
352
353/// The headline level (count of leading `*`) when a line is a headline, i.e. one or more `*` at
354/// column zero followed by a space.
355fn headline_level(line: &str) -> Option<usize> {
356    let stars = line.len() - line.trim_start_matches('*').len();
357    if stars == 0 {
358        return None;
359    }
360    match line.as_bytes().get(stars) {
361        Some(b' ') => Some(stars),
362        _ => None,
363    }
364}
365
366/// Builds a `Header`, splitting off a leading todo keyword and trailing tags and deriving an
367/// identifier from the remaining title text (or the property drawer's custom id).
368fn build_headline(
369    line: &str,
370    level: usize,
371    id_override: Option<String>,
372    ext: Extensions,
373    notes: &BTreeMap<String, Vec<Block>>,
374    ids: &mut IdRegistry,
375) -> Block {
376    let rest = line.get(level..).unwrap_or("").trim();
377
378    let (todo, rest) = split_todo_keyword(rest);
379    let (title_text, tags) = split_tags(rest);
380
381    let title_inlines = parse_inlines(title_text, ext, notes);
382
383    let id = if let Some(custom) = id_override {
384        ids.reserve_native(&custom);
385        custom
386    } else {
387        assign_id(ids, &carta_ast::to_plain_text(&title_inlines), ext)
388    };
389
390    let mut inlines = Vec::new();
391    if let Some(keyword) = todo {
392        inlines.push(todo_span(keyword));
393        inlines.push(Inline::Space);
394    }
395    inlines.extend(title_inlines);
396    if !tags.is_empty() {
397        inlines.push(Inline::Space);
398        for (n, tag) in tags.iter().enumerate() {
399            if n > 0 {
400                inlines.push(Inline::Str("\u{a0}".into()));
401            }
402            inlines.push(tag_span(tag));
403        }
404    }
405
406    let attr = Attr {
407        id: id.into(),
408        ..Attr::default()
409    };
410    let level = i32::try_from(level).unwrap_or(6).clamp(1, 6);
411    Block::Header(level, Box::new(attr), inlines)
412}
413
414fn todo_span(keyword: &str) -> Inline {
415    let state = if keyword == "DONE" { "done" } else { "todo" };
416    let attr = Attr {
417        classes: vec![state.into(), keyword.into()],
418        ..Attr::default()
419    };
420    Inline::Span(Box::new(attr), vec![Inline::Str(keyword.into())])
421}
422
423fn tag_span(tag: &str) -> Inline {
424    let attr = Attr {
425        classes: vec!["tag".into()],
426        attributes: vec![("tag-name".into(), tag.into())],
427        ..Attr::default()
428    };
429    Inline::Span(
430        Box::new(attr),
431        vec![Inline::SmallCaps(vec![Inline::Str(tag.into())])],
432    )
433}
434
435/// Splits a leading `TODO`/`DONE` keyword (which must be followed by a space or end the text) from
436/// the headline body.
437fn split_todo_keyword(rest: &str) -> (Option<&str>, &str) {
438    for keyword in ["TODO", "DONE"] {
439        if let Some(after) = rest.strip_prefix(keyword)
440            && (after.is_empty() || after.starts_with(' '))
441        {
442            return (Some(keyword), after.trim_start());
443        }
444    }
445    (None, rest)
446}
447
448/// Splits trailing `:tag:tag:` tags from a headline, returning the title text and the tag names.
449fn split_tags(rest: &str) -> (&str, Vec<String>) {
450    let trimmed = rest.trim_end();
451    if !trimmed.ends_with(':') {
452        return (rest, Vec::new());
453    }
454    let Some(space) = trimmed.rfind(char::is_whitespace) else {
455        return (rest, Vec::new());
456    };
457    let candidate = trimmed.get(space + 1..).unwrap_or("");
458    if candidate.len() < 2 || !candidate.starts_with(':') || !candidate.ends_with(':') {
459        return (rest, Vec::new());
460    }
461    let inner = &candidate[1..candidate.len() - 1];
462    if inner.is_empty()
463        || !inner
464            .chars()
465            .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '@' | '#' | '%' | ':'))
466    {
467        return (rest, Vec::new());
468    }
469    let tags: Vec<String> = inner
470        .split(':')
471        .filter(|t| !t.is_empty())
472        .map(str::to_owned)
473        .collect();
474    if tags.is_empty() {
475        return (rest, Vec::new());
476    }
477    (trimmed.get(..space).unwrap_or("").trim_end(), tags)
478}
479
480/// Reads a `:PROPERTIES:`…`:END:` drawer immediately following a headline, returning the custom
481/// identifier (if any) and the number of lines consumed. Returns `None` when no drawer follows.
482fn read_property_drawer(lines: &[&str], start: usize) -> Option<(Option<String>, usize)> {
483    let first = lines.get(start)?;
484    if !first.trim().eq_ignore_ascii_case(":PROPERTIES:") {
485        return None;
486    }
487    let mut custom = None;
488    let mut i = start + 1;
489    while let Some(line) = lines.get(i) {
490        let trimmed = line.trim();
491        if trimmed.eq_ignore_ascii_case(":END:") {
492            return Some((custom, i + 1 - start));
493        }
494        if let Some(rest) = trimmed.strip_prefix(':')
495            && let Some((key, value)) = rest.split_once(':')
496            && key.eq_ignore_ascii_case("CUSTOM_ID")
497        {
498            custom = Some(value.trim().to_owned());
499        }
500        i += 1;
501    }
502    // Unterminated drawer: leave the lines to the block parser.
503    None
504}
505
506// -- Greater blocks ----------------------------------------------------------------------------
507
508/// The block name of a `#+begin_<name>` line, as written (case preserved). Callers compare it
509/// case-insensitively.
510fn greater_block_open(line: &str) -> Option<String> {
511    let trimmed = line.trim_start();
512    let rest = strip_prefix_ci(trimmed, "#+begin_")?;
513    let name: String = rest
514        .chars()
515        .take_while(|c| !c.is_whitespace())
516        .collect::<String>();
517    if name.is_empty() { None } else { Some(name) }
518}
519
520#[allow(clippy::too_many_arguments)]
521fn parse_greater_block(
522    lines: &[&str],
523    start: usize,
524    name: &str,
525    ext: Extensions,
526    notes: &BTreeMap<String, Vec<Block>>,
527    ids: &mut IdRegistry,
528    meta: &mut BTreeMap<Text, MetaValue>,
529) -> (Option<Block>, usize) {
530    // `name` is the block name parsed from this same open line, so the header arguments are whatever
531    // follows it on that line.
532    let open_line = lines.get(start).copied().unwrap_or("");
533    let header_args = strip_prefix_ci(open_line.trim_start(), "#+begin_")
534        .unwrap_or("")
535        .get(name.len()..)
536        .unwrap_or("")
537        .trim();
538
539    let lower = name.to_ascii_lowercase();
540    let end_marker = format!("#+end_{lower}");
541    let mut depth = 1usize;
542    let mut content: Vec<&str> = Vec::new();
543    let mut i = start + 1;
544    while let Some(&line) = lines.get(i) {
545        let t = line.trim_start();
546        if let Some(open) = greater_block_open(line)
547            && open.eq_ignore_ascii_case(name)
548        {
549            depth += 1;
550        }
551        if t.eq_ignore_ascii_case(&end_marker) {
552            depth -= 1;
553            if depth == 0 {
554                i += 1;
555                break;
556            }
557        }
558        content.push(line);
559        i += 1;
560    }
561    let consumed = i - start;
562
563    let block = match lower.as_str() {
564        "src" => {
565            let lang = header_args
566                .split_whitespace()
567                .next()
568                .unwrap_or("")
569                .to_owned();
570            let attr = Attr {
571                classes: if lang.is_empty() {
572                    vec![]
573                } else {
574                    vec![lang.into()]
575                },
576                ..Attr::default()
577            };
578            Some(Block::CodeBlock(
579                Box::new(attr),
580                dedent_verbatim(&content).into(),
581            ))
582        }
583        "example" => Some(Block::CodeBlock(
584            Box::default(),
585            dedent_verbatim(&content).into(),
586        )),
587        "export" => {
588            let fmt = header_args
589                .split_whitespace()
590                .next()
591                .unwrap_or("")
592                .to_owned();
593            Some(Block::RawBlock(
594                Format(fmt.into()),
595                verbatim(&content).into(),
596            ))
597        }
598        "quote" => Some(Block::BlockQuote(parse_blocks(
599            &content, ext, notes, ids, meta,
600        ))),
601        "verse" => Some(Block::LineBlock(
602            content
603                .iter()
604                .map(|l| parse_inlines(l.trim(), ext, notes))
605                .collect(),
606        )),
607        "comment" => None,
608        _ => {
609            let attr = Attr {
610                classes: vec![name.into()],
611                ..Attr::default()
612            };
613            Some(Block::Div(
614                Box::new(attr),
615                parse_blocks(&content, ext, notes, ids, meta),
616            ))
617        }
618    };
619    (block, consumed)
620}
621
622/// Joins verbatim content lines with a trailing newline on each.
623fn verbatim(lines: &[&str]) -> String {
624    let mut out = String::new();
625    for line in lines {
626        out.push_str(line);
627        out.push('\n');
628    }
629    out
630}
631
632/// Joins verbatim content, first stripping the common leading indentation shared by all non-blank
633/// lines.
634fn dedent_verbatim(lines: &[&str]) -> String {
635    let indent = lines
636        .iter()
637        .filter(|l| !l.trim().is_empty())
638        .map(|l| l.len() - l.trim_start().len())
639        .min()
640        .unwrap_or(0);
641    let mut out = String::new();
642    for line in lines {
643        let trimmed = line.get(indent..).unwrap_or("");
644        out.push_str(if line.trim().is_empty() {
645            line
646        } else {
647            trimmed
648        });
649        out.push('\n');
650    }
651    out
652}
653
654// -- Keyword lines -----------------------------------------------------------------------------
655
656/// Splits a `#+key: value` keyword line into `(key, value)`. Block delimiters (`#+begin_…`) are not
657/// keyword lines.
658fn keyword_line(line: &str) -> Option<(String, String)> {
659    let trimmed = line.trim_start();
660    let rest = trimmed.strip_prefix("#+")?;
661    let colon = rest.find(':')?;
662    let key = rest.get(..colon)?;
663    if key.is_empty()
664        || !key
665            .chars()
666            .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '-'))
667    {
668        return None;
669    }
670    if key.eq_ignore_ascii_case("begin_src")
671        || starts_with_ci(key, "begin_")
672        || starts_with_ci(key, "end_")
673    {
674        return None;
675    }
676    let value = rest.get(colon + 1..).unwrap_or("").trim_start().to_owned();
677    Some((key.to_owned(), value))
678}
679
680#[allow(clippy::too_many_arguments)]
681fn handle_keyword(
682    key: &str,
683    value: &str,
684    line: &str,
685    ext: Extensions,
686    notes: &BTreeMap<String, Vec<Block>>,
687    meta: &mut BTreeMap<Text, MetaValue>,
688    pending: &mut Affiliated,
689    out: &mut Vec<Block>,
690) {
691    let upper = key.to_ascii_uppercase();
692    match upper.as_str() {
693        "TITLE" | "SUBTITLE" | "AUTHOR" | "DATE" | "KEYWORDS" | "DESCRIPTION" => {
694            meta.insert(
695                upper.to_ascii_lowercase().into(),
696                MetaValue::MetaInlines(parse_inlines(value, ext, notes)),
697            );
698        }
699        "LANGUAGE" => {
700            meta.insert("lang".into(), MetaValue::MetaString(value.into()));
701        }
702        "CAPTION" => pending.caption = Some(parse_inlines(value, ext, notes)),
703        "NAME" | "LABEL" => pending.name = Some(value.to_owned()),
704        "OPTIONS" | "TODO" | "SEQ_TODO" | "TYP_TODO" | "PRIORITIES" | "TAGS" | "COLUMNS"
705        | "SETUPFILE" | "CONSTANTS" | "MACRO" | "DRAWERS" | "ARCHIVE" | "RESULTS" | "HEADER"
706        | "PLOT" => {}
707        other if other.starts_with("ATTR_") => {}
708        other if other.starts_with("LATEX_HEADER") => {
709            append_header_include(meta, "latex", value);
710        }
711        other if other.starts_with("HTML_HEAD") => {
712            append_header_include(meta, "html", value);
713        }
714        _ => out.push(Block::RawBlock(
715            Format("org".into()),
716            line.trim_end().into(),
717        )),
718    }
719}
720
721fn append_header_include(meta: &mut BTreeMap<Text, MetaValue>, format: &str, value: &str) {
722    let entry =
723        MetaValue::MetaInlines(vec![Inline::RawInline(Format(format.into()), value.into())]);
724    match meta
725        .entry("header-includes".into())
726        .or_insert_with(|| MetaValue::MetaList(Vec::new()))
727    {
728        MetaValue::MetaList(list) => list.push(entry),
729        slot => *slot = MetaValue::MetaList(vec![entry]),
730    }
731}
732
733// -- Fixed-width, drawers, rules ---------------------------------------------------------------
734
735fn is_horizontal_rule(line: &str) -> bool {
736    let t = line.trim();
737    t.len() >= 5 && t.chars().all(|c| c == '-')
738}
739
740fn is_fixed_width(line: &str) -> bool {
741    let t = line.trim_start();
742    t == ":" || t.starts_with(": ")
743}
744
745fn collect_fixed_width(lines: &[&str], start: usize) -> (String, usize) {
746    let mut text = String::new();
747    let mut i = start;
748    while let Some(&line) = lines.get(i) {
749        if !is_fixed_width(line) {
750            break;
751        }
752        let t = line.trim_start();
753        let content = t
754            .strip_prefix(": ")
755            .or_else(|| t.strip_prefix(':'))
756            .unwrap_or("");
757        text.push_str(content);
758        text.push('\n');
759        i += 1;
760    }
761    (text, i - start)
762}
763
764/// The drawer name of a `:NAME:` line (excluding `:END:`), or `None` when the line is not a drawer.
765fn drawer_open(line: &str) -> Option<String> {
766    let t = line.trim();
767    let inner = t.strip_prefix(':')?.strip_suffix(':')?;
768    if inner.is_empty()
769        || inner.contains(':')
770        || inner.eq_ignore_ascii_case("END")
771        || !inner
772            .chars()
773            .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '@' | '#' | '%'))
774    {
775        return None;
776    }
777    Some(inner.to_owned())
778}
779
780fn collect_drawer<'a>(lines: &[&'a str], start: usize) -> (Vec<&'a str>, usize) {
781    let mut inner = Vec::new();
782    let mut i = start + 1;
783    while let Some(&line) = lines.get(i) {
784        if line.trim().eq_ignore_ascii_case(":END:") {
785            i += 1;
786            break;
787        }
788        inner.push(line);
789        i += 1;
790    }
791    (inner, i - start)
792}
793
794// -- Tables ------------------------------------------------------------------------------------
795
796fn is_table_line(line: &str) -> bool {
797    line.trim_start().starts_with('|')
798}
799
800/// One parsed table row: either a separator (`|---+---|`) or content cells.
801enum TableRow {
802    Separator,
803    Cells(Vec<String>),
804}
805
806fn collect_table(lines: &[&str], start: usize) -> (Vec<TableRow>, usize) {
807    let mut rows = Vec::new();
808    let mut i = start;
809    while let Some(&line) = lines.get(i) {
810        if !is_table_line(line) {
811            break;
812        }
813        rows.push(parse_table_row(line));
814        i += 1;
815    }
816    (rows, i - start)
817}
818
819fn parse_table_row(line: &str) -> TableRow {
820    let t = line.trim();
821    let inner = t.strip_prefix('|').unwrap_or(t);
822    let inner = inner.strip_suffix('|').unwrap_or(inner);
823    if !inner.is_empty()
824        && inner
825            .chars()
826            .all(|c| matches!(c, '-' | '+' | '|' | ' ' | ':'))
827    {
828        return TableRow::Separator;
829    }
830    let cells = inner.split('|').map(|c| c.trim().to_owned()).collect();
831    TableRow::Cells(cells)
832}
833
834fn build_table(
835    rows: &[TableRow],
836    ext: Extensions,
837    notes: &BTreeMap<String, Vec<Block>>,
838    pending: &mut Affiliated,
839) -> Block {
840    let mut head_rows: Vec<Vec<String>> = Vec::new();
841    let mut body_rows: Vec<Vec<String>> = Vec::new();
842    let mut seen_separator = false;
843    let mut header_done = false;
844    for row in rows {
845        match row {
846            TableRow::Separator => {
847                if !body_rows.is_empty() {
848                    header_done = true;
849                } else if !head_rows.is_empty() {
850                    seen_separator = true;
851                }
852            }
853            TableRow::Cells(cells) => {
854                if seen_separator || header_done {
855                    body_rows.push(cells.clone());
856                } else {
857                    head_rows.push(cells.clone());
858                }
859            }
860        }
861    }
862    // With no separator, every row is a body row.
863    if !seen_separator {
864        body_rows.splice(0..0, head_rows.drain(..));
865    }
866
867    let columns = head_rows
868        .iter()
869        .chain(body_rows.iter())
870        .map(Vec::len)
871        .max()
872        .unwrap_or(0);
873
874    let col_specs = (0..columns)
875        .map(|_| ColSpec {
876            align: Alignment::AlignDefault,
877            width: ColWidth::ColWidthDefault,
878        })
879        .collect();
880
881    let to_rows = |cells: &[Vec<String>]| -> Vec<Row> {
882        cells
883            .iter()
884            .map(|row| Row {
885                attr: Attr::default(),
886                cells: (0..columns)
887                    .map(|c| build_cell(row.get(c).map_or("", String::as_str), ext, notes))
888                    .collect(),
889            })
890            .collect()
891    };
892
893    let Affiliated { caption, name } = mem::take(pending);
894    let caption = Caption {
895        short: None,
896        long: caption.map(|c| vec![Block::Plain(c)]).unwrap_or_default(),
897    };
898
899    let table = Table {
900        attr: Attr {
901            id: name.unwrap_or_default().into(),
902            ..Attr::default()
903        },
904        caption,
905        col_specs,
906        head: TableHead {
907            attr: Attr::default(),
908            rows: to_rows(&head_rows),
909        },
910        bodies: vec![TableBody {
911            attr: Attr::default(),
912            row_head_columns: 0,
913            head: Vec::new(),
914            body: to_rows(&body_rows),
915        }],
916        foot: TableFoot::default(),
917    };
918    Block::Table(Box::new(table))
919}
920
921fn build_cell(text: &str, ext: Extensions, notes: &BTreeMap<String, Vec<Block>>) -> Cell {
922    let content = if text.is_empty() {
923        Vec::new()
924    } else {
925        vec![Block::Plain(parse_inlines(text, ext, notes))]
926    };
927    Cell {
928        attr: Attr::default(),
929        align: Alignment::AlignDefault,
930        row_span: 1,
931        col_span: 1,
932        content,
933    }
934}
935
936// -- Lists -------------------------------------------------------------------------------------
937
938/// The kind of a list item marker.
939#[derive(Clone, Copy, PartialEq)]
940enum Marker {
941    Bullet,
942    Ordered(ListNumberStyle, ListNumberDelim),
943}
944
945/// A recognized list marker: its column, the width consumed by the marker plus following space, and
946/// the marker kind.
947struct MarkerInfo {
948    indent: usize,
949    content_col: usize,
950    kind: Marker,
951}
952
953fn list_marker(line: &str) -> Option<MarkerInfo> {
954    let indent = line.len() - line.trim_start().len();
955    let rest = line.get(indent..)?;
956    let bytes = rest.as_bytes();
957    // Bullet: '-' or '+', or '*' only when indented.
958    if let Some(&c) = bytes.first()
959        && (matches!(c, b'-' | b'+') || (c == b'*' && indent > 0))
960        && (bytes.get(1) == Some(&b' ') || bytes.len() == 1)
961    {
962        return Some(MarkerInfo {
963            indent,
964            content_col: indent + 2,
965            kind: Marker::Bullet,
966        });
967    }
968    // Ordered: digits or a single letter, then '.' or ')'.
969    let mut j = 0;
970    while bytes.get(j).is_some_and(u8::is_ascii_digit) {
971        j += 1;
972    }
973    let style = if j > 0 {
974        ListNumberStyle::Decimal
975    } else if let Some(&letter) = bytes
976        .first()
977        .filter(|c| c.is_ascii_alphabetic())
978        .filter(|_| bytes.get(1).is_some_and(|&c| c == b'.' || c == b')'))
979    {
980        j = 1;
981        if letter.is_ascii_uppercase() {
982            ListNumberStyle::UpperAlpha
983        } else {
984            ListNumberStyle::LowerAlpha
985        }
986    } else {
987        return None;
988    };
989    let delim = match bytes.get(j) {
990        Some(b'.') => ListNumberDelim::Period,
991        Some(b')') => ListNumberDelim::OneParen,
992        _ => return None,
993    };
994    if bytes.get(j + 1) == Some(&b' ') || bytes.len() == j + 1 {
995        Some(MarkerInfo {
996            indent,
997            content_col: indent + j + 2,
998            kind: Marker::Ordered(style, delim),
999        })
1000    } else {
1001        None
1002    }
1003}
1004
1005fn parse_list(
1006    lines: &[&str],
1007    start: usize,
1008    ext: Extensions,
1009    notes: &BTreeMap<String, Vec<Block>>,
1010    ids: &mut IdRegistry,
1011    meta: &mut BTreeMap<Text, MetaValue>,
1012) -> (Option<Block>, usize) {
1013    let Some(first) = list_marker(lines.get(start).copied().unwrap_or("")) else {
1014        return (None, 1);
1015    };
1016    let base_indent = first.indent;
1017    let first_kind = first.kind;
1018
1019    let mut items: Vec<Vec<&str>> = Vec::new();
1020    let mut loose = false;
1021    let mut i = start;
1022    let mut pending_blank = false;
1023
1024    while let Some(&line) = lines.get(i) {
1025        if line.trim().is_empty() {
1026            pending_blank = true;
1027            i += 1;
1028            continue;
1029        }
1030        if let Some(marker) = list_marker(line)
1031            && marker.indent == base_indent
1032            && same_series(first_kind, marker.kind)
1033        {
1034            if pending_blank && !items.is_empty() {
1035                loose = true;
1036            }
1037            pending_blank = false;
1038            let content_col = marker.content_col;
1039            let mut item_lines = vec![line.get(content_col..).unwrap_or("")];
1040            i += 1;
1041            // Gather continuation lines belonging to this item.
1042            while let Some(&next) = lines.get(i) {
1043                if next.trim().is_empty() {
1044                    pending_blank = true;
1045                    item_lines.push("");
1046                    i += 1;
1047                    continue;
1048                }
1049                let next_indent = next.len() - next.trim_start().len();
1050                let is_sibling = list_marker(next).is_some_and(|m| m.indent == base_indent);
1051                if next_indent > base_indent && !is_sibling {
1052                    if pending_blank {
1053                        loose = true;
1054                    }
1055                    pending_blank = false;
1056                    item_lines.push(dedent_line(next, content_col));
1057                    i += 1;
1058                } else {
1059                    break;
1060                }
1061            }
1062            // Trim a trailing blank kept inside the item.
1063            while item_lines.last() == Some(&"") {
1064                item_lines.pop();
1065            }
1066            items.push(item_lines);
1067            continue;
1068        }
1069        break;
1070    }
1071
1072    if items.is_empty() {
1073        return (None, 1);
1074    }
1075
1076    // Definition list when the first item carries a `::` separator.
1077    if let Some(defs) = try_definition_list(&items, ext, notes, ids, meta, loose) {
1078        return (Some(defs), i - start);
1079    }
1080
1081    let item_blocks: Vec<Vec<Block>> = items
1082        .iter()
1083        .map(|item| {
1084            let blocks = parse_list_item(item, ext, notes, ids, meta);
1085            if loose { blocks } else { tighten(blocks) }
1086        })
1087        .collect();
1088
1089    let block = match first_kind {
1090        Marker::Bullet => Block::BulletList(item_blocks),
1091        Marker::Ordered(style, delim) => {
1092            let (style, delim) = if ext.contains(Extension::FancyLists) {
1093                (style, delim)
1094            } else {
1095                (ListNumberStyle::DefaultStyle, ListNumberDelim::DefaultDelim)
1096            };
1097            Block::OrderedList(
1098                ListAttributes {
1099                    start: 1,
1100                    style,
1101                    delim,
1102                },
1103                item_blocks,
1104            )
1105        }
1106    };
1107    (Some(block), i - start)
1108}
1109
1110/// Whether two markers belong to the same list (both bullets, or both ordered).
1111fn same_series(a: Marker, b: Marker) -> bool {
1112    matches!(
1113        (a, b),
1114        (Marker::Bullet, Marker::Bullet) | (Marker::Ordered(..), Marker::Ordered(..))
1115    )
1116}
1117
1118fn parse_list_item(
1119    item: &[&str],
1120    ext: Extensions,
1121    notes: &BTreeMap<String, Vec<Block>>,
1122    ids: &mut IdRegistry,
1123    meta: &mut BTreeMap<Text, MetaValue>,
1124) -> Vec<Block> {
1125    let mut lines = item.to_vec();
1126    let mut checkbox = None;
1127    if ext.contains(Extension::TaskLists)
1128        && let Some(first) = lines.first_mut()
1129        && let Some((glyph, rest)) = strip_checkbox(first)
1130    {
1131        checkbox = Some(glyph);
1132        *first = rest;
1133    }
1134    let mut blocks = parse_blocks(&lines, ext, notes, ids, meta);
1135    if let Some(glyph) = checkbox {
1136        prepend_checkbox(&mut blocks, glyph);
1137    }
1138    blocks
1139}
1140
1141/// Splits a leading `[ ]`/`[X]`/`[-]` checkbox off a list item's first line, returning its ballot
1142/// glyph and the remaining text. The checkbox must be followed by a space or end the line.
1143fn strip_checkbox(line: &str) -> Option<(&'static str, &str)> {
1144    for (token, glyph) in [
1145        ("[ ]", "\u{2610}"),
1146        ("[-]", "\u{2610}"),
1147        ("[X]", "\u{2612}"),
1148    ] {
1149        if let Some(rest) = line.strip_prefix(token) {
1150            if rest.is_empty() {
1151                return Some((glyph, rest));
1152            }
1153            if let Some(after) = rest.strip_prefix(' ') {
1154                return Some((glyph, after));
1155            }
1156        }
1157    }
1158    None
1159}
1160
1161/// Prepends a checkbox glyph to a list item's first inline-bearing block, or introduces a plain block
1162/// when the item has no content.
1163fn prepend_checkbox(blocks: &mut Vec<Block>, glyph: &str) {
1164    match blocks.first_mut() {
1165        Some(Block::Plain(inlines) | Block::Para(inlines)) => {
1166            inlines.splice(0..0, [Inline::Str(glyph.into()), Inline::Space]);
1167        }
1168        _ => blocks.insert(0, Block::Plain(vec![Inline::Str(glyph.into())])),
1169    }
1170}
1171
1172/// Converts leading paragraphs to plain blocks for a tight list.
1173fn tighten(blocks: Vec<Block>) -> Vec<Block> {
1174    blocks
1175        .into_iter()
1176        .map(|b| match b {
1177            Block::Para(inlines) => Block::Plain(inlines),
1178            other => other,
1179        })
1180        .collect()
1181}
1182
1183fn try_definition_list(
1184    items: &[Vec<&str>],
1185    ext: Extensions,
1186    notes: &BTreeMap<String, Vec<Block>>,
1187    ids: &mut IdRegistry,
1188    meta: &mut BTreeMap<Text, MetaValue>,
1189    loose: bool,
1190) -> Option<Block> {
1191    let first = items.first()?;
1192    split_definition(first.first().copied().unwrap_or(""))?;
1193    let mut entries = Vec::new();
1194    for item in items {
1195        let head = item.first().copied().unwrap_or("");
1196        let (term_text, def_first) = match split_definition(head) {
1197            Some(pair) => pair,
1198            None => (head, ""),
1199        };
1200        let term = parse_inlines(term_text.trim(), ext, notes);
1201        let mut def_lines = vec![def_first];
1202        def_lines.extend(item.get(1..).unwrap_or(&[]).iter().copied());
1203        let blocks = parse_blocks(&def_lines, ext, notes, ids, meta);
1204        let blocks = if loose { blocks } else { tighten(blocks) };
1205        entries.push((term, vec![blocks]));
1206    }
1207    Some(Block::DefinitionList(entries))
1208}
1209
1210/// Splits a definition-list item head `term :: definition` into its term and the start of its
1211/// definition.
1212fn split_definition(line: &str) -> Option<(&str, &str)> {
1213    let idx = line.find(" :: ")?;
1214    Some((line.get(..idx)?, line.get(idx + 4..)?))
1215}
1216
1217/// Removes up to `col` leading spaces from a continuation line, borrowing the remaining slice.
1218fn dedent_line(line: &str, col: usize) -> &str {
1219    let indent = line.len() - line.trim_start().len();
1220    let drop = indent.min(col);
1221    line.get(drop..).unwrap_or("")
1222}
1223
1224// -- Inline parsing ----------------------------------------------------------------------------
1225
1226fn parse_inlines(text: &str, ext: Extensions, notes: &BTreeMap<String, Vec<Block>>) -> Vec<Inline> {
1227    let chars: Vec<char> = text.chars().collect();
1228    let mut scanner = Inlines {
1229        chars: &chars,
1230        ext,
1231        notes,
1232        out: Vec::new(),
1233        word: String::new(),
1234    };
1235    scanner.run();
1236    scanner.finish()
1237}
1238
1239struct Inlines<'a> {
1240    chars: &'a [char],
1241    ext: Extensions,
1242    notes: &'a BTreeMap<String, Vec<Block>>,
1243    out: Vec<Inline>,
1244    word: String,
1245}
1246
1247impl Inlines<'_> {
1248    fn finish(mut self) -> Vec<Inline> {
1249        self.flush();
1250        self.out
1251    }
1252
1253    fn flush(&mut self) {
1254        if !self.word.is_empty() {
1255            self.out.push(Inline::Str(mem::take(&mut self.word).into()));
1256        }
1257    }
1258
1259    fn push_inline(&mut self, inline: Inline) {
1260        self.flush();
1261        self.out.push(inline);
1262    }
1263
1264    fn at(&self, i: usize) -> Option<char> {
1265        self.chars.get(i).copied()
1266    }
1267
1268    #[allow(clippy::too_many_lines)]
1269    fn run(&mut self) {
1270        let mut i = 0;
1271        while let Some(c) = self.at(i) {
1272            let prev = if i == 0 { None } else { self.at(i - 1) };
1273
1274            // Bare autolink at a word boundary.
1275            if is_url_boundary(prev)
1276                && let Some((url, end)) = self.scan_bare_url(i)
1277            {
1278                self.push_inline(link(&url, vec![Inline::Str(url.clone().into())]));
1279                i = end;
1280                continue;
1281            }
1282
1283            match c {
1284                ' ' | '\t' => {
1285                    self.flush();
1286                    while matches!(self.at(i), Some(' ' | '\t')) {
1287                        i += 1;
1288                    }
1289                    self.out.push(Inline::Space);
1290                }
1291                '\n' => {
1292                    self.flush();
1293                    self.out.push(Inline::SoftBreak);
1294                    i += 1;
1295                }
1296                '\\' => i = self.scan_backslash(i),
1297                '*' | '/' | '+' => {
1298                    if let Some(end) = self.scan_emphasis(i, c, prev) {
1299                        let inner = self.chars.get(i + 1..end).unwrap_or(&[]);
1300                        let content = parse_inlines(&collect_str(inner), self.ext, self.notes);
1301                        self.push_inline(wrap_markup(c, content));
1302                        i = end + 1;
1303                    } else {
1304                        self.word.push(c);
1305                        i += 1;
1306                    }
1307                }
1308                '_' => {
1309                    if let Some(end) = self.scan_emphasis(i, '_', prev) {
1310                        let inner = self.chars.get(i + 1..end).unwrap_or(&[]);
1311                        let content = parse_inlines(&collect_str(inner), self.ext, self.notes);
1312                        self.push_inline(Inline::Underline(content));
1313                        i = end + 1;
1314                    } else if let Some((inline, end)) = self.scan_subsup(i, prev, false) {
1315                        self.push_inline(inline);
1316                        i = end;
1317                    } else {
1318                        self.word.push('_');
1319                        i += 1;
1320                    }
1321                }
1322                '^' => {
1323                    if let Some((inline, end)) = self.scan_subsup(i, prev, true) {
1324                        self.push_inline(inline);
1325                        i = end;
1326                    } else {
1327                        self.word.push('^');
1328                        i += 1;
1329                    }
1330                }
1331                '=' | '~' => {
1332                    // Verbatim uses the same border rules as markup emphasis but takes its body
1333                    // literally.
1334                    if let Some(end) = self.scan_emphasis(i, c, prev) {
1335                        let inner = self.chars.get(i + 1..end).unwrap_or(&[]);
1336                        self.push_inline(verbatim_code(c, inner));
1337                        i = end + 1;
1338                    } else {
1339                        self.word.push(c);
1340                        i += 1;
1341                    }
1342                }
1343                '[' => {
1344                    if let Some((inline, end)) = self.scan_bracket(i) {
1345                        self.push_inline(inline);
1346                        i = end;
1347                    } else {
1348                        self.word.push('[');
1349                        i += 1;
1350                    }
1351                }
1352                '<' => {
1353                    if let Some((inline, end)) = self.scan_angle(i) {
1354                        self.push_inline(inline);
1355                        i = end;
1356                    } else {
1357                        self.word.push('<');
1358                        i += 1;
1359                    }
1360                }
1361                '$' => {
1362                    if let Some((inline, end)) = self.scan_math_dollar(i, prev) {
1363                        self.push_inline(inline);
1364                        i = end;
1365                    } else {
1366                        self.word.push('$');
1367                        i += 1;
1368                    }
1369                }
1370                '@' => {
1371                    if let Some((inline, end)) = self.scan_export(i) {
1372                        self.push_inline(inline);
1373                        i = end;
1374                    } else {
1375                        self.word.push('@');
1376                        i += 1;
1377                    }
1378                }
1379                '-' | '.' => {
1380                    // The typographic dash and ellipsis replacements are always active.
1381                    if let Some((text, end)) = self.scan_special_string(i) {
1382                        self.word.push_str(text);
1383                        i = end;
1384                    } else {
1385                        self.word.push(c);
1386                        i += 1;
1387                    }
1388                }
1389                '\'' if self.ext.contains(Extension::Smart)
1390                    && prev.is_some_and(char::is_alphanumeric) =>
1391                {
1392                    // A word-internal or trailing apostrophe becomes a right single quotation mark.
1393                    self.word.push('\u{2019}');
1394                    i += 1;
1395                }
1396                '"' | '\'' if self.ext.contains(Extension::Smart) => {
1397                    let (inline, end) = self.scan_quote(i, c);
1398                    if let Some(q) = inline {
1399                        self.push_inline(q);
1400                        i = end;
1401                    } else {
1402                        self.word.push(c);
1403                        i += 1;
1404                    }
1405                }
1406                _ => {
1407                    self.word.push(c);
1408                    i += 1;
1409                }
1410            }
1411        }
1412    }
1413
1414    // -- Emphasis ------------------------------------------------------------------------------
1415
1416    /// Finds the closing marker for markup emphasis opened at `i`, honoring the pre/post border
1417    /// rules and the single-newline body limit.
1418    fn scan_emphasis(&self, i: usize, marker: char, prev: Option<char>) -> Option<usize> {
1419        if !pre_ok(prev) {
1420            return None;
1421        }
1422        let first = self.at(i + 1)?;
1423        if first.is_whitespace() {
1424            return None;
1425        }
1426        let mut newlines = 0;
1427        let mut j = i + 1;
1428        while let Some(c) = self.at(j) {
1429            if c == '\n' {
1430                newlines += 1;
1431                if newlines > 1 {
1432                    return None;
1433                }
1434            }
1435            if c == marker
1436                && j > i + 1
1437                && !self.at(j - 1).is_some_and(char::is_whitespace)
1438                && post_ok(self.at(j + 1))
1439            {
1440                return Some(j);
1441            }
1442            j += 1;
1443        }
1444        None
1445    }
1446
1447    // -- Sub/superscript -----------------------------------------------------------------------
1448
1449    /// Parses a subscript (`_`) or superscript (`^`) at `i`. Requires a preceding non-space base and
1450    /// accepts either a `{…}` group or a bare token ending in an alphanumeric.
1451    fn scan_subsup(&self, i: usize, prev: Option<char>, sup: bool) -> Option<(Inline, usize)> {
1452        // The base must be a non-space character, and never an underscore: a run like `a__b` is a
1453        // literal double underscore, not a subscript.
1454        if prev.is_none_or(|c| c.is_whitespace() || c == '_') {
1455            return None;
1456        }
1457        let content;
1458        let end;
1459        if self.at(i + 1) == Some('{') {
1460            let close = self.match_brace(i + 1)?;
1461            let inner = self.chars.get(i + 2..close).unwrap_or(&[]);
1462            content = parse_inlines(&collect_str(inner), self.ext, self.notes);
1463            end = close + 1;
1464        } else {
1465            let (text, stop) = self.scan_bare_script(i + 1)?;
1466            content = vec![Inline::Str(text.into())];
1467            end = stop;
1468        }
1469        let inline = if sup {
1470            Inline::Superscript(content)
1471        } else {
1472            Inline::Subscript(content)
1473        };
1474        Some((inline, end))
1475    }
1476
1477    /// Scans a bare sub/superscript token: an optional sign then alphanumerics, dots, and commas,
1478    /// which must end in an alphanumeric.
1479    fn scan_bare_script(&self, start: usize) -> Option<(String, usize)> {
1480        let mut j = start;
1481        if matches!(self.at(j), Some('-' | '+')) {
1482            j += 1;
1483        }
1484        let body_start = j;
1485        while matches!(self.at(j), Some(c) if c.is_alphanumeric() || matches!(c, '.' | ',' | '\\'))
1486        {
1487            j += 1;
1488        }
1489        // Trim trailing non-alphanumerics; require at least one alphanumeric in the body.
1490        let mut last = j;
1491        while last > body_start && !self.at(last - 1).is_some_and(char::is_alphanumeric) {
1492            last -= 1;
1493        }
1494        if last <= body_start {
1495            return None;
1496        }
1497        let text: String = self.chars.get(start..last).unwrap_or(&[]).iter().collect();
1498        Some((text, last))
1499    }
1500
1501    fn match_brace(&self, open: usize) -> Option<usize> {
1502        let mut depth = 0usize;
1503        let mut j = open;
1504        while let Some(c) = self.at(j) {
1505            match c {
1506                '{' => depth += 1,
1507                '}' => {
1508                    depth -= 1;
1509                    if depth == 0 {
1510                        return Some(j);
1511                    }
1512                }
1513                '\n' => return None,
1514                _ => {}
1515            }
1516            j += 1;
1517        }
1518        None
1519    }
1520
1521    // -- Backslash: line break, math, entities -------------------------------------------------
1522
1523    fn scan_backslash(&mut self, i: usize) -> usize {
1524        match self.at(i + 1) {
1525            Some('\\') => {
1526                // Line break: consume both backslashes, trailing spaces, and one newline.
1527                self.push_inline(Inline::LineBreak);
1528                let mut j = i + 2;
1529                while matches!(self.at(j), Some(' ' | '\t')) {
1530                    j += 1;
1531                }
1532                if self.at(j) == Some('\n') {
1533                    j += 1;
1534                }
1535                j
1536            }
1537            Some('(') => self.scan_tex_math(i + 2, "\\)", MathType::InlineMath, i),
1538            Some('[') => self.scan_tex_math(i + 2, "\\]", MathType::DisplayMath, i),
1539            Some(c) if c.is_ascii_alphabetic() => self.scan_entity(i),
1540            _ => {
1541                self.word.push('\\');
1542                i + 1
1543            }
1544        }
1545    }
1546
1547    fn scan_tex_math(
1548        &mut self,
1549        start: usize,
1550        close: &str,
1551        kind: MathType,
1552        fallback: usize,
1553    ) -> usize {
1554        let closing: Vec<char> = close.chars().collect();
1555        let mut j = start;
1556        while j < self.chars.len() {
1557            if self.matches_at(j, &closing) {
1558                let inner: String = self.chars.get(start..j).unwrap_or(&[]).iter().collect();
1559                self.push_inline(Inline::Math(kind, inner.into()));
1560                return j + closing.len();
1561            }
1562            j += 1;
1563        }
1564        // Unterminated: emit the opening delimiter literally.
1565        self.word.push('\\');
1566        fallback + 1
1567    }
1568
1569    fn scan_entity(&mut self, i: usize) -> usize {
1570        let mut j = i + 1;
1571        while matches!(self.at(j), Some(c) if c.is_ascii_alphabetic()) {
1572            j += 1;
1573        }
1574        let name: String = self.chars.get(i + 1..j).unwrap_or(&[]).iter().collect();
1575        // An optional `{}` terminates the entity and is consumed.
1576        let mut end = j;
1577        if self.at(j) == Some('{') && self.at(j + 1) == Some('}') {
1578            end = j + 2;
1579        }
1580        if let Some(replacement) = entity(&name) {
1581            self.word.push_str(replacement);
1582        } else {
1583            self.push_inline(Inline::RawInline(
1584                Format("latex".into()),
1585                format!("\\{name}").into(),
1586            ));
1587        }
1588        end
1589    }
1590
1591    // -- Brackets: links, footnotes, citations -------------------------------------------------
1592
1593    fn scan_bracket(&self, i: usize) -> Option<(Inline, usize)> {
1594        if self.at(i + 1) == Some('[') {
1595            return self.scan_link(i);
1596        }
1597        if self.matches_at(i + 1, &['f', 'n', ':']) {
1598            return self.scan_footnote(i);
1599        }
1600        if self.ext.contains(Extension::Citations)
1601            && (self.matches_at(i + 1, &['c', 'i', 't', 'e', ':'])
1602                || self.matches_at(i + 1, &['c', 'i', 't', 'e', '/']))
1603        {
1604            return self.scan_citation(i);
1605        }
1606        None
1607    }
1608
1609    fn scan_link(&self, i: usize) -> Option<(Inline, usize)> {
1610        // `[[` … `]]`, with an optional `][description]`.
1611        let inner_start = i + 2;
1612        let close = self.find_double_close(inner_start)?;
1613        let inner: String = self
1614            .chars
1615            .get(inner_start..close)
1616            .unwrap_or(&[])
1617            .iter()
1618            .collect();
1619        let (target_raw, desc_raw) = match inner.find("][") {
1620            Some(idx) => (
1621                inner.get(..idx).unwrap_or(""),
1622                Some(inner.get(idx + 2..).unwrap_or("")),
1623            ),
1624            None => (inner.as_str(), None),
1625        };
1626        let target = process_target(target_raw);
1627        let end = close + 2;
1628        match desc_raw {
1629            Some(desc) => Some((
1630                link(&target, parse_inlines(desc, self.ext, self.notes)),
1631                end,
1632            )),
1633            None => {
1634                if is_image_target(&target) {
1635                    Some((image(&target, Vec::new()), end))
1636                } else {
1637                    Some((link(&target, vec![Inline::Str(target_raw.into())]), end))
1638                }
1639            }
1640        }
1641    }
1642
1643    /// Finds a `]]` starting at or after `from`.
1644    fn find_double_close(&self, from: usize) -> Option<usize> {
1645        let mut j = from;
1646        while j + 1 < self.chars.len() {
1647            if self.at(j) == Some(']') && self.at(j + 1) == Some(']') {
1648                return Some(j);
1649            }
1650            j += 1;
1651        }
1652        None
1653    }
1654
1655    fn scan_footnote(&self, i: usize) -> Option<(Inline, usize)> {
1656        // `[fn:label]`, `[fn:label:text]`, or `[fn::text]`.
1657        let close = self.match_bracket(i)?;
1658        let inner: String = self.chars.get(i + 1..close).unwrap_or(&[]).iter().collect();
1659        let body = inner.strip_prefix("fn:")?;
1660        let end = close + 1;
1661        if let Some((label, text)) = body.split_once(':') {
1662            // Inline definition (named or anonymous).
1663            let note = vec![Block::Para(parse_inlines(
1664                text.trim(),
1665                self.ext,
1666                self.notes,
1667            ))];
1668            let _ = label;
1669            return Some((Inline::Note(note), end));
1670        }
1671        // Bare reference: resolve against the gathered definitions.
1672        let blocks = self.notes.get(body).cloned().unwrap_or_default();
1673        Some((Inline::Note(blocks), end))
1674    }
1675
1676    fn scan_citation(&self, i: usize) -> Option<(Inline, usize)> {
1677        let close = self.match_bracket(i)?;
1678        let inner: String = self.chars.get(i + 1..close).unwrap_or(&[]).iter().collect();
1679        let raw: String = self.chars.get(i..close + 1).unwrap_or(&[]).iter().collect();
1680        let rest = inner.strip_prefix("cite")?;
1681        let (style, payload) = match rest.strip_prefix('/') {
1682            Some(after) => {
1683                let (sty, pay) = after.split_once(':')?;
1684                (Some(sty), pay)
1685            }
1686            None => (None, rest.strip_prefix(':')?),
1687        };
1688        let citations = parse_citation_items(payload, style, self.ext, self.notes)?;
1689        Some((Inline::Cite(citations, plain_words(&raw)), close + 1))
1690    }
1691
1692    fn match_bracket(&self, open: usize) -> Option<usize> {
1693        let mut depth = 0usize;
1694        let mut j = open;
1695        while let Some(c) = self.at(j) {
1696            match c {
1697                '[' => depth += 1,
1698                ']' => {
1699                    depth -= 1;
1700                    if depth == 0 {
1701                        return Some(j);
1702                    }
1703                }
1704                _ => {}
1705            }
1706            j += 1;
1707        }
1708        None
1709    }
1710
1711    // -- Angle brackets: targets and autolinks -------------------------------------------------
1712
1713    fn scan_angle(&self, i: usize) -> Option<(Inline, usize)> {
1714        if self.at(i + 1) == Some('<') {
1715            // Target `<<name>>`.
1716            let name_start = i + 2;
1717            let mut j = name_start;
1718            while matches!(self.at(j), Some(c) if c != '<' && c != '>' && c != '\n') {
1719                j += 1;
1720            }
1721            if j > name_start && self.at(j) == Some('>') && self.at(j + 1) == Some('>') {
1722                let name: String = self
1723                    .chars
1724                    .get(name_start..j)
1725                    .unwrap_or(&[])
1726                    .iter()
1727                    .collect();
1728                let attr = Attr {
1729                    id: name.into(),
1730                    ..Attr::default()
1731                };
1732                // A target absorbs the whitespace that follows it.
1733                let mut end = j + 2;
1734                while matches!(self.at(end), Some(' ' | '\t')) {
1735                    end += 1;
1736                }
1737                return Some((Inline::Span(Box::new(attr), Vec::new()), end));
1738            }
1739            return None;
1740        }
1741        // Autolink `<uri>`.
1742        let mut j = i + 1;
1743        while matches!(self.at(j), Some(c) if c != '>' && c != '\n') {
1744            j += 1;
1745        }
1746        if self.at(j) != Some('>') {
1747            return None;
1748        }
1749        let content: String = self.chars.get(i + 1..j).unwrap_or(&[]).iter().collect();
1750        if is_uri(&content) {
1751            return Some((
1752                link(&content, vec![Inline::Str(content.clone().into())]),
1753                j + 1,
1754            ));
1755        }
1756        None
1757    }
1758
1759    // -- Dollar math ---------------------------------------------------------------------------
1760
1761    fn scan_math_dollar(&self, i: usize, prev: Option<char>) -> Option<(Inline, usize)> {
1762        if self.at(i + 1) == Some('$') {
1763            // Display `$$…$$`.
1764            let start = i + 2;
1765            let mut j = start;
1766            while j + 1 < self.chars.len() {
1767                if self.at(j) == Some('$') && self.at(j + 1) == Some('$') {
1768                    let inner: String = self.chars.get(start..j).unwrap_or(&[]).iter().collect();
1769                    return Some((Inline::Math(MathType::DisplayMath, inner.into()), j + 2));
1770                }
1771                j += 1;
1772            }
1773            return None;
1774        }
1775        // Inline `$…$` with word-boundary and border constraints.
1776        if prev.is_some_and(|c| c.is_alphanumeric() || c == '$') {
1777            return None;
1778        }
1779        let first = self.at(i + 1)?;
1780        if first.is_whitespace() || first == '$' {
1781            return None;
1782        }
1783        let mut j = i + 1;
1784        while let Some(c) = self.at(j) {
1785            if c == '\n' {
1786                return None;
1787            }
1788            if c == '$'
1789                && !self.at(j - 1).is_some_and(char::is_whitespace)
1790                && !self.at(j + 1).is_some_and(char::is_alphanumeric)
1791            {
1792                let inner: String = self.chars.get(i + 1..j).unwrap_or(&[]).iter().collect();
1793                return Some((Inline::Math(MathType::InlineMath, inner.into()), j + 1));
1794            }
1795            j += 1;
1796        }
1797        None
1798    }
1799
1800    // -- Inline export ---------------------------------------------------------------------------
1801
1802    fn scan_export(&self, i: usize) -> Option<(Inline, usize)> {
1803        // `@@format:content@@`.
1804        if self.at(i + 1) != Some('@') {
1805            return None;
1806        }
1807        let fmt_start = i + 2;
1808        let mut j = fmt_start;
1809        while matches!(self.at(j), Some(c) if c.is_ascii_alphanumeric() || c == '-') {
1810            j += 1;
1811        }
1812        if self.at(j) != Some(':') || j == fmt_start {
1813            return None;
1814        }
1815        let fmt: String = self.chars.get(fmt_start..j).unwrap_or(&[]).iter().collect();
1816        let content_start = j + 1;
1817        let mut k = content_start;
1818        while k + 1 < self.chars.len() {
1819            if self.at(k) == Some('@') && self.at(k + 1) == Some('@') {
1820                let content: String = self
1821                    .chars
1822                    .get(content_start..k)
1823                    .unwrap_or(&[])
1824                    .iter()
1825                    .collect();
1826                return Some((Inline::RawInline(Format(fmt.into()), content.into()), k + 2));
1827            }
1828            k += 1;
1829        }
1830        None
1831    }
1832
1833    // -- Smart quotes --------------------------------------------------------------------------
1834
1835    fn scan_quote(&self, i: usize, quote: char) -> (Option<Inline>, usize) {
1836        let (kind, close) = if quote == '"' {
1837            (QuoteType::DoubleQuote, '"')
1838        } else {
1839            (QuoteType::SingleQuote, '\'')
1840        };
1841        // The opening quote must be followed immediately by a non-space character.
1842        if !matches!(self.at(i + 1), Some(c) if !c.is_whitespace()) {
1843            return (None, i + 1);
1844        }
1845        let mut j = i + 1;
1846        while let Some(c) = self.at(j) {
1847            if c == close
1848                && !self.at(j - 1).is_some_and(char::is_whitespace)
1849                && post_ok(self.at(j + 1))
1850            {
1851                let inner = self.chars.get(i + 1..j).unwrap_or(&[]);
1852                let content = parse_inlines(&collect_str(inner), self.ext, self.notes);
1853                return (Some(Inline::Quoted(kind, content)), j + 1);
1854            }
1855            if c == '\n' {
1856                break;
1857            }
1858            j += 1;
1859        }
1860        (None, i + 1)
1861    }
1862
1863    // -- Special strings -----------------------------------------------------------------------
1864
1865    /// Replaces `---`/`--` with em/en dashes and `...` with an ellipsis.
1866    fn scan_special_string(&self, i: usize) -> Option<(&'static str, usize)> {
1867        if self.at(i) == Some('.') {
1868            if self.at(i + 1) == Some('.') && self.at(i + 2) == Some('.') {
1869                return Some(("\u{2026}", i + 3));
1870            }
1871            return None;
1872        }
1873        // Dash sequences.
1874        if self.at(i + 1) == Some('-') {
1875            if self.at(i + 2) == Some('-') {
1876                return Some(("\u{2014}", i + 3));
1877            }
1878            return Some(("\u{2013}", i + 2));
1879        }
1880        None
1881    }
1882
1883    // -- Bare autolinks ------------------------------------------------------------------------
1884
1885    fn scan_bare_url(&self, i: usize) -> Option<(String, usize)> {
1886        const SCHEMES: [&str; 3] = ["https://", "http://", "ftp://"];
1887        let scheme = SCHEMES.iter().find(|s| self.matches_str(i, s)).copied()?;
1888        let mut j = i + scheme.chars().count();
1889        while matches!(self.at(j), Some(c) if !c.is_whitespace() && !matches!(c, '<' | '>' | '(' | ')' | '[' | ']'))
1890        {
1891            j += 1;
1892        }
1893        // Trim trailing sentence punctuation.
1894        while j > i + scheme.chars().count()
1895            && self
1896                .at(j - 1)
1897                .is_some_and(|c| matches!(c, '.' | ',' | ';' | ':' | '!' | '?' | '\'' | '"'))
1898        {
1899            j -= 1;
1900        }
1901        let url: String = self.chars.get(i..j).unwrap_or(&[]).iter().collect();
1902        Some((url, j))
1903    }
1904
1905    // -- Low-level matching --------------------------------------------------------------------
1906
1907    fn matches_at(&self, i: usize, pat: &[char]) -> bool {
1908        pat.iter()
1909            .enumerate()
1910            .all(|(k, &c)| self.at(i + k) == Some(c))
1911    }
1912
1913    fn matches_str(&self, i: usize, pat: &str) -> bool {
1914        pat.chars()
1915            .enumerate()
1916            .all(|(k, c)| self.at(i + k) == Some(c))
1917    }
1918}
1919
1920// -- Inline helpers ----------------------------------------------------------------------------
1921
1922fn collect_str(chars: &[char]) -> String {
1923    chars.iter().collect()
1924}
1925
1926/// Tokenizes text into `Str` words separated by `Space`, used for the literal fallback rendering of a
1927/// citation.
1928fn plain_words(text: &str) -> Vec<Inline> {
1929    let mut out = Vec::new();
1930    for word in text.split_whitespace() {
1931        if !out.is_empty() {
1932            out.push(Inline::Space);
1933        }
1934        out.push(Inline::Str(word.into()));
1935    }
1936    out
1937}
1938
1939fn wrap_markup(marker: char, content: Vec<Inline>) -> Inline {
1940    match marker {
1941        '*' => Inline::Strong(content),
1942        '+' => Inline::Strikeout(content),
1943        // The only other marker routed here is `/`.
1944        _ => Inline::Emph(content),
1945    }
1946}
1947
1948fn verbatim_code(marker: char, inner: &[char]) -> Inline {
1949    // A newline inside verbatim collapses to a space.
1950    let text: String = inner
1951        .iter()
1952        .map(|&c| if c == '\n' { ' ' } else { c })
1953        .collect();
1954    let attr = if marker == '=' {
1955        Attr {
1956            classes: vec!["verbatim".into()],
1957            ..Attr::default()
1958        }
1959    } else {
1960        Attr::default()
1961    };
1962    Inline::Code(Box::new(attr), text.into())
1963}
1964
1965fn link(target: &str, desc: Vec<Inline>) -> Inline {
1966    Inline::Link(
1967        Box::default(),
1968        desc,
1969        Box::new(carta_ast::Target {
1970            url: target.into(),
1971            title: carta_ast::Text::default(),
1972        }),
1973    )
1974}
1975
1976fn image(target: &str, alt: Vec<Inline>) -> Inline {
1977    Inline::Image(
1978        Box::default(),
1979        alt,
1980        Box::new(carta_ast::Target {
1981            url: target.into(),
1982            title: carta_ast::Text::default(),
1983        }),
1984    )
1985}
1986
1987/// Processes a link target: strips a `file:` prefix and leaves other targets untouched.
1988fn process_target(raw: &str) -> String {
1989    if let Some(rest) = raw.strip_prefix("file:") {
1990        return rest.to_owned();
1991    }
1992    raw.to_owned()
1993}
1994
1995fn is_image_target(target: &str) -> bool {
1996    const EXTS: [&str; 8] = [
1997        ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", ".bmp", ".tiff",
1998    ];
1999    let lower = target.to_ascii_lowercase();
2000    EXTS.iter().any(|e| lower.ends_with(e))
2001}
2002
2003/// Whether an angle-bracketed string is a URI: it carries a scheme and no whitespace.
2004fn is_uri(s: &str) -> bool {
2005    if s.chars().any(char::is_whitespace) {
2006        return false;
2007    }
2008    if s.contains("://") {
2009        return true;
2010    }
2011    match s.split_once(':') {
2012        Some((scheme, rest)) => {
2013            !scheme.is_empty()
2014                && !rest.is_empty()
2015                && scheme
2016                    .chars()
2017                    .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '.' | '-'))
2018        }
2019        None => false,
2020    }
2021}
2022
2023fn is_url_boundary(prev: Option<char>) -> bool {
2024    match prev {
2025        None => true,
2026        Some(c) => !c.is_alphanumeric(),
2027    }
2028}
2029
2030fn pre_ok(prev: Option<char>) -> bool {
2031    match prev {
2032        None => true,
2033        Some(c) => c.is_whitespace() || matches!(c, '-' | '(' | '{' | '\'' | '"'),
2034    }
2035}
2036
2037fn post_ok(next: Option<char>) -> bool {
2038    match next {
2039        None => true,
2040        Some(c) => {
2041            c.is_whitespace()
2042                || matches!(
2043                    c,
2044                    '-' | '.' | ',' | ':' | '!' | '?' | ';' | '"' | '\'' | ')' | '}' | '['
2045                )
2046        }
2047    }
2048}
2049
2050// -- Citations ---------------------------------------------------------------------------------
2051
2052fn parse_citation_items(
2053    payload: &str,
2054    style: Option<&str>,
2055    ext: Extensions,
2056    notes: &BTreeMap<String, Vec<Block>>,
2057) -> Option<Vec<carta_ast::Citation>> {
2058    let mode = citation_mode(style);
2059    let chunks: Vec<&str> = payload.split(';').collect();
2060
2061    let mut prefix_carry: Option<&str> = None;
2062    let mut items: Vec<(String, Vec<Inline>, Vec<Inline>)> = Vec::new();
2063    let mut trailing_suffix: Option<&str> = None;
2064
2065    for chunk in chunks {
2066        match chunk.find('@') {
2067            Some(at) => {
2068                let prefix = chunk.get(..at).unwrap_or("");
2069                let after = chunk.get(at + 1..).unwrap_or("");
2070                let key_end = after
2071                    .find(|c: char| !is_citation_key_char(c))
2072                    .unwrap_or(after.len());
2073                let key = after.get(..key_end).unwrap_or("").to_owned();
2074                let suffix = after.get(key_end..).unwrap_or("");
2075                let mut prefix_text = prefix.to_owned();
2076                if let Some(carry) = prefix_carry.take() {
2077                    prefix_text = format!("{carry};{prefix}");
2078                }
2079                items.push((
2080                    key,
2081                    parse_inlines(prefix_text.trim(), ext, notes),
2082                    parse_inlines(suffix.trim_end(), ext, notes),
2083                ));
2084            }
2085            None => {
2086                if items.is_empty() {
2087                    prefix_carry = Some(chunk);
2088                } else {
2089                    trailing_suffix = Some(chunk);
2090                }
2091            }
2092        }
2093    }
2094
2095    if items.is_empty() {
2096        return None;
2097    }
2098
2099    if let (Some(suffix), Some(last)) = (trailing_suffix, items.last_mut()) {
2100        let mut combined = last.2.clone();
2101        if !combined.is_empty() {
2102            combined.push(Inline::Str(";".into()));
2103        }
2104        combined.extend(parse_inlines(suffix.trim(), ext, notes));
2105        last.2 = combined;
2106    }
2107
2108    let citations = items
2109        .into_iter()
2110        .enumerate()
2111        .map(|(idx, (id, prefix, suffix))| carta_ast::Citation {
2112            id: id.into(),
2113            prefix,
2114            suffix,
2115            mode: if idx == 0 {
2116                mode.clone()
2117            } else {
2118                carta_ast::CitationMode::NormalCitation
2119            },
2120            note_num: 0,
2121            hash: 0,
2122        })
2123        .collect();
2124    Some(citations)
2125}
2126
2127fn citation_mode(style: Option<&str>) -> carta_ast::CitationMode {
2128    match style {
2129        Some("t" | "text" | "author") => carta_ast::CitationMode::AuthorInText,
2130        Some(s)
2131            if s.starts_with("na") || s.starts_with("noauthor") || s.starts_with("suppress") =>
2132        {
2133            carta_ast::CitationMode::SuppressAuthor
2134        }
2135        _ => carta_ast::CitationMode::NormalCitation,
2136    }
2137}
2138
2139fn is_citation_key_char(c: char) -> bool {
2140    c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | ':' | '.' | '/')
2141}
2142
2143// -- Prefix helpers ----------------------------------------------------------------------------
2144
2145fn strip_prefix_ci<'a>(s: &'a str, prefix: &str) -> Option<&'a str> {
2146    if s.len() >= prefix.len() && s.get(..prefix.len())?.eq_ignore_ascii_case(prefix) {
2147        s.get(prefix.len()..)
2148    } else {
2149        None
2150    }
2151}
2152
2153fn starts_with_ci(s: &str, prefix: &str) -> bool {
2154    strip_prefix_ci(s, prefix).is_some()
2155}
2156
2157// -- Entity table ------------------------------------------------------------------------------
2158
2159/// The Unicode replacement for an Org entity name, or `None` when the name is unknown (the caller
2160/// then passes it through as raw LaTeX).
2161#[allow(clippy::too_many_lines, clippy::match_same_arms)]
2162fn entity(name: &str) -> Option<&'static str> {
2163    let value = match name {
2164        "alpha" => "α",
2165        "beta" => "β",
2166        "gamma" => "γ",
2167        "delta" => "δ",
2168        "epsilon" => "ϵ",
2169        "zeta" => "ζ",
2170        "eta" => "η",
2171        "theta" => "θ",
2172        "iota" => "ι",
2173        "kappa" => "κ",
2174        "lambda" => "λ",
2175        "mu" => "μ",
2176        "nu" => "ν",
2177        "xi" => "ξ",
2178        "omicron" => "ο",
2179        "pi" => "π",
2180        "rho" => "ρ",
2181        "sigma" => "σ",
2182        "sigmaf" => "ς",
2183        "tau" => "τ",
2184        "upsilon" => "υ",
2185        "phi" => "φ",
2186        "chi" => "χ",
2187        "psi" => "ψ",
2188        "omega" => "ω",
2189        "varphi" => "ϕ",
2190        "vartheta" => "ϑ",
2191        "varpi" => "ϖ",
2192        "Alpha" => "Α",
2193        "Beta" => "Β",
2194        "Gamma" => "Γ",
2195        "Delta" => "Δ",
2196        "Epsilon" => "Ε",
2197        "Zeta" => "Ζ",
2198        "Eta" => "Η",
2199        "Theta" => "Θ",
2200        "Iota" => "Ι",
2201        "Kappa" => "Κ",
2202        "Lambda" => "Λ",
2203        "Mu" => "Μ",
2204        "Nu" => "Ν",
2205        "Xi" => "Ξ",
2206        "Omicron" => "Ο",
2207        "Pi" => "Π",
2208        "Rho" => "Ρ",
2209        "Sigma" => "Σ",
2210        "Tau" => "Τ",
2211        "Upsilon" => "Υ",
2212        "Phi" => "Φ",
2213        "Chi" => "Χ",
2214        "Psi" => "Ψ",
2215        "Omega" => "Ω",
2216        "pm" => "±",
2217        "mp" => "∓",
2218        "times" => "×",
2219        "div" => "÷",
2220        "cdot" => "ċ",
2221        "deg" => "°",
2222        "prime" => "′",
2223        "Prime" => "″",
2224        "infin" => "∞",
2225        "nabla" => "∇",
2226        "partial" => "∂",
2227        "forall" => "∀",
2228        "exist" => "∃",
2229        "empty" => "∅",
2230        "isin" => "∈",
2231        "notin" => "∉",
2232        "ni" => "∋",
2233        "sum" => "∑",
2234        "prod" => "∏",
2235        "minus" => "−",
2236        "lowast" => "∗",
2237        "radic" => "√",
2238        "prop" => "∝",
2239        "ang" => "∠",
2240        "or" => "∨",
2241        "cap" => "∩",
2242        "cup" => "∪",
2243        "int" => "∫",
2244        "there4" => "∴",
2245        "sim" => "∼",
2246        "cong" => "≅",
2247        "asymp" => "≈",
2248        "ne" => "≠",
2249        "equiv" => "≡",
2250        "le" => "≤",
2251        "ge" => "≥",
2252        "sub" => "⊂",
2253        "sup" => "⊃",
2254        "sube" => "⊆",
2255        "supe" => "⊇",
2256        "oplus" => "⊕",
2257        "otimes" => "⊗",
2258        "perp" => "⊥",
2259        "sdot" => "⋅",
2260        "larr" => "←",
2261        "rarr" => "→",
2262        "uarr" => "↑",
2263        "darr" => "↓",
2264        "harr" => "↔",
2265        "lArr" => "⇐",
2266        "rArr" => "⇒",
2267        "uArr" => "⇑",
2268        "dArr" => "⇓",
2269        "hArr" => "⇔",
2270        "Leftarrow" => "⇐",
2271        "Rightarrow" => "⇒",
2272        "Leftrightarrow" => "⇔",
2273        "copy" => "©",
2274        "reg" => "®",
2275        "trade" => "™",
2276        "euro" => "€",
2277        "cent" => "¢",
2278        "pound" => "£",
2279        "yen" => "¥",
2280        "sect" => "§",
2281        "para" => "¶",
2282        "middot" => "·",
2283        "hellip" => "…",
2284        "dots" => "…",
2285        "amp" => "&",
2286        "lt" => "<",
2287        "gt" => ">",
2288        "ndash" => "–",
2289        "mdash" => "—",
2290        "lsquo" => "‘",
2291        "rsquo" => "’",
2292        "ldquo" => "“",
2293        "rdquo" => "”",
2294        "laquo" => "«",
2295        "raquo" => "»",
2296        "nbsp" => "\u{a0}",
2297        "shy" => "\u{ad}",
2298        "aacute" => "á",
2299        "eacute" => "é",
2300        "iacute" => "í",
2301        "oacute" => "ó",
2302        "uacute" => "ú",
2303        "auml" => "ä",
2304        "euml" => "ë",
2305        "iuml" => "ï",
2306        "ouml" => "ö",
2307        "uuml" => "ü",
2308        "ntilde" => "ñ",
2309        "ccedil" => "ç",
2310        "szlig" => "ß",
2311        "dagger" => "†",
2312        "Dagger" => "‡",
2313        "bull" => "•",
2314        "permil" => "‰",
2315        "frac12" => "½",
2316        "frac14" => "¼",
2317        "frac34" => "¾",
2318        "sup2" => "²",
2319        "sup3" => "³",
2320        "plusmn" => "±",
2321        _ => return None,
2322    };
2323    Some(value)
2324}
2325
2326#[cfg(test)]
2327mod tests {
2328    // Test code: indexing into a block/inline vector produced from a known fixture is the idiomatic
2329    // assertion, and a wrong index panics the test rather than corrupting shipped output.
2330    #![allow(clippy::indexing_slicing)]
2331    use super::*;
2332
2333    fn doc(input: &str) -> Document {
2334        let mut options = ReaderOptions::default();
2335        options.extensions = Extensions::from_list(&[
2336            Extension::AutoIdentifiers,
2337            Extension::Citations,
2338            Extension::TaskLists,
2339        ]);
2340        OrgReader.read(input, &options).unwrap()
2341    }
2342
2343    fn blocks(input: &str) -> Vec<Block> {
2344        doc(input).blocks
2345    }
2346
2347    #[test]
2348    fn paragraph_with_emphasis() {
2349        let b = blocks("Hello *world* /italic/ =verb= ~code~ +strike+.");
2350        assert_eq!(b.len(), 1);
2351        match &b[0] {
2352            Block::Para(inlines) => {
2353                assert!(inlines.contains(&Inline::Strong(vec![Inline::Str("world".into())])));
2354                assert!(inlines.contains(&Inline::Emph(vec![Inline::Str("italic".into())])));
2355            }
2356            other => panic!("expected paragraph, got {other:?}"),
2357        }
2358    }
2359
2360    #[test]
2361    fn headline_levels_and_ids() {
2362        let b = blocks("* First\n** Second");
2363        match &b[0] {
2364            Block::Header(1, attr, _) => assert_eq!(attr.id, "first"),
2365            other => panic!("expected header, got {other:?}"),
2366        }
2367        match &b[1] {
2368            Block::Header(2, attr, _) => assert_eq!(attr.id, "second"),
2369            other => panic!("expected header, got {other:?}"),
2370        }
2371    }
2372
2373    #[test]
2374    fn todo_keyword_and_tags() {
2375        let b = blocks("* TODO Task :work:");
2376        match &b[0] {
2377            Block::Header(1, attr, inlines) => {
2378                assert_eq!(attr.id, "task");
2379                assert!(
2380                    matches!(inlines.first(), Some(Inline::Span(a, _)) if a.classes == ["todo", "TODO"])
2381                );
2382            }
2383            other => panic!("expected header, got {other:?}"),
2384        }
2385    }
2386
2387    #[test]
2388    fn src_block_becomes_code_block() {
2389        let b = blocks("#+BEGIN_SRC python\nprint(1)\n#+END_SRC");
2390        match &b[0] {
2391            Block::CodeBlock(attr, text) => {
2392                assert_eq!(attr.classes, ["python"]);
2393                assert_eq!(text, "print(1)\n");
2394            }
2395            other => panic!("expected code block, got {other:?}"),
2396        }
2397    }
2398
2399    #[test]
2400    fn bullet_and_ordered_lists() {
2401        assert!(
2402            matches!(blocks("- a\n- b").first(), Some(Block::BulletList(items)) if items.len() == 2)
2403        );
2404        assert!(matches!(
2405            blocks("1. a\n2. b").first(),
2406            Some(Block::OrderedList(..))
2407        ));
2408    }
2409
2410    #[test]
2411    fn definition_list() {
2412        match blocks("- term :: definition").first() {
2413            Some(Block::DefinitionList(entries)) => assert_eq!(entries.len(), 1),
2414            other => panic!("expected definition list, got {other:?}"),
2415        }
2416    }
2417
2418    #[test]
2419    fn link_and_image() {
2420        let b = blocks("[[https://example.com][label]] [[./x.png]]");
2421        match &b[0] {
2422            Block::Para(inlines) => {
2423                assert!(inlines.iter().any(|i| matches!(i, Inline::Link(..))));
2424                assert!(inlines.iter().any(|i| matches!(i, Inline::Image(..))));
2425            }
2426            other => panic!("expected paragraph, got {other:?}"),
2427        }
2428    }
2429
2430    #[test]
2431    fn footnote_reference_resolves() {
2432        let b = blocks("Text[fn:1] more.\n\n[fn:1] The note.");
2433        match &b[0] {
2434            Block::Para(inlines) => {
2435                assert!(inlines.iter().any(|i| matches!(i, Inline::Note(_))));
2436            }
2437            other => panic!("expected paragraph, got {other:?}"),
2438        }
2439    }
2440
2441    #[test]
2442    fn table_with_header() {
2443        match blocks("| a | b |\n|---+---|\n| 1 | 2 |").first() {
2444            Some(Block::Table(table)) => {
2445                assert_eq!(table.head.rows.len(), 1);
2446                assert_eq!(table.bodies.len(), 1);
2447            }
2448            other => panic!("expected table, got {other:?}"),
2449        }
2450    }
2451
2452    #[test]
2453    fn metadata_title() {
2454        let d = doc("#+TITLE: My Doc\n\nbody");
2455        assert!(d.meta.contains_key("title"));
2456    }
2457
2458    #[test]
2459    fn subscript_and_superscript() {
2460        let b = blocks("H_2O and x^2");
2461        match &b[0] {
2462            Block::Para(inlines) => {
2463                assert!(inlines.iter().any(|i| matches!(i, Inline::Subscript(_))));
2464                assert!(inlines.iter().any(|i| matches!(i, Inline::Superscript(_))));
2465            }
2466            other => panic!("expected paragraph, got {other:?}"),
2467        }
2468    }
2469
2470    #[test]
2471    fn special_strings_dashes() {
2472        let b = blocks("em --- en -- dots ...");
2473        match &b[0] {
2474            Block::Para(inlines) => {
2475                let text = carta_ast::to_plain_text(inlines);
2476                assert!(text.contains('\u{2014}'));
2477                assert!(text.contains('\u{2013}'));
2478                assert!(text.contains('\u{2026}'));
2479            }
2480            other => panic!("expected paragraph, got {other:?}"),
2481        }
2482    }
2483
2484    fn doc_with(input: &str, exts: &[Extension]) -> Document {
2485        let mut options = ReaderOptions::default();
2486        options.extensions = Extensions::from_list(exts);
2487        OrgReader.read(input, &options).unwrap()
2488    }
2489
2490    #[test]
2491    fn smart_quotes_and_apostrophe() {
2492        let d = doc_with("He said \"hi\" and it's 'fine'.", &[Extension::Smart]);
2493        let Block::Para(inlines) = &d.blocks[0] else {
2494            panic!("expected paragraph");
2495        };
2496        assert!(inlines.contains(&Inline::Quoted(
2497            QuoteType::DoubleQuote,
2498            vec![Inline::Str("hi".into())]
2499        )));
2500        assert!(inlines.contains(&Inline::Quoted(
2501            QuoteType::SingleQuote,
2502            vec![Inline::Str("fine".into())]
2503        )));
2504        assert!(inlines.contains(&Inline::Str("it\u{2019}s".into())));
2505    }
2506
2507    #[test]
2508    fn quotes_literal_without_smart() {
2509        let d = doc_with("say \"hi\".", &[]);
2510        let Block::Para(inlines) = &d.blocks[0] else {
2511            panic!("expected paragraph");
2512        };
2513        assert!(inlines.iter().all(|i| !matches!(i, Inline::Quoted(..))));
2514    }
2515
2516    #[test]
2517    fn gfm_and_ascii_identifiers() {
2518        let gfm = doc_with(
2519            "* Foo Bar 1.2",
2520            &[Extension::AutoIdentifiers, Extension::GfmAutoIdentifiers],
2521        );
2522        assert!(matches!(&gfm.blocks[0], Block::Header(_, a, _) if a.id == "foo-bar-12"));
2523
2524        let ascii = doc_with(
2525            "* Café Résumé",
2526            &[Extension::AutoIdentifiers, Extension::AsciiIdentifiers],
2527        );
2528        assert!(matches!(&ascii.blocks[0], Block::Header(_, a, _) if a.id == "cafe-resume"));
2529    }
2530
2531    #[test]
2532    fn checkbox_literal_without_task_lists() {
2533        let d = doc_with("- [X] item", &[]);
2534        let Block::BulletList(items) = &d.blocks[0] else {
2535            panic!("expected bullet list");
2536        };
2537        let Block::Plain(inlines) = &items[0][0] else {
2538            panic!("expected plain");
2539        };
2540        assert!(inlines.contains(&Inline::Str("[X]".into())));
2541    }
2542
2543    #[test]
2544    fn entity_replacement() {
2545        let b = blocks("\\alpha and \\unknownentity");
2546        match &b[0] {
2547            Block::Para(inlines) => {
2548                assert!(carta_ast::to_plain_text(inlines).contains('α'));
2549                assert!(inlines.iter().any(|i| matches!(i, Inline::RawInline(..))));
2550            }
2551            other => panic!("expected paragraph, got {other:?}"),
2552        }
2553    }
2554}