Skip to main content

carta_readers/
man.rs

1//! Reader for the `man` macro package (the `groff`/`troff` manual-page language).
2//!
3//! A manual page is a sequence of control lines (a request or macro, introduced by `.` or `'` in
4//! the first column) and text lines. Text lines are *filled*: consecutive lines collapse into one
5//! paragraph, their words separated by single spaces. Macros structure the page — section headings
6//! (`.SH`/`.SS`), paragraph breaks (`.PP`), tagged and indented lists (`.TP`/`.IP`), relative insets
7//! (`.RS`/`.RE`), verbatim regions (`.nf`/`.EX`), and hyperlinks (`.UR`/`.MT`). Inline font macros
8//! (`.B`, `.I`, `.BR`, …) and the `\f` escape switch between roman, bold, and italic; the `\(xx`,
9//! `\[…]`, and `\*x` escapes produce special characters and predefined strings.
10//!
11//! The title macro `.TH` populates document metadata (`title`, `section`, `date`, `footer`,
12//! `header`); everything else becomes the block sequence.
13
14use std::collections::{BTreeMap, BTreeSet};
15
16use carta_ast::{
17    Alignment, Attr, Block, Caption, Cell, ColSpec, ColWidth, Document, Inline, ListAttributes,
18    ListNumberDelim, ListNumberStyle, MetaValue, Row, Table, TableBody, TableFoot, TableHead,
19    Target, slug, slug_gfm, to_plain_text,
20};
21use carta_core::{Extensions, Reader, ReaderOptions, Result};
22
23use crate::heading_ids::{IdRegistry, IdScheme, fold_to_ascii};
24use crate::inline_text::trim_inline_ends;
25
26/// A table of named strings: the predefined groff strings plus any defined with `.ds`, looked up by
27/// the `\*` interpolation escape.
28type Strings = BTreeMap<String, String>;
29
30/// The deepest a `\*` interpolation may recurse, bounding self-referential string definitions.
31const MAX_STRING_DEPTH: usize = 8;
32
33/// The most lines a single macro invocation may expand to, bounding self- and mutually-referential
34/// macro definitions so an invocation cannot loop forever.
35const MAX_MACRO_EXPANSION_LINES: usize = 100_000;
36
37/// The named strings groff defines before any input is read, keyed as the `\*` escape spells them:
38/// `\*R`, `\*(Tm`, `\*(lq`, `\*(rq`.
39fn predefined_strings() -> Strings {
40    [
41        ("R", "\u{00ae}"),
42        ("Tm", "\u{2122}"),
43        ("lq", "\u{201c}"),
44        ("rq", "\u{201d}"),
45    ]
46    .into_iter()
47    .map(|(name, value)| (name.to_owned(), value.to_owned()))
48    .collect()
49}
50
51/// Parses a manual page written in the `man` macro language into the document model.
52#[derive(Debug, Default, Clone, Copy)]
53pub struct ManReader;
54
55impl Reader for ManReader {
56    fn read(&self, input: &str, options: &ReaderOptions) -> Result<Document> {
57        let lines = logical_lines(input);
58        let mut parser = Parser::new(lines, options.extensions);
59        let blocks = parser.parse_blocks(Ctx::TOP);
60        Ok(Document {
61            meta: parser
62                .meta
63                .into_iter()
64                .map(|(k, v)| (k.into(), v))
65                .collect(),
66            blocks,
67            ..Document::default()
68        })
69    }
70}
71
72/// Splits the input into logical lines, joining input-continuation lines. A line ending in an odd
73/// number of backslashes continues onto the next: the trailing backslash is removed and the following
74/// line is appended directly, with no separating space. An even count leaves the line intact.
75fn logical_lines(input: &str) -> Vec<String> {
76    let mut out = Vec::new();
77    let mut acc = String::new();
78    let mut continuing = false;
79    for raw in input.split('\n') {
80        let raw = raw.strip_suffix('\r').unwrap_or(raw);
81        if !continuing {
82            acc.clear();
83        }
84        acc.push_str(raw);
85        let trailing = acc.chars().rev().take_while(|&c| c == '\\').count();
86        if trailing % 2 == 1 {
87            acc.pop();
88            continuing = true;
89        } else {
90            out.push(std::mem::take(&mut acc));
91            continuing = false;
92        }
93    }
94    if continuing {
95        out.push(acc);
96    }
97    out
98}
99
100/// The active typeface for a run of text. `\f(BI` and the `.BI`/`.IB` macros render bold-italic as
101/// emphasis wrapping strong. The constant-width faces (`\f(CW`, `\fC`, `.CW`) render as inline code,
102/// with a bold or italic constant-width face wrapping that code in the corresponding markup.
103#[derive(Debug, Clone, Copy, PartialEq, Eq)]
104enum Font {
105    Regular,
106    Bold,
107    Italic,
108    BoldItalic,
109    Mono,
110    MonoBold,
111    MonoItalic,
112}
113
114impl Font {
115    /// Wraps already-built inline content in the markup for this font; roman content is unwrapped.
116    /// A constant-width face collapses its content to a single inline-code span.
117    fn wrap(self, inlines: Vec<Inline>) -> Vec<Inline> {
118        if inlines.is_empty() {
119            return Vec::new();
120        }
121        self.wrap_forced(inlines)
122    }
123
124    /// Wraps the inlines in this font's markup unconditionally — even when they are empty. A
125    /// single-font macro called with an explicit argument keeps its styled wrapper around empty
126    /// content, whereas a font run that produces nothing collapses (see [`wrap`]).
127    fn wrap_forced(self, inlines: Vec<Inline>) -> Vec<Inline> {
128        match self {
129            Font::Regular => inlines,
130            Font::Bold => vec![Inline::Strong(inlines)],
131            Font::Italic => vec![Inline::Emph(inlines)],
132            Font::BoldItalic => vec![Inline::Emph(vec![Inline::Strong(inlines)])],
133            Font::Mono => vec![code_inline(&inlines)],
134            Font::MonoBold => vec![Inline::Strong(vec![code_inline(&inlines)])],
135            Font::MonoItalic => vec![Inline::Emph(vec![code_inline(&inlines)])],
136        }
137    }
138}
139
140/// Collapses a run of inline content into a single inline-code span, recovering its literal text.
141fn code_inline(inlines: &[Inline]) -> Inline {
142    let mut text = String::new();
143    collect_code_text(inlines, &mut text);
144    Inline::Code(Box::default(), text.into())
145}
146
147fn collect_code_text(inlines: &[Inline], out: &mut String) {
148    for inline in inlines {
149        match inline {
150            Inline::Str(s) => out.push_str(s),
151            Inline::Space => out.push(' '),
152            Inline::Strong(xs) | Inline::Emph(xs) => collect_code_text(xs, out),
153            _ => {}
154        }
155    }
156}
157
158/// What may end a block sequence early. Section headings always rise to the top level; a new
159/// paragraph or list-item macro closes an open list-item body so the list can continue or finish.
160#[derive(Debug, Clone, Copy)]
161struct Ctx {
162    /// Inside a `.RS` inset, so a closing `.RE` returns control to the inset's opener.
163    in_inset: bool,
164    /// Inside a list item's body, so a sibling item or a paragraph break ends the body.
165    in_item: bool,
166}
167
168impl Ctx {
169    const TOP: Ctx = Ctx {
170        in_inset: false,
171        in_item: false,
172    };
173    const INSET: Ctx = Ctx {
174        in_inset: true,
175        in_item: false,
176    };
177    const ITEM: Ctx = Ctx {
178        in_inset: false,
179        in_item: true,
180    };
181}
182
183/// Hands out heading identifiers in reading order, disambiguating repeats the way the active
184/// auto-identifier extension prescribes.
185struct HeadingIds {
186    scheme: Option<IdScheme>,
187    ascii: bool,
188    registry: IdRegistry,
189}
190
191impl HeadingIds {
192    fn new(extensions: Extensions) -> Self {
193        Self {
194            scheme: IdScheme::select(extensions, false),
195            ascii: extensions.contains(carta_core::Extension::AsciiIdentifiers),
196            registry: IdRegistry::default(),
197        }
198    }
199
200    fn assign(&mut self, inlines: &[Inline]) -> String {
201        let Some(scheme) = self.scheme else {
202            return String::new();
203        };
204        let text = to_plain_text(inlines);
205        // The slug shape follows the active extension, but a manual page always disambiguates
206        // natively: an empty slug becomes `section` and repeats increment until unused.
207        let base = match scheme {
208            IdScheme::Plain => slug(&text),
209            IdScheme::Gfm => slug_gfm(&text),
210        };
211        // ASCII folding transliterates the finished slug, so a separator left by a word whose
212        // letters all lack an ASCII base is preserved. The plain shape then re-drops its leading
213        // run up to the first letter, which folding away a leading word can expose; the gfm shape
214        // never strips a leading run.
215        let base = if self.ascii {
216            let folded = fold_to_ascii(&base);
217            match scheme {
218                IdScheme::Plain => folded
219                    .chars()
220                    .skip_while(|c| !c.is_ascii_alphabetic())
221                    .collect(),
222                IdScheme::Gfm => folded,
223            }
224        } else {
225            base
226        };
227        self.registry.assign_native(base)
228    }
229}
230
231struct Parser {
232    lines: Vec<String>,
233    pos: usize,
234    /// Lines from an in-progress macro expansion, not yet consumed. The logical current line is
235    /// this queue's front when non-empty, else `lines[pos]` — expanding a macro call pushes its
236    /// body here instead of splicing it into `lines`, so expansion cost is independent of how much
237    /// of the document remains unparsed.
238    pending: std::collections::VecDeque<String>,
239    meta: BTreeMap<String, MetaValue>,
240    headings: HeadingIds,
241    /// Named strings interpolated by `\*`: the predefined groff set, extended by `.ds`.
242    strings: Strings,
243    /// User-defined macros (`.de`/`.de1`), keyed by name; the value is the macro body's lines.
244    macros: BTreeMap<String, Vec<String>>,
245    /// Set when the most recent `.ie` condition was false, so the following `.el` takes its branch.
246    else_branch: bool,
247}
248
249impl Parser {
250    fn new(lines: Vec<String>, extensions: Extensions) -> Self {
251        Self {
252            lines,
253            pos: 0,
254            pending: std::collections::VecDeque::new(),
255            meta: BTreeMap::new(),
256            headings: HeadingIds::new(extensions),
257            strings: predefined_strings(),
258            macros: BTreeMap::new(),
259            else_branch: false,
260        }
261    }
262
263    fn peek(&self) -> Option<&str> {
264        self.pending
265            .front()
266            .map(String::as_str)
267            .or_else(|| self.lines.get(self.pos).map(String::as_str))
268    }
269
270    fn advance(&mut self) {
271        if self.pending.pop_front().is_none() {
272            self.pos += 1;
273        }
274    }
275
276    /// The control-line request name of the line at `pos`, if it is a non-comment control line.
277    fn peek_request(&self) -> Option<&str> {
278        let line = self.peek()?;
279        if is_comment(line) {
280            return None;
281        }
282        control_parts(line).map(|(name, _)| name)
283    }
284
285    /// Consumes and returns the next line, if any.
286    fn take_line(&mut self) -> Option<String> {
287        if let Some(line) = self.pending.pop_front() {
288            return Some(line);
289        }
290        let line = self.lines.get(self.pos).cloned();
291        if line.is_some() {
292            self.pos += 1;
293        }
294        line
295    }
296
297    /// Replaces the current line with the taken branch of a conditional so the main loop reprocesses
298    /// it as a fresh logical line (text or control line). An empty branch is skipped outright.
299    fn reprocess_as(&mut self, content: &str) {
300        let content = content.trim_start_matches([' ', '\t']);
301        if content.is_empty() {
302            self.advance();
303        } else if let Some(slot) = self.pending.front_mut() {
304            content.clone_into(slot);
305        } else if let Some(slot) = self.lines.get_mut(self.pos) {
306            content.clone_into(slot);
307        } else {
308            self.advance();
309        }
310    }
311
312    /// Consumes the body of a `.de`/`.de1` macro definition up to (but not including) the line whose
313    /// request name is `end` (the default end is `..`, whose request name is a single `.`), or to end
314    /// of input, and returns the collected body lines. The terminator line is consumed.
315    fn collect_macro_definition(&mut self, end: &str) -> Vec<String> {
316        let mut body = Vec::new();
317        while let Some(line) = self.peek().map(str::to_owned) {
318            self.advance();
319            let is_end =
320                !is_comment(&line) && control_parts(&line).is_some_and(|(name, _)| name == end);
321            if is_end {
322                break;
323            }
324            body.push(reduce_copy_mode(&line));
325        }
326        body
327    }
328
329    /// Expands a macro invocation into a flat list of lines, substituting the call's arguments for
330    /// `\$N` references and inlining any nested macro calls. Re-entrant calls and a per-invocation
331    /// line budget bound the expansion so a self- or mutually-referential macro cannot loop forever.
332    fn expand_macro_call(&self, name: &str, args: &[String]) -> Vec<String> {
333        let mut out = Vec::new();
334        let mut active = BTreeSet::new();
335        self.expand_macro_into(name, args, &mut active, &mut out);
336        out
337    }
338
339    fn expand_macro_into(
340        &self,
341        name: &str,
342        args: &[String],
343        active: &mut BTreeSet<String>,
344        out: &mut Vec<String>,
345    ) {
346        if out.len() >= MAX_MACRO_EXPANSION_LINES || active.contains(name) {
347            return;
348        }
349        let Some(body) = self.macros.get(name) else {
350            return;
351        };
352        active.insert(name.to_owned());
353        for raw in body {
354            if out.len() >= MAX_MACRO_EXPANSION_LINES {
355                break;
356            }
357            match control_parts(raw) {
358                Some((inner, inner_rest))
359                    if !is_comment(raw) && self.macros.contains_key(inner) =>
360                {
361                    // A nested call to a user macro receives the substituted arguments.
362                    let inner_args = split_args(&substitute_macro_args(inner_rest, args));
363                    self.expand_macro_into(inner, &inner_args, active, out);
364                }
365                // A request line is emitted verbatim; argument references in a request's own
366                // arguments are left for ordinary escape processing, which yields nothing.
367                Some(_) => out.push(raw.clone()),
368                // A text line has its argument references substituted.
369                None => out.push(substitute_macro_args(raw, args)),
370            }
371        }
372        active.remove(name);
373    }
374
375    /// Parses a sequence of blocks until the context's terminator (or end of input). A terminator
376    /// line is left unconsumed for the caller, except a `.RE` that closes the inset it belongs to.
377    // The macro dispatch lists names separately for clarity even where their handling coincides.
378    #[allow(clippy::too_many_lines, clippy::match_same_arms)]
379    fn parse_blocks(&mut self, ctx: Ctx) -> Vec<Block> {
380        let mut blocks = Vec::new();
381        let mut fill = Vec::new();
382        // Whether a text line has opened the current paragraph: a paragraph made only of
383        // whitespace-filled lines is still emitted (as `Para []`), unlike a macro-driven flush.
384        let mut started = false;
385        while let Some(line) = self.peek().map(str::to_owned) {
386            if line.is_empty() {
387                flush_para(&mut fill, &mut blocks, &mut started);
388                self.advance();
389                continue;
390            }
391            let Some((name, rest)) = control_parts(&line) else {
392                self.advance();
393                append_text(&mut fill, tokenize(&line, Font::Regular, &self.strings));
394                started = true;
395                continue;
396            };
397            if is_comment(&line) {
398                self.advance();
399                continue;
400            }
401            match name {
402                "SH" | "SS" => {
403                    if ctx.in_inset || ctx.in_item {
404                        flush_para(&mut fill, &mut blocks, &mut started);
405                        return blocks;
406                    }
407                    flush_para(&mut fill, &mut blocks, &mut started);
408                    self.advance();
409                    let level = if name == "SH" { 1 } else { 2 };
410                    let inlines = self.heading_inlines(rest);
411                    let id = self.headings.assign(&inlines);
412                    blocks.push(Block::Header(
413                        level,
414                        Box::new(Attr {
415                            id: id.into(),
416                            ..Attr::default()
417                        }),
418                        inlines,
419                    ));
420                }
421                "PP" | "LP" | "P" | "HP" => {
422                    flush_para(&mut fill, &mut blocks, &mut started);
423                    if ctx.in_item {
424                        return blocks;
425                    }
426                    self.advance();
427                }
428                "TP" | "IP" => {
429                    flush_para(&mut fill, &mut blocks, &mut started);
430                    if ctx.in_item {
431                        return blocks;
432                    }
433                    let list = self.parse_list();
434                    blocks.extend(list);
435                }
436                "TQ" => {
437                    flush_para(&mut fill, &mut blocks, &mut started);
438                    if ctx.in_item {
439                        return blocks;
440                    }
441                    self.advance();
442                }
443                "RS" => {
444                    flush_para(&mut fill, &mut blocks, &mut started);
445                    self.advance();
446                    let inner = self.parse_blocks(Ctx::INSET);
447                    if ctx.in_item {
448                        blocks.extend(inner);
449                    } else {
450                        blocks.push(Block::BlockQuote(inner));
451                    }
452                }
453                "RE" => {
454                    flush_para(&mut fill, &mut blocks, &mut started);
455                    self.advance();
456                    if ctx.in_inset {
457                        return blocks;
458                    }
459                }
460                "nf" | "EX" => {
461                    flush_para(&mut fill, &mut blocks, &mut started);
462                    self.advance();
463                    blocks.push(self.parse_verbatim());
464                }
465                "fi" | "EE" | "UE" | "ME" => {
466                    flush_para(&mut fill, &mut blocks, &mut started);
467                    self.advance();
468                }
469                "TS" => {
470                    flush_para(&mut fill, &mut blocks, &mut started);
471                    self.advance();
472                    blocks.extend(self.parse_tbl());
473                }
474                "ds" => {
475                    self.advance();
476                    self.define_string(rest);
477                }
478                "br" => {
479                    self.advance();
480                    fill.push(Inline::LineBreak);
481                }
482                "sp" => {
483                    flush_para(&mut fill, &mut blocks, &mut started);
484                    self.advance();
485                }
486                "TH" => {
487                    self.advance();
488                    self.parse_title(rest);
489                }
490                "B" | "I" => {
491                    self.advance();
492                    let font = single_font(name);
493                    let inlines = if rest.is_empty() {
494                        let text = self.take_line().unwrap_or_default();
495                        font.wrap(tokenize(&text, Font::Regular, &self.strings))
496                    } else {
497                        let text = split_args(rest).join(" ");
498                        font.wrap_forced(tokenize(&text, Font::Regular, &self.strings))
499                    };
500                    append_text(&mut fill, inlines);
501                    started = true;
502                }
503                "BR" | "RB" | "BI" | "IB" | "RI" | "IR" => {
504                    self.advance();
505                    let rest = if rest.is_empty() {
506                        self.take_line().unwrap_or_default()
507                    } else {
508                        rest.to_owned()
509                    };
510                    append_text(
511                        &mut fill,
512                        alternating(&rest, fonts_for(name), &self.strings),
513                    );
514                    started = true;
515                }
516                "SY" => {
517                    self.advance();
518                    let text = if rest.is_empty() {
519                        self.take_line().unwrap_or_default()
520                    } else {
521                        split_args(rest).join(" ")
522                    };
523                    append_text(&mut fill, font_macro(Font::Bold, &text, &self.strings));
524                    started = true;
525                }
526                "OP" => {
527                    self.advance();
528                    append_text(&mut fill, option_synopsis(rest, &self.strings));
529                    started = true;
530                }
531                "YS" => {
532                    self.advance();
533                }
534                "UR" | "MT" => {
535                    self.advance();
536                    let url = split_args(rest).into_iter().next().unwrap_or_default();
537                    let url = if name == "MT" {
538                        format!("mailto:{url}")
539                    } else {
540                        url
541                    };
542                    if self.link_label_is_plain() {
543                        self.parse_link(url, &mut fill);
544                        started = true;
545                    } else {
546                        // A font macro (or any request) inside the label aborts the link: the open
547                        // paragraph is flushed and the label content is emitted as its own blocks.
548                        flush_para(&mut fill, &mut blocks, &mut started);
549                        blocks.extend(self.parse_aborted_link());
550                    }
551                }
552                "de" | "de1" => {
553                    self.advance();
554                    let args = split_args(rest);
555                    let end = args.get(1).map_or(".", String::as_str).to_owned();
556                    let body = self.collect_macro_definition(&end);
557                    if let Some(name) = args.into_iter().next() {
558                        self.macros.insert(name, body);
559                    }
560                }
561                "if" => {
562                    let (cond, branch) = split_condition(rest);
563                    if condition_true(cond) {
564                        self.reprocess_as(branch);
565                    } else {
566                        self.advance();
567                    }
568                }
569                "ie" => {
570                    let (cond, branch) = split_condition(rest);
571                    let taken = condition_true(cond);
572                    self.else_branch = !taken;
573                    if taken {
574                        self.reprocess_as(branch);
575                    } else {
576                        self.advance();
577                    }
578                }
579                "el" => {
580                    if self.else_branch {
581                        self.else_branch = false;
582                        self.reprocess_as(rest);
583                    } else {
584                        self.advance();
585                    }
586                }
587                // A call to a user-defined macro queues its expanded body ahead of the current
588                // position so the queued lines are parsed in place, before the base document
589                // resumes.
590                _ if self.macros.contains_key(name) => {
591                    self.advance();
592                    let args = split_args(rest);
593                    let expansion = self.expand_macro_call(name, &args);
594                    for line in expansion.into_iter().rev() {
595                        self.pending.push_front(line);
596                    }
597                }
598                // An empty request (a bare control character) or one named only with control
599                // characters (`.`, `..`, `'`) is a no-op that leaves the open paragraph filling.
600                _ if is_noop_request(name) => {
601                    self.advance();
602                }
603                _ => {
604                    flush_para(&mut fill, &mut blocks, &mut started);
605                    self.advance();
606                }
607            }
608        }
609        flush_para(&mut fill, &mut blocks, &mut started);
610        blocks
611    }
612
613    /// Heading inline content: the macro's arguments joined by spaces, or — when the macro carries
614    /// none — the following input line.
615    fn heading_inlines(&mut self, rest: &str) -> Vec<Inline> {
616        if rest.is_empty() {
617            let next = self.take_line().unwrap_or_default();
618            tokenize(&next, Font::Regular, &self.strings)
619        } else {
620            tokenize(&split_args(rest).join(" "), Font::Regular, &self.strings)
621        }
622    }
623
624    /// Reads `.TH` arguments into metadata: identifier, section, date, footer, header.
625    fn parse_title(&mut self, rest: &str) {
626        let keys = ["title", "section", "date", "footer", "header"];
627        for (key, arg) in keys.iter().zip(split_args(rest)) {
628            if arg.is_empty() {
629                continue;
630            }
631            let inlines = tokenize(&arg, Font::Regular, &self.strings);
632            self.meta
633                .insert((*key).to_owned(), MetaValue::MetaInlines(inlines));
634        }
635    }
636
637    /// Records a `.ds` string definition. The name is the first argument; the value is the remainder
638    /// of the line after the single separating space, truncated at an inline comment (`\"`) and with
639    /// trailing whitespace removed. The value keeps its own escapes, expanded when it is interpolated.
640    fn define_string(&mut self, rest: &str) {
641        let (name, value) = match rest.split_once([' ', '\t']) {
642            Some((name, value)) => (name, value),
643            None => (rest, ""),
644        };
645        if name.is_empty() {
646            return;
647        }
648        let value = match value.find("\\\"") {
649            Some(index) => value.get(..index).unwrap_or(value),
650            None => value,
651        };
652        let value = value.trim_end_matches([' ', '\t']);
653        self.strings.insert(name.to_owned(), value.to_owned());
654    }
655
656    /// Collects a verbatim region (`.nf`/`.EX`) as a code block. Lines keep their literal spacing;
657    /// escapes and font macros are reduced to plain text. The region ends at `.fi`/`.EE`, or at a
658    /// section heading or end of input (both left unconsumed).
659    fn parse_verbatim(&mut self) -> Block {
660        let mut text_lines: Vec<String> = Vec::new();
661        while let Some(line) = self.peek().map(str::to_owned) {
662            if let Some((name, rest)) = control_parts(&line) {
663                if is_comment(&line) {
664                    self.advance();
665                    continue;
666                }
667                match name {
668                    "fi" | "EE" => {
669                        self.advance();
670                        break;
671                    }
672                    "SH" | "SS" => break,
673                    "B" | "I" | "BR" | "RB" | "BI" | "IB" | "RI" | "IR" => {
674                        self.advance();
675                        text_lines.push(flatten(&split_args(rest).join(" "), &self.strings));
676                    }
677                    _ => self.advance(),
678                }
679            } else {
680                self.advance();
681                text_lines.push(flatten(&line, &self.strings));
682            }
683        }
684        Block::CodeBlock(Box::default(), text_lines.join("\n").into())
685    }
686
687    /// Parses a tbl table region (`.TS`/`.TE`) into a [`Block::Table`]. The region's structure is the
688    /// preprocessor's: an optional options line ending in `;` (from which the cell separator is read),
689    /// one or more format lines ending in `.` (the first fixes the column count and alignments), then
690    /// the data rows. A malformed region (no format line) yields no block. The region ends at `.TE`,
691    /// or at a section heading or end of input (both left unconsumed).
692    fn parse_tbl(&mut self) -> Vec<Block> {
693        let mut region: Vec<String> = Vec::new();
694        while let Some(line) = self.peek().map(str::to_owned) {
695            if let Some((name, _)) = control_parts(&line) {
696                if is_comment(&line) {
697                    self.advance();
698                    continue;
699                }
700                match name {
701                    "TE" => {
702                        self.advance();
703                        break;
704                    }
705                    "SH" | "SS" => break,
706                    _ => {
707                        self.advance();
708                        region.push(line);
709                    }
710                }
711            } else {
712                self.advance();
713                region.push(line);
714            }
715        }
716        build_tbl(&region).into_iter().collect()
717    }
718
719    /// Parses a run of consecutive `.TP`/`.IP` items into list blocks. Items of the same kind merge
720    /// into one list; an unmarked `.IP` becomes a standalone inset.
721    fn parse_list(&mut self) -> Vec<Block> {
722        let mut out = Vec::new();
723        let mut pending: Option<Pending> = None;
724        while let Some(line) = self.peek().map(str::to_owned) {
725            let Some((name, rest)) = control_parts(&line) else {
726                break;
727            };
728            if is_comment(&line) {
729                self.advance();
730                continue;
731            }
732            match name {
733                "TP" => {
734                    self.advance();
735                    let mut term = self.read_term();
736                    // A `.TQ` adds a further tagged term to the same item, on its own line.
737                    while self.peek_request() == Some("TQ") {
738                        self.advance();
739                        term.push(Inline::LineBreak);
740                        term.extend(self.read_term());
741                    }
742                    let body = self.parse_blocks(Ctx::ITEM);
743                    if body.is_empty() {
744                        // A tag with no body of its own takes the rest of the list as its body,
745                        // nesting it; with nothing left to take, the tag stands as a paragraph.
746                        let rest = self.parse_list();
747                        if rest.is_empty() {
748                            flush_pending(&mut pending, &mut out);
749                            out.push(Block::Para(term));
750                        } else {
751                            push_definition(&mut pending, &mut out, term, rest);
752                        }
753                    } else {
754                        push_definition(&mut pending, &mut out, term, body);
755                    }
756                }
757                "IP" => {
758                    self.advance();
759                    let args = split_args(rest);
760                    match args.first() {
761                        // No designator at all: an unmarked inset.
762                        None => {
763                            flush_pending(&mut pending, &mut out);
764                            let body = self.parse_blocks(Ctx::ITEM);
765                            // An unmarked inset with no body contributes nothing.
766                            if !body.is_empty() {
767                                out.push(Block::BlockQuote(body));
768                            }
769                        }
770                        Some(mark_raw) => {
771                            let mark = flatten(mark_raw, &self.strings);
772                            match classify_mark(&mark) {
773                                Mark::Bullet => {
774                                    let body = self.item_body();
775                                    push_bullet(&mut pending, &mut out, body);
776                                }
777                                Mark::Ordered(attrs) => {
778                                    let body = self.item_body();
779                                    push_ordered(&mut pending, &mut out, attrs, body);
780                                }
781                                // A present designator that is neither a bullet nor an enumerator —
782                                // including one that reduces to nothing — is a definition term.
783                                Mark::None | Mark::Text => {
784                                    let term = inlines_from_plain(&mark);
785                                    let body = self.item_body();
786                                    push_definition(&mut pending, &mut out, term, body);
787                                }
788                            }
789                        }
790                    }
791                }
792                _ => break,
793            }
794        }
795        flush_pending(&mut pending, &mut out);
796        out
797    }
798
799    /// A marked list item's body, where an empty body is represented as a single empty paragraph so
800    /// the item is still rendered.
801    fn item_body(&mut self) -> Vec<Block> {
802        let body = self.parse_blocks(Ctx::ITEM);
803        if body.is_empty() {
804            vec![Block::Para(Vec::new())]
805        } else {
806            body
807        }
808    }
809
810    /// The term of a `.TP` item: the next line, which is either a font macro or plain text.
811    fn read_term(&mut self) -> Vec<Inline> {
812        let Some(line) = self.take_line() else {
813            return Vec::new();
814        };
815        if let Some((name, rest)) = control_parts(&line) {
816            if is_comment(&line) {
817                return self.read_term();
818            }
819            match name {
820                "B" | "I" => {
821                    let font = single_font(name);
822                    return font_macro(font, &split_args(rest).join(" "), &self.strings);
823                }
824                "BR" | "RB" | "BI" | "IB" | "RI" | "IR" => {
825                    return alternating(rest, fonts_for(name), &self.strings);
826                }
827                _ => return tokenize(rest, Font::Regular, &self.strings),
828            }
829        }
830        tokenize(&line, Font::Regular, &self.strings)
831    }
832
833    /// Whether the label that opens at the current position is plain — only text lines (and comments)
834    /// up to a `.UE`/`.ME` terminator. A request inside the label, or end of input before the
835    /// terminator, makes the label non-plain, so the link is abandoned.
836    fn link_label_is_plain(&self) -> bool {
837        let lookahead = self
838            .pending
839            .iter()
840            .chain(self.lines.get(self.pos..).into_iter().flatten());
841        for line in lookahead {
842            if is_comment(line) {
843                continue;
844            }
845            if let Some((name, _)) = control_parts(line) {
846                return matches!(name, "UE" | "ME");
847            }
848        }
849        false
850    }
851
852    /// Collects a plain hyperlink's label between `.UR`/`.MT` and its `.UE`/`.ME` terminator,
853    /// appending the resulting link to the open paragraph. The label's text lines are concatenated
854    /// without separators; text after the terminator attaches to the link without a space.
855    fn parse_link(&mut self, url: String, fill: &mut Vec<Inline>) {
856        let mut label_text = String::new();
857        let mut trailing = String::new();
858        while let Some(line) = self.peek().map(str::to_owned) {
859            if is_comment(&line) {
860                self.advance();
861                continue;
862            }
863            self.advance();
864            if let Some((name, rest)) = control_parts(&line) {
865                if matches!(name, "UE" | "ME") {
866                    trailing = split_args(rest).join(" ");
867                }
868                break;
869            }
870            label_text.push_str(&line);
871        }
872        let label = tokenize(&label_text, Font::Regular, &self.strings);
873        append_text(
874            fill,
875            vec![Inline::Link(
876                Box::default(),
877                label,
878                Box::new(Target {
879                    url: url.into(),
880                    title: carta_ast::Text::default(),
881                }),
882            )],
883        );
884        if !trailing.is_empty() {
885            fill.extend(tokenize(&trailing, Font::Regular, &self.strings));
886        }
887    }
888
889    /// Parses the body of an abandoned link as a single paragraph: text lines fill normally, font
890    /// macros and `.br` apply, and the `.UE`/`.ME` terminator is consumed (its trailing text dropped).
891    /// Any other request ends the body, left unconsumed.
892    fn parse_aborted_link(&mut self) -> Vec<Block> {
893        let mut fill = Vec::new();
894        while let Some(line) = self.peek().map(str::to_owned) {
895            let Some((name, rest)) = control_parts(&line) else {
896                self.advance();
897                append_text(&mut fill, tokenize(&line, Font::Regular, &self.strings));
898                continue;
899            };
900            if is_comment(&line) {
901                self.advance();
902                continue;
903            }
904            match name {
905                "UE" | "ME" => {
906                    self.advance();
907                    break;
908                }
909                "br" => {
910                    self.advance();
911                    fill.push(Inline::LineBreak);
912                }
913                "B" | "I" => {
914                    self.advance();
915                    let font = single_font(name);
916                    let inlines = if rest.is_empty() {
917                        let text = self.take_line().unwrap_or_default();
918                        font.wrap(tokenize(&text, Font::Regular, &self.strings))
919                    } else {
920                        let text = split_args(rest).join(" ");
921                        font.wrap_forced(tokenize(&text, Font::Regular, &self.strings))
922                    };
923                    append_text(&mut fill, inlines);
924                }
925                "BR" | "RB" | "BI" | "IB" | "RI" | "IR" => {
926                    self.advance();
927                    let rest = if rest.is_empty() {
928                        self.take_line().unwrap_or_default()
929                    } else {
930                        rest.to_owned()
931                    };
932                    append_text(
933                        &mut fill,
934                        alternating(&rest, fonts_for(name), &self.strings),
935                    );
936                }
937                _ => break,
938            }
939        }
940        trim_inline_ends(&mut fill);
941        if fill.is_empty() {
942            Vec::new()
943        } else {
944            vec![Block::Para(fill)]
945        }
946    }
947}
948
949/// The font a single-font macro selects: `.B` is bold, every other (`.I`) is italic.
950fn single_font(name: &str) -> Font {
951    if name == "B" {
952        Font::Bold
953    } else {
954        Font::Italic
955    }
956}
957
958/// The two alternating fonts of an alternating font macro, applied to arguments in turn.
959fn fonts_for(name: &str) -> [Font; 2] {
960    match name {
961        "BR" => [Font::Bold, Font::Regular],
962        "RB" => [Font::Regular, Font::Bold],
963        "BI" => [Font::Bold, Font::Italic],
964        "IB" => [Font::Italic, Font::Bold],
965        "RI" => [Font::Regular, Font::Italic],
966        _ => [Font::Italic, Font::Regular],
967    }
968}
969
970/// Renders a single-font macro (`.B`/`.I`): the whole argument is read as roman text and then
971/// wrapped once in the macro's font, so an inner `\f` font change nests inside that font rather than
972/// replacing it.
973fn font_macro(font: Font, text: &str, strings: &Strings) -> Vec<Inline> {
974    font.wrap(tokenize(text, Font::Regular, strings))
975}
976
977/// Renders an alternating font macro: each argument takes the next font in the cycle, is read as
978/// roman text, and is wrapped in that font; the rendered arguments abut with no separating space.
979fn alternating(rest: &str, fonts: [Font; 2], strings: &Strings) -> Vec<Inline> {
980    let mut out = Vec::new();
981    for (index, arg) in split_args(rest).into_iter().enumerate() {
982        let font = fonts.get(index % 2).copied().unwrap_or(Font::Regular);
983        out.extend(font.wrap(tokenize(&arg, Font::Regular, strings)));
984    }
985    out
986}
987
988/// Renders a `.OP` command-option synopsis: the option name (the first argument) is set bold and an
989/// optional argument (the rest) roman, the whole bracketed as optional — `[ -name argument ]`.
990fn option_synopsis(rest: &str, strings: &Strings) -> Vec<Inline> {
991    let args = split_args(rest);
992    let mut out = vec![Inline::Str("[".into())];
993    if let Some(name) = args.first() {
994        out.push(Inline::Space);
995        out.extend(font_macro(Font::Bold, name, strings));
996    }
997    let argument = args.get(1..).unwrap_or(&[]).join(" ");
998    if !argument.is_empty() {
999        out.push(Inline::Space);
1000        out.extend(tokenize(&argument, Font::Regular, strings));
1001    }
1002    out.push(Inline::Space);
1003    out.push(Inline::Str("]".into()));
1004    out
1005}
1006
1007/// What kind of list a `.IP` marker introduces.
1008enum Mark {
1009    None,
1010    Bullet,
1011    Ordered(ListAttributes),
1012    Text,
1013}
1014
1015/// Builds inline content from plain text, splitting on whitespace into words separated by single
1016/// spaces.
1017fn inlines_from_plain(text: &str) -> Vec<Inline> {
1018    let mut out = Vec::new();
1019    for word in text.split_whitespace() {
1020        if !out.is_empty() {
1021            out.push(Inline::Space);
1022        }
1023        out.push(Inline::Str(word.into()));
1024    }
1025    out
1026}
1027
1028/// Classifies a `.IP` marker, already reduced to plain text: a bullet glyph, an enumerator (decimal,
1029/// alphabetic, or roman), or arbitrary text that becomes a definition term.
1030fn classify_mark(mark: &str) -> Mark {
1031    if mark.is_empty() {
1032        return Mark::None;
1033    }
1034    if matches!(mark, "*" | "\u{2022}" | "\u{00b7}" | "-" | "+") {
1035        return Mark::Bullet;
1036    }
1037    if let Some(attrs) = parse_enumerator(mark) {
1038        return Mark::Ordered(attrs);
1039    }
1040    Mark::Text
1041}
1042
1043/// Parses an ordered-list enumerator (`1.`, `a)`, `(iv)`, a bare letter, …) into its list
1044/// attributes, or returns `None` when the marker is not an enumerator.
1045fn parse_enumerator(mark: &str) -> Option<ListAttributes> {
1046    if let Some(inner) = mark.strip_prefix('(').and_then(|m| m.strip_suffix(')')) {
1047        return enumerator_body(inner, ListNumberDelim::TwoParens);
1048    }
1049    let (body, delim) = match mark.strip_suffix('.') {
1050        Some(body) => (body, ListNumberDelim::Period),
1051        None => match mark.strip_suffix(')') {
1052            Some(body) => (body, ListNumberDelim::OneParen),
1053            None => (mark, ListNumberDelim::DefaultDelim),
1054        },
1055    };
1056    enumerator_body(body, delim)
1057}
1058
1059/// Parses the numeric/alphabetic/roman body of an enumerator, with its delimiter already determined,
1060/// into list attributes, or returns `None` when the body is not an enumerator.
1061fn enumerator_body(body: &str, delim: ListNumberDelim) -> Option<ListAttributes> {
1062    if body.is_empty() {
1063        return None;
1064    }
1065    if body.chars().all(|c| c.is_ascii_digit()) {
1066        let start = body.parse().ok()?;
1067        return Some(ListAttributes {
1068            start,
1069            style: ListNumberStyle::Decimal,
1070            delim,
1071        });
1072    }
1073    if let Some(start) = roman_value(body) {
1074        let style = if body.chars().next().is_some_and(char::is_uppercase) {
1075            ListNumberStyle::UpperRoman
1076        } else {
1077            ListNumberStyle::LowerRoman
1078        };
1079        return Some(ListAttributes {
1080            start,
1081            style,
1082            delim,
1083        });
1084    }
1085    let mut chars = body.chars();
1086    if let (Some(c), None) = (chars.next(), chars.next())
1087        && c.is_ascii_alphabetic()
1088    {
1089        let start = i32::from((c.to_ascii_lowercase() as u8) - b'a') + 1;
1090        let style = if c.is_ascii_uppercase() {
1091            ListNumberStyle::UpperAlpha
1092        } else {
1093            ListNumberStyle::LowerAlpha
1094        };
1095        return Some(ListAttributes {
1096            start,
1097            style,
1098            delim,
1099        });
1100    }
1101    None
1102}
1103
1104/// The value of a roman numeral, or `None` if the string is not a well-formed roman numeral.
1105fn roman_value(text: &str) -> Option<i32> {
1106    fn digit(c: char) -> Option<i32> {
1107        match c.to_ascii_lowercase() {
1108            'i' => Some(1),
1109            'v' => Some(5),
1110            'x' => Some(10),
1111            'l' => Some(50),
1112            'c' => Some(100),
1113            'd' => Some(500),
1114            'm' => Some(1000),
1115            _ => None,
1116        }
1117    }
1118    let values: Vec<i32> = text.chars().map(digit).collect::<Option<Vec<_>>>()?;
1119    let mut total = 0;
1120    for (index, &value) in values.iter().enumerate() {
1121        match values.get(index + 1) {
1122            Some(&next) if value < next => total -= value,
1123            _ => total += value,
1124        }
1125    }
1126    (total > 0).then_some(total)
1127}
1128
1129/// The accumulating list of the current kind. Consecutive same-kind items append to it; a
1130/// different kind flushes it first.
1131enum Pending {
1132    Definition(Vec<(Vec<Inline>, Vec<Vec<Block>>)>),
1133    Bullet(Vec<Vec<Block>>),
1134    Ordered(ListAttributes, Vec<Vec<Block>>),
1135}
1136
1137fn flush_pending(pending: &mut Option<Pending>, out: &mut Vec<Block>) {
1138    match pending.take() {
1139        Some(Pending::Definition(items)) => out.push(Block::DefinitionList(items)),
1140        Some(Pending::Bullet(items)) => out.push(Block::BulletList(items)),
1141        Some(Pending::Ordered(attrs, items)) => out.push(Block::OrderedList(attrs, items)),
1142        None => {}
1143    }
1144}
1145
1146fn push_definition(
1147    pending: &mut Option<Pending>,
1148    out: &mut Vec<Block>,
1149    term: Vec<Inline>,
1150    body: Vec<Block>,
1151) {
1152    if let Some(Pending::Definition(items)) = pending {
1153        items.push((term, vec![body]));
1154        return;
1155    }
1156    flush_pending(pending, out);
1157    *pending = Some(Pending::Definition(vec![(term, vec![body])]));
1158}
1159
1160fn push_bullet(pending: &mut Option<Pending>, out: &mut Vec<Block>, body: Vec<Block>) {
1161    if let Some(Pending::Bullet(items)) = pending {
1162        items.push(body);
1163        return;
1164    }
1165    flush_pending(pending, out);
1166    *pending = Some(Pending::Bullet(vec![body]));
1167}
1168
1169fn push_ordered(
1170    pending: &mut Option<Pending>,
1171    out: &mut Vec<Block>,
1172    attrs: ListAttributes,
1173    body: Vec<Block>,
1174) {
1175    if let Some(Pending::Ordered(_, items)) = pending {
1176        items.push(body);
1177        return;
1178    }
1179    flush_pending(pending, out);
1180    *pending = Some(Pending::Ordered(attrs, vec![body]));
1181}
1182
1183/// Moves an open paragraph's inlines into the block list. A paragraph with visible content is
1184/// emitted; one that a text line opened but that filled to nothing (only whitespace) is still
1185/// emitted as an empty paragraph; a run that no text line opened is dropped.
1186fn flush_para(fill: &mut Vec<Inline>, blocks: &mut Vec<Block>, started: &mut bool) {
1187    let mut trimmed = std::mem::take(fill);
1188    trim_inline_ends(&mut trimmed);
1189    if !trimmed.is_empty() {
1190        blocks.push(Block::Para(trimmed));
1191    } else if *started {
1192        blocks.push(Block::Para(Vec::new()));
1193    }
1194    *started = false;
1195}
1196
1197/// Appends fillable inline content to the open paragraph, inserting a single separating space
1198/// unless the paragraph is empty or already ends at a line break.
1199fn append_text(fill: &mut Vec<Inline>, inlines: Vec<Inline>) {
1200    if inlines.is_empty() {
1201        return;
1202    }
1203    if !fill.is_empty() && !matches!(fill.last(), Some(Inline::LineBreak)) {
1204        fill.push(Inline::Space);
1205    }
1206    fill.extend(inlines);
1207}
1208
1209/// Whether a line is a control line — one introduced by the `.` or `'` control character.
1210fn is_control(line: &str) -> bool {
1211    line.starts_with('.') || line.starts_with('\'')
1212}
1213
1214/// Whether a control line is a comment (`.\"` or `.\#`).
1215fn is_comment(line: &str) -> bool {
1216    if !is_control(line) {
1217        return false;
1218    }
1219    let body = line.get(1..).unwrap_or("");
1220    body.starts_with("\\\"") || body.starts_with("\\#")
1221}
1222
1223/// Splits a control line into its request name and the remaining argument text, or returns `None`
1224/// for a text line. Whitespace between the control character and the request name is allowed and
1225/// skipped, so `.  SH` names the `SH` request.
1226fn control_parts(line: &str) -> Option<(&str, &str)> {
1227    if !is_control(line) {
1228        return None;
1229    }
1230    let body = line.get(1..).unwrap_or("").trim_start_matches([' ', '\t']);
1231    match body.split_once([' ', '\t']) {
1232        Some((name, rest)) => Some((name, rest.trim_start_matches([' ', '\t']))),
1233        None => Some((body, "")),
1234    }
1235}
1236
1237/// Whether a request name marks a no-op control line: an empty request (a bare control character) or
1238/// one named only with control characters (`.`, `..`, `...`, `'`). Such a line is transparent and
1239/// does not interrupt fill.
1240fn is_noop_request(name: &str) -> bool {
1241    name.chars().all(|c| matches!(c, '.' | '\''))
1242}
1243
1244/// Splits a conditional request's argument into its one-token condition and the branch text that
1245/// follows it.
1246fn split_condition(rest: &str) -> (&str, &str) {
1247    match rest.split_once([' ', '\t']) {
1248        Some((cond, branch)) => (cond, branch),
1249        None => (rest, ""),
1250    }
1251}
1252
1253/// Evaluates a conditional request's condition. The nroff target (`n`) and the constant `1` are
1254/// true; every other condition — the troff target `t`, `0`, other numbers, register and string
1255/// tests — is treated as false.
1256fn condition_true(cond: &str) -> bool {
1257    cond == "n" || cond == "1"
1258}
1259
1260/// Splits a macro argument string the way `groff` does: on spaces and tabs, with double quotes
1261/// grouping an argument that may contain spaces and `""` denoting a literal quote. A backslash keeps
1262/// the following character (so an escaped space does not split).
1263fn split_args(input: &str) -> Vec<String> {
1264    let mut args = Vec::new();
1265    let mut chars = input.chars().peekable();
1266    loop {
1267        while matches!(chars.peek(), Some(' ' | '\t')) {
1268            chars.next();
1269        }
1270        if chars.peek().is_none() {
1271            break;
1272        }
1273        let mut arg = String::new();
1274        if chars.peek() == Some(&'"') {
1275            chars.next();
1276            while let Some(c) = chars.next() {
1277                if c == '"' {
1278                    if chars.peek() == Some(&'"') {
1279                        chars.next();
1280                        arg.push('"');
1281                    } else {
1282                        break;
1283                    }
1284                } else {
1285                    arg.push(c);
1286                }
1287            }
1288        } else {
1289            while let Some(&c) = chars.peek() {
1290                if c == ' ' || c == '\t' {
1291                    break;
1292                }
1293                chars.next();
1294                arg.push(c);
1295                if c == '\\'
1296                    && let Some(next) = chars.next()
1297                {
1298                    arg.push(next);
1299                }
1300            }
1301        }
1302        args.push(arg);
1303    }
1304    args
1305}
1306
1307/// Substitutes a macro call's arguments for `\$N` references in one body line. `\$1`..`\$9` expand to
1308/// the corresponding argument (an absent one to nothing) and `\$0` to nothing; a doubled backslash
1309/// before the reference (`\\$N`, how a reference is written so it survives definition-time copying) is
1310/// treated the same. Every other backslash sequence is left untouched.
1311/// Applies copy-mode reduction to a line as it is stored in a macro body: an escaped backslash
1312/// `\\` collapses to a single `\`. This defers the remaining escapes — argument references `\$N`
1313/// among them — to the moment the macro is invoked, so a body written with `\\$1` and one written
1314/// with `\$1` resolve identically when the macro runs.
1315fn reduce_copy_mode(line: &str) -> String {
1316    if !line.contains('\\') {
1317        return line.to_owned();
1318    }
1319    let mut out = String::with_capacity(line.len());
1320    let mut chars = line.chars().peekable();
1321    while let Some(c) = chars.next() {
1322        if c == '\\' && chars.peek() == Some(&'\\') {
1323            chars.next();
1324        }
1325        out.push(c);
1326    }
1327    out
1328}
1329
1330fn substitute_macro_args(line: &str, args: &[String]) -> String {
1331    if !line.contains("\\$") {
1332        return line.to_owned();
1333    }
1334    let mut out = String::with_capacity(line.len());
1335    let mut chars = line.chars().peekable();
1336    while let Some(c) = chars.next() {
1337        if c != '\\' {
1338            out.push(c);
1339            continue;
1340        }
1341        match chars.peek() {
1342            Some('$') => {
1343                chars.next();
1344                push_macro_arg(&mut chars, args, &mut out);
1345            }
1346            // Preserve an escaped backslash intact; consuming one here would let a following
1347            // `$` be misread as an argument reference.
1348            Some('\\') => {
1349                chars.next();
1350                out.push('\\');
1351                out.push('\\');
1352            }
1353            _ => out.push('\\'),
1354        }
1355    }
1356    out
1357}
1358
1359/// After a `\$` reference, reads the one-digit argument index and appends the corresponding call
1360/// argument (nothing for `\$0` or an out-of-range index).
1361fn push_macro_arg(
1362    chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
1363    args: &[String],
1364    out: &mut String,
1365) {
1366    if let Some(&digit) = chars.peek()
1367        && let Some(index) = digit.to_digit(10)
1368    {
1369        chars.next();
1370        if index >= 1
1371            && let Some(arg) = args.get((index - 1) as usize)
1372        {
1373            out.push_str(arg);
1374        }
1375    }
1376}
1377
1378/// A scanned character together with the font in effect, or an inter-word separator carrying the
1379/// literal whitespace character it stands for (so a verbatim region can preserve a tab).
1380enum Atom {
1381    Char(Font, char),
1382    Space(char),
1383}
1384
1385/// Tokenizes a line of `man` text into inlines: words become [`Inline::Str`], runs of whitespace a
1386/// single [`Inline::Space`], and font runs wrap in the appropriate markup. Leading and trailing
1387/// spaces are dropped.
1388fn tokenize(text: &str, start_font: Font, strings: &Strings) -> Vec<Inline> {
1389    let atoms = scan(text, start_font, strings);
1390    let mut result: Vec<Inline> = Vec::new();
1391    let mut run: Vec<Inline> = Vec::new();
1392    let mut run_font = Font::Regular;
1393    let mut word = String::new();
1394    let mut word_font = Font::Regular;
1395    let mut pending_space = false;
1396
1397    let commit_word = |word: &mut String,
1398                       word_font: Font,
1399                       run: &mut Vec<Inline>,
1400                       run_font: &mut Font,
1401                       result: &mut Vec<Inline>,
1402                       pending_space: &mut bool| {
1403        if word.is_empty() {
1404            return;
1405        }
1406        let text = std::mem::take(word);
1407        if !run.is_empty() && word_font == *run_font {
1408            if *pending_space {
1409                run.push(Inline::Space);
1410            }
1411            run.push(Inline::Str(text.into()));
1412        } else {
1413            flush_run(run, *run_font, result);
1414            if *pending_space {
1415                push_space(result);
1416            }
1417            *run_font = word_font;
1418            run.push(Inline::Str(text.into()));
1419        }
1420        *pending_space = false;
1421    };
1422
1423    for atom in atoms {
1424        match atom {
1425            Atom::Char(font, c) => {
1426                if !word.is_empty() && font != word_font {
1427                    commit_word(
1428                        &mut word,
1429                        word_font,
1430                        &mut run,
1431                        &mut run_font,
1432                        &mut result,
1433                        &mut pending_space,
1434                    );
1435                }
1436                if word.is_empty() {
1437                    word_font = font;
1438                }
1439                word.push(c);
1440            }
1441            Atom::Space(_) => {
1442                commit_word(
1443                    &mut word,
1444                    word_font,
1445                    &mut run,
1446                    &mut run_font,
1447                    &mut result,
1448                    &mut pending_space,
1449                );
1450                pending_space = true;
1451            }
1452        }
1453    }
1454    commit_word(
1455        &mut word,
1456        word_font,
1457        &mut run,
1458        &mut run_font,
1459        &mut result,
1460        &mut pending_space,
1461    );
1462    flush_run(&mut run, run_font, &mut result);
1463    trim_inline_ends(&mut result);
1464    result
1465}
1466
1467fn flush_run(run: &mut Vec<Inline>, run_font: Font, result: &mut Vec<Inline>) {
1468    if !run.is_empty() {
1469        result.extend(run_font.wrap(std::mem::take(run)));
1470    }
1471}
1472
1473/// Appends a single top-level space, coalescing with any space already present.
1474fn push_space(result: &mut Vec<Inline>) {
1475    if !result.is_empty() && !matches!(result.last(), Some(Inline::Space)) {
1476        result.push(Inline::Space);
1477    }
1478}
1479
1480/// Reduces a line to plain text for a verbatim region: escapes and special characters resolve, font
1481/// markup is discarded, and literal spacing is preserved.
1482fn flatten(text: &str, strings: &Strings) -> String {
1483    let mut out = String::new();
1484    for atom in scan(text, Font::Regular, strings) {
1485        match atom {
1486            Atom::Char(_, c) | Atom::Space(c) => out.push(c),
1487        }
1488    }
1489    out
1490}
1491
1492/// Scans a line into atoms, resolving escape sequences and interpolating named strings.
1493fn scan(text: &str, start_font: Font, strings: &Strings) -> Vec<Atom> {
1494    let mut atoms = Vec::new();
1495    let mut font = start_font;
1496    let mut previous = start_font;
1497    scan_into(text, &mut font, &mut previous, &mut atoms, strings, 0);
1498    atoms
1499}
1500
1501/// Scans `text` into `atoms`, carrying the running font across the call so an interpolated `\*`
1502/// string can change the font for the remainder of the line. Font escapes (`\f…`) update the font;
1503/// an inline comment (`\"`/`\#`) ends the line; a `\*` string is expanded by re-scanning its value,
1504/// bounded by [`MAX_STRING_DEPTH`] so a self-referential definition cannot loop forever.
1505// Escape arms are listed separately by groff semantics even where two reduce to the same body.
1506#[allow(clippy::too_many_lines, clippy::match_same_arms)]
1507fn scan_into(
1508    text: &str,
1509    font: &mut Font,
1510    previous: &mut Font,
1511    atoms: &mut Vec<Atom>,
1512    strings: &Strings,
1513    depth: usize,
1514) {
1515    let mut chars = text.chars().peekable();
1516    while let Some(c) = chars.next() {
1517        if c == ' ' || c == '\t' {
1518            atoms.push(Atom::Space(c));
1519            continue;
1520        }
1521        if c != '\\' {
1522            atoms.push(Atom::Char(*font, c));
1523            continue;
1524        }
1525        let Some(&escape) = chars.peek() else {
1526            break;
1527        };
1528        match escape {
1529            'f' => {
1530                chars.next();
1531                let name = read_escape_name(&mut chars);
1532                apply_font(&name, font, previous);
1533            }
1534            '"' | '#' => break,
1535            '-' => {
1536                chars.next();
1537                atoms.push(Atom::Char(*font, '-'));
1538            }
1539            'e' | '\\' => {
1540                chars.next();
1541                atoms.push(Atom::Char(*font, '\\'));
1542            }
1543            '.' => {
1544                chars.next();
1545                atoms.push(Atom::Char(*font, '.'));
1546            }
1547            // An unpaddable space and a tab are inter-word separators; the tab keeps its own
1548            // character so a verbatim region preserves it.
1549            ' ' => {
1550                chars.next();
1551                atoms.push(Atom::Space(' '));
1552            }
1553            't' => {
1554                chars.next();
1555                atoms.push(Atom::Space('\t'));
1556            }
1557            '~' => {
1558                chars.next();
1559                atoms.push(Atom::Char(*font, '\u{00a0}'));
1560            }
1561            '0' => {
1562                chars.next();
1563                atoms.push(Atom::Char(*font, '\u{2007}'));
1564            }
1565            '^' => {
1566                chars.next();
1567                atoms.push(Atom::Char(*font, '\u{200a}'));
1568            }
1569            '|' => {
1570                chars.next();
1571                atoms.push(Atom::Char(*font, '\u{2006}'));
1572            }
1573            // Escapes that emit nothing: `\c` (continuation), the zero-width `\&` and friends, and
1574            // the half-line vertical motions `\u`/`\d`, which take no argument.
1575            '&' | ')' | ',' | '/' | ':' | '!' | '%' | '{' | '}' | 'c' | 'u' | 'd' => {
1576                chars.next();
1577            }
1578            '(' => {
1579                chars.next();
1580                let name: String = (&mut chars).take(2).collect();
1581                push_chars(atoms, *font, special_char(&name));
1582            }
1583            '[' => {
1584                chars.next();
1585                let name = read_delimited(&mut chars, ']');
1586                push_chars(atoms, *font, bracket_char(&name));
1587            }
1588            '*' => {
1589                chars.next();
1590                let name = read_escape_name(&mut chars);
1591                if depth < MAX_STRING_DEPTH
1592                    && let Some(value) = strings.get(&name)
1593                {
1594                    scan_into(value, font, previous, atoms, strings, depth + 1);
1595                }
1596            }
1597            's' => {
1598                chars.next();
1599                skip_size(&mut chars);
1600            }
1601            // `\n` reads a number-register name and `\k` a position-register name; both are discarded.
1602            'n' | 'k' => {
1603                chars.next();
1604                let _ = read_escape_name(&mut chars);
1605            }
1606            // `\z` outputs the next glyph with no width; the glyph is dropped here.
1607            'z' => {
1608                chars.next();
1609                chars.next();
1610            }
1611            // Color and named-argument escapes whose name (one char, `(xx`, or `[name]`) carries no
1612            // text: fill/stroke color (`\m`/`\M`), font family (`\F`), register format (`\g`),
1613            // environment value (`\V`), macro-as-string (`\Y`), and macro argument (`\$N`).
1614            'm' | 'M' | 'F' | 'g' | 'V' | 'Y' | '$' => {
1615                chars.next();
1616                let _ = read_escape_name(&mut chars);
1617            }
1618            // `\p` (break the output line) and `\a` (leader) both produce no text.
1619            'p' | 'a' => {
1620                chars.next();
1621            }
1622            // `\C'name'` names a glyph with an explicit delimiter, like `\[name]`.
1623            'C' => {
1624                chars.next();
1625                let name = match chars.next() {
1626                    Some(delim) => read_delimited(&mut chars, delim),
1627                    None => String::new(),
1628                };
1629                push_chars(atoms, *font, bracket_char(&name));
1630            }
1631            'h' | 'v' | 'w' | 'o' | 'b' | 'l' | 'L' | 'D' | 'N' | 'R' | 'A' | 'Z' | 'X' | 'B' => {
1632                chars.next();
1633                skip_delimited_arg(&mut chars);
1634            }
1635            other => {
1636                chars.next();
1637                atoms.push(Atom::Char(*font, other));
1638            }
1639        }
1640    }
1641}
1642
1643fn push_chars(atoms: &mut Vec<Atom>, font: Font, mapped: Option<char>) {
1644    atoms.push(Atom::Char(font, mapped.unwrap_or('\u{fffd}')));
1645}
1646
1647/// Reads an escape name after `\f`, `\*` or `\n`: one character, a two-character `(xx` name, or a
1648/// `[name]` group.
1649fn read_escape_name(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> String {
1650    match chars.peek() {
1651        Some('(') => {
1652            chars.next();
1653            chars.take(2).collect()
1654        }
1655        Some('[') => {
1656            chars.next();
1657            read_delimited(chars, ']')
1658        }
1659        Some(_) => chars.next().map(String::from).unwrap_or_default(),
1660        None => String::new(),
1661    }
1662}
1663
1664fn read_delimited(chars: &mut std::iter::Peekable<std::str::Chars<'_>>, close: char) -> String {
1665    let mut name = String::new();
1666    for c in chars.by_ref() {
1667        if c == close {
1668            break;
1669        }
1670        name.push(c);
1671    }
1672    name
1673}
1674
1675/// Skips an argument delimited by a repeated character, as in `\h'amount'`.
1676fn skip_delimited_arg(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) {
1677    let Some(delim) = chars.next() else {
1678        return;
1679    };
1680    for c in chars.by_ref() {
1681        if c == delim {
1682            break;
1683        }
1684    }
1685}
1686
1687/// Skips a `\s` size argument: an optional sign and one or two digits, or a delimited or grouped
1688/// form.
1689fn skip_size(chars: &mut std::iter::Peekable<std::str::Chars<'_>>) {
1690    match chars.peek() {
1691        Some('(') => {
1692            chars.next();
1693            chars.next();
1694            chars.next();
1695        }
1696        Some('[') => {
1697            chars.next();
1698            read_delimited(chars, ']');
1699        }
1700        Some('\'') => {
1701            chars.next();
1702            read_delimited(chars, '\'');
1703        }
1704        _ => {
1705            if matches!(chars.peek(), Some('+' | '-')) {
1706                chars.next();
1707            }
1708            for _ in 0..2 {
1709                if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) {
1710                    chars.next();
1711                } else {
1712                    break;
1713                }
1714            }
1715        }
1716    }
1717}
1718
1719/// Applies a `\f` font name to the running font, remembering the previous font so `P` (or an empty
1720/// name) can return to it.
1721// The named roman fonts are spelled out; any unrecognized name also falls back to roman.
1722#[allow(clippy::match_same_arms)]
1723fn apply_font(name: &str, font: &mut Font, previous: &mut Font) {
1724    let next = match name {
1725        "B" => Font::Bold,
1726        "I" => Font::Italic,
1727        "BI" | "IB" => Font::BoldItalic,
1728        "C" | "CW" | "CR" => Font::Mono,
1729        "CB" => Font::MonoBold,
1730        "CI" => Font::MonoItalic,
1731        "R" => Font::Regular,
1732        "P" | "" => {
1733            std::mem::swap(font, previous);
1734            return;
1735        }
1736        _ => Font::Regular,
1737    };
1738    *previous = *font;
1739    *font = next;
1740}
1741
1742/// Resolves a `\[name]` escape: a `uXXXX` Unicode escape or a special-character name.
1743fn bracket_char(name: &str) -> Option<char> {
1744    if let Some(hex) = name.strip_prefix('u') {
1745        return u32::from_str_radix(hex, 16).ok().and_then(char::from_u32);
1746    }
1747    special_char(name)
1748}
1749
1750/// Builds a [`Block::Table`] from the lines of a tbl region (those between `.TS` and `.TE`, both
1751/// excluded). The region is the preprocessor's: an optional options line ending in `;` (carrying the
1752/// cell separator in its `tab(X)` option), one or more format lines the last of which ends in `.`
1753/// (the first fixes the column count and alignments), then the data rows. A rule line (`_`/`=`) just
1754/// below the first data row promotes that row to the table head. A `T{`…`T}` text block spanning
1755/// several input lines collapses into one filled cell. A format declaring a horizontal span, which
1756/// the table model cannot express, renders as a placeholder paragraph. Returns `None` for a region
1757/// with no format line, where there is no table to build.
1758fn build_tbl(region: &[String]) -> Option<Block> {
1759    let mut index = 0;
1760    let mut separator = "\t".to_owned();
1761    if let Some(first) = region.first()
1762        && first.trim_end().ends_with(';')
1763    {
1764        if let Some(sep) = tab_option(first) {
1765            separator = sep;
1766        }
1767        index = 1;
1768    }
1769
1770    let aligns = parse_col_aligns(region.get(index)?);
1771    if aligns.is_empty() {
1772        return None;
1773    }
1774    let columns = aligns.len();
1775    let mut data_start = None;
1776    for (offset, line) in region.iter().enumerate().skip(index) {
1777        if line.trim_end().ends_with('.') {
1778            data_start = Some(offset + 1);
1779            break;
1780        }
1781    }
1782    let data_start = data_start?;
1783
1784    // A column that horizontally spans its neighbor has no representation in the table model, so a
1785    // region whose format declares one is rendered as a placeholder paragraph instead.
1786    if region
1787        .get(index..data_start)
1788        .unwrap_or(&[])
1789        .iter()
1790        .any(|line| format_has_span(line))
1791    {
1792        return Some(Block::Para(vec![Inline::Str("TABLE".into())]));
1793    }
1794
1795    let data = collapse_text_blocks(region.get(data_start..).unwrap_or(&[]), &separator);
1796
1797    let (head_lines, body_lines): (&[String], &[String]) =
1798        if data.get(1).is_some_and(|line| is_rule(line)) {
1799            (data.get(..1).unwrap_or(&[]), data.get(2..).unwrap_or(&[]))
1800        } else {
1801            (&[], &data)
1802        };
1803
1804    let col_specs = aligns
1805        .into_iter()
1806        .map(|align| ColSpec {
1807            align,
1808            width: ColWidth::ColWidthDefault,
1809        })
1810        .collect();
1811    let head = TableHead {
1812        attr: Attr::default(),
1813        rows: head_lines
1814            .iter()
1815            .map(|line| tbl_row(line, &separator, columns))
1816            .collect(),
1817    };
1818    let body = TableBody {
1819        attr: Attr::default(),
1820        row_head_columns: 0,
1821        head: Vec::new(),
1822        body: body_lines
1823            .iter()
1824            .filter(|line| !is_rule(line))
1825            .map(|line| tbl_row(line, &separator, columns))
1826            .collect(),
1827    };
1828
1829    Some(Block::Table(Box::new(Table {
1830        attr: Attr::default(),
1831        caption: Caption::default(),
1832        col_specs,
1833        head,
1834        bodies: vec![body],
1835        foot: TableFoot::default(),
1836    })))
1837}
1838
1839/// Reads the cell separator from a tbl options line's `tab(X)` option, if it carries one.
1840fn tab_option(options: &str) -> Option<String> {
1841    let inside = options.split_once("tab(")?.1.split_once(')')?.0;
1842    (!inside.is_empty()).then(|| inside.to_owned())
1843}
1844
1845/// Parses the alignment of each column from a tbl format line. Each key letter (`l`/`a` left, `r`/`n`
1846/// right, `c` center) opens a column; `s` continues a horizontal span; a font modifier (`f` and its
1847/// name) and a width modifier (`w`/`p`/`v`/`m` and its parenthesized or numeric argument) are skipped.
1848fn parse_col_aligns(spec: &str) -> Vec<Alignment> {
1849    let mut aligns = Vec::new();
1850    let mut chars = spec.chars().peekable();
1851    while let Some(c) = chars.next() {
1852        match c.to_ascii_lowercase() {
1853            'l' | 'a' => aligns.push(Alignment::AlignLeft),
1854            'r' | 'n' => aligns.push(Alignment::AlignRight),
1855            'c' => aligns.push(Alignment::AlignCenter),
1856            'f' => match chars.peek() {
1857                Some('(') => {
1858                    chars.next();
1859                    chars.next();
1860                    chars.next();
1861                }
1862                Some('[') => {
1863                    chars.next();
1864                    read_delimited(&mut chars, ']');
1865                }
1866                Some(_) => {
1867                    chars.next();
1868                }
1869                None => {}
1870            },
1871            'w' | 'p' | 'v' | 'm' => {
1872                if chars.peek() == Some(&'(') {
1873                    chars.next();
1874                    for d in chars.by_ref() {
1875                        if d == ')' {
1876                            break;
1877                        }
1878                    }
1879                } else {
1880                    while matches!(chars.peek(), Some(d) if d.is_ascii_digit()) {
1881                        chars.next();
1882                    }
1883                }
1884            }
1885            _ => {}
1886        }
1887    }
1888    aligns
1889}
1890
1891/// Whether a tbl format line declares a horizontal span (an `s`/`S` key), skipping the font and width
1892/// modifiers whose own arguments could otherwise contain that letter.
1893fn format_has_span(spec: &str) -> bool {
1894    let mut chars = spec.chars().peekable();
1895    while let Some(c) = chars.next() {
1896        match c.to_ascii_lowercase() {
1897            's' => return true,
1898            'f' => match chars.peek() {
1899                Some('(') => {
1900                    chars.next();
1901                    chars.next();
1902                    chars.next();
1903                }
1904                Some('[') => {
1905                    chars.next();
1906                    read_delimited(&mut chars, ']');
1907                }
1908                Some(_) => {
1909                    chars.next();
1910                }
1911                None => {}
1912            },
1913            'w' | 'p' | 'v' | 'm' => {
1914                if chars.peek() == Some(&'(') {
1915                    chars.next();
1916                    for d in chars.by_ref() {
1917                        if d == ')' {
1918                            break;
1919                        }
1920                    }
1921                } else {
1922                    while matches!(chars.peek(), Some(d) if d.is_ascii_digit()) {
1923                        chars.next();
1924                    }
1925                }
1926            }
1927            _ => {}
1928        }
1929    }
1930    false
1931}
1932
1933/// Collapses tbl text blocks into single data lines. A field of `T{` begins a block whose content is
1934/// the following lines up to a line starting with `T}`; those lines join with single spaces into the
1935/// field, and any fields after `T}` on its line continue the row.
1936fn collapse_text_blocks(data: &[String], separator: &str) -> Vec<String> {
1937    let mut out = Vec::new();
1938    let mut index = 0;
1939    while let Some(line) = data.get(index) {
1940        index += 1;
1941        if !line.split(separator).any(|field| field.trim() == "T{") {
1942            out.push(line.clone());
1943            continue;
1944        }
1945        let mut fields: Vec<String> = Vec::new();
1946        for field in line.split(separator) {
1947            if field.trim() != "T{" {
1948                fields.push(field.to_owned());
1949                continue;
1950            }
1951            let mut block: Vec<String> = Vec::new();
1952            let mut terminated = false;
1953            while let Some(block_line) = data.get(index) {
1954                index += 1;
1955                if block_line.trim_start().starts_with("T}") {
1956                    let mut tail = block_line.split(separator);
1957                    tail.next();
1958                    fields.push(block.join(" "));
1959                    fields.extend(tail.map(str::to_owned));
1960                    terminated = true;
1961                    break;
1962                }
1963                block.push(block_line.clone());
1964            }
1965            if !terminated {
1966                fields.push(block.join(" "));
1967            }
1968        }
1969        out.push(fields.join(separator));
1970    }
1971    out
1972}
1973
1974/// Whether a tbl line is a horizontal rule: a non-empty line of only `_` or `=` characters.
1975fn is_rule(line: &str) -> bool {
1976    let trimmed = line.trim();
1977    !trimmed.is_empty() && trimmed.chars().all(|c| c == '_' || c == '=')
1978}
1979
1980/// Builds one table row of exactly `columns` cells from a tbl data line: fields past the column count
1981/// are dropped and missing fields are filled with empty cells.
1982fn tbl_row(line: &str, separator: &str, columns: usize) -> Row {
1983    let mut cells: Vec<Cell> = line.split(separator).take(columns).map(tbl_cell).collect();
1984    while cells.len() < columns {
1985        cells.push(tbl_cell(""));
1986    }
1987    Row {
1988        attr: Attr::default(),
1989        cells,
1990    }
1991}
1992
1993/// Builds a table cell from raw field text: surviving backslash escapes are stripped and the
1994/// remainder is split on whitespace into words. An empty field yields a cell with no content.
1995fn tbl_cell(field: &str) -> Cell {
1996    let cleaned: String = field.chars().filter(|&c| c != '\\').collect();
1997    let mut inlines = Vec::new();
1998    for word in cleaned.split_whitespace() {
1999        if !inlines.is_empty() {
2000            inlines.push(Inline::Space);
2001        }
2002        inlines.push(Inline::Str(word.into()));
2003    }
2004    let content = if inlines.is_empty() {
2005        Vec::new()
2006    } else {
2007        vec![Block::Plain(inlines)]
2008    };
2009    Cell {
2010        attr: Attr::default(),
2011        align: Alignment::AlignDefault,
2012        row_span: 1,
2013        col_span: 1,
2014        content,
2015    }
2016}
2017
2018/// Maps a special-character name (`\(xx`, `\[name]`) to its character; unknown names yield `None`,
2019/// which the caller renders as the replacement character.
2020// One name per arm keeps the glyph table legible even where distinct names share a character.
2021#[allow(clippy::match_same_arms, clippy::too_many_lines)]
2022fn special_char(name: &str) -> Option<char> {
2023    let c = match name {
2024        // Dashes, hyphens, and quotation.
2025        "hy" => '\u{2010}',
2026        "en" => '\u{2013}',
2027        "em" => '\u{2014}',
2028        "lq" => '\u{201c}',
2029        "rq" => '\u{201d}',
2030        "oq" => '\u{2018}',
2031        "cq" => '\u{2019}',
2032        "aq" => '\'',
2033        "dq" => '"',
2034        "Bq" => '\u{201e}',
2035        "bq" => '\u{201a}',
2036        "Fo" => '\u{00ab}',
2037        "Fc" => '\u{00bb}',
2038        "fo" => '\u{2039}',
2039        "fc" => '\u{203a}',
2040        "ga" => '`',
2041        "aa" => '\u{00b4}',
2042        "ha" => '^',
2043        "ti" => '~',
2044        "ul" => '_',
2045        "ru" => '_',
2046        "rs" => '\\',
2047        "sl" => '/',
2048        // Bullets, marks, and shapes.
2049        "bu" => '\u{00b7}',
2050        "ci" => '\u{25cb}',
2051        "sq" => '\u{25a1}',
2052        "lz" => '\u{25ca}',
2053        "dg" => '\u{2020}',
2054        "dd" => '\u{2021}',
2055        "ps" => '\u{00b6}',
2056        "sc" => '\u{00a7}',
2057        "lh" => '\u{261c}',
2058        "rh" => '\u{261e}',
2059        "co" => '\u{00a9}',
2060        "rg" => '\u{00ae}',
2061        "tm" => '\u{2122}',
2062        "fm" => '\u{2032}',
2063        "sd" => '\u{2033}',
2064        "de" => '\u{00b0}',
2065        "mc" => '\u{00b5}',
2066        "%0" => '\u{2030}',
2067        // Punctuation and bars.
2068        "at" => '@',
2069        "sh" => '#',
2070        "or" => '|',
2071        "ba" => '|',
2072        "br" => '\u{2502}',
2073        "bb" => '\u{00a6}',
2074        "rn" => '\u{203e}',
2075        "ct" => '\u{00a2}',
2076        // Currency.
2077        "Do" => '$',
2078        "Eu" | "eu" => '\u{20ac}',
2079        "Po" => '\u{00a3}',
2080        "Ye" => '\u{00a5}',
2081        "Cs" => '\u{00a4}',
2082        // Fractions and ligatures.
2083        "12" => '\u{00bd}',
2084        "14" => '\u{00bc}',
2085        "34" => '\u{00be}',
2086        "ff" => '\u{fb00}',
2087        "fi" => '\u{fb01}',
2088        "fl" => '\u{fb02}',
2089        "Fi" => '\u{fb03}',
2090        "Fl" => '\u{fb04}',
2091        // Accented letters and accents.
2092        "oA" => '\u{00c5}',
2093        "oa" => '\u{00e5}',
2094        "/L" => '\u{0141}',
2095        "/l" => '\u{0142}',
2096        "/O" => '\u{00d8}',
2097        "/o" => '\u{00f8}',
2098        "a-" => '\u{00af}',
2099        "a." => '\u{02d9}',
2100        "ad" => '\u{00a8}',
2101        "ah" => '\u{02c7}',
2102        "a^" => '^',
2103        // Diaeresis.
2104        ":a" => '\u{00e4}',
2105        ":e" => '\u{00eb}',
2106        ":i" => '\u{00ef}',
2107        ":o" => '\u{00f6}',
2108        ":u" => '\u{00fc}',
2109        ":y" => '\u{00ff}',
2110        ":A" => '\u{00c4}',
2111        ":E" => '\u{00cb}',
2112        ":I" => '\u{00cf}',
2113        ":O" => '\u{00d6}',
2114        ":U" => '\u{00dc}',
2115        ":Y" => '\u{0178}',
2116        // Acute accent.
2117        "'a" => '\u{00e1}',
2118        "'c" => '\u{0107}',
2119        "'e" => '\u{00e9}',
2120        "'i" => '\u{00ed}',
2121        "'o" => '\u{00f3}',
2122        "'u" => '\u{00fa}',
2123        "'y" => '\u{00fd}',
2124        "'A" => '\u{00c1}',
2125        "'C" => '\u{0106}',
2126        "'E" => '\u{00c9}',
2127        "'I" => '\u{00cd}',
2128        "'O" => '\u{00d3}',
2129        "'U" => '\u{00da}',
2130        "'Y" => '\u{00dd}',
2131        // Grave accent.
2132        "`a" => '\u{00e0}',
2133        "`e" => '\u{00e8}',
2134        "`i" => '\u{00ec}',
2135        "`o" => '\u{00f2}',
2136        "`u" => '\u{00f9}',
2137        "`A" => '\u{00c0}',
2138        "`E" => '\u{00c8}',
2139        "`I" => '\u{00cc}',
2140        "`O" => '\u{00d2}',
2141        "`U" => '\u{00d9}',
2142        // Circumflex.
2143        "^a" => '\u{00e2}',
2144        "^e" => '\u{00ea}',
2145        "^i" => '\u{00ee}',
2146        "^o" => '\u{00f4}',
2147        "^u" => '\u{00fb}',
2148        "^A" => '\u{00c2}',
2149        "^E" => '\u{00ca}',
2150        "^I" => '\u{00ce}',
2151        "^O" => '\u{00d4}',
2152        "^U" => '\u{00db}',
2153        // Tilde.
2154        "~a" => '\u{00e3}',
2155        "~n" => '\u{00f1}',
2156        "~o" => '\u{00f5}',
2157        "~A" => '\u{00c3}',
2158        "~N" => '\u{00d1}',
2159        "~O" => '\u{00d5}',
2160        // Cedilla.
2161        ",c" => '\u{00e7}',
2162        ",C" => '\u{00c7}',
2163        // Other Latin letters and ligatures.
2164        "ss" => '\u{00df}',
2165        "ae" => '\u{00e6}',
2166        "AE" => '\u{00c6}',
2167        "oe" => '\u{0153}',
2168        "OE" => '\u{0152}',
2169        "-D" => '\u{00d0}',
2170        "Sd" => '\u{00f0}',
2171        "TP" => '\u{00de}',
2172        "Tp" => '\u{00fe}',
2173        // Mathematical operators and relations.
2174        "pl" => '+',
2175        "mi" => '\u{2212}',
2176        "mu" => '\u{00d7}',
2177        "di" => '\u{00f7}',
2178        "+-" => '\u{00b1}',
2179        "**" => '\u{2217}',
2180        "c*" => '\u{2297}',
2181        "c+" => '\u{2295}',
2182        "<=" => '\u{2264}',
2183        ">=" => '\u{2265}',
2184        "!=" => '\u{2260}',
2185        "==" => '\u{2261}',
2186        "->" => '\u{2192}',
2187        "<-" => '\u{2190}',
2188        "eq" => '=',
2189        "no" => '\u{00ac}',
2190        "sr" => '\u{221a}',
2191        "is" => '\u{222b}',
2192        "pd" => '\u{2202}',
2193        "gr" => '\u{2207}',
2194        "fa" => '\u{2200}',
2195        "te" => '\u{2203}',
2196        "if" => '\u{221e}',
2197        "pt" => '\u{221d}',
2198        "es" => '\u{2205}',
2199        "ca" => '\u{2229}',
2200        "cu" => '\u{222a}',
2201        "sb" => '\u{2282}',
2202        "sp" => '\u{2283}',
2203        "ib" => '\u{2286}',
2204        "ip" => '\u{2287}',
2205        "mo" => '\u{2208}',
2206        "nm" => '\u{2209}',
2207        "pp" => '\u{22a5}',
2208        "3d" => '\u{2234}',
2209        "Ah" => '\u{2135}',
2210        "Im" => '\u{2111}',
2211        "Re" => '\u{211c}',
2212        "wp" => '\u{2118}',
2213        // Angle brackets and extensible bars.
2214        "la" => '\u{27e8}',
2215        "ra" => '\u{27e9}',
2216        "va" => '\u{2195}',
2217        "an" => '\u{23af}',
2218        // Greek lowercase.
2219        "*a" => '\u{03b1}',
2220        "*b" => '\u{03b2}',
2221        "*g" => '\u{03b3}',
2222        "*d" => '\u{03b4}',
2223        "*e" => '\u{03b5}',
2224        "*z" => '\u{03b6}',
2225        "*y" => '\u{03b7}',
2226        "*h" => '\u{03b8}',
2227        "*i" => '\u{03b9}',
2228        "*k" => '\u{03ba}',
2229        "*l" => '\u{03bb}',
2230        "*m" => '\u{03bc}',
2231        "*n" => '\u{03bd}',
2232        "*c" => '\u{03be}',
2233        "*o" => '\u{03bf}',
2234        "*p" => '\u{03c0}',
2235        "*r" => '\u{03c1}',
2236        "ts" => '\u{03c2}',
2237        "*s" => '\u{03c3}',
2238        "*t" => '\u{03c4}',
2239        "*u" => '\u{03c5}',
2240        "*f" => '\u{03c6}',
2241        "*x" => '\u{03c7}',
2242        "*q" => '\u{03c8}',
2243        "*w" => '\u{03c9}',
2244        // Greek uppercase.
2245        "*A" => '\u{0391}',
2246        "*B" => '\u{0392}',
2247        "*G" => '\u{0393}',
2248        "*D" => '\u{0394}',
2249        "*E" => '\u{0395}',
2250        "*Z" => '\u{0396}',
2251        "*Y" => '\u{0397}',
2252        "*H" => '\u{0398}',
2253        "*I" => '\u{0399}',
2254        "*K" => '\u{039a}',
2255        "*L" => '\u{039b}',
2256        "*M" => '\u{039c}',
2257        "*N" => '\u{039d}',
2258        "*C" => '\u{039e}',
2259        "*O" => '\u{039f}',
2260        "*P" => '\u{03a0}',
2261        "*R" => '\u{03a1}',
2262        "*S" => '\u{03a3}',
2263        "*T" => '\u{03a4}',
2264        "*U" => '\u{03a5}',
2265        "*F" => '\u{03a6}',
2266        "*X" => '\u{03a7}',
2267        "*Q" => '\u{03a8}',
2268        "*W" => '\u{03a9}',
2269        _ => return None,
2270    };
2271    Some(c)
2272}
2273
2274#[cfg(test)]
2275mod tests {
2276    use super::*;
2277    use carta_core::Extension;
2278
2279    fn read(input: &str) -> Document {
2280        read_with(input, Extensions::from_list(&[Extension::AutoIdentifiers]))
2281    }
2282
2283    fn read_with(input: &str, extensions: Extensions) -> Document {
2284        let mut options = ReaderOptions::default();
2285        options.extensions = extensions;
2286        ManReader.read(input, &options).expect("read")
2287    }
2288
2289    #[test]
2290    fn title_populates_metadata() {
2291        let doc = read(".TH FOO 1 \"2024-01-01\" \"version 1.0\" \"Foo Manual\"\n");
2292        assert_eq!(
2293            doc.meta.get("title"),
2294            Some(&MetaValue::MetaInlines(vec![Inline::Str("FOO".into())]))
2295        );
2296        assert_eq!(
2297            doc.meta.get("section"),
2298            Some(&MetaValue::MetaInlines(vec![Inline::Str("1".into())]))
2299        );
2300        assert_eq!(
2301            doc.meta.get("header"),
2302            Some(&MetaValue::MetaInlines(vec![
2303                Inline::Str("Foo".into()),
2304                Inline::Space,
2305                Inline::Str("Manual".into()),
2306            ]))
2307        );
2308    }
2309
2310    #[test]
2311    fn section_headings_get_identifiers() {
2312        let doc = read(".TH T 1\n.SH NAME\nfoo\n.SS Sub Title\nbar\n");
2313        assert_eq!(
2314            doc.blocks.first(),
2315            Some(&Block::Header(
2316                1,
2317                Box::new(Attr {
2318                    id: "name".into(),
2319                    ..Attr::default()
2320                }),
2321                vec![Inline::Str("NAME".into())]
2322            ))
2323        );
2324        assert!(matches!(
2325            doc.blocks.get(2),
2326            Some(Block::Header(2, attr, _)) if attr.id == "sub-title"
2327        ));
2328    }
2329
2330    #[test]
2331    fn duplicate_headings_disambiguate() {
2332        let doc = read(".TH T 1\n.SH Foo\nx\n.SH Foo\ny\n");
2333        let ids: Vec<&str> = doc
2334            .blocks
2335            .iter()
2336            .filter_map(|b| match b {
2337                Block::Header(_, attr, _) => Some(attr.id.as_str()),
2338                _ => None,
2339            })
2340            .collect();
2341        assert_eq!(ids, vec!["foo", "foo-1"]);
2342    }
2343
2344    #[test]
2345    fn auto_identifiers_off_leaves_empty_id() {
2346        let doc = read_with(".TH T 1\n.SH Foo Bar\nx\n", Extensions::empty());
2347        assert!(matches!(
2348            doc.blocks.first(),
2349            Some(Block::Header(1, attr, _)) if attr.id.is_empty()
2350        ));
2351    }
2352
2353    #[test]
2354    fn lines_fill_into_one_paragraph() {
2355        let doc = read(".TH T 1\nfirst line\nsecond line\n");
2356        assert_eq!(
2357            doc.blocks.first(),
2358            Some(&Block::Para(vec![
2359                Inline::Str("first".into()),
2360                Inline::Space,
2361                Inline::Str("line".into()),
2362                Inline::Space,
2363                Inline::Str("second".into()),
2364                Inline::Space,
2365                Inline::Str("line".into()),
2366            ]))
2367        );
2368    }
2369
2370    #[test]
2371    fn blank_line_separates_paragraphs() {
2372        let doc = read(".TH T 1\none\n\ntwo\n");
2373        assert_eq!(doc.blocks.len(), 2);
2374    }
2375
2376    #[test]
2377    fn bold_macro_joins_arguments() {
2378        let doc = read(".TH T 1\n.B \"two words\" tail\n");
2379        assert_eq!(
2380            doc.blocks.first(),
2381            Some(&Block::Para(vec![Inline::Strong(vec![
2382                Inline::Str("two".into()),
2383                Inline::Space,
2384                Inline::Str("words".into()),
2385                Inline::Space,
2386                Inline::Str("tail".into()),
2387            ])]))
2388        );
2389    }
2390
2391    #[test]
2392    fn font_macro_nests_an_inner_font_escape() {
2393        let doc = read(".TH T 1\n.B \\-f \\fIfile\\fR tail\n");
2394        assert_eq!(
2395            doc.blocks.first(),
2396            Some(&Block::Para(vec![Inline::Strong(vec![
2397                Inline::Str("-f".into()),
2398                Inline::Space,
2399                Inline::Emph(vec![Inline::Str("file".into())]),
2400                Inline::Space,
2401                Inline::Str("tail".into()),
2402            ])]))
2403        );
2404    }
2405
2406    #[test]
2407    fn alternating_font_arg_wraps_an_inner_escape() {
2408        let doc = read(".TH T 1\n.BR a\\fIx\\fR b\n");
2409        assert_eq!(
2410            doc.blocks.first(),
2411            Some(&Block::Para(vec![
2412                Inline::Strong(vec![
2413                    Inline::Str("a".into()),
2414                    Inline::Emph(vec![Inline::Str("x".into())]),
2415                ]),
2416                Inline::Str("b".into()),
2417            ]))
2418        );
2419    }
2420
2421    #[test]
2422    fn alternating_fonts_abut_without_space() {
2423        let doc = read(".TH T 1\n.BR bold roman\n");
2424        assert_eq!(
2425            doc.blocks.first(),
2426            Some(&Block::Para(vec![
2427                Inline::Strong(vec![Inline::Str("bold".into())]),
2428                Inline::Str("roman".into()),
2429            ]))
2430        );
2431    }
2432
2433    #[test]
2434    fn inline_font_escape_groups_run() {
2435        let doc = read(".TH T 1\n\\fBtwo words\\fR plain\n");
2436        assert_eq!(
2437            doc.blocks.first(),
2438            Some(&Block::Para(vec![
2439                Inline::Strong(vec![
2440                    Inline::Str("two".into()),
2441                    Inline::Space,
2442                    Inline::Str("words".into()),
2443                ]),
2444                Inline::Space,
2445                Inline::Str("plain".into()),
2446            ]))
2447        );
2448    }
2449
2450    #[test]
2451    fn boundary_space_leaves_the_font_run() {
2452        let doc = read(".TH T 1\n\\fBbold \\fRroman\n");
2453        assert_eq!(
2454            doc.blocks.first(),
2455            Some(&Block::Para(vec![
2456                Inline::Strong(vec![Inline::Str("bold".into())]),
2457                Inline::Space,
2458                Inline::Str("roman".into()),
2459            ]))
2460        );
2461    }
2462
2463    #[test]
2464    fn break_macro_is_a_line_break() {
2465        let doc = read(".TH T 1\nbefore\n.br\nafter\n");
2466        assert_eq!(
2467            doc.blocks.first(),
2468            Some(&Block::Para(vec![
2469                Inline::Str("before".into()),
2470                Inline::LineBreak,
2471                Inline::Str("after".into()),
2472            ]))
2473        );
2474    }
2475
2476    #[test]
2477    fn comment_is_transparent() {
2478        let doc = read(".TH T 1\nvisible\n.\\\" a comment\nstill\n");
2479        assert_eq!(
2480            doc.blocks.first(),
2481            Some(&Block::Para(vec![
2482                Inline::Str("visible".into()),
2483                Inline::Space,
2484                Inline::Str("still".into()),
2485            ]))
2486        );
2487    }
2488
2489    #[test]
2490    fn special_characters_resolve() {
2491        let doc = read(".TH T 1\ndash \\- bullet \\(bu em \\(em\n");
2492        assert_eq!(
2493            doc.blocks.first(),
2494            Some(&Block::Para(vec![
2495                Inline::Str("dash".into()),
2496                Inline::Space,
2497                Inline::Str("-".into()),
2498                Inline::Space,
2499                Inline::Str("bullet".into()),
2500                Inline::Space,
2501                Inline::Str("\u{00b7}".into()),
2502                Inline::Space,
2503                Inline::Str("em".into()),
2504                Inline::Space,
2505                Inline::Str("\u{2014}".into()),
2506            ]))
2507        );
2508    }
2509
2510    #[test]
2511    fn unknown_special_character_is_replacement() {
2512        let doc = read(".TH T 1\nx \\(ZZ y\n");
2513        assert_eq!(
2514            doc.blocks.first(),
2515            Some(&Block::Para(vec![
2516                Inline::Str("x".into()),
2517                Inline::Space,
2518                Inline::Str("\u{fffd}".into()),
2519                Inline::Space,
2520                Inline::Str("y".into()),
2521            ]))
2522        );
2523    }
2524
2525    #[test]
2526    fn unicode_escape_resolves() {
2527        let doc = read(".TH T 1\n\\[u00C9]\n");
2528        assert_eq!(
2529            doc.blocks.first(),
2530            Some(&Block::Para(vec![Inline::Str("\u{00c9}".into())]))
2531        );
2532    }
2533
2534    #[test]
2535    fn tbl_region_becomes_a_table() {
2536        let doc = read(".TH T 1\n.TS\nl r.\nName\tAge\n_\nAda\t36\n.TE\nafter\n");
2537        let Some(Block::Table(table)) = doc.blocks.first() else {
2538            panic!("expected a table");
2539        };
2540        // Alignments come from the format line; widths stay default.
2541        assert_eq!(
2542            table.col_specs,
2543            vec![
2544                ColSpec {
2545                    align: Alignment::AlignLeft,
2546                    width: ColWidth::ColWidthDefault,
2547                },
2548                ColSpec {
2549                    align: Alignment::AlignRight,
2550                    width: ColWidth::ColWidthDefault,
2551                },
2552            ]
2553        );
2554        // The rule line under the first data row promotes it to the head.
2555        assert_eq!(table.head.rows.len(), 1);
2556        assert_eq!(table.head.rows.first().map(|row| row.cells.len()), Some(2));
2557        assert_eq!(table.bodies.first().map(|body| body.body.len()), Some(1));
2558        assert_eq!(
2559            doc.blocks.get(1),
2560            Some(&Block::Para(vec![Inline::Str("after".into())]))
2561        );
2562    }
2563
2564    #[test]
2565    fn tbl_without_header_rule_puts_every_row_in_the_body() {
2566        let doc = read(".TH T 1\n.TS\nc c.\nName\tAge\nAda\t36\n.TE\n");
2567        let Some(Block::Table(table)) = doc.blocks.first() else {
2568            panic!("expected a table");
2569        };
2570        assert!(table.head.rows.is_empty());
2571        assert_eq!(table.bodies.first().map(|body| body.body.len()), Some(2));
2572    }
2573
2574    #[test]
2575    fn malformed_tbl_region_yields_no_block() {
2576        let doc = read(".TS");
2577        assert!(doc.blocks.is_empty());
2578    }
2579
2580    #[test]
2581    fn tagged_paragraphs_become_a_definition_list() {
2582        let doc = read(".TH T 1\n.TP\n.B \\-v\nVerbose mode.\n.TP\n.B \\-f\nUse a file.\n");
2583        let Some(Block::DefinitionList(items)) = doc.blocks.first() else {
2584            panic!("expected a definition list");
2585        };
2586        assert_eq!(items.len(), 2);
2587        assert_eq!(
2588            items.first().map(|(term, _)| term.clone()),
2589            Some(vec![Inline::Strong(vec![Inline::Str("-v".into())])])
2590        );
2591    }
2592
2593    #[test]
2594    fn bullet_indented_paragraphs_become_a_bullet_list() {
2595        let doc = read(".TH T 1\n.IP \\(bu 2\none\n.IP \\(bu 2\ntwo\n");
2596        let Some(Block::BulletList(items)) = doc.blocks.first() else {
2597            panic!("expected a bullet list");
2598        };
2599        assert_eq!(items.len(), 2);
2600    }
2601
2602    #[test]
2603    fn numbered_indented_paragraphs_become_an_ordered_list() {
2604        let doc = read(".TH T 1\n.IP 3. 4\nthree\n.IP 4. 4\nfour\n");
2605        assert_eq!(
2606            doc.blocks.first(),
2607            Some(&Block::OrderedList(
2608                ListAttributes {
2609                    start: 3,
2610                    style: ListNumberStyle::Decimal,
2611                    delim: ListNumberDelim::Period,
2612                },
2613                vec![
2614                    vec![Block::Para(vec![Inline::Str("three".into())])],
2615                    vec![Block::Para(vec![Inline::Str("four".into())])],
2616                ]
2617            ))
2618        );
2619    }
2620
2621    #[test]
2622    fn roman_marker_is_lower_roman() {
2623        assert!(matches!(
2624            parse_enumerator("iv."),
2625            Some(ListAttributes {
2626                start: 4,
2627                style: ListNumberStyle::LowerRoman,
2628                delim: ListNumberDelim::Period,
2629            })
2630        ));
2631    }
2632
2633    #[test]
2634    fn bare_letter_marker_uses_its_position() {
2635        assert!(matches!(
2636            parse_enumerator("o"),
2637            Some(ListAttributes {
2638                start: 15,
2639                style: ListNumberStyle::LowerAlpha,
2640                delim: ListNumberDelim::DefaultDelim,
2641            })
2642        ));
2643    }
2644
2645    #[test]
2646    fn unmarked_indented_paragraph_is_an_inset() {
2647        let doc = read(".TH T 1\n.IP\nplain indented\n");
2648        assert!(matches!(doc.blocks.first(), Some(Block::BlockQuote(_))));
2649    }
2650
2651    #[test]
2652    fn relative_inset_becomes_a_block_quote() {
2653        let doc = read(".TH T 1\n.RS\ninside\n.RE\nafter\n");
2654        assert_eq!(
2655            doc.blocks.first(),
2656            Some(&Block::BlockQuote(vec![Block::Para(vec![Inline::Str(
2657                "inside".into()
2658            )])]))
2659        );
2660        assert_eq!(
2661            doc.blocks.get(1),
2662            Some(&Block::Para(vec![Inline::Str("after".into())]))
2663        );
2664    }
2665
2666    #[test]
2667    fn nested_insets_nest_block_quotes() {
2668        let doc = read(".TH T 1\n.RS\nouter\n.RS\ninner\n.RE\n.RE\n");
2669        assert!(matches!(
2670            doc.blocks.first(),
2671            Some(Block::BlockQuote(inner)) if inner.iter().any(|b| matches!(b, Block::BlockQuote(_)))
2672        ));
2673    }
2674
2675    #[test]
2676    fn no_fill_region_becomes_a_code_block() {
2677        let doc = read(".TH T 1\n.nf\nline one\n  indented\n.fi\n");
2678        assert_eq!(
2679            doc.blocks.first(),
2680            Some(&Block::CodeBlock(
2681                Box::default(),
2682                "line one\n  indented".into()
2683            ))
2684        );
2685    }
2686
2687    #[test]
2688    fn example_region_becomes_a_code_block() {
2689        let doc = read(".TH T 1\n.EX\n\\fBcode\\fR \\- here\n.EE\n");
2690        assert_eq!(
2691            doc.blocks.first(),
2692            Some(&Block::CodeBlock(Box::default(), "code - here".into()))
2693        );
2694    }
2695
2696    #[test]
2697    fn uri_macro_becomes_a_link() {
2698        let doc = read(".TH T 1\n.UR https://example.com\nthe text\n.UE\n");
2699        assert_eq!(
2700            doc.blocks.first(),
2701            Some(&Block::Para(vec![Inline::Link(
2702                Box::default(),
2703                vec![
2704                    Inline::Str("the".into()),
2705                    Inline::Space,
2706                    Inline::Str("text".into()),
2707                ],
2708                Box::new(Target {
2709                    url: "https://example.com".into(),
2710                    title: carta_ast::Text::default(),
2711                }),
2712            )]))
2713        );
2714    }
2715
2716    #[test]
2717    fn mail_macro_uses_mailto() {
2718        let doc = read(".TH T 1\n.MT user@example.com\nwrite me\n.ME\n");
2719        let Some(Block::Para(inlines)) = doc.blocks.first() else {
2720            panic!("expected a paragraph");
2721        };
2722        assert!(matches!(
2723            inlines.first(),
2724            Some(Inline::Link(_, _, target)) if target.url == "mailto:user@example.com"
2725        ));
2726    }
2727
2728    #[test]
2729    fn link_trailing_text_attaches_without_space() {
2730        let doc = read(".TH T 1\nsee\n.UR https://x.org\nhere\n.UE .\nnext\n");
2731        let Some(Block::Para(inlines)) = doc.blocks.first() else {
2732            panic!("expected a paragraph");
2733        };
2734        // … the link, then the trailing "." with no separating space.
2735        let link_index = inlines
2736            .iter()
2737            .position(|i| matches!(i, Inline::Link(..)))
2738            .expect("link present");
2739        assert_eq!(inlines.get(link_index + 1), Some(&Inline::Str(".".into())));
2740    }
2741
2742    #[test]
2743    fn unknown_macro_breaks_the_paragraph() {
2744        let doc = read(".TH T 1\nbefore\n.XYZ args\nafter\n");
2745        assert_eq!(doc.blocks.len(), 2);
2746    }
2747
2748    #[test]
2749    fn defined_string_interpolates_and_rescans_its_escapes() {
2750        let doc = read(".TH T 1\n.ds B \\fBbold\\fP\nx \\*B y\n");
2751        assert_eq!(
2752            doc.blocks.first(),
2753            Some(&Block::Para(vec![
2754                Inline::Str("x".into()),
2755                Inline::Space,
2756                Inline::Strong(vec![Inline::Str("bold".into())]),
2757                Inline::Space,
2758                Inline::Str("y".into()),
2759            ]))
2760        );
2761    }
2762
2763    #[test]
2764    fn predefined_strings_resolve() {
2765        let doc = read(".TH T 1\n\\*(lq x \\*(rq \\*(Tm \\*R\n");
2766        assert_eq!(
2767            doc.blocks.first(),
2768            Some(&Block::Para(vec![
2769                Inline::Str("\u{201c}".into()),
2770                Inline::Space,
2771                Inline::Str("x".into()),
2772                Inline::Space,
2773                Inline::Str("\u{201d}".into()),
2774                Inline::Space,
2775                Inline::Str("\u{2122}".into()),
2776                Inline::Space,
2777                Inline::Str("\u{00ae}".into()),
2778            ]))
2779        );
2780    }
2781
2782    #[test]
2783    fn accented_special_characters_resolve() {
2784        let doc = read(".TH T 1\n\\(:a\\(ss\\('e\\(la\\(,c\n");
2785        assert_eq!(
2786            doc.blocks.first(),
2787            Some(&Block::Para(vec![Inline::Str(
2788                "\u{e4}\u{df}\u{e9}\u{27e8}\u{e7}".into()
2789            )]))
2790        );
2791    }
2792
2793    #[test]
2794    fn tab_escape_becomes_a_space() {
2795        let doc = read(".TH T 1\na\\tb\n");
2796        assert_eq!(
2797            doc.blocks.first(),
2798            Some(&Block::Para(vec![
2799                Inline::Str("a".into()),
2800                Inline::Space,
2801                Inline::Str("b".into()),
2802            ]))
2803        );
2804    }
2805
2806    #[test]
2807    fn continuation_escape_is_dropped() {
2808        // `\c` vanishes; the two text lines still fill with a separating space.
2809        let doc = read(".TH T 1\nabc\\c\ndef\n");
2810        assert_eq!(
2811            doc.blocks.first(),
2812            Some(&Block::Para(vec![
2813                Inline::Str("abc".into()),
2814                Inline::Space,
2815                Inline::Str("def".into()),
2816            ]))
2817        );
2818    }
2819
2820    #[test]
2821    fn zero_width_and_motion_escapes_drop_their_glyphs() {
2822        // `\z` drops the following glyph, `\u`/`\d` take no argument, `\k` reads a register name.
2823        let doc = read(".TH T 1\na\\zbc up\\udown\\d mark\\kx end\n");
2824        assert_eq!(
2825            doc.blocks.first(),
2826            Some(&Block::Para(vec![
2827                Inline::Str("ac".into()),
2828                Inline::Space,
2829                Inline::Str("updown".into()),
2830                Inline::Space,
2831                Inline::Str("mark".into()),
2832                Inline::Space,
2833                Inline::Str("end".into()),
2834            ]))
2835        );
2836    }
2837
2838    #[test]
2839    fn trailing_backslash_joins_the_next_line_without_a_space() {
2840        let doc = read(".TH T 1\nfoo\\\nbar\n");
2841        assert_eq!(
2842            doc.blocks.first(),
2843            Some(&Block::Para(vec![Inline::Str("foobar".into())]))
2844        );
2845    }
2846
2847    #[test]
2848    fn supplementary_tag_joins_terms_with_a_line_break() {
2849        let doc = read(".TH T 1\n.TP\n.B \\-a\n.TQ\n.B \\-b\nbody.\n");
2850        let Some(Block::DefinitionList(items)) = doc.blocks.first() else {
2851            panic!("expected a definition list");
2852        };
2853        assert_eq!(
2854            items.first().map(|(term, _)| term.clone()),
2855            Some(vec![
2856                Inline::Strong(vec![Inline::Str("-a".into())]),
2857                Inline::LineBreak,
2858                Inline::Strong(vec![Inline::Str("-b".into())]),
2859            ])
2860        );
2861    }
2862
2863    #[test]
2864    fn request_in_link_label_aborts_the_link() {
2865        // The label's request makes a link impossible; the label is emitted as its own block and the
2866        // text trailing the terminator is dropped.
2867        let doc = read(".TH T 1\nbefore\n.UR u\n.B bold\n.UE after\nnext\n");
2868        assert_eq!(
2869            doc.blocks,
2870            vec![
2871                Block::Para(vec![Inline::Str("before".into())]),
2872                Block::Para(vec![Inline::Strong(vec![Inline::Str("bold".into())])]),
2873                Block::Para(vec![Inline::Str("next".into())]),
2874            ]
2875        );
2876    }
2877
2878    #[test]
2879    fn link_without_a_terminator_emits_its_label() {
2880        let doc = read(".TH T 1\n.UR u\nlabel\n");
2881        assert_eq!(
2882            doc.blocks.first(),
2883            Some(&Block::Para(vec![Inline::Str("label".into())]))
2884        );
2885    }
2886
2887    #[test]
2888    fn whitespace_only_line_does_not_break_the_paragraph() {
2889        let doc = read(".TH T 1\none\n \ntwo\n");
2890        assert_eq!(
2891            doc.blocks.first(),
2892            Some(&Block::Para(vec![
2893                Inline::Str("one".into()),
2894                Inline::Space,
2895                Inline::Str("two".into()),
2896            ]))
2897        );
2898        assert_eq!(doc.blocks.len(), 1);
2899    }
2900
2901    #[test]
2902    fn lone_whitespace_line_is_an_empty_paragraph() {
2903        let doc = read(".TH T 1\n \n");
2904        assert_eq!(doc.blocks.first(), Some(&Block::Para(Vec::new())));
2905    }
2906
2907    #[test]
2908    fn tagged_paragraph_with_no_body_becomes_a_paragraph() {
2909        let doc = read(".TH T 1\n.TP\n.B \\-x\n");
2910        assert_eq!(
2911            doc.blocks.first(),
2912            Some(&Block::Para(vec![Inline::Strong(vec![Inline::Str(
2913                "-x".into()
2914            )])]))
2915        );
2916    }
2917
2918    #[test]
2919    fn empty_tagged_paragraph_nests_the_following_items() {
2920        let doc = read(".TH T 1\n.TP\n.B \\-a\n.TP\n.B \\-b\nbody.\n");
2921        let Some(Block::DefinitionList(items)) = doc.blocks.first() else {
2922            panic!("expected a definition list");
2923        };
2924        assert_eq!(items.len(), 1);
2925        let nested = items
2926            .first()
2927            .and_then(|(_, bodies)| bodies.first())
2928            .and_then(|blocks| blocks.first());
2929        assert!(matches!(nested, Some(Block::DefinitionList(_))));
2930    }
2931
2932    #[test]
2933    fn marked_item_with_no_body_keeps_an_empty_paragraph() {
2934        let doc = read(".TH T 1\n.IP \\(bu\n.IP \\(bu\nsecond.\n");
2935        let Some(Block::BulletList(items)) = doc.blocks.first() else {
2936            panic!("expected a bullet list");
2937        };
2938        assert_eq!(items.first(), Some(&vec![Block::Para(Vec::new())]));
2939    }
2940
2941    #[test]
2942    fn unmarked_item_with_no_body_contributes_nothing() {
2943        let doc = read(".TH T 1\n.IP\n");
2944        assert!(doc.blocks.is_empty());
2945    }
2946
2947    #[test]
2948    fn ascii_identifiers_fold_an_accented_heading() {
2949        let doc = read_with(
2950            ".TH T 1\n.SH Café\nx\n",
2951            Extensions::from_list(&[Extension::AutoIdentifiers, Extension::AsciiIdentifiers]),
2952        );
2953        assert!(matches!(
2954            doc.blocks.first(),
2955            Some(Block::Header(1, attr, _)) if attr.id == "cafe"
2956        ));
2957    }
2958
2959    #[test]
2960    fn constant_width_font_escape_becomes_code() {
2961        let doc = read(".TH T 1\nplain \\f(CWmono\\fP back\n");
2962        assert_eq!(
2963            doc.blocks.first(),
2964            Some(&Block::Para(vec![
2965                Inline::Str("plain".into()),
2966                Inline::Space,
2967                Inline::Code(Box::default(), "mono".into()),
2968                Inline::Space,
2969                Inline::Str("back".into()),
2970            ]))
2971        );
2972    }
2973
2974    #[test]
2975    fn constant_width_bold_font_wraps_code_in_strong() {
2976        let doc = read(".TH T 1\n\\f(CBmono\\fP\n");
2977        assert_eq!(
2978            doc.blocks.first(),
2979            Some(&Block::Para(vec![Inline::Strong(vec![Inline::Code(
2980                Box::default(),
2981                "mono".into()
2982            )])]))
2983        );
2984    }
2985
2986    #[test]
2987    fn user_macro_substitutes_call_arguments() {
2988        let doc = read(".TH T 1\n.de GREET\nHello \\$1 and \\$2.\n..\n.GREET Alice Bob\n");
2989        assert_eq!(
2990            doc.blocks.first(),
2991            Some(&Block::Para(vec![
2992                Inline::Str("Hello".into()),
2993                Inline::Space,
2994                Inline::Str("Alice".into()),
2995                Inline::Space,
2996                Inline::Str("and".into()),
2997                Inline::Space,
2998                Inline::Str("Bob.".into()),
2999            ]))
3000        );
3001    }
3002
3003    #[test]
3004    fn multi_line_macro_expansion_fills_like_inline_text() {
3005        let inline = read(".TH T 1\nfirst line\nsecond line\n");
3006        let via_macro = read(".TH T 1\n.de M\nfirst line\nsecond line\n..\n.M\n");
3007        assert_eq!(inline.blocks, via_macro.blocks);
3008    }
3009
3010    #[test]
3011    fn nested_macro_call_expands_in_place_preserving_order() {
3012        let doc =
3013            read(".TH T 1\n.de INNER\nmiddle\n..\n.de OUTER\nbefore\n.INNER\nafter\n..\n.OUTER\n");
3014        assert_eq!(
3015            doc.blocks.first(),
3016            Some(&Block::Para(vec![
3017                Inline::Str("before".into()),
3018                Inline::Space,
3019                Inline::Str("middle".into()),
3020                Inline::Space,
3021                Inline::Str("after".into()),
3022            ]))
3023        );
3024    }
3025
3026    #[test]
3027    fn macro_expansion_seam_keeps_base_lines_in_order() {
3028        let doc = read(".TH T 1\n.de M\nexpanded\n..\n.M\nbase line\n");
3029        assert_eq!(
3030            doc.blocks.first(),
3031            Some(&Block::Para(vec![
3032                Inline::Str("expanded".into()),
3033                Inline::Space,
3034                Inline::Str("base".into()),
3035                Inline::Space,
3036                Inline::Str("line".into()),
3037            ]))
3038        );
3039    }
3040
3041    #[test]
3042    fn conditional_inside_macro_expansion_reprocesses_the_queued_line() {
3043        // `.ie`/`.el` reprocess the *queued* expansion line in place; the base document's
3044        // following line must survive untouched.
3045        let doc = read(".TH T 1\n.de M\n.ie n kept\n.el dropped\n..\n.M\nbase line\n");
3046        assert_eq!(
3047            doc.blocks.first(),
3048            Some(&Block::Para(vec![
3049                Inline::Str("kept".into()),
3050                Inline::Space,
3051                Inline::Str("base".into()),
3052                Inline::Space,
3053                Inline::Str("line".into()),
3054            ]))
3055        );
3056    }
3057
3058    #[test]
3059    fn link_label_spanning_macro_expansion_and_base_document_is_recognized() {
3060        // The label opens inside a macro expansion (queued) and its terminator sits in the base
3061        // document (unqueued); the lookahead must chain across that seam to find it.
3062        let doc =
3063            read(".TH T 1\n.de LABEL\n.UR https://example.com\nfirst\n..\n.LABEL\nsecond\n.UE\n");
3064        let Some(Block::Para(inlines)) = doc.blocks.first() else {
3065            panic!("expected a paragraph");
3066        };
3067        assert!(matches!(
3068            inlines.first(),
3069            Some(Inline::Link(_, _, target)) if target.url == "https://example.com"
3070        ));
3071    }
3072
3073    #[test]
3074    fn doubled_backslash_argument_reference_reduces_like_a_single_one() {
3075        let single = read(".TH T 1\n.de M\nvalue \\$1\n..\n.M x\n");
3076        let doubled = read(".TH T 1\n.de M\nvalue \\\\$1\n..\n.M x\n");
3077        assert_eq!(single.blocks, doubled.blocks);
3078        assert_eq!(
3079            single.blocks.first(),
3080            Some(&Block::Para(vec![
3081                Inline::Str("value".into()),
3082                Inline::Space,
3083                Inline::Str("x".into()),
3084            ]))
3085        );
3086    }
3087
3088    #[test]
3089    fn copy_mode_reduces_an_escaped_backslash_before_an_escape() {
3090        assert_eq!(reduce_copy_mode("x\\\\(buy"), "x\\(buy");
3091        assert_eq!(reduce_copy_mode("plain text"), "plain text");
3092    }
3093
3094    #[test]
3095    fn font_macro_with_an_explicit_empty_argument_keeps_its_wrapper() {
3096        let doc = read(".TH T 1\nbefore\n.B \"\"\nafter\n");
3097        assert_eq!(
3098            doc.blocks.first(),
3099            Some(&Block::Para(vec![
3100                Inline::Str("before".into()),
3101                Inline::Space,
3102                Inline::Strong(Vec::new()),
3103                Inline::Space,
3104                Inline::Str("after".into()),
3105            ]))
3106        );
3107    }
3108
3109    #[test]
3110    fn font_macro_with_no_argument_takes_the_next_line() {
3111        let doc = read(".TH T 1\nbefore\n.I\nafter\n");
3112        assert_eq!(
3113            doc.blocks.first(),
3114            Some(&Block::Para(vec![
3115                Inline::Str("before".into()),
3116                Inline::Space,
3117                Inline::Emph(vec![Inline::Str("after".into())]),
3118            ]))
3119        );
3120    }
3121
3122    #[test]
3123    fn option_synopsis_brackets_a_bold_option_name() {
3124        let doc = read(".TH T 1\n.OP \\-o file\n");
3125        assert_eq!(
3126            doc.blocks.first(),
3127            Some(&Block::Para(vec![
3128                Inline::Str("[".into()),
3129                Inline::Space,
3130                Inline::Strong(vec![Inline::Str("-o".into())]),
3131                Inline::Space,
3132                Inline::Str("file".into()),
3133                Inline::Space,
3134                Inline::Str("]".into()),
3135            ]))
3136        );
3137    }
3138
3139    #[test]
3140    fn table_with_a_horizontal_span_degrades_to_a_placeholder() {
3141        let doc = read(".TH T 1\n.TS\nl s l.\nWide\t\tEnd\none\ttwo\tthree\n.TE\n");
3142        assert_eq!(
3143            doc.blocks.first(),
3144            Some(&Block::Para(vec![Inline::Str("TABLE".into())]))
3145        );
3146    }
3147
3148    #[test]
3149    fn table_text_block_joins_its_lines() {
3150        let doc = read(".TH T 1\n.TS\nl l.\nName\tT{\nA long\ndescription\nT}\nLeft\tRight\n.TE\n");
3151        let Some(Block::Table(table)) = doc.blocks.first() else {
3152            panic!("expected a table");
3153        };
3154        // The two source lines of the `T{ … T}` block join into a single cell.
3155        let cell_text = format!("{table:?}");
3156        assert!(cell_text.contains("long"));
3157        assert!(cell_text.contains("description"));
3158    }
3159
3160    #[test]
3161    fn east_asian_line_breaks_is_accepted_and_inert() {
3162        let input = ".TH T 1\n.SH H\nplain filled text\n";
3163        let base = read(input);
3164        let with = read_with(
3165            input,
3166            Extensions::from_list(&[Extension::AutoIdentifiers, Extension::EastAsianLineBreaks]),
3167        );
3168        assert_eq!(base.blocks, with.blocks);
3169    }
3170}