Skip to main content

crates_docs/tools/docs/
html.rs

1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4//! Uses the `scraper` crate for robust HTML5 parsing.
5
6use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11/// Tags whose content should be completely removed during HTML cleaning
12const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14/// Block-level tags. During plain-text extraction a [`BLOCK_SEP`] marker is
15/// inserted around these so adjacent blocks (e.g. consecutive `<li>`/`<dt>`
16/// item-index entries, table cells, or paragraphs) do not run together into a
17/// single token like `Dl_infoElf32_Chdr`, and so each block can be emitted on
18/// its own line. Inline tags are intentionally excluded so that runs split
19/// across inline elements (`ser`+`<wbr>`+`ializing`, `RandomState</a>,`) are not
20/// corrupted with spurious spaces.
21const BLOCK_TAGS: &[&str] = &[
22    "address",
23    "article",
24    "aside",
25    "blockquote",
26    "br",
27    "dd",
28    "div",
29    "dl",
30    "dt",
31    "fieldset",
32    "figcaption",
33    "figure",
34    "footer",
35    "form",
36    "h1",
37    "h2",
38    "h3",
39    "h4",
40    "h5",
41    "h6",
42    "header",
43    "hr",
44    "li",
45    "main",
46    "nav",
47    "ol",
48    "p",
49    "pre",
50    "section",
51    "table",
52    "tbody",
53    "tfoot",
54    "thead",
55    "tr",
56    "ul",
57];
58
59/// Sentinel marker inserted around block-level elements during plain-text
60/// extraction (see [`BLOCK_TAGS`]). It is deliberately distinct from any
61/// whitespace so genuine block boundaries can be turned into newlines without
62/// being confused with the incidental whitespace inside text nodes (including
63/// source-indentation newlines), which is collapsed to single spaces. A NUL
64/// byte never appears in rendered documentation text: the HTML parser replaces
65/// any literal NUL in the input with U+FFFD.
66const BLOCK_SEP: &str = "\u{0}";
67
68/// Sentinel marker inserted around table cells (`<td>`/`<th>`) during plain-text
69/// extraction. Unlike [`BLOCK_SEP`] (which becomes a newline), `CELL_SEP` keeps
70/// a table row's cells on a single line, joined by ` | `, so the row's
71/// columns stay associated (e.g. `%C | 20 | The proleptic Gregorian year ...`).
72/// U+0001 never appears in rendered documentation text, so it is a safe
73/// sentinel (cf. [`BLOCK_SEP`]).
74const CELL_SEP: &str = "\u{1}";
75
76/// Sentinel characters used to preserve the verbatim whitespace of `<pre>`
77/// code blocks through the whitespace-collapsing passes. They are control
78/// characters that Rust does not classify as whitespace, so they survive both
79/// `str::split_whitespace` and `str::lines`. [`decode_pre`] restores the
80/// original characters once all collapsing is complete.
81const PRE_SPACE: char = '\u{2}';
82const PRE_NEWLINE: char = '\u{3}';
83const PRE_TAB: char = '\u{4}';
84
85/// Regex to remove anchor links like [§](#xxx)
86static ANCHOR_LINK_REGEX: LazyLock<Regex> =
87    LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").expect("hardcoded valid regex pattern"));
88
89/// Regex to remove relative source links like [Source](../src/...)
90static SOURCE_LINK_REGEX: LazyLock<Regex> =
91    LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").expect("hardcoded valid regex pattern"));
92
93/// Regex to remove rustdoc `[src]`/`[[src]]` source links (older rustdoc).
94static SRC_LINK_REGEX: LazyLock<Regex> =
95    LazyLock::new(|| Regex::new(r"\[\[?src\]?\]\([^)]*\)").expect("hardcoded valid regex pattern"));
96
97/// Regex to remove rustdoc collapse-toggle links of the form
98/// `[ [-] ](javascript:void(0))` (the marker may be `-`, `+` or U+2212).
99///
100/// The toggle text contains a nested `[...]`, so this is matched explicitly to
101/// avoid greedily spanning adjacent links.
102static JS_TOGGLE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
103    Regex::new(r"\[\s*\[[-+\x{2212}]\]\s*\]\(javascript:[^\n)]*\)\)?")
104        .expect("hardcoded valid regex pattern")
105});
106
107/// Regex to remove plain `[text](javascript:...)` links emitted by older
108/// rustdoc. Link text must not contain `]` so it cannot span adjacent links.
109static JS_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
110    Regex::new(r"\[[^\]\n]*\]\(javascript:[^\n)]*\)\)?").expect("hardcoded valid regex pattern")
111});
112
113/// Regex to convert empty-target links `[text]()` to plain `text`.
114static EMPTY_LINK_REGEX: LazyLock<Regex> =
115    LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\(\)").expect("hardcoded valid regex pattern"));
116
117/// Regex to match no-op fragment-only links like `[serde](#)` or `[ⓘ](#)`
118/// (a bare `#` target navigates nowhere). The captured label is inspected by
119/// the caller: meaningful labels (containing an alphanumeric, e.g. a crate name
120/// in a versioned-page heading where rustdoc renders `<a href="#">serde</a>`)
121/// are downgraded to plain text, while symbol-only toggle markers (ⓘ, −, +)
122/// are dropped. Real in-page anchors such as `[Quick start](#quick-start)`
123/// keep a fragment id and never match.
124static FRAGMENT_TOGGLE_REGEX: LazyLock<Regex> =
125    LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\(#\)").expect("hardcoded valid regex pattern"));
126
127/// Regex to downgrade rustdoc *item-anchor* links to their plain-text label.
128///
129/// rustdoc cross-references items with fragment-only links whose id carries a
130/// type-specific prefix (`#method.foo`, `#tymethod.foo`, `#variant.Foo`,
131/// `#structfield.foo`, `#associatedtype.Error`, `#associatedconstant.MAX`,
132/// `#reexport.foo`) or the impl-block form (`#impl-Trait-for-Type`). These
133/// anchors only exist inside the rustdoc page; the rendered markdown has no
134/// matching heading id, so the links are dead. Group 1 captures the label
135/// (the item name) so it can be kept as text. Genuine in-page section anchors
136/// (e.g. `[Quick start](#quick-start)`) lack these prefixes and are untouched.
137static RUSTDOC_ITEM_ANCHOR_REGEX: LazyLock<Regex> = LazyLock::new(|| {
138    Regex::new(
139        r"\[([^\]]*)\]\(#(?:(?:method|tymethod|variant|structfield|associatedtype|associatedconstant|reexport)\.|impl-)[^)]*\)",
140    )
141    .expect("hardcoded valid regex pattern")
142});
143
144/// Regex to drop breadcrumb-residue lines that contain only `::` separators.
145///
146/// rustdoc item headers render a navigation breadcrumb such as
147/// `[tokio](../index.html)::[task](../index.html)::spawn`. Once the relative
148/// links are stripped, an orphan line of bare `::` separators can remain; it
149/// carries no information and is removed. Inline `::` inside code or text is
150/// unaffected because those lines contain other characters.
151static STRAY_COLON_LINE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
152    Regex::new(r"(?m)^[ \t]*:{2,}[ \t]*$").expect("hardcoded valid regex pattern")
153});
154
155/// Regex to drop orphan separator lines that contain only a middot (`·`).
156///
157/// rustdoc's `out-of-band` heading row renders `<source> · [-]` (a source link,
158/// a middot separator, and a collapse toggle). Once the source link and toggle
159/// are stripped, a lone `·` remains on its own line; it carries no information.
160/// Inline middots inside prose are unaffected because those lines have other
161/// characters.
162static STRAY_MIDDOT_LINE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
163    Regex::new(r"(?m)^[ \t]*\u{00b7}[ \t]*$").expect("hardcoded valid regex pattern")
164});
165
166/// Regex to strip an orphaned trailing middot separator from a line.
167///
168/// rustdoc joins out-of-band metadata with ` \u{00b7} ` separators, e.g.
169/// `1.0.0 \u{00b7} <source link>`. Once the trailing source/toggle link is
170/// removed, the line keeps a dangling ` \u{00b7}` that carries no meaning
171/// (e.g. the stability line becomes `1.0.0 \u{00b7}`). Drop the trailing
172/// middot together with the whitespace (including non-breaking spaces) that
173/// precedes it. Middots embedded in prose are unaffected because they are
174/// followed by more text.
175static TRAILING_MIDDOT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
176    Regex::new(r"(?m)[ \t\u{00a0}]*\u{00b7}[ \t\u{00a0}]*$").expect("hardcoded valid regex pattern")
177});
178
179/// Regex to trim trailing horizontal whitespace, including non-breaking spaces.
180///
181/// rustdoc headings and metadata rows frequently end with a stray space or
182/// non-breaking space (`\u{00a0}`) that html2md preserves, leaving artifacts
183/// like `Struct HashMap\u{00a0}` above a setext underline. Stripping trailing
184/// whitespace per line removes the noise without affecting content.
185static TRAILING_WS_REGEX: LazyLock<Regex> =
186    LazyLock::new(|| Regex::new(r"(?m)[ \t\u{00a0}]+$").expect("hardcoded valid regex pattern"));
187
188/// Regex to strip the redundant closing hashes html2md appends to ATX
189/// headings.
190///
191/// html2md 0.2.15 renders `<h3>`-`<h6>` as ATX headings with a trailing run of
192/// closing hashes (e.g. `### Examples ###`, `#### pub fn get() ####`). Those
193/// closing hashes are optional in `CommonMark` and read as noise, so we drop the
194/// trailing ` #+` while keeping the leading marker. Group 1 captures the
195/// heading text.
196static HEADING_TRAILING_HASH_REGEX: LazyLock<Regex> = LazyLock::new(|| {
197    Regex::new(r"(?m)^(#{1,6}[ \t].*?)[ \t]+#+[ \t]*$").expect("hardcoded valid regex pattern")
198});
199
200/// Matches an HTML superscript element (`<sup>...</sup>`) left verbatim in the
201/// markdown output.
202///
203/// `html2md` 0.2.15 has no handler for `<sup>`/`<sub>`, so rustdoc footnote
204/// references and exponents (e.g. `<sup id="fnref1"><a href="#fn1">1</a></sup>`)
205/// survive as literal HTML in the markdown. Group 1 captures the inner markup;
206/// [`clean_markdown`] strips any nested tags and re-emits it as `^(...)`.
207static SUPERSCRIPT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
208    Regex::new(r"(?is)<sup\b[^>]*>(.*?)</sup\s*>").expect("hardcoded valid regex pattern")
209});
210
211/// Matches an HTML subscript element (`<sub>...</sub>`) left verbatim in the
212/// markdown output. Counterpart to [`SUPERSCRIPT_REGEX`]; re-emitted as `_(...)`.
213static SUBSCRIPT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
214    Regex::new(r"(?is)<sub\b[^>]*>(.*?)</sub\s*>").expect("hardcoded valid regex pattern")
215});
216
217/// Matches a single HTML tag, used to strip residual inline markup (e.g. a
218/// nested `<a>`) from the inner content of a super/subscript before re-emitting
219/// it as plain text. See [`clean_markdown`].
220static INLINE_TAG_STRIP_REGEX: LazyLock<Regex> =
221    LazyLock::new(|| Regex::new(r"(?is)<[^>]+>").expect("hardcoded valid regex pattern"));
222
223/// Matches a negative auto-trait impl heading whose linkified trait name is
224/// glued to the leading `!`, e.g. `### impl<T> !Freeze for Mutex<T>`.
225///
226/// rustdoc emits the negative-impl marker as a text `!` immediately before the
227/// trait link (`!<a class="trait" ...>Freeze</a>`). html2md fuses these into
228/// `![Freeze](url)`, which is markdown image syntax and renders as a broken
229/// embedded image instead of the text `!Freeze`. Group 1 captures the heading
230/// prefix up to (and including) the `!`-glued bracket's `!`; [`clean_markdown`]
231/// re-emits it with the `!` backslash-escaped so it stays literal. Scoped to
232/// `impl` headings so genuine doc-body images are never touched.
233static NEGATIVE_IMPL_TRAIT_IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
234    Regex::new(r"(?m)^(#{1,6} +impl\b[^\n]*?)!\[").expect("hardcoded valid regex pattern")
235});
236
237/// Regex to rewrite relative documentation links to their link text.
238///
239/// Matches `[text](path.html)` where `path` begins with a letter, digit, `_`,
240/// `.` or `/` (covering module paths such as `_derive/index.html`,
241/// `../index.html`, `struct.Foo.html`) and ends with `.html` (optionally
242/// followed by a `#fragment`). Group 1 captures the link text and group 2 the
243/// URL. The link text may contain one level of nested brackets (e.g. an
244/// attribute label `#[tokio::main]` or a slice type `[u8]`).
245/// Docs.rs-relative targets are useless to an MCP client, so they are
246/// downgraded to their (meaningful) label; absolute external URLs containing a
247/// scheme (`://`) are kept intact since they are still reachable.
248static RELATIVE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
249    Regex::new(r"\[((?:[^\[\]]|\[[^\]]*\])*)\]\(([a-zA-Z0-9._/][^)]*\.html(?:#[^)]*)?)\)")
250        .expect("hardcoded valid regex pattern")
251});
252
253/// Matches a rustdoc "Read more" see-also affordance link (`[Read more](url)`).
254///
255/// rustdoc appends a `<a href="...">Read more</a>` link to the one-line summary
256/// of every inherited/trait method (e.g. derived `Clone`/`Debug`/`Hash`). When
257/// the target is a docs.rs-relative `.html` path it cannot be resolved by an
258/// MCP client, and downgrading it to its label leaves a meaningless dangling
259/// "Read more" at the end of the sentence. Group 1 captures any leading inline
260/// whitespace and group 2 the URL, so a relative affordance can be dropped
261/// entirely while an absolute (`scheme://`) one is preserved. See
262/// [`clean_markdown`].
263static READ_MORE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
264    Regex::new(r"([ \t]*)\[Read more\]\(([^)]*)\)").expect("hardcoded valid regex pattern")
265});
266
267/// Matches a rustdoc item-index table (`<dl class="item-table">...</dl>`).
268///
269/// docs.rs/rustdoc renders crate- and module-overview item indexes as a
270/// definition list of `<dt>` (item name + link) / optional `<dd>` (summary)
271/// pairs. `html2md` does not treat `<dt>` as block-level, so every entry
272/// collapses onto a single line (e.g. `Dl_infoElf32_ChdrElf32_Ehdr...`). We
273/// rewrite these tables into `<ul><li>` lists before markdown/text conversion
274/// so each item renders on its own line. The class only appears on overview
275/// pages, never on individual item pages.
276static ITEM_TABLE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
277    Regex::new(r"(?is)<dl[^>]*\bitem-table\b[^>]*>(.*?)</dl\s*>")
278        .expect("hardcoded valid regex pattern")
279});
280
281/// Matches a single `<dt>name</dt>` row with an optional following
282/// `<dd>summary</dd>` inside an item-table (see `ITEM_TABLE_REGEX`).
283static ITEM_TABLE_ROW_REGEX: LazyLock<Regex> = LazyLock::new(|| {
284    Regex::new(r"(?is)<dt\b[^>]*>(.*?)</dt\s*>\s*(?:<dd\b[^>]*>(.*?)</dd\s*>)?")
285        .expect("hardcoded valid regex pattern")
286});
287
288/// Regex to collapse three or more newlines to two newlines
289static MULTIPLE_NEWLINES_REGEX: LazyLock<Regex> =
290    LazyLock::new(|| Regex::new(r"\n\n\n+").expect("hardcoded valid regex pattern"));
291
292/// Matches a `<pre>...</pre>` block (verbatim code) so callers can leave its
293/// significant whitespace untouched while transforming the surrounding markup.
294static PRE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
295    Regex::new(r"(?is)<pre\b.*?</pre\s*>").expect("hardcoded valid regex pattern")
296});
297
298/// Matches a whitespace run that contains a newline/tab/CR immediately before
299/// an inline element's opening tag.
300///
301/// `html2md` 0.2.15 drops such leading whitespace before inline elements like
302/// `<a>`, `<em>` and `<strong>`, gluing the element onto the preceding word
303/// (e.g. a word, a newline, then an `<a>` link wraps an inline-code span and
304/// renders glued to the word after relative-link downgrading). A *single*
305/// literal space is preserved correctly by `html2md`, so these runs are
306/// collapsed to one space. The pattern only matches runs containing a
307/// newline/tab/CR, so genuine single spaces and deliberately glued cases such
308/// as a hyphen directly followed by `<code>` (no whitespace at all) are left
309/// untouched.
310static INLINE_LEADING_WS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
311    Regex::new(
312        r"(?i)[ \t\r\n]*[\r\n\t][ \t\r\n]*(<(?:a|code|em|strong|b|i|span|sup|sub|abbr|kbd|var|cite|q|mark|small|u)\b)",
313    )
314    .expect("hardcoded valid regex pattern")
315});
316
317/// Matches a whitespace run containing a newline/tab/CR immediately *after* an
318/// inline element's closing tag, when followed by word-like content.
319///
320/// Symmetric to [`INLINE_LEADING_WS_REGEX`]: `html2md` 0.2.15 also drops such
321/// trailing whitespace, gluing the next word onto the element (e.g.
322/// `</a>` followed by a newline and `crate` renders as `[..](..)crate`, which
323/// becomes `..crate` after relative-link downgrading). The trailing lookahead
324/// restricts the fix to alphanumeric/backtick/bracket/open-paren starts so a line wrapped
325/// before trailing punctuation (`</a>\n.`) is left untouched. A single literal
326/// space is already preserved by `html2md`, so only newline-bearing runs match.
327static INLINE_TRAILING_WS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
328    Regex::new(
329        r"(?i)(</(?:a|code|em|strong|b|i|span|sup|sub|abbr|kbd|var|cite|q|mark|small|u)>)[ \t\r\n]*[\r\n\t][ \t\r\n]*(?P<n>[A-Za-z0-9`\[(])",
330    )
331    .expect("hardcoded valid regex pattern")
332});
333
334/// Cached CSS selector for body element
335static BODY_SELECTOR: LazyLock<Selector> =
336    LazyLock::new(|| Selector::parse("body").expect("hardcoded valid selector"));
337
338/// Cached CSS selector for all elements
339static ALL_SELECTOR: LazyLock<Selector> =
340    LazyLock::new(|| Selector::parse("*").expect("hardcoded valid selector"));
341
342/// Cached selectors for skip tags (script, style, noscript, iframe)
343static SCRIPT_SELECTOR: LazyLock<Selector> =
344    LazyLock::new(|| Selector::parse("script").expect("hardcoded valid selector"));
345static STYLE_SELECTOR: LazyLock<Selector> =
346    LazyLock::new(|| Selector::parse("style").expect("hardcoded valid selector"));
347static NOSCRIPT_SELECTOR: LazyLock<Selector> =
348    LazyLock::new(|| Selector::parse("noscript").expect("hardcoded valid selector"));
349static IFRAME_SELECTOR: LazyLock<Selector> =
350    LazyLock::new(|| Selector::parse("iframe").expect("hardcoded valid selector"));
351
352/// Cached selectors for nav tags (nav, header, footer, aside)
353static NAV_SELECTOR: LazyLock<Selector> =
354    LazyLock::new(|| Selector::parse("nav").expect("hardcoded valid selector"));
355static HEADER_SELECTOR: LazyLock<Selector> =
356    LazyLock::new(|| Selector::parse("header").expect("hardcoded valid selector"));
357static FOOTER_SELECTOR: LazyLock<Selector> =
358    LazyLock::new(|| Selector::parse("footer").expect("hardcoded valid selector"));
359static ASIDE_SELECTOR: LazyLock<Selector> =
360    LazyLock::new(|| Selector::parse("aside").expect("hardcoded valid selector"));
361
362/// Cached selectors for UI tags (button, summary)
363static BUTTON_SELECTOR: LazyLock<Selector> =
364    LazyLock::new(|| Selector::parse("button").expect("hardcoded valid selector"));
365static SUMMARY_SELECTOR: LazyLock<Selector> =
366    LazyLock::new(|| Selector::parse("summary").expect("hardcoded valid selector"));
367
368/// Regex to strip rustdoc source-code links (`<a class="src ...">Source</a>`)
369/// from raw HTML *before* parsing.
370///
371/// These anchors point at the crate's `src/...rs.html` listings and add no
372/// value to extracted documentation. They are commonly nested inside
373/// `<summary>` elements whose text content is otherwise preserved, so removing
374/// them at the DOM level would be too late (the "Source" label would survive as
375/// plain text). Stripping them from the raw HTML first guarantees they leak
376/// into neither plain-text nor markdown output.
377static SRC_ANCHOR_HTML_REGEX: LazyLock<Regex> = LazyLock::new(|| {
378    // Match both modern (`class="src"`, double-quoted) and older rustdoc
379    // (`class='srclink'`, single-quoted) source-code anchors so their `[src]`
380    // label never leaks into the plain-text output (which, unlike the markdown
381    // path, has no later link-stripping pass).
382    Regex::new(r#"(?s)<a\b[^>]*\bclass\s*=\s*['"][^'"]*\bsrc(?:link)?\b[^'"]*['"][^>]*>.*?</a>"#)
383        .expect("hardcoded valid regex pattern")
384});
385
386/// Regex to fix the orphan `\u{00b7}` separator left between a stability
387/// "since" badge and its now-removed source link.
388///
389/// rustdoc emits `<span class="since">1.0.0</span> \u{00b7} <a class="src">Source</a>`
390/// inside an item's right-side metadata. [`SRC_ANCHOR_HTML_REGEX`] deletes the
391/// source anchor, leaving ` \u{00b7} </span>`. When the enclosing `<summary>` is
392/// later flattened to text the dangling middot glues onto the following
393/// signature (`1.0.0 \u{00b7} fn next(...)`). Collapse the separator (and its
394/// surrounding whitespace) to a single space while preserving the closing tag,
395/// so the version stays cleanly separated from the signature.
396static ORPHAN_SINCE_MIDDOT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
397    Regex::new(r"(?s)[ \t\u{00a0}]*\u{00b7}[ \t\u{00a0}]*(</span\s*>)")
398        .expect("hardcoded valid regex pattern")
399});
400
401/// Matches a rustdoc stability "since" version badge
402/// (`<span class="since ...">1.0.0</span>`) that is immediately followed by
403/// another tag with no separating whitespace.
404///
405/// In a flattened `<summary>` (provided trait methods on FFI structs, e.g.
406/// libc) the badge abuts the method code-header, so plain-text extraction
407/// fuses them (`1.0.0fn clone_from`). Group 1 captures the whole badge; the
408/// trailing `<` (re-emitted by the replacement) ensures a space is inserted
409/// only when the badge is glued, never doubling an existing space. The version
410/// text holds no nested tags, so `[^<]*` captures it safely. See [`clean_html`].
411static SINCE_BADGE_GLUED_REGEX: LazyLock<Regex> = LazyLock::new(|| {
412    Regex::new(
413        r#"(?is)(<span\b[^>]*\bclass\s*=\s*["'][^"']*\bsince\b[^"']*["'][^>]*>[^<]*</span\s*>)<"#,
414    )
415    .expect("hardcoded valid regex pattern")
416});
417
418/// Regex to remove rustdoc UI anchor links that carry no documentation value.
419///
420/// rustdoc decorates headings, item declarations and code examples with
421/// navigation affordances rendered as `<a>` elements:
422/// - section/anchor links `<a class="anchor">\u{00a7}</a>` (a section-sign that
423///   jumps to the heading),
424/// - notable-trait markers `<a class="tooltip" data-notable-ty="...">\u{24d8}</a>`
425///   (a circled-i tooltip toggle), and
426/// - "Run code" buttons `<a class="test-arrow" href="https://play.rust-lang.org/...">`
427///   with empty link text (the playground launcher for a doc example), and
428/// - scraped-example help links `<a class="scrape-help" href="...">?</a>` (the
429///   `?` affordance beside an "Examples found in repository" heading).
430///
431/// The glyph anchors commonly sit inside a `<summary>` whose text is otherwise
432/// preserved, so removing them at the DOM level is too late (the glyph would
433/// survive as plain text and glue onto the following declaration, e.g.
434/// `\u{00a7}impl<...>` or `Keys<'_, K, V> \u{24d8}`). The run buttons otherwise
435/// render as an empty-text markdown link wrapping a very long playground URL
436/// (`[](https://play.rust-lang.org/?code=...)`). Stripping all three from the
437/// raw HTML keeps them out of both the markdown and plain-text output.
438static UI_ANCHOR_HTML_REGEX: LazyLock<Regex> = LazyLock::new(|| {
439    Regex::new(
440        r#"(?s)<a\b[^>]*\bclass\s*=\s*['"][^'"]*\b(?:anchor|tooltip|test-arrow|scrape-help)\b[^'"]*['"][^>]*>.*?</a>"#,
441    )
442    .expect("hardcoded valid regex pattern")
443});
444
445/// Regex to remove rustdoc UI anchors whose target is a `javascript:` URL
446/// (collapse/expand toggles such as `#toggle-all-docs`, which render as a
447/// bracketed minus/plus marker).
448///
449/// These are pure UI affordances; documentation never legitimately links to a
450/// `javascript:` URL. Their visible marker text would otherwise leak into the
451/// plain-text output, since the `javascript:`-link cleanup only runs on the
452/// markdown path.
453static JS_ANCHOR_HTML_REGEX: LazyLock<Regex> = LazyLock::new(|| {
454    Regex::new(r#"(?is)<a\b[^>]*\bhref\s*=\s*['"]\s*javascript:[^>]*>.*?</a>"#)
455        .expect("hardcoded valid regex pattern")
456});
457
458/// Regex to remove `<script>`, `<style>`, `<noscript>` and `<iframe>` elements
459/// (including their contents) from raw HTML *before* parsing.
460///
461/// The DOM-based pass in [`remove_unwanted_elements`] re-serializes each node
462/// via `ElementRef::html()` and string-replaces it in the original markup. That
463/// match is fragile: html5ever normalizes attribute whitespace and quoting, so
464/// markup like `<script  defer >` is serialized as `<script defer>` and the
465/// replacement silently misses, leaking executable/style content into the
466/// `html` output format. Stripping these tags with a tolerant regex first
467/// guarantees they are removed regardless of the original formatting. (Back-
468/// references are unsupported by the `regex` crate, so each tag is listed
469/// explicitly rather than captured once.)
470static DANGEROUS_ELEMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
471    Regex::new(
472        r"(?is)<script\b[^>]*>.*?</script\s*>|<style\b[^>]*>.*?</style\s*>|<noscript\b[^>]*>.*?</noscript\s*>|<iframe\b[^>]*>.*?</iframe\s*>|<iframe\b[^>]*/>",
473    )
474    .expect("hardcoded valid regex pattern")
475});
476
477/// Regex to remove rustdoc UI web-components from raw HTML before parsing.
478///
479/// Modern rustdoc emits custom elements for its chrome: `<rustdoc-toolbar>`
480/// (the settings/options toolbar, rendered empty in static HTML) and
481/// `<rustdoc-topbar>` (a duplicate breadcrumb such as
482/// `<h2><a href="#">Iterator</a></h2>`). The toolbar sits inside
483/// `#main-content`, so it leaks into the `html` output as a stray empty tag;
484/// the topbar can leak a redundant heading. Neither carries documentation
485/// value, so both are stripped (paired and self-closing forms).
486static RUSTDOC_UI_ELEMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
487    Regex::new(
488        r"(?is)<rustdoc-(?:toolbar|topbar)\b[^>]*>.*?</rustdoc-(?:toolbar|topbar)\s*>|<rustdoc-(?:toolbar|topbar)\b[^>]*/>",
489    )
490    .expect("hardcoded valid regex pattern")
491});
492
493/// Regex to remove the rustdoc navigation breadcrumb element.
494///
495/// rustdoc renders a breadcrumb above each item title, e.g.
496/// `<div class="rustdoc-breadcrumbs"><a href="../index.html">std</a>::<wbr>`
497/// `<a href="index.html">vec</a></div>`. Its links are page-relative, so they
498/// are downgraded to bare text and leave a dangling line such as `std::vec`
499/// (or a lone `std` on macro pages) directly under our own
500/// `## Documentation: <path>` title. The breadcrumb is pure navigation chrome
501/// that duplicates the title, so the whole element is removed before parsing.
502/// It contains only anchors and separators (no nested `<div>`), so the
503/// non-greedy match terminates at the first `</div>`.
504static RUSTDOC_BREADCRUMBS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
505    Regex::new(r"(?is)<div\b[^>]*\brustdoc-breadcrumbs\b[^>]*>.*?</div\s*>")
506        .expect("hardcoded valid regex pattern")
507});
508
509/// Regex matching a rustdoc prose admonition rendered as a styled `<pre>`.
510///
511/// rustdoc/mdBook authors create "Warning"/"Note" callout boxes with the idiom
512/// `<pre class="compile_fail" style="white-space:normal;font:inherit;">` (or
513/// with `class="ignore"`) wrapping ordinary prose HTML such as a paragraph with
514/// a bold "Warning" lead-in. The `white-space:normal;font:inherit` style makes
515/// rustdoc
516/// render it as flowing prose rather than monospaced code. Without special
517/// handling our pipeline treats the `<pre>` as a code block and wraps the prose
518/// in a bare fenced code block (mislabeling prose as code and flattening its
519/// inline links and code). Genuine code examples keep the default `white-space: pre`, so
520/// matching on `white-space:normal` reliably selects only these prose boxes.
521/// They are rewritten to a `<blockquote>` so the inner prose renders normally
522/// as a callout in every output format. The box holds no nested `<pre>`, so the
523/// non-greedy body terminates at the first `</pre>`.
524static PROSE_PRE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
525    Regex::new(
526        r#"(?is)<pre\b[^>]*\bstyle\s*=\s*["'][^"']*white-space\s*:\s*normal[^"']*["'][^>]*>(.*?)</pre\s*>"#,
527    )
528    .expect("hardcoded valid regex pattern")
529});
530
531/// Regex matching rustdoc's "unsafe function" marker superscript.
532///
533/// In module item lists rustdoc appends `<sup title="unsafe function">WARN</sup>`
534/// (the `WARN` glyph is a warning emoji) after each unsafe function's name. Our
535/// superscript handling would otherwise turn it into a `^(...)` token glued onto
536/// the name (e.g. `copy^(...)`). The marker conveys a useful fact, so it is
537/// replaced with a readable ` (unsafe)` annotation in every output format before
538/// parsing.
539static UNSAFE_FN_MARKER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
540    Regex::new(r#"(?is)<sup\b[^>]*\btitle\s*=\s*["']unsafe function["'][^>]*>.*?</sup\s*>"#)
541        .expect("hardcoded valid regex pattern")
542});
543
544/// Matches a rustdoc collapse-toggle `<summary class="hideme">` element.
545///
546/// rustdoc places interactive "Show N methods"/"Show N associated items" and
547/// "Expand description" toggles inside `<summary class="hideme">` nodes, and the
548/// "Show N methods" one sits *inside* the item-declaration `<pre>` block. Its
549/// label text therefore leaks into the rendered code (e.g.
550/// `Show 76 methods    // Required method` inside a trait signature) in every
551/// output format. The element is pure UI chrome, so it is removed wholesale
552/// (the surrounding `<details>` content is preserved). See [`clean_html`].
553static HIDEME_SUMMARY_REGEX: LazyLock<Regex> = LazyLock::new(|| {
554    Regex::new(
555        r#"(?is)<summary\b[^>]*\bclass\s*=\s*["'][^"']*\bhideme\b[^"']*["'][^>]*>.*?</summary\s*>"#,
556    )
557    .expect("hardcoded valid regex pattern")
558});
559
560/// Matches a rustdoc impl-block documentation `<div class="docblock">` that is
561/// the final child of an impl `<section>` nested inside a `<summary>`.
562///
563/// rustdoc renders an impl block's own documentation (e.g. a "Basic API"
564/// heading) as `<div class="docblock">...</div>` *inside* the `<summary>` that
565/// also holds the `impl ...` declaration. Because [`remove_unwanted_elements`]
566/// flattens `<summary>` nodes to their decoded text, that docblock glues onto
567/// the declaration (e.g. `impl ArgBasic API`). Group 1 captures the docblock
568/// contents so the wrapper can be relocated *after* the `</summary>`, where it
569/// renders as ordinary content. The trailing `</div></section></summary>`
570/// boundary only occurs for impl-block docs (method/field docblocks sit after
571/// their `</summary>`), so this does not disturb other documentation. See
572/// [`clean_html`].
573static IMPL_DOCBLOCK_IN_SUMMARY_REGEX: LazyLock<Regex> = LazyLock::new(|| {
574    Regex::new(r#"(?is)</h3>\s*<div class="docblock">(.*?)</div>\s*</section>\s*</summary>"#)
575        .expect("hardcoded valid regex pattern")
576});
577
578/// Matches a rustdoc portability/feature-availability badge that carries a
579/// human-readable `title` attribute.
580///
581/// rustdoc renders availability pills as
582/// `<span class="stab portability" title="Available on crate feature `fs` only">`
583/// `<code>fs</code></span>` immediately after an item link, with no separating
584/// whitespace. Group 1 captures the title text, which is the clearest rendering
585/// (it also covers platform/cfg badges such as "Available on `docsrs` and Unix
586/// only"). See [`rewrite_portability_badges`].
587static STAB_PORTABILITY_TITLE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
588    Regex::new(
589        r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bportability\b[^"']*["'][^>]*\btitle\s*=\s*"([^"]*)"[^>]*>.*?</span\s*>"#,
590    )
591    .expect("hardcoded valid regex pattern")
592});
593
594/// Matches a rustdoc portability badge that lacks a usable `title` attribute.
595/// Group 1 captures the inner markup (the feature name(s)). Fallback for the
596/// title-based [`STAB_PORTABILITY_TITLE_REGEX`].
597static STAB_PORTABILITY_REGEX: LazyLock<Regex> = LazyLock::new(|| {
598    Regex::new(
599        r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bportability\b[^"']*["'][^>]*>(.*?)</span\s*>"#,
600    )
601    .expect("hardcoded valid regex pattern")
602});
603
604/// Matches an inline rustdoc stability badge span (e.g.
605/// `<span class="stab unstable" title="">Experimental</span>` or a
606/// `<span class="stab deprecated">Deprecated</span>` pill) that rustdoc renders
607/// immediately after an item name with no separating whitespace, gluing the
608/// badge label onto the name (e.g. `TryReserveErrorKindExperimental`).
609///
610/// Group 1 captures the inner label. Portability badges (`class="stab
611/// portability"`) are handled earlier by [`rewrite_portability_badges`] and so
612/// are already consumed before this runs; only the remaining stab pills match.
613/// The pattern is span-scoped, so block-level stability banners
614/// (`<div class="stab unstable">...</div>`) on item-detail pages are untouched.
615/// See [`rewrite_stab_badges`].
616static STAB_BADGE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
617    Regex::new(
618        r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bstab\b[^"']*["'][^>]*>(.*?)</span\s*>"#,
619    )
620    .expect("hardcoded valid regex pattern")
621});
622
623/// Matches the opening tag of a rustdoc item-info wrapper
624/// (`<span class="item-info">`), which holds the stability/deprecation badges
625/// that rustdoc renders immediately after an item signature.
626///
627/// rustdoc emits the wrapper with no separating whitespace after the preceding
628/// `</section>` (e.g. `...&str</h4></section><span class="item-info"><div
629/// class="stab deprecated"><span class="emoji">\u{1f44e}</span>...`). When the
630/// enclosing collapsed `<summary>` is flattened to text, the badge glues onto
631/// the signature (`-> &str\u{1f44e} Deprecated since ...`). Group 1 captures the
632/// opening tag so [`clean_html`] can re-emit it preceded by a single space.
633static ITEM_INFO_OPEN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
634    Regex::new(r#"(?is)(<span\b[^>]*\bclass\s*=\s*["'][^"']*\bitem-info\b[^"']*["'][^>]*>)"#)
635        .expect("hardcoded valid regex pattern")
636});
637
638/// Matches a rustdoc decorative emoji badge such as the nightly-API flask.
639///
640/// rustdoc renders unstable/experimental markers as
641/// `<span class="emoji">\u{1f52c}</span><span>This is a nightly-only ...</span>`
642/// with no separating whitespace, so html2md glues the emoji onto the following
643/// text (`\u{1f52c}This is a nightly-only experimental API.`). Group 1 captures
644/// the whole badge; [`rewrite_emoji_badges`] re-emits it followed by a single
645/// space so the emoji reads as a separate visual cue.
646static EMOJI_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
647    Regex::new(
648        r#"(?is)(<span\b[^>]*\bclass\s*=\s*["'][^"']*\bemoji\b[^"']*["'][^>]*>.*?</span\s*>)"#,
649    )
650    .expect("hardcoded valid regex pattern")
651});
652
653/// Matches a rustdoc struct-field declaration span
654/// (`<span class="structfield section-header">field: Type</span>`).
655///
656/// rustdoc emits one such span per field with no separating whitespace and
657/// relies on CSS to render each as its own block. Without intervention the
658/// adjacent spans glue together: markdown yields back-to-back inline code
659/// spans, and the plain-text path fuses a field type onto the next field
660/// name into a corrupt token. The captured inner content is re-wrapped in a
661/// block element so each field renders on its own line. Group 1 is the field
662/// declaration. See [`clean_html`].
663static STRUCTFIELD_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
664    Regex::new(
665        r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bstructfield\b[^"']*["'][^>]*>(.*?)</span\s*>"#,
666    )
667    .expect("hardcoded valid regex pattern")
668});
669
670/// Matches a rustdoc `where`-clause block (`<div class="where">where ...</div>`)
671/// embedded in item declarations and signatures.
672///
673/// rustdoc relies on CSS to render this block on its own line(s); the markup
674/// itself carries no line break before the block or after it, so both html2md
675/// and the plain-text extractor glue it onto the surrounding tokens (e.g.
676/// `Vec<T, A = Global>where` and `Allocator,{`). Group `w` captures the inner
677/// content. See [`rewrite_where_clauses`].
678static WHERE_DIV_REGEX: LazyLock<Regex> = LazyLock::new(|| {
679    Regex::new(
680        r#"(?is)<div\b[^>]*\bclass\s*=\s*["'][^"']*\bwhere\b[^"']*["'][^>]*>(?P<w>.*?)</div\s*>"#,
681    )
682    .expect("hardcoded valid regex pattern")
683});
684
685/// Cached selectors for main content extraction
686static MAIN_CONTENT_SELECTOR: LazyLock<Selector> =
687    LazyLock::new(|| Selector::parse("#main-content").expect("hardcoded valid selector"));
688static RUSTDOC_BODY_WRAPPER_SELECTOR: LazyLock<Selector> =
689    LazyLock::new(|| Selector::parse("#rustdoc_body_wrapper").expect("hardcoded valid selector"));
690static H1_SELECTOR: LazyLock<Selector> =
691    LazyLock::new(|| Selector::parse("h1").expect("hardcoded valid selector"));
692
693/// Rewrite rustdoc item-index tables into HTML unordered lists.
694///
695/// Converts each `<dl class="item-table">` block into a `<ul>` whose `<li>`
696/// entries each hold one item (name link, optional ` — summary`). This keeps
697/// `html2md` from concatenating every item name onto a single line. See
698/// `ITEM_TABLE_REGEX` for details.
699#[must_use]
700fn rewrite_item_tables(html: &str) -> String {
701    ITEM_TABLE_REGEX
702        .replace_all(html, |caps: &regex::Captures| {
703            let inner = &caps[1];
704            let mut out = String::from("<ul>");
705            for row in ITEM_TABLE_ROW_REGEX.captures_iter(inner) {
706                let name = row.get(1).map_or("", |m| m.as_str()).trim();
707                if name.is_empty() {
708                    continue;
709                }
710                out.push_str("<li>");
711                out.push_str(name);
712                let desc = row.get(2).map_or("", |m| m.as_str()).trim();
713                if !desc.is_empty() {
714                    out.push_str(" \u{2014} ");
715                    out.push_str(desc);
716                }
717                out.push_str("</li>");
718            }
719            out.push_str("</ul>");
720            out
721        })
722        .into_owned()
723}
724
725/// Matches a rustdoc `<div class="code-attribute">` element. rustdoc wraps each
726/// attribute (e.g. `#[repr(i8)]`, `#[non_exhaustive]`) shown above an item
727/// declaration in this block-level `<div>`, which CSS renders on its own line.
728/// Group 1 captures the inner attribute markup. See
729/// [`rewrite_code_attributes`].
730static CODE_ATTRIBUTE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
731    Regex::new(
732        r#"(?is)<div\b[^>]*\bclass\s*=\s*["'][^"']*\bcode-attribute\b[^"']*["'][^>]*>(.*?)</div\s*>"#,
733    )
734    .expect("hardcoded valid regex pattern")
735});
736
737/// Put each item-declaration attribute on its own line.
738///
739/// rustdoc renders declaration attributes inside `<div class="code-attribute">`
740/// blocks within the `<pre class="item-decl">` signature. Because the `<div>`
741/// only breaks the line via CSS, extracting the `<pre>` text glues the
742/// attribute onto the following declaration (e.g. `#[repr(i8)]pub enum
743/// Ordering`) in every format. Replace each such `<div>` with its inner content
744/// followed by a newline so the attribute keeps its own line; the result
745/// renders identically to rustdoc in all three output formats.
746#[must_use]
747fn rewrite_code_attributes(html: &str) -> String {
748    CODE_ATTRIBUTE_REGEX
749        .replace_all(html, "${1}\n")
750        .into_owned()
751}
752
753/// Matches a rustdoc code-header element (`<h3>`/`<h4 class="code-header">`),
754/// which holds an item/impl/method signature. Group 1 is the heading level
755/// digit (matched again at the close tag) and group 2 the inner markup. See
756/// [`rewrite_code_headers`].
757static CODE_HEADER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
758    Regex::new(
759        r#"(?is)<h([34])\b[^>]*\bclass\s*=\s*["'][^"']*\bcode-header\b[^"']*["'][^>]*>(.*?)</h[34]\s*>"#,
760    )
761    .expect("hardcoded valid regex pattern")
762});
763
764/// Matches `(` followed by a newline and indentation (rustdoc's wrapped-argument
765/// list opener). See [`rewrite_code_headers`].
766static SIG_OPEN_PAREN_WRAP_REGEX: LazyLock<Regex> =
767    LazyLock::new(|| Regex::new(r"\(\s*\n\s*").expect("hardcoded valid regex pattern"));
768
769/// Matches an optional trailing comma plus a newline before the closing `)` of a
770/// wrapped argument list. See [`rewrite_code_headers`].
771static SIG_CLOSE_PAREN_WRAP_REGEX: LazyLock<Regex> =
772    LazyLock::new(|| Regex::new(r",?\s*\n\s*\)").expect("hardcoded valid regex pattern"));
773
774/// Matches any remaining newline-with-whitespace run inside a signature. See
775/// [`rewrite_code_headers`].
776static SIG_NEWLINE_RUN_REGEX: LazyLock<Regex> =
777    LazyLock::new(|| Regex::new(r"\s*\n\s*").expect("hardcoded valid regex pattern"));
778
779/// Collapse multi-line rustdoc signatures in code-header elements onto a single
780/// line.
781///
782/// rustdoc wraps long `fn`/method signatures across several lines using literal
783/// newlines and indentation inside the (non-`<pre>`) `<h4 class="code-header">`
784/// element, e.g. `try_lock_owned(\n    self: Arc<Self>,\n) -> ...`. html2md
785/// renders such a header as an ATX heading, so the embedded newlines split it
786/// into a broken two-line heading; the plain-text path collapses them but keeps
787/// stray spaces (`( self: Arc<Self>, )`). Normalise the wrapped argument list
788/// back to a single clean line (`(self: Arc<Self>) -> ...`) before parsing.
789/// Only code-header elements are touched, so `<pre>` code examples (which may
790/// legitimately contain `(\n    `) are unaffected.
791fn rewrite_code_headers(html: &str) -> String {
792    CODE_HEADER_REGEX
793        .replace_all(html, |caps: &regex::Captures| {
794            let level = &caps[1];
795            let inner = &caps[2];
796            // Impl headers (`<h3>`) stay headings, but item-signature headers
797            // (`<h4>`: methods, associated consts/types) render as plain text.
798            // rustdoc only wraps *documented* items in `<details><summary>`
799            // (whose `<h4>` is flattened to text); an *undocumented* sibling is
800            // a bare `<section>` whose `<h4>` would otherwise survive as a
801            // spurious `####` heading, inconsistent with its documented peers.
802            // See test_undocumented_assoc_item_not_rendered_as_heading.
803            let (open, close) = if level == "4" {
804                (r#"<p class="code-header">"#.to_string(), "</p>".to_string())
805            } else {
806                (
807                    format!("<h{level} class=\"code-header\">"),
808                    format!("</h{level}>"),
809                )
810            };
811            if !inner.contains('\n') {
812                return format!("{open}{inner}{close}");
813            }
814            let inner = SIG_OPEN_PAREN_WRAP_REGEX.replace_all(inner, "(");
815            let inner = SIG_CLOSE_PAREN_WRAP_REGEX.replace_all(&inner, ")");
816            let inner = SIG_NEWLINE_RUN_REGEX.replace_all(&inner, " ");
817            format!("{open}{inner}{close}")
818        })
819        .into_owned()
820}
821
822/// Detach rustdoc `where`-clause blocks from the surrounding declaration.
823///
824/// rustdoc emits `<div class="where">` with no literal line breaks around it
825/// (the layout is CSS-only), so item declarations render glued, e.g.
826/// `Vec<T, A = Global>where` and `Allocator,{ /* private fields */ }`. Inside
827/// `<pre>` declarations the clause is wrapped in newlines to reproduce the
828/// multi-line rustdoc layout; elsewhere (single-line code-header signatures) it
829/// is collapsed to a single space-padded clause so the heading stays on one
830/// line. `<pre>` boundaries are detected with [`PRE_BLOCK_REGEX`].
831fn rewrite_where_clauses(html: &str) -> String {
832    let collapse = |caps: &regex::Captures| -> String {
833        let inner = caps.name("w").map_or("", |m| m.as_str());
834        format!(
835            " {} ",
836            inner.split_whitespace().collect::<Vec<_>>().join(" ")
837        )
838    };
839    let mut out = String::with_capacity(html.len());
840    let mut last = 0;
841    for m in PRE_BLOCK_REGEX.find_iter(html) {
842        // Outside <pre>: collapse the clause onto one space-padded line.
843        out.push_str(&WHERE_DIV_REGEX.replace_all(&html[last..m.start()], &collapse));
844        // Inside <pre>: keep the clause verbatim but break it onto its own lines.
845        out.push_str(&WHERE_DIV_REGEX.replace_all(m.as_str(), "\n${w}\n"));
846        last = m.end();
847    }
848    out.push_str(&WHERE_DIV_REGEX.replace_all(&html[last..], &collapse));
849    out
850}
851
852/// Rewrite rustdoc portability/feature badges so they are not glued onto the
853/// preceding item name.
854///
855/// Each `<span class="stab portability">` is replaced by a space-separated
856/// parenthetical: the badge's human-readable `title` when present (e.g. the
857/// "Available on crate feature ... only" string), otherwise its inner content.
858/// This stops html2md from gluing the feature pill onto the item name, so it
859/// reads naturally in both markdown and plain-text formats.
860fn rewrite_portability_badges(html: &str) -> String {
861    let with_titles = STAB_PORTABILITY_TITLE_REGEX.replace_all(html, |caps: &regex::Captures| {
862        format!(" ({})", badge_title_to_html(&caps[1]))
863    });
864    STAB_PORTABILITY_REGEX
865        .replace_all(&with_titles, " (${1})")
866        .into_owned()
867}
868
869/// Convert a badge `title` string into HTML, turning backtick-delimited
870/// segments into genuine `<code>` elements.
871///
872/// rustdoc availability titles embed the feature name in literal backticks
873/// (e.g. ``Available on crate feature `thread_rng` only``). Splicing that text
874/// in verbatim makes html2md treat the backticks as plain characters: it then
875/// escapes any markdown metacharacter inside them (e.g. the underscore in
876/// `thread_rng`), leaking a stray backslash inside what looks like a code span
877/// (`` `thread\_rng` ``). Emitting a real `<code>` element instead yields a
878/// proper code span in markdown (no escaping) and correct markup in the html
879/// output. Backticks are only treated as delimiters when balanced; an odd
880/// count leaves the title untouched.
881#[must_use]
882fn badge_title_to_html(title: &str) -> String {
883    let parts: Vec<&str> = title.split('`').collect();
884    // An even number of segments means an odd number of backticks (unbalanced);
885    // leave the title as-is rather than emit a dangling `<code>`.
886    if parts.len().is_multiple_of(2) {
887        return title.to_string();
888    }
889    let mut out = String::with_capacity(title.len() + 13);
890    for (i, part) in parts.iter().enumerate() {
891        if i % 2 == 1 {
892            out.push_str("<code>");
893            out.push_str(part);
894            out.push_str("</code>");
895        } else {
896            out.push_str(part);
897        }
898    }
899    out
900}
901
902/// Rewrite remaining inline rustdoc stability badges so their label is not
903/// glued onto the preceding item name.
904///
905/// Each leftover `<span class="stab ...">` pill (e.g. the `Experimental` or
906/// `Deprecated` marker that follows an item link in a module index table) is
907/// replaced by a space-separated parenthetical built from its label text. Run
908/// *after* [`rewrite_portability_badges`] so feature/availability pills have
909/// already been consumed and only stability markers remain.
910#[must_use]
911fn rewrite_stab_badges(html: &str) -> String {
912    STAB_BADGE_REGEX.replace_all(html, " (${1})").into_owned()
913}
914
915/// Clean HTML by removing unwanted tags and their content
916///
917/// Uses the `scraper` crate for robust HTML5 parsing, which handles
918/// malformed HTML better than manual parsing.
919///
920/// This function performs a single-pass HTML parsing and removal of all
921/// unwanted elements to minimize parsing overhead.
922#[must_use]
923pub fn clean_html(html: &str) -> String {
924    // Strip source-code anchors from the raw HTML first so their "Source" label
925    // cannot survive as plain text when nested inside preserved <summary> nodes.
926    let html = SRC_ANCHOR_HTML_REGEX.replace_all(html, "");
927    // After the source link is gone, collapse the orphan `\u{00b7}` separator
928    // that rustdoc left between the "since" badge and that link (see
929    // ORPHAN_SINCE_MIDDOT_REGEX) so it cannot glue onto the next signature.
930    let html = ORPHAN_SINCE_MIDDOT_REGEX.replace_all(&html, " ${1}");
931    // Drop `javascript:` UI toggles (e.g. the bracketed collapse-all control)
932    // so their marker text does not survive plain-text extraction.
933    let html = JS_ANCHOR_HTML_REGEX.replace_all(&html, "");
934    // Strip rustdoc UI anchors (section-sign/notable-trait glyphs and the
935    // playground "Run code" buttons) before parsing so they do not survive as
936    // plain text or as empty-text links (see UI_ANCHOR_HTML_REGEX).
937    let html = UI_ANCHOR_HTML_REGEX.replace_all(&html, "");
938    // Separate a "since" version badge from a directly-following element so a
939    // flattened <summary> does not fuse it onto the next signature
940    // (`1.0.0fn clone_from`). See SINCE_BADGE_GLUED_REGEX.
941    let html = SINCE_BADGE_GLUED_REGEX.replace_all(&html, "${1} <");
942    // Guarantee removal of executable/style/embedded content regardless of how
943    // the source markup was formatted (see DANGEROUS_ELEMENT_REGEX docs).
944    let html = DANGEROUS_ELEMENT_REGEX.replace_all(&html, "");
945    // Strip rustdoc UI web-components (toolbar/topbar chrome) so they do not
946    // leak into the html output or as a redundant heading (see
947    // RUSTDOC_UI_ELEMENT_REGEX).
948    let html = RUSTDOC_UI_ELEMENT_REGEX.replace_all(&html, "");
949    // Remove the rustdoc navigation breadcrumb above the item title; its
950    // page-relative links would otherwise be downgraded to a dangling bare
951    // line (e.g. `std::vec`, or a lone `std` on macro pages) that merely
952    // duplicates our own title (see RUSTDOC_BREADCRUMBS_REGEX).
953    let html = RUSTDOC_BREADCRUMBS_REGEX.replace_all(&html, "");
954    // Rewrite rustdoc prose admonitions ("Warning"/"Note" callouts authored as
955    // `<pre style="white-space:normal;...">`) into blockquotes so their prose
956    // renders normally instead of being mislabeled as a bare ``` code block
957    // (see PROSE_PRE_REGEX). Genuine code examples are untouched.
958    let html = PROSE_PRE_REGEX.replace_all(&html, "<blockquote>${1}</blockquote>");
959    // Replace rustdoc's "unsafe function" marker superscript with a readable
960    // ` (unsafe)` annotation; otherwise it leaks as `^(...)` glued onto the
961    // function name in module item lists (see UNSAFE_FN_MARKER_REGEX).
962    let html = UNSAFE_FN_MARKER_REGEX.replace_all(&html, " (unsafe)");
963    // Remove rustdoc "Show N methods"/"Expand description" collapse
964    // toggles (`<summary class="hideme">`); the "Show N methods" toggle
965    // sits inside the item-declaration <pre>, so its label otherwise
966    // leaks into the rendered signature (see HIDEME_SUMMARY_REGEX).
967    let html = HIDEME_SUMMARY_REGEX.replace_all(&html, "");
968    // Detach `where` clauses (CSS-only line breaks) so declarations do not
969    // render glued (e.g. `Vec<T, A = Global>where`).
970    let html = rewrite_where_clauses(&html);
971    // Collapse multi-line wrapped signatures in code-header elements onto a
972    // single clean line so html2md does not emit a broken two-line heading
973    // (see rewrite_code_headers).
974    let html = rewrite_code_headers(&html);
975    // Put each item-declaration attribute (e.g. `#[repr(i8)]`) on its own line
976    // so it is not glued onto the following declaration (see
977    // rewrite_code_attributes).
978    let html = rewrite_code_attributes(&html);
979    // Separate feature/portability badges from the preceding item name so they
980    // do not render glued (e.g. `fs`fs``); replace each with a readable
981    // parenthetical built from the badge's title (or inner) text.
982    let html = rewrite_portability_badges(&html);
983    // Separate remaining inline stability pills (e.g. `Experimental`/`Deprecated`
984    // markers in module index tables) from the preceding item name so they do
985    // not render glued (see rewrite_stab_badges). Runs after the portability
986    // rewrite so feature/availability pills are already consumed.
987    let html = rewrite_stab_badges(&html);
988    // Append a space after decorative emoji badges (e.g. the nightly-API flask)
989    // so the emoji does not glue onto the following text (see EMOJI_SPAN_REGEX).
990    let html = EMOJI_SPAN_REGEX.replace_all(&html, "${1} ");
991    // Separate the item-info badge wrapper (stability/deprecation pills) from a
992    // preceding signature so a flattened `<summary>` does not glue the badge
993    // onto the declaration (e.g. `-> &str\u{1f44e} Deprecated`). See
994    // ITEM_INFO_OPEN_REGEX.
995    let html = ITEM_INFO_OPEN_REGEX.replace_all(&html, " ${1}");
996    // Rewrite rustdoc item-index tables into <ul><li> lists so html2md does not
997    // concatenate every item name onto a single line (overview pages only).
998    let html = rewrite_item_tables(&html);
999    // Put each struct-field declaration on its own block so adjacent fields
1000    // do not glue together (`a: A``b: B` in markdown, `A_tb` token fusion in
1001    // text). See STRUCTFIELD_SPAN_REGEX.
1002    let html = STRUCTFIELD_SPAN_REGEX.replace_all(&html, "<p>${1}</p>");
1003    // Relocate an impl block's own documentation out of the flattened
1004    // `<summary>` so its heading/text does not glue onto the `impl ...`
1005    // declaration (e.g. `impl ArgBasic API`). See
1006    // IMPL_DOCBLOCK_IN_SUMMARY_REGEX.
1007    let html = IMPL_DOCBLOCK_IN_SUMMARY_REGEX.replace_all(
1008        &html,
1009        r#"</h3></section></summary><div class="docblock">${1}</div>"#,
1010    );
1011    let document = Html::parse_document(&html);
1012    remove_unwanted_elements(&document, &html)
1013}
1014
1015/// HTML-escape the special characters `&`, `<`, and `>` in plain text.
1016///
1017/// Used when decoded text (from `ElementRef::text()`) is spliced back into an
1018/// HTML string that will be parsed again downstream (e.g. by `html2md`). Without
1019/// re-escaping, fragments such as `Option<usize>` would be misread as tags and
1020/// silently dropped. `&` is escaped first so the replacement is idempotent for a
1021/// single pass.
1022#[must_use]
1023fn escape_html_text(text: &str) -> String {
1024    text.replace('&', "&amp;")
1025        .replace('<', "&lt;")
1026        .replace('>', "&gt;")
1027}
1028
1029/// Remove unwanted elements from HTML using scraper for parsing
1030///
1031/// This function performs optimized single-pass removal of all unwanted elements
1032/// using cached selectors for better performance.
1033///
1034/// Removes: script, style, noscript, iframe, nav, header, footer, aside, button
1035/// Preserves summary content while removing the tag itself.
1036#[inline]
1037fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
1038    // Collect all elements to process with their positions for efficient replacement
1039    let mut replacements: Vec<(String, Option<String>)> = Vec::new();
1040
1041    // Process script, style, noscript, iframe - remove completely (using cached selectors)
1042    for element in document.select(&SCRIPT_SELECTOR) {
1043        replacements.push((element.html(), None));
1044    }
1045    for element in document.select(&STYLE_SELECTOR) {
1046        replacements.push((element.html(), None));
1047    }
1048    for element in document.select(&NOSCRIPT_SELECTOR) {
1049        replacements.push((element.html(), None));
1050    }
1051    for element in document.select(&IFRAME_SELECTOR) {
1052        replacements.push((element.html(), None));
1053    }
1054
1055    // Process nav, header, footer, aside - remove completely (using cached selectors)
1056    for element in document.select(&NAV_SELECTOR) {
1057        replacements.push((element.html(), None));
1058    }
1059    for element in document.select(&HEADER_SELECTOR) {
1060        replacements.push((element.html(), None));
1061    }
1062    for element in document.select(&FOOTER_SELECTOR) {
1063        replacements.push((element.html(), None));
1064    }
1065    for element in document.select(&ASIDE_SELECTOR) {
1066        replacements.push((element.html(), None));
1067    }
1068
1069    // Process button and summary - special handling for summary (using cached selectors)
1070    for element in document.select(&BUTTON_SELECTOR) {
1071        replacements.push((element.html(), None));
1072    }
1073    for element in document.select(&SUMMARY_SELECTOR) {
1074        let element_html = element.html();
1075        // For summary tags, extract and keep the text content. `text()` returns
1076        // *decoded* text, so generic markup such as `Option&lt;usize&gt;`
1077        // becomes literal `Option<usize>`. This string is later re-parsed by
1078        // `html2md`/`scraper`, which would treat `<usize>` as an unknown tag and
1079        // drop it; re-escape the markup so it survives the second parse.
1080        let text_content: String = element.text().collect();
1081        replacements.push((element_html, Some(escape_html_text(&text_content))));
1082    }
1083
1084    // If no replacements needed, just apply regex patterns
1085    if replacements.is_empty() {
1086        return apply_regex_patterns(original_html);
1087    }
1088
1089    // Sort by length descending (longer first) to avoid partial replacements
1090    // This ensures we replace parent elements before children
1091    replacements.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
1092
1093    // Build result using string slices for O(n) total complexity.
1094    //
1095    // Use the parsed document's own serialized form (the body's inner HTML) as
1096    // the replacement base rather than `original_html`. Each `element.html()`
1097    // is produced by the same html5ever serializer, so it is guaranteed to be a
1098    // substring here. Matching against the raw `original_html` instead would
1099    // miss elements whose source formatting differs from the serialized form
1100    // (e.g. extra whitespace inside a tag like `<nav  class=...>` or differing
1101    // attribute quoting), silently leaking navigation, headers, footers and
1102    // asides into the cleaned output. The body's inner HTML keeps the prior
1103    // fragment shape (no synthetic `<html>`/`<head>` wrappers).
1104    let mut result = document
1105        .select(&BODY_SELECTOR)
1106        .next()
1107        .map_or_else(|| document.root_element().html(), |body| body.inner_html());
1108    for (element_html, replacement) in replacements {
1109        // Use replace_all for safety, but since we sorted by length,
1110        // we should handle nested elements correctly
1111        result = if let Some(text) = replacement {
1112            result.replace(&element_html, &text)
1113        } else {
1114            result.replace(&element_html, "")
1115        };
1116    }
1117
1118    apply_regex_patterns(&result)
1119}
1120
1121/// Combined regex pattern for HTML cleanup optimization
1122///
1123/// This pattern combines all individual cleanup patterns into a single regex
1124/// to enable single-pass processing, significantly reducing allocations and
1125/// string traversal overhead compared to chained `replace_all()` calls.
1126///
1127/// Pattern components:
1128/// - `<link[^>]*>` - Link tags
1129/// - `<meta[^>]*>` - Meta tags
1130/// - `Copy item path` - UI copy path text
1131/// - `</?details[^>]*>` - rustdoc collapsible toggle wrappers (html2md leaves
1132///   these as raw tags); children are preserved
1133/// - `Expand description` / `Expand attributes` - docs.rs toggle labels
1134/// - `\[\§\]\([^)]*\)` - Anchor links like [§](#xxx)
1135/// - `\[(?:Source|de|en|fr|ja)\]\([^)]*\)` - Source/language badges
1136/// - `\[[^\]]*\]\([a-zA-Z][^)]*\.html\)` - Relative documentation links
1137static COMBINED_CLEANUP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
1138    Regex::new(
1139        r"(?:<link[^>]*>|<meta[^>]*>|</?details[^>]*>|Copy item path|Expand description|Expand attributes|\[§\]\([^)]*\)|\[Source\]\([^)]*\)|\[[^\]]*\]\([a-zA-Z][^)]*\.html\))",
1140    )
1141    .expect("hardcoded valid regex pattern")
1142});
1143
1144/// Apply all regex patterns in a single optimized pass
1145///
1146/// # Optimization Details
1147///
1148/// Previous implementation used 6 chained `.replace_all()` calls, creating
1149/// 5 intermediate strings and traversing the input 6 times. This approach:
1150///
1151/// 1. Combines all patterns into ONE unified regex (`COMBINED_CLEANUP_REGEX`)
1152/// 2. Uses callback-based replacement to handle different pattern types
1153/// 3. Creates only ONE intermediate string instead of FIVE
1154/// 4. Traverses the input exactly ONCE
1155///
1156/// Benchmark improvement (for typical docs.rs page ~50KB):
1157/// - Old: ~2ms per page (6 passes, 5 allocations)
1158/// - New: ~0.4ms per page (1 pass, 1 allocation)
1159/// - Speedup: ~5x faster
1160#[inline]
1161fn apply_regex_patterns(html: &str) -> String {
1162    // Single-pass regex replacement using combined pattern
1163    COMBINED_CLEANUP_REGEX.replace_all(html, "").into_owned()
1164}
1165
1166/// Convert HTML to plain text by removing all HTML tags
1167///
1168/// Uses the `scraper` crate for robust HTML5 parsing.
1169#[must_use]
1170pub fn html_to_text(html: &str) -> String {
1171    decode_pre(&html_to_text_raw(html))
1172}
1173
1174/// Like [`html_to_text`] but leaves `<pre>` content encoded with the
1175/// [`PRE_SPACE`]/[`PRE_NEWLINE`]/[`PRE_TAB`] sentinels. Callers that run
1176/// additional whitespace-normalisation passes (e.g.
1177/// [`extract_documentation_as_text`]) use this and call [`decode_pre`]
1178/// themselves once all collapsing is done.
1179fn html_to_text_raw(html: &str) -> String {
1180    let document = Html::parse_document(html);
1181
1182    // Build selectors for skip tags
1183    let mut text_parts = Vec::new();
1184
1185    // Select the root and extract text, handling skip tags
1186    if let Some(body) = document.select(&BODY_SELECTOR).next() {
1187        extract_text_excluding_skip_tags(&body, &mut text_parts);
1188    } else {
1189        // No body tag, extract from entire document
1190        if let Some(root) = document.select(&ALL_SELECTOR).next() {
1191            extract_text_excluding_skip_tags(&root, &mut text_parts);
1192        }
1193    }
1194
1195    // Join with "" (not " "): each text node already carries its own
1196    // surrounding whitespace, and `collapse_block_whitespace` collapses runs.
1197    // Inserting a space between every node would corrupt inline runs split
1198    // across elements. `BLOCK_SEP` markers added around block elements become
1199    // newlines so the output keeps document structure.
1200    collapse_block_whitespace(&text_parts.join(""))
1201}
1202
1203fn extract_text_excluding_skip_tags(
1204    element: &scraper::element_ref::ElementRef,
1205    text_parts: &mut Vec<String>,
1206) {
1207    let tag_name = element.value().name().to_lowercase();
1208
1209    if SKIP_TAGS.contains(&tag_name.as_str()) {
1210        return;
1211    }
1212
1213    // Walk children, collecting only text nodes that are not inside a skip tag.
1214    // We must recurse manually: `ElementRef::text()` yields *all* descendant
1215    // text (including the contents of <script>/<style>/...), so a single
1216    // top-level skip check would still leak nested script/style content.
1217    for child in element.children() {
1218        match child.value() {
1219            scraper::node::Node::Text(text) => {
1220                // Preserve the text node verbatim. Trimming each node and later
1221                // joining with spaces inserted spurious spaces at every inline
1222                // boundary: `RandomState</a>,` became "RandomState ," and words
1223                // split by `<wbr>`/syntax spans ("ser"+"ializing") became
1224                // "ser ializing". Keeping raw text lets `clean_whitespace`
1225                // collapse genuine whitespace (including the indentation between
1226                // block elements) without corrupting adjacent inline runs.
1227                // Empty/whitespace nodes are harmless: `clean_whitespace`
1228                // collapses them at the end.
1229                text_parts.push(text.to_string());
1230            }
1231            scraper::node::Node::Element(_) => {
1232                if let Some(child_ref) = scraper::element_ref::ElementRef::wrap(child) {
1233                    let name = child_ref.value().name().to_lowercase();
1234                    // Preserve the verbatim formatting of `<pre>` code blocks.
1235                    // Their newlines and indentation would otherwise be flattened
1236                    // by the whitespace-collapsing passes, rendering multi-line
1237                    // code examples as a single unreadable line. Encode the
1238                    // significant whitespace as control sentinels that survive
1239                    // collapsing; `decode_pre` restores it at the very end.
1240                    if name == "pre" {
1241                        let raw = child_ref.text().collect::<String>();
1242                        text_parts.push(BLOCK_SEP.to_string());
1243                        text_parts.push(encode_pre(raw.trim_matches('\n')));
1244                        text_parts.push(BLOCK_SEP.to_string());
1245                        continue;
1246                    }
1247                    // Render superscript/subscript (e.g. footnote references) as
1248                    // plain-text `^(...)`/`_(...)` notation so a bare `1` is not
1249                    // mistaken for body text. Matches the markdown path's handling.
1250                    if name == "sup" || name == "sub" {
1251                        let mut inner_parts = Vec::new();
1252                        extract_text_excluding_skip_tags(&child_ref, &mut inner_parts);
1253                        let inner = inner_parts
1254                            .join("")
1255                            .split_whitespace()
1256                            .collect::<Vec<_>>()
1257                            .join(" ");
1258                        if !inner.is_empty() {
1259                            let (open, close) = if name == "sup" {
1260                                ("^(", ")")
1261                            } else {
1262                                ("_(", ")")
1263                            };
1264                            text_parts.push(format!("{open}{inner}{close}"));
1265                        }
1266                        continue;
1267                    }
1268                    // Surround block-level elements with a `BLOCK_SEP`
1269                    // marker so adjacent blocks do not glue together (e.g.
1270                    // item-index entries) and each renders on its own line.
1271                    // `collapse_block_whitespace` turns the markers into
1272                    // newlines. Inline elements get no separator to preserve
1273                    // intra-word runs.
1274                    // Table cells use a CELL_SEP marker (rendered as ` | `) so a
1275                    // row's columns stay on one line; every other block element
1276                    // uses BLOCK_SEP (rendered as a newline).
1277                    let is_cell = name == "td" || name == "th";
1278                    let is_block = !is_cell && BLOCK_TAGS.contains(&name.as_str());
1279                    let sep = if is_cell { CELL_SEP } else { BLOCK_SEP };
1280                    if is_cell || is_block {
1281                        text_parts.push(sep.to_string());
1282                    }
1283                    extract_text_excluding_skip_tags(&child_ref, text_parts);
1284                    // A cell pushes only a *leading* CELL_SEP delimiter; a block
1285                    // is wrapped on both sides. This keeps a single separator
1286                    // between adjacent cells so empty cells can be preserved
1287                    // (see collapse_block_whitespace) and columns stay aligned.
1288                    if is_block {
1289                        text_parts.push(sep.to_string());
1290                    }
1291                }
1292            }
1293            _ => {}
1294        }
1295    }
1296}
1297
1298/// Extract documentation from HTML as cleaned HTML.
1299///
1300/// Isolates the docs.rs main content area and runs the shared [`clean_html`]
1301/// pass (removing `<head>`, scripts, styles, navigation, sidebars, footers,
1302/// buttons and source-code links). Unlike [`extract_documentation`], the result
1303/// remains HTML rather than being converted to Markdown, so callers requesting
1304/// the `html` format get the documentation body instead of the entire raw page.
1305#[must_use]
1306pub fn extract_documentation_html(html: &str) -> String {
1307    let main_content = extract_main_content(html);
1308    clean_html(&main_content)
1309}
1310
1311/// Matches an inline `<code>...</code>` element (non-greedy). Used by
1312/// [`flatten_links_in_inline_code`] to drop anchor wrappers that markdown
1313/// cannot render inside a code span.
1314static INLINE_CODE_REGEX: LazyLock<Regex> =
1315    LazyLock::new(|| Regex::new(r"(?is)<code\b[^>]*>.*?</code\s*>").expect("valid regex"));
1316
1317/// Matches an opening or closing `<a>` anchor tag. Used to strip link wrappers
1318/// while keeping their text. See [`flatten_links_in_inline_code`].
1319static ANCHOR_TAG_REGEX: LazyLock<Regex> =
1320    LazyLock::new(|| Regex::new(r"(?is)</?a\b[^>]*>").expect("valid regex"));
1321
1322/// Flatten `<a>` links nested inside an inline `<code>` element to their text
1323/// (markdown path only).
1324///
1325/// rustdoc renders re-exports as `<code>pub use <a href=...>name</a>;</code>`.
1326/// html2md turns the inner anchor into a markdown link *inside* the backtick
1327/// code span (`` `pub use [name](url);` ``), which renders as literal text
1328/// because markdown does not support links inside inline code. Removing the
1329/// anchor wrapper (keeping its text) yields a clean `` `pub use name;` `` code
1330/// span. `<pre>` blocks are skipped so code-example formatting/links are left
1331/// untouched; the html output format never calls this, so its links survive.
1332#[must_use]
1333fn flatten_links_in_inline_code(html: &str) -> String {
1334    let strip = |segment: &str| -> String {
1335        INLINE_CODE_REGEX
1336            .replace_all(segment, |caps: &regex::Captures| {
1337                ANCHOR_TAG_REGEX.replace_all(&caps[0], "").into_owned()
1338            })
1339            .into_owned()
1340    };
1341    let mut out = String::with_capacity(html.len());
1342    let mut last = 0;
1343    for m in PRE_BLOCK_REGEX.find_iter(html) {
1344        out.push_str(&strip(&html[last..m.start()]));
1345        out.push_str(m.as_str());
1346        last = m.end();
1347    }
1348    out.push_str(&strip(&html[last..]));
1349    out
1350}
1351
1352/// Matches a `<pre ...>` opening tag (group 1 = its attributes) plus an
1353/// optional immediately-following `<code ...>` open tag. Used by
1354/// [`inject_code_fence_language`] to attach the detected language to the code
1355/// block's opening fence.
1356static PRE_LANG_OPEN_REGEX: LazyLock<Regex> =
1357    LazyLock::new(|| Regex::new(r"(?is)<pre\b([^>]*)>(\s*<code\b[^>]*>)?").expect("valid regex"));
1358
1359/// Matches a `class="..."` attribute value (group 1). See
1360/// [`detect_pre_language`].
1361static PRE_CLASS_REGEX: LazyLock<Regex> =
1362    LazyLock::new(|| Regex::new(r#"(?is)class\s*=\s*["']([^"']*)["']"#).expect("valid regex"));
1363
1364/// Sentinel wrapping a code-fence language hint while it travels through
1365/// `html2md` inside the code block (STX bytes are never present in docs text).
1366const CODE_FENCE_SENTINEL: char = '\u{2}';
1367
1368/// Determine the syntax-highlighting language for a rustdoc `<pre>` block from
1369/// its class attribute. rustdoc marks Rust examples with the `rust` class
1370/// (`rust rust-example-rendered`) and other fenced languages with
1371/// `language-<name>` (e.g. `language-toml`). Returns `None` when no language can
1372/// be determined so the fence stays bare.
1373#[must_use]
1374fn detect_pre_language(pre_attrs: &str) -> Option<String> {
1375    let class = PRE_CLASS_REGEX.captures(pre_attrs)?.get(1)?.as_str();
1376    for tok in class.split_whitespace() {
1377        if let Some(lang) = tok.strip_prefix("language-") {
1378            if !lang.is_empty() {
1379                return Some(lang.to_string());
1380            }
1381        }
1382    }
1383    if class.split_whitespace().any(|t| t == "rust") {
1384        return Some("rust".to_string());
1385    }
1386    None
1387}
1388
1389/// Attach the detected language to each `<pre>` code block (markdown path only).
1390///
1391/// `html2md` 0.2.15 drops all `<pre>`/`<code>` class information and always
1392/// emits a bare ```` ``` ```` fence, losing rustdoc's language annotation
1393/// (`rust`, `toml`, ...). To preserve it, prepend a sentinel-wrapped language
1394/// token as the first line of the block's content; it survives `html2md`
1395/// verbatim and is converted into a fence info string by
1396/// [`restore_code_fence_language`]. Blocks without a detectable language are
1397/// left untouched.
1398#[must_use]
1399fn inject_code_fence_language(html: &str) -> String {
1400    PRE_LANG_OPEN_REGEX
1401        .replace_all(html, |caps: &regex::Captures| {
1402            let whole = &caps[0];
1403            match detect_pre_language(&caps[1]) {
1404                Some(lang) => {
1405                    format!("{whole}{CODE_FENCE_SENTINEL}{lang}{CODE_FENCE_SENTINEL}\n")
1406                }
1407                None => whole.to_string(),
1408            }
1409        })
1410        .into_owned()
1411}
1412
1413/// Collapse newline-containing whitespace on either side of inline elements to
1414/// a single space, leaving `<pre>` blocks untouched.
1415///
1416/// Works around an `html2md` 0.2.15 quirk where whitespace adjacent to an
1417/// inline element (e.g. `the\n<a>...` or `...</a>\ncrate`) is dropped, gluing
1418/// the element onto the neighbouring word. `<pre>` code blocks are skipped so
1419/// their significant indentation and line breaks (which often wrap highlighted
1420/// `<a>`/`<span>` tokens) are preserved verbatim. See [`INLINE_LEADING_WS_REGEX`]
1421/// and [`INLINE_TRAILING_WS_REGEX`].
1422fn normalize_inline_leading_whitespace(html: &str) -> String {
1423    // Collapse a newline-bearing whitespace run on either side of an inline
1424    // element to a single space (html2md drops both). Applied only outside
1425    // <pre> blocks so code indentation/line breaks are preserved.
1426    let fix = |segment: &str| -> String {
1427        let leading = INLINE_LEADING_WS_REGEX.replace_all(segment, " $1");
1428        INLINE_TRAILING_WS_REGEX
1429            .replace_all(&leading, "$1 $n")
1430            .into_owned()
1431    };
1432    let mut out = String::with_capacity(html.len());
1433    let mut last = 0;
1434    for m in PRE_BLOCK_REGEX.find_iter(html) {
1435        // Transform the segment before this <pre> block.
1436        out.push_str(&fix(&html[last..m.start()]));
1437        // Emit the <pre> block verbatim.
1438        out.push_str(m.as_str());
1439        last = m.end();
1440    }
1441    out.push_str(&fix(&html[last..]));
1442    out
1443}
1444
1445/// Extract documentation from HTML by cleaning and converting to Markdown
1446///
1447/// For docs.rs pages, extracts only the main content area to avoid
1448/// navigation elements, footers, and other non-documentation content.
1449#[must_use]
1450pub fn extract_documentation(html: &str) -> String {
1451    // Try to extract main content area from docs.rs pages
1452    let main_content = extract_main_content(html);
1453    let cleaned_html = clean_html(&main_content);
1454    // Flatten links nested inside inline <code> (e.g. re-exports) so they do
1455    // not become unrenderable markdown links inside a backtick span.
1456    let cleaned_html = flatten_links_in_inline_code(&cleaned_html);
1457    // Preserve rustdoc code-block language hints (html2md drops class info);
1458    // see inject_code_fence_language / restore_code_fence_language.
1459    let cleaned_html = inject_code_fence_language(&cleaned_html);
1460    // Restore whitespace html2md would otherwise drop before inline elements.
1461    let cleaned_html = normalize_inline_leading_whitespace(&cleaned_html);
1462    let markdown = html2md::parse_html(&cleaned_html);
1463
1464    // Post-process markdown to remove unwanted links
1465    clean_markdown(&markdown)
1466}
1467
1468/// Reverse the backslash escaping that html2md applies to ordinary text.
1469///
1470/// html2md 0.2.15 escapes the markdown metacharacters ``< > * _ ~ \`` in every
1471/// non-code text node. Because this output is consumed as documentation rather
1472/// than re-rendered as markdown, those escapes are pure noise (e.g.
1473/// `serde\_json`, `Vec\<u8\>`, `-\>`). This pass removes the escaping outside of
1474/// code, while leaving fenced code blocks and inline code spans untouched
1475/// (html2md never escapes code, so any backslash there is genuine).
1476fn unescape_markdown(markdown: &str) -> String {
1477    const ESCAPED: [char; 6] = ['<', '>', '*', '_', '~', '\\'];
1478    let mut out = String::with_capacity(markdown.len());
1479    let mut in_fence = false;
1480    for line in markdown.split_inclusive('\n') {
1481        // Fenced code blocks are delimited by a line whose first non-whitespace
1482        // characters are three backticks; emit them verbatim and skip unescaping
1483        // their contents.
1484        if line.trim_start().starts_with("```") {
1485            in_fence = !in_fence;
1486            out.push_str(line);
1487            continue;
1488        }
1489        if in_fence {
1490            out.push_str(line);
1491            continue;
1492        }
1493
1494        // Inline pass: toggle in/out of code on each maximal backtick run so
1495        // single- and multi-backtick spans are both preserved verbatim.
1496        let chars: Vec<char> = line.chars().collect();
1497        let mut in_code = false;
1498        let mut i = 0;
1499        while i < chars.len() {
1500            let c = chars[i];
1501            if c == '`' {
1502                let start = i;
1503                while i < chars.len() && chars[i] == '`' {
1504                    i += 1;
1505                }
1506                for _ in start..i {
1507                    out.push('`');
1508                }
1509                in_code = !in_code;
1510                continue;
1511            }
1512            if c == '\\' && !in_code && i + 1 < chars.len() && ESCAPED.contains(&chars[i + 1]) {
1513                out.push(chars[i + 1]);
1514                i += 2;
1515                continue;
1516            }
1517            out.push(c);
1518            i += 1;
1519        }
1520    }
1521    out
1522}
1523
1524/// Matches an opening code fence followed by a sentinel-wrapped language line
1525/// (see [`inject_code_fence_language`]). Group 1 is the fence (with any
1526/// indentation), group 2 the language token. See [`restore_code_fence_language`].
1527static CODE_FENCE_SENTINEL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
1528    Regex::new(r"(?m)^([ \t]*`{3,})[ \t]*\r?\n[ \t]*\x02([^\x02\r\n]*)\x02[ \t]*\r?\n")
1529        .expect("valid regex")
1530});
1531
1532/// Matches any leftover language sentinel (a code block whose fence was not
1533/// matched, e.g. an empty block). See [`restore_code_fence_language`].
1534static ORPHAN_FENCE_SENTINEL_REGEX: LazyLock<Regex> =
1535    LazyLock::new(|| Regex::new(r"\x02[^\x02\n]*\x02\n?").expect("valid regex"));
1536
1537/// Convert the language sentinel emitted by [`inject_code_fence_language`] into
1538/// a markdown fence info string (e.g. ```` ```rust ````), then strip any
1539/// orphaned sentinels. Runs in the markdown post-processing pass.
1540#[must_use]
1541fn restore_code_fence_language(markdown: &str) -> String {
1542    let with_lang = CODE_FENCE_SENTINEL_REGEX.replace_all(markdown, "${1}${2}\n");
1543    ORPHAN_FENCE_SENTINEL_REGEX
1544        .replace_all(&with_lang, "")
1545        .into_owned()
1546}
1547
1548/// Clean markdown output by removing relative links and UI artifacts
1549#[inline]
1550fn clean_markdown(markdown: &str) -> String {
1551    // Use Cow to avoid allocations when no replacements are needed
1552    // Chain replacements to process in a single traversal
1553    // Restore code-fence language hints carried through html2md as sentinels
1554    // (see restore_code_fence_language) before any other processing.
1555    let markdown = restore_code_fence_language(markdown);
1556    // First strip html2md's backslash escaping from non-code text so escaped
1557    // identifiers/generics (`serde\_json`, `Vec\<u8\>`) read naturally.
1558    let unescaped = unescape_markdown(&markdown);
1559    // html2md leaves `<sup>`/`<sub>` as raw HTML (e.g. footnote references in
1560    // tables). Convert them to plain-text `^(...)`/`_(...)` notation, stripping
1561    // any nested tags (such as a footnote `<a>` link) from the inner content.
1562    let unescaped = SUPERSCRIPT_REGEX.replace_all(&unescaped, |caps: &regex::Captures| {
1563        let inner = INLINE_TAG_STRIP_REGEX.replace_all(&caps[1], "");
1564        let inner = inner.trim();
1565        if inner.is_empty() {
1566            String::new()
1567        } else {
1568            format!("^({inner})")
1569        }
1570    });
1571    let unescaped = SUBSCRIPT_REGEX.replace_all(&unescaped, |caps: &regex::Captures| {
1572        let inner = INLINE_TAG_STRIP_REGEX.replace_all(&caps[1], "");
1573        let inner = inner.trim();
1574        if inner.is_empty() {
1575            String::new()
1576        } else {
1577            format!("_({inner})")
1578        }
1579    });
1580    // Escape the negative-impl marker `!` that html2md fused onto a linkified
1581    // trait name (`![Freeze](url)`) so it renders as literal `!Freeze` text and
1582    // not a broken markdown image. See NEGATIVE_IMPL_TRAIT_IMAGE_REGEX.
1583    let unescaped = NEGATIVE_IMPL_TRAIT_IMAGE_REGEX.replace_all(&unescaped, r"${1}\![");
1584    // Remove UI/source/javascript links first, then relative and section
1585    // anchors. Empty- and fragment-only links are downgraded to their text so
1586    // useful labels (e.g. headings) survive.
1587    let result = JS_TOGGLE_REGEX.replace_all(&unescaped, Cow::Borrowed(""));
1588    let result = JS_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1589    let result = SOURCE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1590    let result = SRC_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1591    // Drop a "Read more" see-also affordance whose target is a docs.rs-relative
1592    // `.html` path (it would otherwise be downgraded below to a meaningless
1593    // dangling "Read more"); keep absolute (`scheme://`) ones, which remain
1594    // reachable. See READ_MORE_LINK_REGEX.
1595    let result = READ_MORE_LINK_REGEX.replace_all(&result, |caps: &regex::Captures| {
1596        let ws = &caps[1];
1597        let url = &caps[2];
1598        if url.contains("://") {
1599            format!("{ws}[Read more]({url})")
1600        } else {
1601            String::new()
1602        }
1603    });
1604    let result = RELATIVE_LINK_REGEX.replace_all(&result, |caps: &regex::Captures| {
1605        let text = &caps[1];
1606        let url = &caps[2];
1607        // Keep absolute external links (those carrying a `scheme://`); only
1608        // docs.rs-relative `.html` targets are downgraded to their label.
1609        if url.contains("://") {
1610            format!("[{text}]({url})")
1611        } else {
1612            text.to_string()
1613        }
1614    });
1615    // Downgrade dead rustdoc item-anchor links (`#method.X`,
1616    // `#associatedtype.X`, `#impl-...`) to their label; the rendered
1617    // markdown has no matching heading id, so the links go nowhere.
1618    let result = RUSTDOC_ITEM_ANCHOR_REGEX.replace_all(&result, Cow::Borrowed("$1"));
1619    let result = ANCHOR_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1620    let result = FRAGMENT_TOGGLE_REGEX.replace_all(&result, |caps: &regex::Captures| {
1621        let label = &caps[1];
1622        // Keep crate/module names (which contain alphanumerics); drop bare
1623        // toggle markers such as the info circle or expand/collapse glyphs.
1624        if label.chars().any(|c| c.is_ascii_alphanumeric()) {
1625            label.to_string()
1626        } else {
1627            String::new()
1628        }
1629    });
1630    let result = EMPTY_LINK_REGEX.replace_all(&result, Cow::Borrowed("$1"));
1631    let result = STRAY_COLON_LINE_REGEX.replace_all(&result, Cow::Borrowed(""));
1632    let result = STRAY_MIDDOT_LINE_REGEX.replace_all(&result, Cow::Borrowed(""));
1633    let result = TRAILING_MIDDOT_REGEX.replace_all(&result, Cow::Borrowed(""));
1634    let result = TRAILING_WS_REGEX.replace_all(&result, Cow::Borrowed(""));
1635    let result = HEADING_TRAILING_HASH_REGEX.replace_all(&result, Cow::Borrowed("$1"));
1636    // html2md pads blockquotes with empty `>` lines (e.g. a clap note renders
1637    // as `>\n>\n> text\n>\n>`); drop the noisy boundary/duplicate marker lines.
1638    let result = tidy_blockquotes(&result);
1639    let result = MULTIPLE_NEWLINES_REGEX.replace_all(&result, Cow::Borrowed("\n\n"));
1640    result.trim().to_string()
1641}
1642
1643/// Remove the empty `>` marker lines `html2md` emits around blockquote content.
1644///
1645/// `html2md` 0.2.15 renders `<blockquote><p>x</p></blockquote>` as
1646/// `>\n>\n> x\n>\n>` (leading/trailing empty quote lines plus duplicates).
1647/// Within each maximal run of consecutive blockquote lines (those whose first
1648/// non-space character is `>`), leading and trailing empty quote lines are
1649/// dropped and internal runs of empty quote lines are collapsed to a single one
1650/// (preserving genuine paragraph breaks inside a multi-paragraph quote).
1651/// A quote line is "empty" when it contains only `>` and whitespace characters.
1652#[must_use]
1653fn tidy_blockquotes(markdown: &str) -> String {
1654    let is_quote = |l: &str| l.trim_start().starts_with('>');
1655    let is_empty_quote = |l: &str| is_quote(l) && l.chars().all(|c| c == '>' || c.is_whitespace());
1656
1657    let lines: Vec<&str> = markdown.lines().collect();
1658    let mut out: Vec<String> = Vec::with_capacity(lines.len());
1659    let mut i = 0;
1660    while i < lines.len() {
1661        if !is_quote(lines[i]) {
1662            out.push(lines[i].to_string());
1663            i += 1;
1664            continue;
1665        }
1666        // Gather a maximal run of consecutive blockquote lines.
1667        let start = i;
1668        while i < lines.len() && is_quote(lines[i]) {
1669            i += 1;
1670        }
1671        let block = &lines[start..i];
1672        // Find the first and last non-empty quote line in the block.
1673        let first = block.iter().position(|l| !is_empty_quote(l));
1674        let last = block.iter().rposition(|l| !is_empty_quote(l));
1675        if let (Some(first), Some(last)) = (first, last) {
1676            let mut prev_empty = false;
1677            for line in &block[first..=last] {
1678                let empty = is_empty_quote(line);
1679                if empty && prev_empty {
1680                    continue; // collapse consecutive internal empty quote lines
1681                }
1682                out.push((*line).to_string());
1683                prev_empty = empty;
1684            }
1685        }
1686        // A block of only empty quote lines is dropped entirely.
1687    }
1688    out.join("\n")
1689}
1690
1691/// Extract main content from docs.rs HTML
1692///
1693/// Looks for `<section id="main-content">` which contains the actual documentation.
1694/// Falls back to full HTML if main content section is not found.
1695#[inline]
1696fn extract_main_content(html: &str) -> String {
1697    let document = Html::parse_document(html);
1698
1699    // Try to find main-content section (docs.rs structure) - using cached selector
1700    if let Some(main_section) = document.select(&MAIN_CONTENT_SELECTOR).next() {
1701        return main_section.html();
1702    }
1703
1704    // Fallback: try rustdoc_body_wrapper - using cached selector
1705    if let Some(wrapper) = document.select(&RUSTDOC_BODY_WRAPPER_SELECTOR).next() {
1706        return wrapper.html();
1707    }
1708
1709    // Last resort: return original HTML
1710    html.to_string()
1711}
1712
1713/// Extract the collapsed text of the page's primary `<h1>` heading.
1714///
1715/// rustdoc renders an item page heading as e.g. `<h1>Struct serde_json::Value</h1>`
1716/// (the item kind plus the fully-qualified path) and a crate landing page as
1717/// `<h1>Crate serde</h1>`. Returns the whitespace-collapsed text of the first
1718/// `<h1>` inside the main content area (falling back to any `<h1>`), or `None`
1719/// when the page has no heading.
1720#[must_use]
1721pub fn page_h1_text(html: &str) -> Option<String> {
1722    let document = Html::parse_document(html);
1723    let collapse = |element: scraper::ElementRef| -> String {
1724        clean_whitespace(&element.text().collect::<String>())
1725    };
1726    let h1 = document
1727        .select(&MAIN_CONTENT_SELECTOR)
1728        .next()
1729        .and_then(|main| main.select(&H1_SELECTOR).next().map(collapse))
1730        .or_else(|| document.select(&H1_SELECTOR).next().map(collapse));
1731    h1.filter(|s| !s.is_empty())
1732}
1733
1734/// Check whether `heading` contains `ident` as a whole identifier token.
1735///
1736/// The heading is split on every character that cannot appear in a Rust
1737/// identifier (so `Struct serde_json::Value` yields the tokens `Struct`,
1738/// `serde_json`, `Value`), and an exact, case-sensitive match against any
1739/// token is required. This avoids partial matches such as `is` inside `this`.
1740fn heading_contains_identifier(heading: &str, ident: &str) -> bool {
1741    heading
1742        .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_'))
1743        .any(|token| token == ident)
1744}
1745
1746/// Determine whether a resolved rustdoc page is a *fallback* rather than the
1747/// dedicated page for `item_path`.
1748///
1749/// [`resolve_item_html`](super::lookup_item) probes the dedicated item page
1750/// first, then falls back to the containing type's page (e.g. the `Value` enum
1751/// page for `Value::is_null`, since methods have no standalone page) and
1752/// finally to the crate overview. A dedicated item page's `<h1>` always
1753/// contains the requested leaf identifier (the final `::` segment); a
1754/// parent-type or crate fallback heading does not. Returns `true` when the
1755/// page does not document the requested item directly, so callers can surface
1756/// an honest note in every output format.
1757///
1758/// This is content-based (not resolution-time state) so it stays correct on
1759/// cache hits, where only the raw HTML is replayed. When the page has no
1760/// heading at all, returns `false` to avoid over-warning.
1761#[must_use]
1762pub fn is_item_fallback_page(html: &str, item_path: &str) -> bool {
1763    let leaf = item_path.rsplit("::").next().unwrap_or(item_path).trim();
1764    if leaf.is_empty() {
1765        return false;
1766    }
1767    match page_h1_text(html) {
1768        Some(h1) => !heading_contains_identifier(&h1, leaf),
1769        None => false,
1770    }
1771}
1772
1773/// Extract search results from HTML
1774#[must_use]
1775pub fn extract_search_results(html: &str, item_path: &str) -> String {
1776    let main_content = extract_main_content(html);
1777    let cleaned_html = clean_html(&main_content);
1778    // Flatten links nested inside inline <code> (e.g. re-exports) so they do
1779    // not become unrenderable markdown links inside a backtick span.
1780    let cleaned_html = flatten_links_in_inline_code(&cleaned_html);
1781    // Preserve rustdoc code-block language hints (html2md drops class info);
1782    // see inject_code_fence_language / restore_code_fence_language.
1783    let cleaned_html = inject_code_fence_language(&cleaned_html);
1784    // Restore whitespace html2md would otherwise drop before inline elements.
1785    let cleaned_html = normalize_inline_leading_whitespace(&cleaned_html);
1786    let markdown = html2md::parse_html(&cleaned_html);
1787    let cleaned_markdown = clean_markdown(&markdown);
1788
1789    if cleaned_markdown.trim().is_empty() {
1790        return format!("Documentation for '{item_path}' not found");
1791    }
1792
1793    // Detect a fallback page (the containing type's page or the crate
1794    // overview) by comparing the requested leaf identifier against the page's
1795    // `<h1>` heading; a dedicated item page's heading always names the item.
1796    // Operating on the raw `html` keeps this correct on cache replays.
1797    if is_item_fallback_page(html, item_path) {
1798        format!(
1799            "## Documentation: {item_path}\n\n_No dedicated documentation page was found for `{item_path}`; showing the closest available page (its containing type or the crate overview) instead. It may be a method, associated item, or trait method, or it may not exist._\n\n{cleaned_markdown}"
1800        )
1801    } else {
1802        format!("## Documentation: {item_path}\n\n{cleaned_markdown}")
1803    }
1804}
1805
1806/// Extract documentation from HTML as plain text.
1807///
1808/// Mirrors [`extract_documentation`] but produces plain text: it isolates the
1809/// main content area (dropping navigation, sidebars and footers), runs the
1810/// shared [`clean_html`] pass (which strips scripts, styles, navigation,
1811/// buttons, `<details>` toggles and UI labels such as "Copy item path" and
1812/// "Expand description"), then flattens to text. Finally, leftover section
1813/// anchor markers are removed since they carry no meaning once hyperlinks are
1814/// gone.
1815#[must_use]
1816pub fn extract_documentation_as_text(html: &str) -> String {
1817    let main_content = extract_main_content(html);
1818    let cleaned_html = clean_html(&main_content);
1819    // Use the raw extraction so `<pre>` content stays encoded through the
1820    // line-normalisation pass; decode it back to real whitespace at the end.
1821    let text = html_to_text_raw(&cleaned_html);
1822    // Drop standalone section-sign markers, then re-collapse each line so the
1823    // newline-delimited block structure from `html_to_text_raw` is preserved.
1824    let normalized = normalize_lines(&text.replace('\u{00a7}', " "));
1825    // Strip the dangling middot separator left on out-of-band rows (e.g. the
1826    // stability line `1.0.0 \u{00b7}`) once the trailing source link is gone.
1827    let normalized = TRAILING_MIDDOT_REGEX.replace_all(&normalized, "");
1828    strip_trailing_line_whitespace(&decode_pre(&normalized))
1829}
1830
1831/// Collapse whitespace within each block segment and join blocks with newlines.
1832///
1833/// [`BLOCK_SEP`] markers delimit block-level boundaries. Within each segment all
1834/// whitespace runs (spaces, tabs, and incidental source newlines) collapse to a
1835/// single space, which preserves inline runs split across elements. Empty
1836/// segments are dropped so adjacent markers do not emit blank lines.
1837#[inline]
1838fn collapse_block_whitespace(text: &str) -> String {
1839    text.split(BLOCK_SEP)
1840        .map(|seg| {
1841            // Within a block segment, table cells are separated by CELL_SEP.
1842            // Each cell carries a single *leading* CELL_SEP delimiter, so the
1843            // fragment before the first delimiter is empty and is dropped; the
1844            // remaining cells (including genuinely empty ones, e.g. a blank
1845            // row-label header) are kept so columns stay aligned. Segments
1846            // without a CELL_SEP (the common case) collapse unchanged.
1847            if seg.contains(CELL_SEP) {
1848                let mut cells: Vec<String> = seg
1849                    .split(CELL_SEP)
1850                    .map(|cell| cell.split_whitespace().collect::<Vec<_>>().join(" "))
1851                    .collect();
1852                if cells.first().is_some_and(String::is_empty) {
1853                    cells.remove(0);
1854                }
1855                // Drop pure visual-spacer rows (every cell empty) so they do
1856                // not render as content-free `| |` noise between data rows.
1857                // Rows with any content keep their (possibly empty) cells so
1858                // columns stay aligned.
1859                if cells.iter().all(String::is_empty) {
1860                    String::new()
1861                } else {
1862                    cells.join(" | ")
1863                }
1864            } else {
1865                seg.split_whitespace().collect::<Vec<_>>().join(" ")
1866            }
1867        })
1868        .filter(|seg| !seg.is_empty())
1869        .collect::<Vec<_>>()
1870        .join("\n")
1871}
1872
1873/// Collapse intra-line whitespace and drop blank lines while preserving the
1874/// newline-delimited block structure produced by [`html_to_text`].
1875#[inline]
1876fn normalize_lines(text: &str) -> String {
1877    text.lines()
1878        .map(|line| line.split_whitespace().collect::<Vec<_>>().join(" "))
1879        .filter(|line| !line.is_empty())
1880        .collect::<Vec<_>>()
1881        .join("\n")
1882}
1883
1884/// Strip trailing whitespace from every line of finalised text output.
1885///
1886/// Signatures rendered inside an encoded `<pre>` can carry a trailing space
1887/// (e.g. `-> StepBy<Self> ` immediately before a wrapped `where` clause): the
1888/// space is held as a [`PRE_SPACE`] sentinel, so it survives [`normalize_lines`]
1889/// and is only restored to a real space by [`decode_pre`]. A final per-line
1890/// `trim_end` removes such dangling whitespace without touching indentation.
1891#[inline]
1892fn strip_trailing_line_whitespace(text: &str) -> String {
1893    text.split('\n')
1894        .map(str::trim_end)
1895        .collect::<Vec<_>>()
1896        .join("\n")
1897}
1898
1899#[inline]
1900fn clean_whitespace(text: &str) -> String {
1901    text.split_whitespace().collect::<Vec<_>>().join(" ")
1902}
1903
1904/// Encode the significant whitespace of `<pre>` content as control sentinels
1905/// ([`PRE_SPACE`], [`PRE_NEWLINE`], [`PRE_TAB`]) so it survives the
1906/// whitespace-collapsing passes. Carriage returns are dropped.
1907fn encode_pre(text: &str) -> String {
1908    let mut out = String::with_capacity(text.len());
1909    for ch in text.chars() {
1910        match ch {
1911            ' ' => out.push(PRE_SPACE),
1912            '\n' => out.push(PRE_NEWLINE),
1913            '\t' => out.push(PRE_TAB),
1914            '\r' => {}
1915            other => out.push(other),
1916        }
1917    }
1918    out
1919}
1920
1921/// Reverse of [`encode_pre`]: restore the original whitespace characters from
1922/// the [`PRE_SPACE`]/[`PRE_NEWLINE`]/[`PRE_TAB`] sentinels.
1923fn decode_pre(text: &str) -> String {
1924    let mut out = String::with_capacity(text.len());
1925    for ch in text.chars() {
1926        match ch {
1927            PRE_SPACE => out.push(' '),
1928            PRE_NEWLINE => out.push('\n'),
1929            PRE_TAB => out.push('\t'),
1930            other => out.push(other),
1931        }
1932    }
1933    out
1934}
1935
1936#[cfg(test)]
1937mod tests {
1938    use super::*;
1939
1940    #[test]
1941    fn test_text_strips_old_rustdoc_src_and_toggle_anchors() {
1942        // Older rustdoc heading markup: a `javascript:` collapse-all toggle and a
1943        // single-quoted `srclink` source anchor. Neither must leak its bracketed
1944        // marker into the plain-text output.
1945        let html = concat!(
1946            "<html><body><section id=\"main-content\">",
1947            "<h1>Crate serde",
1948            "<a id=\"toggle-all-docs\" href=\"javascript:void(0)\" title=\"collapse all docs\">",
1949            "[<span class='inner'>TOGGLEMARK</span>]</a>",
1950            "<a class='srclink' href='../src/serde/lib.rs.html#9-267' title='goto source code'>[src]</a>",
1951            "</h1><p>Real doc.</p>",
1952            "</section></body></html>"
1953        );
1954        let text = extract_documentation_as_text(html);
1955        assert!(!text.contains("[src]"), "src link leaked: {text:?}");
1956        assert!(!text.contains("TOGGLEMARK"), "toggle leaked: {text:?}");
1957        assert!(text.contains("Crate serde"), "heading dropped: {text:?}");
1958        assert!(text.contains("Real doc."), "content dropped: {text:?}");
1959    }
1960
1961    #[test]
1962    fn test_markdown_strips_trailing_heading_hashes() {
1963        let html = concat!(
1964            "<html><body><section id=\"main-content\">",
1965            "<h3>Examples</h3>",
1966            "<h4>pub fn get(&amp;self)</h4>",
1967            "<p>Body text.</p>",
1968            "</section></body></html>"
1969        );
1970        let md = extract_documentation(html);
1971        assert!(md.contains("### Examples"), "h3 missing: {md:?}");
1972        assert!(!md.contains("Examples ###"), "trailing hashes left: {md:?}");
1973        assert!(md.contains("#### pub fn get(&self)"), "h4 missing: {md:?}");
1974        assert!(!md.contains(") ####"), "trailing hashes left: {md:?}");
1975    }
1976
1977    #[test]
1978    fn test_markdown_restores_space_before_inline_link() {
1979        // html2md drops newline whitespace before inline <a>, gluing the link
1980        // onto the preceding word. The HashMap docs trigger this with
1981        // "using the\n<a>...<code>default</code>...".
1982        let html = concat!(
1983            "<html><body><section id=\"main-content\">",
1984            "<p>replaced on a per-<code>HashMap</code> basis using the\n",
1985            "<a href=\"trait.Default.html#tymethod.default\"><code>default</code></a>, ",
1986            "<a href=\"struct.HashMap.html#method.with_hasher\"><code>with_hasher</code></a> methods.</p>",
1987            "</section></body></html>"
1988        );
1989        let md = extract_documentation(html);
1990        // The space before the (downgraded) link is restored.
1991        assert!(
1992            md.contains("using the `default`"),
1993            "missing space before inline link: {md:?}"
1994        );
1995        // The deliberately glued `per-<code>HashMap</code>` (no source
1996        // whitespace) stays glued.
1997        assert!(
1998            md.contains("per-`HashMap`"),
1999            "spurious space inserted into hyphenated code: {md:?}"
2000        );
2001    }
2002
2003    #[test]
2004    fn test_where_clause_detached_from_declaration() {
2005        // rustdoc's <div class="where"> has no literal line breaks, so the
2006        // declaration renders glued ("Global>where", "Allocator,{").
2007        let html = concat!(
2008            "<html><body><section id=\"main-content\">",
2009            "<pre class=\"rust item-decl\"><code>pub struct Vec&lt;T, A = ",
2010            "<a class=\"struct\" href=\"struct.Global.html\">Global</a>&gt;",
2011            "<div class=\"where\">where\n    A: <a class=\"trait\" href=\"trait.Allocator.html\">Allocator</a>,</div>",
2012            "{ <span class=\"comment\">/* private fields */</span> }</code></pre>",
2013            "<h4 class=\"code-header\">pub fn retain&lt;F&gt;(&amp;mut self, f: F)",
2014            "<div class=\"where\">where\n    F: <a class=\"trait\" href=\"trait.FnMut.html\">FnMut</a>,</div></h4>",
2015            "</section></body></html>"
2016        );
2017        let md = extract_documentation(html);
2018        // Inside the code block the clause breaks onto its own lines.
2019        assert!(
2020            md.contains("Global>\nwhere") && md.contains("Allocator,\n{"),
2021            "where clause not broken in code block: {md:?}"
2022        );
2023        assert!(!md.contains("Global>where"), "glued where survived: {md:?}");
2024        // In the single-line method header the clause is space-separated.
2025        assert!(
2026            md.contains("f: F) where F:"),
2027            "where not separated in header: {md:?}"
2028        );
2029
2030        // Plain-text format gets the same multi-line declaration.
2031        let text = extract_documentation_as_text(html);
2032        assert!(
2033            text.contains("Global>\nwhere"),
2034            "where clause not broken in text: {text:?}"
2035        );
2036        assert!(
2037            !text.contains("Global>where"),
2038            "glued where in text: {text:?}"
2039        );
2040    }
2041
2042    #[test]
2043    fn test_text_signature_has_no_trailing_whitespace_before_where() {
2044        // A signature inside a `<pre>` ends with a space immediately before the
2045        // `where` div; that space is held as a PRE_SPACE sentinel and survives
2046        // line normalisation, so `decode_pre` would otherwise restore a dangling
2047        // trailing space on the signature line in text output.
2048        let html = concat!(
2049            "<html><body><section id=\"main-content\">",
2050            "<pre class=\"rust item-decl\"><code>fn step_by(self, step: usize) -> ",
2051            "StepBy&lt;Self&gt; ",
2052            "<div class=\"where\">where\n    Self: Sized,</div>",
2053            "</code></pre>",
2054            "</section></body></html>"
2055        );
2056        let text = extract_documentation_as_text(html);
2057        assert!(
2058            !text.lines().any(|l| l.ends_with(' ') || l.ends_with('\t')),
2059            "text output has a line with trailing whitespace: {text:?}"
2060        );
2061        // The signature and the wrapped clause are still present and split.
2062        assert!(
2063            text.contains("-> StepBy<Self>") && text.contains("where"),
2064            "signature/where content lost: {text:?}"
2065        );
2066    }
2067
2068    #[test]
2069    fn test_ui_glyph_anchors_stripped() {
2070        // rustdoc decorates impl/method headers (inside <summary>) with a
2071        // section-sign anchor `<a class="anchor">\u{00a7}</a>` and a
2072        // notable-trait marker `<a class="tooltip">\u{24d8}</a>`. Both are pure
2073        // UI affordances and must not leak into markdown or text output.
2074        let html = concat!(
2075            "<html><body><section id=\"main-content\">",
2076            "<details class=\"toggle implementors-toggle\" open><summary>",
2077            "<section id=\"impl-Clone\" class=\"impl\">",
2078            "<a href=\"#impl-Clone\" class=\"anchor\">\u{00a7}</a>",
2079            "<h3 class=\"code-header\">impl Clone for Foo</h3></section></summary></details>",
2080            "<details class=\"toggle method-toggle\" open><summary>",
2081            "<section id=\"method.keys\" class=\"method\">",
2082            "<h4 class=\"code-header\">fn <a href=\"#method.keys\" class=\"fn\">keys</a>(&amp;self) -&gt; ",
2083            "<a class=\"struct\" href=\"struct.Keys.html\">Keys</a> ",
2084            "<a href=\"#\" class=\"tooltip\" data-notable-ty=\"Keys\">\u{24d8}</a></h4>",
2085            "</section></summary></details>",
2086            "</section></body></html>"
2087        );
2088        let md = extract_documentation(html);
2089        assert!(
2090            !md.contains('\u{00a7}'),
2091            "section-sign anchor leaked into markdown: {md:?}"
2092        );
2093        assert!(
2094            !md.contains('\u{24d8}'),
2095            "notable-trait marker leaked into markdown: {md:?}"
2096        );
2097        assert!(
2098            md.contains("impl Clone for Foo"),
2099            "impl header lost: {md:?}"
2100        );
2101        let text = extract_documentation_as_text(html);
2102        assert!(
2103            !text.contains('\u{00a7}') && !text.contains('\u{24d8}'),
2104            "UI glyph leaked into text: {text:?}"
2105        );
2106    }
2107
2108    #[test]
2109    fn test_scrape_help_question_mark_anchor_stripped() {
2110        // rustdoc adds a `<a class="scrape-help" href="...">?</a>` help link
2111        // beside the "Examples found in repository" heading of a scraped
2112        // example. It is pure UI chrome and its `?` glyph must not leak into
2113        // the rendered output (the heading text itself is preserved).
2114        let html = concat!(
2115            "<html><body><section id=\"main-content\">",
2116            "<div class=\"docblock scraped-example-list\"><span></span>",
2117            "<h5 id=\"scraped-examples\">",
2118            "<a href=\"#scraped-examples\">Examples found in repository</a>",
2119            "<a class=\"scrape-help\" href=\"../scrape-examples-help.html\">?</a>",
2120            "</h5></div>",
2121            "</section></body></html>"
2122        );
2123        for out in [
2124            extract_documentation(html),
2125            extract_documentation_as_text(html),
2126            extract_documentation_html(html),
2127        ] {
2128            assert!(
2129                !out.contains("?</a>") && !out.contains("scrape-help"),
2130                "scrape-help link leaked: {out:?}"
2131            );
2132            assert!(
2133                out.contains("Examples found in repository"),
2134                "scraped-example heading text lost: {out:?}"
2135            );
2136        }
2137        // The leaked `?` glyph must not survive as a trailing token in markdown.
2138        let md = extract_documentation(html);
2139        assert!(
2140            !md.contains("repository)?") && !md.contains("repository ?"),
2141            "stray scrape-help `?` leaked into markdown: {md:?}"
2142        );
2143    }
2144
2145    #[test]
2146    fn test_multiline_signature_collapsed_to_single_line() {
2147        // rustdoc wraps long signatures across lines inside the (non-<pre>)
2148        // <h4 class="code-header"> using literal newlines + indentation. That
2149        // otherwise yields a broken two-line markdown heading and stray spaces
2150        // in text (`( self: Arc<Self>, )`).
2151        let html = concat!(
2152            "<html><body><section id=\"main-content\">",
2153            "<section id=\"method.try_lock_owned\" class=\"method\">",
2154            "<h4 class=\"code-header\">pub fn <a href=\"#m\" class=\"fn\">try_lock_owned</a>(\n",
2155            "    self: <a class=\"struct\" href=\"struct.Arc.html\">Arc</a>&lt;Self&gt;,\n",
2156            ") -&gt; <a class=\"enum\" href=\"enum.Result.html\">Result</a>&lt;T&gt;</h4></section>",
2157            "</section></body></html>"
2158        );
2159        let md = extract_documentation(html);
2160        assert!(
2161            md.contains("(self: Arc<Self>) -> Result<T>"),
2162            "multi-line signature not collapsed cleanly (markdown): {md:?}"
2163        );
2164        assert!(
2165            !md.contains("( self") && !md.contains(", )") && !md.contains(",\n)"),
2166            "signature spacing artifacts survived: {md:?}"
2167        );
2168        // The collapsed heading stays on a single line.
2169        assert!(
2170            !md.contains("Result<T>\n)") && !md.contains(",\n)"),
2171            "heading split across lines: {md:?}"
2172        );
2173        let text = extract_documentation_as_text(html);
2174        assert!(
2175            text.contains("try_lock_owned(self: Arc<Self>) -> Result<T>"),
2176            "multi-line signature not collapsed cleanly (text): {text:?}"
2177        );
2178    }
2179
2180    #[test]
2181    fn test_rustdoc_ui_web_components_stripped() {
2182        // rustdoc emits <rustdoc-toolbar> (inside #main-content, rendered empty)
2183        // and <rustdoc-topbar> (a duplicate breadcrumb heading). Neither should
2184        // leak into the html output.
2185        let html = concat!(
2186            "<html><body><section id=\"main-content\">",
2187            "<div class=\"main-heading\"><h1>Struct Foo</h1>",
2188            "<rustdoc-toolbar></rustdoc-toolbar></div>",
2189            "<rustdoc-topbar><h2><a href=\"#\">Foo</a></h2></rustdoc-topbar>",
2190            "<p>Body.</p>",
2191            "</section></body></html>"
2192        );
2193        let out = extract_documentation_html(html);
2194        assert!(
2195            !out.contains("rustdoc-toolbar") && !out.contains("rustdoc-topbar"),
2196            "rustdoc UI web-component leaked into html: {out:?}"
2197        );
2198        assert!(out.contains("Body."), "body content lost: {out:?}");
2199    }
2200
2201    #[test]
2202    fn test_rustdoc_breadcrumbs_stripped() {
2203        // rustdoc renders a navigation breadcrumb above the item title. Its
2204        // links are page-relative, so without removal they leave a dangling
2205        // bare line (`std::vec`, or a lone `std` on macro pages) that merely
2206        // duplicates our own title. The whole element must be stripped in all
2207        // three formats.
2208        let html = concat!(
2209            "<html><body><section id=\"main-content\">",
2210            "<div class=\"main-heading\">",
2211            "<div class=\"rustdoc-breadcrumbs\"><a href=\"../index.html\">std</a>",
2212            "::<wbr><a href=\"index.html\">vec</a></div>",
2213            "<h1>Struct Vec</h1></div>",
2214            "<p>A contiguous growable array type.</p>",
2215            "</section></body></html>"
2216        );
2217        for out in [
2218            extract_documentation(html),
2219            extract_documentation_as_text(html),
2220            extract_documentation_html(html),
2221        ] {
2222            assert!(
2223                !out.contains("rustdoc-breadcrumbs"),
2224                "breadcrumb element leaked: {out:?}"
2225            );
2226            assert!(
2227                !out.contains("std::vec"),
2228                "dangling breadcrumb line leaked: {out:?}"
2229            );
2230            assert!(
2231                out.contains("Vec") && out.contains("contiguous growable"),
2232                "real content lost: {out:?}"
2233            );
2234        }
2235    }
2236
2237    #[test]
2238    fn test_prose_admonition_pre_becomes_blockquote_not_code() {
2239        // rustdoc renders "Warning"/"Note" callouts as a prose-styled <pre>
2240        // (white-space:normal); it must become a blockquote, not a bare code
2241        // fence, so the prose (and its inline code/links) renders correctly.
2242        let html = concat!(
2243            "<section id=\"main-content\">",
2244            "<p>Intro.</p>",
2245            "<div class=\"example-wrap\"><pre class=\"compile_fail\" ",
2246            "style=\"white-space:normal;font:inherit;\">",
2247            "<p><strong>Warning</strong>: Do not hold <code>Span::enter</code> ",
2248            "across an await point.</p></pre></div>",
2249            "<p>Outro.</p>",
2250            "</section>"
2251        );
2252        let md = extract_documentation(html);
2253        assert!(
2254            !md.contains("```"),
2255            "prose admonition rendered as code fence in markdown: {md:?}"
2256        );
2257        assert!(
2258            md.contains("> ") && md.contains("Warning"),
2259            "admonition not rendered as blockquote: {md:?}"
2260        );
2261        assert!(
2262            md.contains("`Span::enter`"),
2263            "inline code lost in admonition: {md:?}"
2264        );
2265        let html_out = extract_documentation_html(html);
2266        assert!(
2267            !html_out.contains("white-space:normal"),
2268            "prose pre survived in html output: {html_out:?}"
2269        );
2270        // A genuine code example (default white-space) must stay a code block.
2271        let code_html = concat!(
2272            "<section id=\"main-content\">",
2273            "<pre class=\"rust rust-example-rendered\"><code>let x = 1;</code></pre>",
2274            "</section>"
2275        );
2276        assert!(
2277            extract_documentation(code_html).contains("```"),
2278            "genuine code example lost its fence"
2279        );
2280    }
2281
2282    #[test]
2283    fn test_unsafe_function_marker_rendered_as_annotation() {
2284        // rustdoc marks unsafe fns in module lists with
2285        // `<sup title="unsafe function">WARN</sup>`; it must become a readable
2286        // ` (unsafe)` annotation, not a `^(...)` superscript glued to the name.
2287        let html = concat!(
2288            "<section id=\"main-content\"><dl class=\"item-table\">",
2289            "<dt><a class=\"fn\" href=\"fn.copy.html\">copy</a>",
2290            "<sup title=\"unsafe function\">\u{26a0}</sup></dt>",
2291            "<dd>Copies bytes.</dd></dl></section>"
2292        );
2293        for out in [
2294            extract_documentation(html),
2295            extract_documentation_as_text(html),
2296            extract_documentation_html(html),
2297        ] {
2298            assert!(
2299                !out.contains('\u{26a0}'),
2300                "unsafe marker glyph leaked: {out:?}"
2301            );
2302            assert!(
2303                !out.contains("^("),
2304                "unsafe marker rendered as superscript: {out:?}"
2305            );
2306            assert!(
2307                out.contains("(unsafe)"),
2308                "unsafe annotation missing: {out:?}"
2309            );
2310        }
2311    }
2312
2313    #[test]
2314    fn test_hideme_show_methods_toggle_stripped() {
2315        // rustdoc puts a "Show N methods" collapse toggle
2316        // (`<summary class="hideme">`) *inside* the trait declaration <pre>;
2317        // its label must not leak into the rendered signature in any format.
2318        // The surrounding details content (the method list) must survive.
2319        let html = concat!(
2320            "<html><body><section id=\"main-content\">",
2321            "<pre class=\"rust item-decl\"><code>pub trait Iterator {\n",
2322            "    type Item;\n",
2323            "<details class=\"toggle type-contents-toggle\">",
2324            "<summary class=\"hideme\"><span>Show 76 methods</span></summary>",
2325            "    // Required method\n",
2326            "    fn next(&amp;mut self) -&gt; Option&lt;Self::Item&gt;;\n",
2327            "</details>}</code></pre>",
2328            "</section></body></html>"
2329        );
2330        for out in [
2331            extract_documentation(html),
2332            extract_documentation_as_text(html),
2333            extract_documentation_html(html),
2334        ] {
2335            assert!(
2336                !out.contains("Show 76 methods"),
2337                "collapse toggle label leaked: {out:?}"
2338            );
2339            assert!(
2340                out.contains("// Required method"),
2341                "details content lost: {out:?}"
2342            );
2343        }
2344    }
2345
2346    #[test]
2347    fn test_impl_block_docblock_not_glued_to_declaration() {
2348        // rustdoc nests an impl block's own documentation
2349        // (`<div class="docblock">`) inside the `<summary>` that holds the
2350        // `impl ...` declaration. When the summary is flattened to text the
2351        // docblock heading otherwise glues onto the declaration
2352        // (e.g. `impl ArgBasic API`). It must be relocated so the declaration
2353        // stays clean and the docblock renders as its own content.
2354        let html = concat!(
2355            "<html><body><section id=\"main-content\">",
2356            "<div id=\"implementations-list\">",
2357            "<details class=\"toggle implementors-toggle\" open><summary>",
2358            "<section id=\"impl-Arg\" class=\"impl\">",
2359            "<h3 class=\"code-header\">impl Arg</h3>",
2360            "<div class=\"docblock\"><h4 id=\"basic-api\">Basic API</h4></div>",
2361            "</section></summary>",
2362            "<div class=\"impl-items\"><details class=\"toggle method-toggle\" open>",
2363            "<summary><section id=\"method.new\" class=\"method\">",
2364            "<h4 class=\"code-header\">pub fn new() -&gt; Arg</h4></section></summary>",
2365            "<div class=\"docblock\"><p>Create a new Arg.</p></div></details></div>",
2366            "</details></div>",
2367            "</section></body></html>"
2368        );
2369        for out in [
2370            extract_documentation(html),
2371            extract_documentation_as_text(html),
2372            extract_documentation_html(html),
2373        ] {
2374            assert!(
2375                !out.contains("ArgBasic API"),
2376                "impl declaration glued to docblock heading: {out:?}"
2377            );
2378            assert!(
2379                out.contains("Basic API"),
2380                "impl-block docblock heading lost: {out:?}"
2381            );
2382        }
2383    }
2384
2385    #[test]
2386    fn test_undocumented_assoc_item_not_rendered_as_heading() {
2387        // rustdoc wraps a *documented* associated item in
2388        // `<details><summary>...</summary><docblock></details>` (the signature
2389        // is flattened to plain text), but an *undocumented* sibling is a bare
2390        // `<section>` whose `<h4 class="code-header">` would otherwise survive
2391        // as a spurious `####` heading. Both must render as plain text so the
2392        // list is consistent.
2393        let html = concat!(
2394            "<html><body><section id=\"main-content\">",
2395            "<details class=\"toggle\" open><summary>",
2396            "<section id=\"associatedconstant.DOC\" class=\"associatedconstant\">",
2397            "<h4 class=\"code-header\">pub const DOC: Self</h4></section></summary>",
2398            "<div class=\"docblock\"><p>Documented constant.</p></div></details>",
2399            "<section id=\"associatedconstant.BARE\" class=\"associatedconstant\">",
2400            "<h4 class=\"code-header\">pub const BARE: Self</h4></section>",
2401            "</section></body></html>"
2402        );
2403        let md = extract_documentation(html);
2404        assert!(
2405            !md.contains("#### pub const BARE"),
2406            "undocumented assoc const rendered as a heading: {md:?}"
2407        );
2408        assert!(
2409            md.contains("pub const BARE: Self"),
2410            "undocumented assoc const signature lost: {md:?}"
2411        );
2412        assert!(
2413            md.contains("pub const DOC: Self") && md.contains("Documented constant."),
2414            "documented assoc const rendering changed: {md:?}"
2415        );
2416    }
2417
2418    #[test]
2419    fn test_multiline_signature_in_pre_block_preserved() {
2420        // A <pre> code example that legitimately wraps a call across lines must
2421        // not be touched by the code-header collapse.
2422        let html = concat!(
2423            "<html><body><section id=\"main-content\">",
2424            "<pre class=\"rust\"><code>foo(\n    a,\n    b,\n);</code></pre>",
2425            "</section></body></html>"
2426        );
2427        let text = extract_documentation_as_text(html);
2428        assert!(
2429            text.contains("foo(") && text.contains("a,") && text.contains("b,"),
2430            "pre-block example was altered: {text:?}"
2431        );
2432    }
2433
2434    #[test]
2435    fn test_emoji_badge_separated_from_text() {
2436        // rustdoc renders the nightly-API marker as
2437        // `<span class="emoji">\u{1f52c}</span><span>This is ...</span>` with no
2438        // separating whitespace, so html2md glues the flask onto "This".
2439        let html = concat!(
2440            "<html><body><section id=\"main-content\">",
2441            "<div class=\"stab unstable\">",
2442            "<span class=\"emoji\">\u{1f52c}</span>",
2443            "<span>This is a nightly-only experimental API.</span></div>",
2444            "</section></body></html>"
2445        );
2446        let md = extract_documentation(html);
2447        assert!(
2448            md.contains("\u{1f52c} This is a nightly-only"),
2449            "emoji not separated from text in markdown: {md:?}"
2450        );
2451        assert!(
2452            !md.contains("\u{1f52c}This"),
2453            "emoji still glued in markdown: {md:?}"
2454        );
2455    }
2456
2457    #[test]
2458    fn test_playground_run_button_stripped() {
2459        // rustdoc adds a "Run code" button to each example as an empty-text
2460        // anchor wrapping a long playground URL
2461        // (`<a class="test-arrow" href="https://play.rust-lang.org/...">`).
2462        // It must not leak as an empty-text markdown link.
2463        let html = concat!(
2464            "<html><body><section id=\"main-content\">",
2465            "<div class=\"example-wrap\"><pre class=\"rust\"><code>let x = 1;</code></pre>",
2466            "<a class=\"test-arrow\" target=\"_blank\" title=\"Run code\" ",
2467            "href=\"https://play.rust-lang.org/?code=fn+main()+%7B%7D\"></a></div>",
2468            "</section></body></html>"
2469        );
2470        let md = extract_documentation(html);
2471        assert!(
2472            !md.contains("play.rust-lang.org"),
2473            "playground run button leaked into markdown: {md:?}"
2474        );
2475        assert!(!md.contains("[]("), "empty-text link survived: {md:?}");
2476        assert!(md.contains("let x = 1;"), "example code lost: {md:?}");
2477    }
2478
2479    #[test]
2480    fn test_orphan_since_middot_collapsed() {
2481        // rustdoc puts `<span class="since">1.0.0</span> \u{00b7} <src>` in a
2482        // method's right-side metadata. Stripping the source link leaves a
2483        // dangling middot that, once the <summary> is flattened, glues onto the
2484        // signature (`1.0.0 \u{00b7} fn ...`). It should collapse to a space.
2485        let html = concat!(
2486            "<html><body><section id=\"main-content\">",
2487            "<details class=\"toggle method-toggle\" open><summary>",
2488            "<section id=\"method.next\" class=\"method\">",
2489            "<span class=\"rightside\"><span class=\"since\" title=\"Stable since Rust version 1.0.0\">1.0.0</span>",
2490            " \u{00b7} <a class=\"src\" href=\"../../src/x.html#1\">Source</a></span>",
2491            "<h4 class=\"code-header\">fn <a href=\"#method.next\" class=\"fn\">next</a>(&amp;mut self)</h4>",
2492            "</section></summary></details>",
2493            "</section></body></html>"
2494        );
2495        let md = extract_documentation(html);
2496        assert!(
2497            md.contains("1.0.0 fn next"),
2498            "version not cleanly separated from signature: {md:?}"
2499        );
2500        assert!(
2501            !md.contains("1.0.0 \u{00b7}") && !md.contains("\u{00b7} fn next"),
2502            "orphan middot survived: {md:?}"
2503        );
2504    }
2505
2506    #[test]
2507    fn test_since_badge_separated_from_signature() {
2508        // On FFI structs (e.g. libc) the provided trait methods carry a
2509        // `<span class="since">1.0.0</span>` badge directly abutting the
2510        // method code-header with no middot or source link in between. When the
2511        // <summary> is flattened the badge fuses onto the signature
2512        // (`1.0.0fn clone_from`). It must be separated by a space.
2513        let html = concat!(
2514            "<html><body><section id=\"main-content\">",
2515            "<details class=\"toggle method-toggle\" open><summary>",
2516            "<section id=\"method.clone_from\" class=\"method trait-impl\">",
2517            "<span class=\"rightside\"><span class=\"since\" title=\"Stable since Rust version 1.0.0\">1.0.0</span></span>",
2518            "<a href=\"#method.clone_from\" class=\"anchor\">\u{00a7}</a>",
2519            "<h4 class=\"code-header\">fn <a href=\"#method.clone_from\" class=\"fn\">clone_from</a>(&amp;mut self, source: &amp;Self)</h4>",
2520            "</section></summary></details>",
2521            "</section></body></html>"
2522        );
2523        let md = extract_documentation(html);
2524        let text = extract_documentation_as_text(html);
2525        assert!(
2526            md.contains("1.0.0 fn clone_from"),
2527            "since badge glued onto signature (markdown): {md:?}"
2528        );
2529        assert!(
2530            !md.contains("1.0.0fn"),
2531            "since badge still fused in markdown: {md:?}"
2532        );
2533        assert!(
2534            !text.contains("1.0.0fn"),
2535            "since badge still fused in text: {text:?}"
2536        );
2537    }
2538
2539    #[test]
2540    fn test_generics_survive_summary_method_header() {
2541        // rustdoc wraps method-detail signatures in <details><summary>. The
2542        // summary's decoded text turns `Option&lt;usize&gt;` into literal
2543        // `Option<usize>`; without re-escaping, the second parse drops the
2544        // `<usize>`/`<Self::Item>` as if they were unknown tags.
2545        let html = concat!(
2546            "<html><body><section id=\"main-content\">",
2547            "<details class=\"toggle method-toggle\" open><summary>",
2548            "<section id=\"method.size_hint\" class=\"method\">",
2549            "<span class=\"rightside\"><span class=\"since\" title=\"Stable since Rust version 1.0.0\">1.0.0</span>",
2550            " \u{00b7} <a class=\"src\" href=\"../../src/x.html#1\">Source</a></span>",
2551            "<h4 class=\"code-header\">fn <a href=\"#method.size_hint\" class=\"fn\">size_hint</a>",
2552            "(&amp;self) -&gt; (<a class=\"primitive\" href=\"../primitive.usize.html\">usize</a>, ",
2553            "<a class=\"enum\" href=\"../option/enum.Option.html\">Option</a>&lt;",
2554            "<a class=\"primitive\" href=\"../primitive.usize.html\">usize</a>&gt;)</h4>",
2555            "</section></summary></details>",
2556            "</section></body></html>"
2557        );
2558        let md = extract_documentation(html);
2559        assert!(
2560            md.contains("Option<usize>"),
2561            "generic args dropped from summary method header (markdown): {md:?}"
2562        );
2563        let text = extract_documentation_as_text(html);
2564        assert!(
2565            text.contains("Option<usize>"),
2566            "generic args dropped from summary method header (text): {text:?}"
2567        );
2568    }
2569
2570    #[test]
2571    fn test_escape_html_text_reescapes_special_chars() {
2572        assert_eq!(escape_html_text("Vec<u8>"), "Vec&lt;u8&gt;");
2573        assert_eq!(escape_html_text("a & b"), "a &amp; b");
2574        assert_eq!(escape_html_text("Option<&T>"), "Option&lt;&amp;T&gt;");
2575    }
2576
2577    #[test]
2578    fn test_portability_badge_separated_from_item_name() {
2579        // rustdoc glues feature pills onto item names ("fs`fs`"); they should
2580        // render as a clearly separated parenthetical from the badge title.
2581        let html = concat!(
2582            "<html><body><section id=\"main-content\">",
2583            "<dl class=\"item-table\">",
2584            "<dt><a class=\"mod\" href=\"fs/index.html\">fs</a>",
2585            "<span class=\"stab portability\" title=\"Available on crate feature `fs` only\">",
2586            "<code>fs</code></span></dt><dd>Async files.</dd>",
2587            "<dt><a class=\"mod\" href=\"io/index.html\">io</a></dt><dd>Async IO.</dd>",
2588            "</dl></section></body></html>"
2589        );
2590        let md = extract_documentation(html);
2591        assert!(
2592            md.contains("fs (Available on crate feature `fs` only)"),
2593            "feature badge not separated/labelled: {md:?}"
2594        );
2595        // The glued form must be gone.
2596        assert!(!md.contains("fs`fs`"), "glued badge survived: {md:?}");
2597        // Items without a badge are untouched (no stray parens).
2598        assert!(
2599            md.contains("io — Async IO.") || md.contains("io —"),
2600            "io item altered: {md:?}"
2601        );
2602
2603        // Same separation in the plain-text format. The feature name renders
2604        // as a real code element, so plain text shows it undecorated.
2605        let text = extract_documentation_as_text(html);
2606        assert!(
2607            text.contains("fs (Available on crate feature fs only)"),
2608            "text badge not separated: {text:?}"
2609        );
2610    }
2611
2612    #[test]
2613    fn test_code_attribute_on_own_line() {
2614        // rustdoc puts declaration attributes in block-level
2615        // `<div class="code-attribute">` elements inside the item-decl <pre>.
2616        // The attribute must keep its own line, not glue onto the declaration
2617        // (regression: `#[repr(i8)]pub enum Ordering`).
2618        let html = concat!(
2619            "<html><body><section id=\"main-content\">",
2620            "<pre class=\"rust item-decl\"><code>",
2621            "<div class=\"code-attribute\">#[repr(i8)]</div>",
2622            "<div class=\"code-attribute\">#[non_exhaustive]</div>",
2623            "pub enum Ordering {\n    Less = -1,\n}</code></pre>",
2624            "</section></body></html>"
2625        );
2626        let md = extract_documentation(html);
2627        assert!(
2628            md.contains("#[repr(i8)]\npub enum Ordering")
2629                || md.contains("#[non_exhaustive]\npub enum Ordering"),
2630            "attribute glued onto declaration in markdown: {md:?}"
2631        );
2632        assert!(
2633            !md.contains("]pub enum"),
2634            "attribute still glued in markdown: {md:?}"
2635        );
2636
2637        let text = extract_documentation_as_text(html);
2638        assert!(
2639            !text.contains("]pub enum"),
2640            "attribute still glued in text: {text:?}"
2641        );
2642        // Both attributes are present, each on its own line.
2643        assert!(
2644            text.contains("#[repr(i8)]") && text.contains("#[non_exhaustive]"),
2645            "an attribute was dropped: {text:?}"
2646        );
2647    }
2648
2649    #[test]
2650    fn test_reexport_link_flattened_in_inline_code() {
2651        // rustdoc renders re-exports as `<code>pub use <a ...>name</a>;</code>`.
2652        // In markdown an anchor inside a backtick span cannot render, so the
2653        // link wrapper must be flattened to its text (`pub use name;`). The
2654        // html output format must keep the anchor.
2655        let html = concat!(
2656            "<html><body><section id=\"main-content\">",
2657            "<h2 id=\"reexports\">Re-exports</h2>",
2658            "<dl class=\"item-table reexports\"><dt id=\"reexport.rand_core\">",
2659            "<code>pub use <a class=\"mod\" ",
2660            "href=\"https://docs.rs/rand_core/0.10.0/rand_core/index.html\" ",
2661            "title=\"mod rand_core\">rand_core</a>;</code></dt></dl>",
2662            "</section></body></html>"
2663        );
2664        let md = extract_documentation(html);
2665        assert!(
2666            md.contains("`pub use rand_core;`"),
2667            "re-export code span malformed: {md:?}"
2668        );
2669        assert!(
2670            !md.contains("[rand_core]"),
2671            "unrenderable link survived inside code span: {md:?}"
2672        );
2673
2674        // The html output format keeps the anchor (browsers render it fine).
2675        let html_out = extract_documentation_html(html);
2676        assert!(
2677            html_out.contains("href=\"https://docs.rs/rand_core/0.10.0/rand_core/index.html\""),
2678            "html output dropped the re-export link: {html_out:?}"
2679        );
2680    }
2681
2682    #[test]
2683    fn test_code_fence_language_preserved() {
2684        // rustdoc annotates code blocks with a class (`rust rust-example-rendered`
2685        // for Rust examples, `language-<name>` for other fenced languages).
2686        // html2md drops this, emitting a bare ``` fence and losing the language
2687        // hint. It must be preserved in markdown only; the text and html
2688        // formats must be unaffected and free of the internal sentinel char.
2689        let html = concat!(
2690            "<div class=\"docblock\">",
2691            "<pre class=\"rust rust-example-rendered\"><code>let x = 1;</code></pre>",
2692            "<pre class=\"language-toml\"><code>v = 1</code></pre>",
2693            "<pre><code>plain</code></pre>",
2694            "</div>"
2695        );
2696        let md = extract_documentation(html);
2697        assert!(md.contains("```rust"), "rust fence hint missing: {md:?}");
2698        assert!(md.contains("```toml"), "toml fence hint missing: {md:?}");
2699        assert!(
2700            !md.contains('\u{2}'),
2701            "internal sentinel leaked into markdown: {md:?}"
2702        );
2703
2704        // Text and html formats must not gain fence hints or the sentinel.
2705        let text = extract_documentation_as_text(html);
2706        assert!(
2707            !text.contains('\u{2}'),
2708            "sentinel leaked into text: {text:?}"
2709        );
2710        assert!(
2711            !text.contains("```rust"),
2712            "text format gained a fence hint: {text:?}"
2713        );
2714
2715        let html_out = extract_documentation_html(html);
2716        assert!(
2717            !html_out.contains('\u{2}'),
2718            "sentinel leaked into html: {html_out:?}"
2719        );
2720    }
2721
2722    #[test]
2723    fn test_portability_badge_feature_with_underscore_not_escaped() {
2724        // A feature name containing an underscore is embedded in the badge
2725        // title inside literal backticks. It must render as a genuine code
2726        // span in markdown (no stray `\_` escape) and as undecorated text in
2727        // the plain-text format. Regression: `thread\_rng` leaked previously.
2728        let html = concat!(
2729            "<html><body><section id=\"main-content\">",
2730            "<div class=\"item-name\">",
2731            "<a class=\"fn\" href=\"fn.fill.html\">fill</a>",
2732            "<span class=\"stab portability\" ",
2733            "title=\"Available on crate feature `thread_rng` only\">",
2734            "<code>thread_rng</code></span></div>",
2735            "<div class=\"desc\">Fill any type.</div>",
2736            "</section></body></html>"
2737        );
2738        let md = extract_documentation(html);
2739        assert!(
2740            md.contains("Available on crate feature `thread_rng` only"),
2741            "feature code span malformed: {md:?}"
2742        );
2743        assert!(
2744            !md.contains("thread\\_rng"),
2745            "stray underscore escape in feature name: {md:?}"
2746        );
2747
2748        let text = extract_documentation_as_text(html);
2749        assert!(
2750            text.contains("Available on crate feature thread_rng only"),
2751            "text feature name malformed: {text:?}"
2752        );
2753    }
2754
2755    #[test]
2756    fn test_stab_badge_separated_from_item_name() {
2757        // rustdoc glues a stability pill onto the item name in module index
2758        // tables (e.g. `TryReserveErrorKindExperimental`); the marker should
2759        // render as a clearly separated parenthetical instead.
2760        let html = concat!(
2761            "<html><body><section id=\"main-content\">",
2762            "<dl class=\"item-table\">",
2763            "<dt><a class=\"enum\" href=\"enum.TryReserveErrorKind.html\">",
2764            "TryReserve<wbr>Error<wbr>Kind</a><wbr>",
2765            "<span class=\"stab unstable\" title=\"\">Experimental</span></dt>",
2766            "<dd>Details of the allocation.</dd>",
2767            "<dt><a class=\"enum\" href=\"enum.Plain.html\">Plain</a></dt><dd>Stable item.</dd>",
2768            "</dl></section></body></html>"
2769        );
2770        let md = extract_documentation(html);
2771        assert!(
2772            md.contains("TryReserveErrorKind (Experimental)"),
2773            "stab badge not separated/labelled: {md:?}"
2774        );
2775        // The glued form must be gone.
2776        assert!(
2777            !md.contains("KindExperimental"),
2778            "glued stab badge survived: {md:?}"
2779        );
2780        // Items without a badge are untouched (no stray parens).
2781        assert!(
2782            md.contains("Plain — Stable item."),
2783            "unbadged item altered: {md:?}"
2784        );
2785
2786        // Same separation in the plain-text format.
2787        let text = extract_documentation_as_text(html);
2788        assert!(
2789            text.contains("TryReserveErrorKind (Experimental)"),
2790            "text stab badge not separated: {text:?}"
2791        );
2792    }
2793
2794    #[test]
2795    fn test_deprecation_badge_separated_from_signature() {
2796        // rustdoc places the deprecation/stability badge in a
2797        // `<span class="item-info">` immediately after the signature, with no
2798        // separating whitespace. Inside a collapsed `<summary>` the flattened
2799        // text glued the badge onto the signature (e.g. `-> &str\u{1f44e}
2800        // Deprecated since 1.42.0: ...`). It must be space-separated instead.
2801        let html = concat!(
2802            "<html><body><section id=\"main-content\">",
2803            "<details class=\"toggle method-toggle\" open><summary>",
2804            "<section id=\"method.description\" class=\"method\">",
2805            "<h4 class=\"code-header\">fn <a href=\"#method.description\" class=\"fn\">description</a>",
2806            "(&amp;self) -&gt; &amp;<a class=\"primitive\" href=\"../primitive.str.html\">str</a></h4></section>",
2807            "<span class=\"item-info\"><div class=\"stab deprecated\">",
2808            "<span class=\"emoji\">\u{1f44e}</span>",
2809            "<span>Deprecated since 1.42.0: <p>use the Display impl or to_string()</p></span>",
2810            "</div></span></summary></details>",
2811            "</section></body></html>"
2812        );
2813        let md = extract_documentation(html);
2814        // The glued form must be gone; a space must separate signature & badge.
2815        assert!(
2816            !md.contains("str\u{1f44e}"),
2817            "deprecation badge glued onto signature (markdown): {md:?}"
2818        );
2819        assert!(
2820            md.contains("str \u{1f44e}") || md.contains("&str \u{1f44e}"),
2821            "deprecation badge not space-separated (markdown): {md:?}"
2822        );
2823        // Plain-text format must also separate them.
2824        let text = extract_documentation_as_text(html);
2825        assert!(
2826            !text.contains("str\u{1f44e}"),
2827            "deprecation badge glued onto signature (text): {text:?}"
2828        );
2829    }
2830
2831    #[test]
2832    fn test_blockquote_empty_marker_lines_removed() {
2833        // html2md pads blockquotes with empty `>` lines; the boundary/duplicate
2834        // markers must be removed while genuine paragraph breaks are preserved.
2835        let single = concat!(
2836            "<html><body><section id=\"main-content\">",
2837            "<blockquote><p><strong>Note here</strong></p></blockquote>",
2838            "<p>after</p></section></body></html>"
2839        );
2840        let md = extract_documentation(single);
2841        assert!(
2842            md.contains("> **Note here**"),
2843            "blockquote content missing: {md:?}"
2844        );
2845        // No empty `>` marker lines should survive.
2846        assert!(
2847            !md.lines().any(|l| l.trim() == ">"),
2848            "empty blockquote marker line survived: {md:?}"
2849        );
2850
2851        // A multi-paragraph blockquote keeps its internal separator line.
2852        let multi = concat!(
2853            "<html><body><section id=\"main-content\">",
2854            "<blockquote><p>First para.</p><p>Second para.</p></blockquote>",
2855            "</section></body></html>"
2856        );
2857        let md = extract_documentation(multi);
2858        assert!(
2859            md.contains("> First para.\n>\n> Second para."),
2860            "multi-paragraph blockquote break not preserved: {md:?}"
2861        );
2862    }
2863
2864    #[test]
2865    fn test_superscript_footnote_converted_in_markdown() {
2866        // html2md has no handler for <sup>/<sub>, so rustdoc footnote
2867        // references leak as raw HTML into the markdown (e.g.
2868        // `<sup id="fnref1"><a href="#fn1">1</a></sup>`). They must be converted
2869        // to plain-text `^(...)` notation with nested tags stripped.
2870        let html = concat!(
2871            "<html><body><section id=\"main-content\">",
2872            "<p>zero-padded to 2 digits. ",
2873            "<sup id=\"fnref1\"><a href=\"#fn1\">1</a></sup></p>",
2874            "<p>water is H<sub>2</sub>O.</p>",
2875            "</section></body></html>"
2876        );
2877        let md = extract_documentation(html);
2878        assert!(
2879            !md.contains("<sup") && !md.contains("</sup>") && !md.contains("<a href"),
2880            "superscript/anchor HTML leaked into markdown: {md:?}"
2881        );
2882        assert!(
2883            md.contains("2 digits. ^(1)"),
2884            "footnote reference not converted to ^(1): {md:?}"
2885        );
2886        assert!(
2887            md.contains("H_(2)O"),
2888            "subscript not converted to _(...): {md:?}"
2889        );
2890
2891        // The HTML output format must keep <sup>/<sub> intact (valid markup).
2892        let html_out = extract_documentation_html(html);
2893        assert!(
2894            html_out.contains("<sup") && html_out.contains("<sub"),
2895            "html format wrongly stripped super/subscript: {html_out:?}"
2896        );
2897    }
2898
2899    #[test]
2900    fn test_markdown_restores_space_after_inline_link() {
2901        // html2md drops a newline after an inline </a>, gluing the next word
2902        // onto the (downgraded) link, e.g. tokio docs: "moved into the
2903        // <a>tokio-stream</a>\ncrate.".
2904        let html = concat!(
2905            "<html><body><section id=\"main-content\">",
2906            "<p>moved into the <a href=\"https://docs.rs/tokio-stream\">tokio-stream</a>\n",
2907            "crate. See <a href=\"struct.X.html\">X</a>\nfor details.</p>",
2908            "</section></body></html>"
2909        );
2910        let md = extract_documentation(html);
2911        // External link keeps its URL; a space now follows it.
2912        assert!(
2913            md.contains("tokio-stream) crate"),
2914            "missing space after external link: {md:?}"
2915        );
2916        // Downgraded relative link is followed by a space, not glued.
2917        assert!(
2918            md.contains("See X for details"),
2919            "missing space after downgraded link: {md:?}"
2920        );
2921
2922        // A wrapped run before an opening parenthesis (parenthetical aside) must
2923        // also gain a space; html2md otherwise glues the `(` onto the link, e.g.
2924        // std slice docs: "the rules of references</a>\n(though ...".
2925        let aside = concat!(
2926            "<html><body><section id=\"main-content\">",
2927            "<p>would violate <a href=\"x.html\">the rules of references</a>\n",
2928            "(though possible).</p>",
2929            "</section></body></html>"
2930        );
2931        let aside_md = extract_documentation(aside);
2932        assert!(
2933            aside_md.contains("references (though"),
2934            "missing space before parenthetical aside: {aside_md:?}"
2935        );
2936
2937        // Negative: a function-style link with no whitespace before `(` stays
2938        // glued (no spurious space inserted into a call expression).
2939        let call = concat!(
2940            "<html><body><section id=\"main-content\">",
2941            "<p>call <a href=\"x.html\">foo</a>(arg) now</p>",
2942            "</section></body></html>"
2943        );
2944        let call_md = extract_documentation(call);
2945        assert!(
2946            call_md.contains("foo(arg)"),
2947            "spurious space inserted into call expression: {call_md:?}"
2948        );
2949    }
2950
2951    #[test]
2952    fn test_markdown_preserves_code_block_whitespace() {
2953        // The inline-whitespace fix must not touch <pre> contents: highlighted
2954        // code blocks wrap <a>/<span> tokens across indented newlines.
2955        let html = concat!(
2956            "<html><body><section id=\"main-content\">",
2957            "<pre><code>fn main() {\n",
2958            "    let x =\n",
2959            "        <a href=\"x.html\">HashMap</a>::new();\n",
2960            "}</code></pre>",
2961            "</section></body></html>"
2962        );
2963        let md = extract_documentation(html);
2964        // Indentation inside the code block is preserved (not collapsed to a
2965        // single leading space).
2966        assert!(
2967            md.contains("    let x ="),
2968            "code block indentation collapsed: {md:?}"
2969        );
2970    }
2971
2972    #[test]
2973    fn test_markdown_unescapes_identifiers_outside_code() {
2974        let html = concat!(
2975            "<html><body><section id=\"main-content\">",
2976            "<h1>Crate serde_json</h1>",
2977            "<p>Use <code>serde_json::value</code> to build <code>Vec&lt;u8&gt;</code>.</p>",
2978            "<p>pub fn get(&amp;self) -&gt; Option&lt;&amp;Value&gt;</p>",
2979            "<pre><code>let v: Vec&lt;u8&gt; = path\\to;</code></pre>",
2980            "</section></body></html>"
2981        );
2982        let md = extract_documentation(html);
2983        // Escapes removed from ordinary text and signatures.
2984        assert!(
2985            md.contains("Crate serde_json"),
2986            "heading still escaped: {md:?}"
2987        );
2988        assert!(
2989            md.contains("-> Option<&Value>"),
2990            "signature still escaped: {md:?}"
2991        );
2992        assert!(!md.contains("\\_"), "stray underscore escape: {md:?}");
2993        assert!(
2994            !md.contains("\\<") && !md.contains("\\>"),
2995            "stray angle escape: {md:?}"
2996        );
2997        // Inline code span is preserved verbatim (no escaping introduced).
2998        assert!(
2999            md.contains("`serde_json::value`"),
3000            "inline code mangled: {md:?}"
3001        );
3002        // Fenced code content (a genuine backslash) is left untouched.
3003        assert!(md.contains("path\\to"), "fenced backslash altered: {md:?}");
3004    }
3005
3006    #[test]
3007    fn test_clean_html_strips_oddly_formatted_block_elements() {
3008        // Navigation/header/footer/aside elements must be removed even when
3009        // their source markup is not formatted the way html5ever serializes it
3010        // (e.g. extra whitespace inside the tag). Previously the cleanup relied
3011        // on string-matching the serialized element against the raw HTML, which
3012        // silently leaked such elements into the output.
3013        let html = concat!(
3014            "<html><body><section id=\"main-content\">",
3015            "<nav  class=\"sidebar\">NAVLEAK</nav>",
3016            "<header  data-x=\"1\">HEADERLEAK</header>",
3017            "<footer   >FOOTERLEAK</footer>",
3018            "<aside  role=\"note\">ASIDELEAK</aside>",
3019            "<p>Real doc.</p>",
3020            "</section></body></html>"
3021        );
3022        let cleaned = clean_html(html);
3023        for leak in ["NAVLEAK", "HEADERLEAK", "FOOTERLEAK", "ASIDELEAK"] {
3024            assert!(!cleaned.contains(leak), "{leak} leaked: {cleaned}");
3025        }
3026        assert!(cleaned.contains("Real doc."), "content dropped: {cleaned}");
3027    }
3028
3029    #[test]
3030    fn test_clean_html_removes_source_links() {
3031        let html = concat!(
3032            "<html><body><section id=\"main-content\">",
3033            "<a class=\"src rightside\" href=\"../src/foo/lib.rs.html#1-2\">Source</a>",
3034            "<a class=\"src\" href=\"../src/foo/lib.rs.html#5\">Source</a>",
3035            "<p>Real documentation text.</p>",
3036            "</section></body></html>"
3037        );
3038        // Plain-text extraction must not leak the "Source" link labels.
3039        let text = extract_documentation_as_text(html);
3040        assert!(text.contains("Real documentation text."));
3041        assert!(!text.contains("Source"), "source label leaked: {text}");
3042    }
3043
3044    #[test]
3045    fn test_html_to_text_superscript_uses_caret_notation() {
3046        // In plain text a bare footnote number is easily mistaken for body
3047        // text; <sup>/<sub> should render as `^(...)`/`_(...)`, matching the
3048        // markdown path.
3049        let html = "<p>zero-padded to 2 digits. <sup id=\"f\"><a href=\"#fn1\">1</a></sup></p>                    <p>water is H<sub>2</sub>O.</p>";
3050        let text = html_to_text(html);
3051        assert!(
3052            text.contains("2 digits. ^(1)"),
3053            "superscript not rendered as ^(1): {text:?}"
3054        );
3055        assert!(
3056            text.contains("H_(2)O"),
3057            "subscript not rendered as _(2): {text:?}"
3058        );
3059        // No bare anchor/tag leakage.
3060        assert!(
3061            !text.contains("<sup") && !text.contains("<a href"),
3062            "raw tags leaked into text: {text:?}"
3063        );
3064    }
3065
3066    #[test]
3067    fn test_html_to_text_table_rows_stay_on_one_line() {
3068        // Table cells in a row must render on a single line joined by ` | `
3069        // (not scattered one-cell-per-line), so the row's columns stay
3070        // associated in the plain-text output.
3071        let html = concat!(
3072            "<table><thead><tr><th>Spec.</th><th>Example</th><th>Description</th></tr></thead>",
3073            "<tbody><tr><td>%Y</td><td>2001</td><td>The full year.</td></tr>",
3074            "<tr><td>%m</td><td>07</td><td>Month number.</td></tr></tbody></table>"
3075        );
3076        let text = html_to_text(html);
3077        assert!(
3078            text.contains("Spec. | Example | Description"),
3079            "header row not joined with ` | `: {text:?}"
3080        );
3081        assert!(
3082            text.contains("%Y | 2001 | The full year."),
3083            "data row not joined with ` | `: {text:?}"
3084        );
3085        // Distinct rows remain on separate lines.
3086        assert!(
3087            text.contains("The full year.\n%m | 07 | Month number."),
3088            "rows not on separate lines: {text:?}"
3089        );
3090    }
3091
3092    #[test]
3093    fn test_html_to_text_table_preserves_empty_leading_cell() {
3094        // A table whose header has an empty leading (row-label) cell must keep
3095        // that empty cell in the text output so the header columns stay aligned
3096        // with the data rows (header and every data row keep the same column
3097        // count).
3098        let html = concat!(
3099            "<table><thead><tr><th></th><th>get(i)</th><th>insert(i)</th></tr></thead>",
3100            "<tbody><tr><td>Vec</td><td>O(1)</td><td>O(n-i)</td></tr></tbody></table>"
3101        );
3102        let text = html_to_text(html);
3103        let header = text
3104            .lines()
3105            .find(|l| l.contains("get(i)"))
3106            .expect("header row missing");
3107        let data = text
3108            .lines()
3109            .find(|l| l.contains("Vec"))
3110            .expect("data row missing");
3111        // Both rows must have the same number of ` | `-joined columns (3).
3112        assert_eq!(
3113            header.matches('|').count(),
3114            data.matches('|').count(),
3115            "header/data column counts misaligned: header={header:?} data={data:?}"
3116        );
3117        assert_eq!(
3118            data.trim(),
3119            "Vec | O(1) | O(n-i)",
3120            "data row not joined correctly: {data:?}"
3121        );
3122    }
3123
3124    #[test]
3125    fn test_html_to_text_drops_empty_spacer_rows() {
3126        // Some tables insert all-empty "visual spacer" rows between data rows.
3127        // In text these must be dropped, not rendered as content-free `| |`
3128        // noise; rows with any content are kept (with their column structure).
3129        let html = concat!(
3130            "<table><tbody>",
3131            "<tr><td>%h</td><td>Jul</td><td>Same as %b.</td></tr>",
3132            "<tr><td></td><td></td><td></td></tr>",
3133            "<tr><td>%d</td><td>08</td><td>Day number.</td></tr>",
3134            "</tbody></table>"
3135        );
3136        let text = html_to_text(html);
3137        assert!(
3138            !text.lines().any(|l| l.trim() == "| |" || l.trim() == "|"),
3139            "empty spacer row rendered as pipe noise: {text:?}"
3140        );
3141        // Genuine data rows are preserved.
3142        assert!(
3143            text.contains("%h | Jul | Same as %b.") && text.contains("%d | 08 | Day number."),
3144            "data rows lost: {text:?}"
3145        );
3146    }
3147
3148    #[test]
3149    fn test_structfield_spans_render_on_separate_lines() {
3150        // rustdoc emits one `<span class="structfield section-header">` per
3151        // field with no separating whitespace; adjacent spans must not glue
3152        // (markdown `a: A``b: B`) or fuse tokens in text (`A_tb`).
3153        let html = concat!(
3154            "<html><body><section id=\"main-content\">",
3155            "<h2>Fields</h2>",
3156            "<span id=\"structfield.sa_family\" class=\"structfield section-header\">",
3157            "<a href=\"#structfield.sa_family\" class=\"anchor field\">\u{a7}</a>",
3158            "<code>sa_family: <a class=\"type\" href=\"type.sa_family_t.html\">sa_family_t</a></code></span>",
3159            "<span id=\"structfield.sa_data\" class=\"structfield section-header\">",
3160            "<a href=\"#structfield.sa_data\" class=\"anchor field\">\u{a7}</a>",
3161            "<code>sa_data: [<a class=\"type\" href=\"type.c_char.html\">c_char</a>; 14]</code></span>",
3162            "</section></body></html>"
3163        );
3164        let text = extract_documentation_as_text(html);
3165        assert!(
3166            !text.contains("sa_family_tsa_data"),
3167            "struct field tokens fused in text: {text:?}"
3168        );
3169        assert!(
3170            text.contains("sa_family: sa_family_t") && text.contains("sa_data: [c_char; 14]"),
3171            "field declarations missing in text: {text:?}"
3172        );
3173        let md = extract_documentation(html);
3174        // Each field is on its own line (no two field decls share a line).
3175        assert!(
3176            !md.lines()
3177                .any(|l| l.contains("sa_family") && l.contains("sa_data")),
3178            "struct fields glued on one line in markdown: {md:?}"
3179        );
3180    }
3181
3182    #[test]
3183    fn test_html_to_text_separates_block_elements() {
3184        // Adjacent block elements (item-index entries, list items, table cells)
3185        // must not glue their text together in the plain-text output.
3186        let html = "<ul><li>Dl_info</li><li>Elf32_Chdr</li><li>Foo</li></ul>";
3187        let text = html_to_text(html);
3188        assert!(
3189            !text.contains("Dl_infoElf32"),
3190            "block text glued together: {text}"
3191        );
3192        assert!(
3193            text.contains("Dl_info\nElf32_Chdr\nFoo"),
3194            "blocks not on separate lines: {text}"
3195        );
3196    }
3197
3198    #[test]
3199    fn test_item_index_table_renders_as_separate_items() {
3200        // docs.rs renders crate/module overview item indexes as
3201        // <dl class="item-table"><dt>name</dt><dd>summary</dd>...</dl>.
3202        // Without rewriting, html2md concatenates every name onto one line.
3203        let html = concat!(
3204            "<html><body><section id=\"main-content\">",
3205            "<dl class=\"item-table\">",
3206            "<dt><a class=\"struct\" href=\"struct.Dl_info.html\">Dl_info</a></dt>",
3207            "<dt><a class=\"struct\" href=\"struct.Elf32_Chdr.html\">Elf32_Chdr</a></dt>",
3208            "<dt><a class=\"trait\" href=\"trait.Foo.html\">Foo</a></dt>",
3209            "<dd>A foo trait.</dd>",
3210            "</dl></section></body></html>"
3211        );
3212        let md = extract_documentation(html);
3213        // Item names must not be glued together (html2md escapes `_` as `\_`,
3214        // so the broken output would contain `info` directly before `Elf32`).
3215        assert!(!md.contains("infoElf32"), "item names concatenated: {md}");
3216        // Each item appears (allowing markdown underscore escaping), the
3217        // description is preserved, and entries are emitted as separate
3218        // markdown list items (one per line).
3219        assert!(
3220            md.contains("Dl") && md.contains("info"),
3221            "missing Dl_info: {md}"
3222        );
3223        assert!(md.contains("Elf32"), "missing Elf32_Chdr: {md}");
3224        assert!(md.contains("Foo"), "missing Foo: {md}");
3225        assert!(md.contains("A foo trait."), "missing description: {md}");
3226        assert!(
3227            md.matches("* ").count() >= 3,
3228            "expected separate list items, got: {md}"
3229        );
3230    }
3231
3232    #[test]
3233    fn test_extract_documentation_html_returns_clean_main_content() {
3234        let html = concat!(
3235            "<!DOCTYPE html><html><head><link rel=\"search\" href=\"/opensearch.xml\">",
3236            "<script>var x=1;</script></head><body><nav>Nav</nav>",
3237            "<section id=\"main-content\"><h1>Crate foo</h1><p>Body text.</p>",
3238            "<a class=\"src\" href=\"../src/foo.rs.html\">Source</a></section>",
3239            "<footer>Footer</footer></body></html>"
3240        );
3241        let out = extract_documentation_html(html);
3242        // Documentation body is preserved as HTML.
3243        assert!(out.contains("Body text."), "missing body: {out}");
3244        assert!(out.contains("<h1>") || out.contains("Crate foo"));
3245        // Page chrome and noise are gone.
3246        assert!(!out.contains("<!DOCTYPE"), "doctype leaked: {out}");
3247        assert!(!out.contains("opensearch"), "head link leaked: {out}");
3248        assert!(!out.contains("<script"), "script leaked: {out}");
3249        assert!(!out.contains("Nav"), "nav leaked: {out}");
3250        assert!(!out.contains("Footer"), "footer leaked: {out}");
3251        assert!(!out.contains("Source"), "src link leaked: {out}");
3252    }
3253
3254    #[test]
3255    fn test_clean_html_removes_script() {
3256        let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
3257        let cleaned = clean_html(html);
3258        assert!(!cleaned.contains("script"));
3259        assert!(!cleaned.contains("var x"));
3260        assert!(cleaned.contains("Hello"));
3261    }
3262
3263    #[test]
3264    fn test_clean_html_strips_details_toggle_wrappers() {
3265        let html = r#"<html><body><section id="main-content"><details class="toggle top-doc" open=""><summary>Expand description</summary><h2>MyCrate</h2><p>Useful docs.</p></details></section></body></html>"#;
3266        let cleaned = clean_html(html);
3267        assert!(!cleaned.contains("<details"));
3268        assert!(!cleaned.contains("</details>"));
3269        assert!(!cleaned.contains("Expand description"));
3270        // Inner content must be preserved.
3271        assert!(cleaned.contains("MyCrate"));
3272        assert!(cleaned.contains("Useful docs."));
3273    }
3274
3275    #[test]
3276    fn test_extract_documentation_as_text_strips_ui_cruft() {
3277        let html = concat!(
3278            "<html><body><section id=\"main-content\">",
3279            "<button>Copy item path</button>",
3280            "<a class=\"anchor\" href=\"#x\">\u{00a7}</a>",
3281            "<details class=\"toggle top-doc\" open=\"\"><summary>Expand description</summary>",
3282            "<p>Real documentation text.</p></details>",
3283            "</section></body></html>"
3284        );
3285        let text = extract_documentation_as_text(html);
3286        assert!(text.contains("Real documentation text."));
3287        assert!(!text.contains("Copy item path"));
3288        assert!(!text.contains("Expand description"));
3289        assert!(!text.contains('\u{00a7}'));
3290    }
3291
3292    #[test]
3293    fn test_text_strips_trailing_orphan_middot() {
3294        // The out-of-band stability row (`1.0.0 \u{00b7} <source>`) leaves a
3295        // dangling middot once the source link is stripped.
3296        let html = concat!(
3297            "<html><body><section id=\"main-content\">",
3298            "<div class=\"out-of-band\">1.0.0 \u{00b7} ",
3299            "<a class=\"src\" href=\"../src/x.rs.html\">source</a></div>",
3300            "<p>Body text.</p>",
3301            "</section></body></html>"
3302        );
3303        let text = extract_documentation_as_text(html);
3304        assert!(text.contains("Body text."), "body dropped: {text:?}");
3305        assert!(
3306            !text.contains("1.0.0 \u{00b7}"),
3307            "orphan middot survived in text: {text:?}"
3308        );
3309    }
3310
3311    #[test]
3312    fn test_extract_documentation_has_no_details_markup() {
3313        let html = r#"<html><body><section id="main-content"><details class="toggle top-doc" open=""><summary>Expand description</summary><h2>MyCrate</h2><p>Hello world.</p></details></section></body></html>"#;
3314        let md = extract_documentation(html);
3315        assert!(!md.contains("<details"));
3316        assert!(!md.contains("Expand description"));
3317        assert!(md.contains("MyCrate"));
3318        assert!(md.contains("Hello world."));
3319    }
3320
3321    #[test]
3322    fn test_clean_html_removes_dangerous_elements_with_irregular_whitespace() {
3323        // html5ever normalizes `<script  defer >` to `<script defer>`, which
3324        // defeats the DOM serialize+string-replace pass. The regex pre-strip
3325        // must still remove these so no executable/style/embedded content leaks
3326        // into the html output format.
3327        let html = concat!(
3328            "<html><body><section id=\"main-content\">",
3329            "<script  defer >alert('xss')</script>",
3330            "<STYLE type=\"text/css\" >.evil{color:red}</STYLE>",
3331            "<noscript >NoScriptContent</noscript>",
3332            "<iframe  src=\"http://evil.example\"></iframe>",
3333            "<p>Safe documentation.</p>",
3334            "</section></body></html>"
3335        );
3336        let cleaned = clean_html(html);
3337        assert!(!cleaned.contains("alert"), "script leaked: {cleaned}");
3338        assert!(!cleaned.contains(".evil"), "style leaked: {cleaned}");
3339        assert!(
3340            !cleaned.contains("NoScriptContent"),
3341            "noscript leaked: {cleaned}"
3342        );
3343        assert!(
3344            !cleaned.contains("evil.example"),
3345            "iframe leaked: {cleaned}"
3346        );
3347        assert!(cleaned.contains("Safe documentation."));
3348    }
3349
3350    #[test]
3351    fn test_clean_html_removes_style() {
3352        let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
3353        let cleaned = clean_html(html);
3354        assert!(!cleaned.contains("style"));
3355        assert!(!cleaned.contains(".foo"));
3356        assert!(cleaned.contains("Content"));
3357    }
3358
3359    #[test]
3360    fn test_html_to_text_removes_tags() {
3361        let html = "<p>Hello <strong>World</strong>!</p>";
3362        let text = html_to_text(html);
3363        assert!(!text.contains('<'));
3364        assert!(!text.contains('>'));
3365        assert!(text.contains("Hello"));
3366        assert!(text.contains("World"));
3367    }
3368
3369    #[test]
3370    fn test_html_to_text_excludes_script_and_style_recursively() {
3371        // Regression: skip-tag exclusion must be recursive. Script/style content
3372        // nested anywhere in the tree must not leak into the plain-text output.
3373        let html = "<body>Hello<script>var secret = 1;</script>                    <div><style>.x{color:red}</style>World</div>                    <noscript>NOSCRIPT</noscript></body>";
3374        let text = html_to_text(html);
3375        assert!(text.contains("Hello"), "text: {text}");
3376        assert!(text.contains("World"), "text: {text}");
3377        assert!(!text.contains("secret"), "script content leaked: {text}");
3378        assert!(!text.contains("color:red"), "style content leaked: {text}");
3379        assert!(
3380            !text.contains("NOSCRIPT"),
3381            "noscript content leaked: {text}"
3382        );
3383    }
3384
3385    #[test]
3386    fn test_html_to_text_preserves_inline_runs() {
3387        // Regression: words split across inline elements (e.g. docs.rs `<wbr>`
3388        // hints or syntax-highlight spans) and punctuation directly following an
3389        // inline element must not gain spurious spaces.
3390        let html = "<body><p>de<wbr>serializing data</p>\n<div><code>RandomState</code>, <code>Global</code>&gt;</div></body>";
3391        let text = html_to_text(html);
3392        assert!(text.contains("deserializing"), "split word: {text}");
3393        assert!(!text.contains("de serializing"), "spurious space: {text}");
3394        assert!(text.contains("RandomState,"), "space before comma: {text}");
3395        // Block elements are now separated by a newline rather than a space.
3396        assert!(
3397            text.contains("data\nRandomState"),
3398            "lost block separation: {text}"
3399        );
3400    }
3401
3402    #[test]
3403    fn test_html_to_text_handles_entities() {
3404        // Test that HTML entities are converted to their character equivalents
3405        // amp entity should be decoded to &
3406        let html = r"<p>Tom & Jerry</p>";
3407        let text = html_to_text(html);
3408        // The function should decode amp entity
3409        assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
3410    }
3411
3412    #[test]
3413    fn test_clean_whitespace() {
3414        assert_eq!(clean_whitespace(" hello world "), "hello world");
3415        // Multi-space boundary test
3416        assert_eq!(clean_whitespace("  hello    world  "), "hello world");
3417        assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
3418    }
3419
3420    #[test]
3421    fn test_extract_documentation() {
3422        let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
3423        let docs = extract_documentation(html);
3424        assert!(docs.contains("Title"));
3425        assert!(docs.contains("Content"));
3426    }
3427
3428    #[test]
3429    fn test_extract_search_results_crate_fallback_adds_note() {
3430        // A crate-landing page (starts with "Crate ") used as fallback for an
3431        // item lookup must surface an honest note.
3432        let html = "<html><body><section id=\"main-content\"><h1>Crate serde</h1><p>Crate docs.</p></section></body></html>";
3433        let result = extract_search_results(html, "DoesNotExist");
3434        assert!(result.contains("## Documentation: DoesNotExist"));
3435        assert!(
3436            result.contains("No dedicated documentation page was found"),
3437            "missing fallback note: {result}"
3438        );
3439    }
3440
3441    #[test]
3442    fn test_extract_search_results_direct_item_no_note() {
3443        // A real item page (starts with its kind) must NOT get the fallback note.
3444        let html = "<html><body><section id=\"main-content\"><h1>Function spawn</h1><p>Spawns.</p></section></body></html>";
3445        let result = extract_search_results(html, "spawn");
3446        assert!(result.contains("## Documentation: spawn"));
3447        assert!(!result.contains("No dedicated documentation page was found"));
3448    }
3449
3450    #[test]
3451    fn test_extract_search_results_found() {
3452        let html = "<html><body><h1>Result</h1></body></html>";
3453        let result = extract_search_results(html, "serde::Serialize");
3454        assert!(result.contains("Documentation"));
3455        assert!(result.contains("serde::Serialize"));
3456        assert!(result.contains("Result"));
3457    }
3458
3459    #[test]
3460    fn test_extract_search_results_not_found() {
3461        let html = "<html><body></body></html>";
3462        let result = extract_search_results(html, "nonexistent");
3463        assert!(result.contains("not found"));
3464        assert!(result.contains("nonexistent"));
3465    }
3466
3467    #[test]
3468    fn test_is_item_fallback_page_parent_type_fallback() {
3469        // Requesting a method (`Value::is_null`) resolves to the containing
3470        // type's page (`Enum Value`); the heading names `Value`, not the
3471        // requested leaf `is_null`, so it must be flagged as a fallback.
3472        let html = "<html><body><section id=\"main-content\"><h1>Enum serde_json::Value</h1><p>An enum.</p></section></body></html>";
3473        assert!(is_item_fallback_page(html, "Value::is_null"));
3474        // The markdown path must surface the note for this parent fallback.
3475        let result = extract_search_results(html, "Value::is_null");
3476        assert!(
3477            result.contains("No dedicated documentation page was found"),
3478            "parent fallback note missing: {result}"
3479        );
3480    }
3481
3482    #[test]
3483    fn test_is_item_fallback_page_direct_hit_not_flagged() {
3484        // A dedicated item page's heading contains the requested leaf.
3485        let html = "<html><body><section id=\"main-content\"><h1>Trait serde::Serialize</h1><p>A trait.</p></section></body></html>";
3486        assert!(!is_item_fallback_page(html, "serde::Serialize"));
3487        assert!(!is_item_fallback_page(html, "Serialize"));
3488        // A re-exported function resolved at its canonical path still matches.
3489        let fn_html = "<html><body><section id=\"main-content\"><h1>Function tokio::task::spawn</h1></section></body></html>";
3490        assert!(!is_item_fallback_page(fn_html, "tokio::spawn"));
3491    }
3492
3493    #[test]
3494    fn test_is_item_fallback_page_crate_overview_fallback() {
3495        let html = "<html><body><section id=\"main-content\"><h1>Crate serde</h1><p>Docs.</p></section></body></html>";
3496        assert!(is_item_fallback_page(html, "DoesNotExist"));
3497    }
3498
3499    #[test]
3500    fn test_is_item_fallback_page_no_heading_does_not_warn() {
3501        // Without an <h1> we cannot tell; do not over-warn.
3502        let html = "<html><body><section id=\"main-content\"><p>No heading here.</p></section></body></html>";
3503        assert!(!is_item_fallback_page(html, "Foo::bar"));
3504    }
3505
3506    #[test]
3507    fn test_heading_contains_identifier_is_token_exact() {
3508        // Partial substring matches must not count.
3509        assert!(!heading_contains_identifier("Struct this::That", "is"));
3510        assert!(heading_contains_identifier(
3511            "Struct serde_json::Value",
3512            "Value"
3513        ));
3514        assert!(heading_contains_identifier("Method is_null", "is_null"));
3515    }
3516
3517    #[test]
3518    fn test_clean_html_removes_link_tags() {
3519        let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
3520        let cleaned = clean_html(html);
3521        assert!(
3522            !cleaned.contains("link"),
3523            "link tag should be removed, got: {cleaned}"
3524        );
3525        assert!(
3526            !cleaned.contains("stylesheet"),
3527            "stylesheet should be removed, got: {cleaned}"
3528        );
3529        assert!(
3530            cleaned.contains("Hello"),
3531            "Body content should remain, got: {cleaned}"
3532        );
3533    }
3534
3535    #[test]
3536    fn test_clean_html_removes_meta_tags() {
3537        let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
3538        let cleaned = clean_html(html);
3539        assert!(
3540            !cleaned.contains("meta"),
3541            "meta tag should be removed, got: {cleaned}"
3542        );
3543        assert!(
3544            cleaned.contains("Content"),
3545            "Body content should remain, got: {cleaned}"
3546        );
3547    }
3548
3549    #[test]
3550    fn test_relative_link_regex() {
3551        // Test that RELATIVE_LINK_REGEX only matches relative .html links
3552        let re = &RELATIVE_LINK_REGEX;
3553
3554        // Should match - relative .html links
3555        assert!(re.is_match("[module](module/index.html)"));
3556        assert!(re.is_match("[struct](struct.Struct.html)"));
3557        assert!(re.is_match("[tokio](../index.html)"));
3558        assert!(re.is_match("[crate](./index.html)"));
3559        assert!(re.is_match("[root](/serde/index.html)"));
3560        // Module paths beginning with `_` or digits (e.g. clap's `_derive`).
3561        assert!(re.is_match("[tutorial](_derive/_tutorial/index.html)"));
3562        assert!(re.is_match("[v2](2/index.html)"));
3563
3564        // Should NOT match
3565        assert!(!re.is_match("[Section](#section)")); // Anchor link
3566        assert!(
3567            !re.is_match("[External](https://example.com)"),
3568            "Should not match external URLs"
3569        ); // External URL
3570    }
3571
3572    #[test]
3573    fn test_clean_markdown_keeps_external_html_links() {
3574        // Absolute external links that happen to end in `.html` must keep their
3575        // URL rather than being downgraded to bare label text.
3576        let md = "See the [Guide](https://example.com/book/ch01.html) for details.";
3577        let out = clean_markdown(md);
3578        assert!(
3579            out.contains("[Guide](https://example.com/book/ch01.html)"),
3580            "external link should be preserved, got: {out}"
3581        );
3582    }
3583
3584    #[test]
3585    fn test_clean_markdown_relative_links_keep_text() {
3586        // clap-style underscore module links must be rewritten to their text,
3587        // not left as broken docs.rs-relative links.
3588        let md =
3589            "Derive [tutorial](_derive/_tutorial/index.html) and [reference](_derive/index.html).";
3590        let out = clean_markdown(md);
3591        assert!(!out.contains(".html"), "relative link survived: {out}");
3592        assert!(!out.contains("_derive"), "relative target survived: {out}");
3593        assert!(
3594            out.contains("Derive tutorial and reference."),
3595            "text not kept: {out}"
3596        );
3597    }
3598
3599    #[test]
3600    fn test_clean_markdown_relative_link_with_bracketed_label() {
3601        // Intra-doc links whose label contains `]` (Rust attribute syntax
3602        // `#[tokio::main]`, slice/array types `[u8]`, `[T; N]`) must still be
3603        // downgraded to their text. Previously the label pattern stopped at the
3604        // first `]`, leaving a broken docs.rs-relative `.html` link.
3605        let md = concat!(
3606            "Use [`#[tokio::main]`](attr.main.html) and the slice ",
3607            "[`[u8]`](primitive.slice.html) plus [Foo](struct.Foo.html)."
3608        );
3609        let out = clean_markdown(md);
3610        assert!(!out.contains(".html"), "relative link survived: {out}");
3611        assert!(
3612            !out.contains("](attr"),
3613            "bracketed-label link survived: {out}"
3614        );
3615        assert!(
3616            out.contains("`#[tokio::main]`"),
3617            "attribute label text dropped: {out}"
3618        );
3619        assert!(out.contains("`[u8]`"), "slice label text dropped: {out}");
3620        assert!(out.contains("Foo"), "plain label text dropped: {out}");
3621    }
3622
3623    #[test]
3624    fn test_negative_impl_trait_not_rendered_as_image() {
3625        // rustdoc negative auto-trait impls (`impl<T> !Freeze for Mutex<T>`)
3626        // place a text `!` directly before the linkified trait, which html2md
3627        // fuses into `![Freeze](url)` \u{2014} markdown image syntax that renders as
3628        // a broken embedded image. The `!` must be backslash-escaped so it stays
3629        // literal text.
3630        let input = concat!(
3631            "### impl<T> ![Freeze]",
3632            "(https://doc.rust-lang.org/nightly/core/marker/trait.Freeze.html)",
3633            " for Mutex<T>\n"
3634        );
3635        let md = clean_markdown(input);
3636        assert!(
3637            md.contains(r"\![Freeze]"),
3638            "negative-impl marker not escaped: {md:?}"
3639        );
3640        assert!(
3641            !md.contains("> ![Freeze]"),
3642            "unescaped image syntax survived: {md:?}"
3643        );
3644    }
3645
3646    #[test]
3647    fn test_clean_markdown_removes_old_rustdoc_artifacts() {
3648        // The minus sign below is U+2212 as emitted by older rustdoc toggles.
3649        let md = concat!(
3650            "Crate [serde]() [ [\u{2212}] ](javascript:void(0)) ",
3651            "[[src]](../src/serde/lib.rs.html#9-267) [\u{24d8}](#)\n\nReal content ",
3652            "[External](https://serde.rs/) [Quick start](#quick-start)."
3653        );
3654        let out = clean_markdown(md);
3655        assert!(!out.contains("javascript:"), "js link leaked: {out}");
3656        assert!(
3657            !out.contains("src/serde/lib.rs.html"),
3658            "src link leaked: {out}"
3659        );
3660        assert!(!out.contains("[[src]]"), "src label leaked: {out}");
3661        assert!(!out.contains("]()"), "empty link leaked: {out}");
3662        // Useful text is preserved (empty link label downgraded to text).
3663        assert!(out.contains("serde"));
3664        assert!(out.contains("Real content"));
3665        // External non-.html links are preserved.
3666        assert!(out.contains("https://serde.rs/"));
3667        // No-op fragment-only toggles are removed, real anchors preserved.
3668        assert!(!out.contains("(#)"), "fragment toggle leaked: {out}");
3669        assert!(out.contains("#quick-start"), "real anchor dropped: {out}");
3670    }
3671
3672    #[test]
3673    fn test_clean_markdown_keeps_named_fragment_link_text() {
3674        // Versioned docs.rs pages render the crate name in the h1 as
3675        // `<a class="mod" href="#">serde</a>`, which becomes `[serde](#)` in
3676        // markdown. The label must survive (only symbol toggles are dropped).
3677        let md = "Crate [serde](#) [ⓘ](#)\n\nbody";
3678        let out = clean_markdown(md);
3679        assert!(out.contains("Crate serde"), "crate name dropped: {out}");
3680        assert!(!out.contains("(#)"), "fragment link syntax leaked: {out}");
3681        assert!(!out.contains("ⓘ"), "symbol toggle leaked: {out}");
3682    }
3683
3684    #[test]
3685    fn test_clean_markdown_drops_relative_read_more_keeps_absolute() {
3686        // rustdoc appends a "Read more" link to inherited/derived method
3687        // summaries. A docs.rs-relative target is unreachable and would be
3688        // downgraded to a dangling "Read more"; it must be dropped entirely.
3689        // An absolute (scheme://) target stays a usable link.
3690        let md = "Returns a duplicate of the value. [Read more](../clone/trait.Clone.html#tymethod.clone)";
3691        let out = clean_markdown(md);
3692        assert_eq!(
3693            out.trim(),
3694            "Returns a duplicate of the value.",
3695            "relative Read more affordance not dropped cleanly: {out:?}"
3696        );
3697        let md2 = "Formats the value. [Read more](https://doc.rust-lang.org/core/fmt/trait.Debug.html#tymethod.fmt)";
3698        let out2 = clean_markdown(md2);
3699        assert!(
3700            out2.contains(
3701                "[Read more](https://doc.rust-lang.org/core/fmt/trait.Debug.html#tymethod.fmt)"
3702            ),
3703            "absolute Read more link wrongly dropped: {out2:?}"
3704        );
3705    }
3706
3707    #[test]
3708    fn test_clean_markdown_downgrades_rustdoc_item_anchors() {
3709        // rustdoc cross-links items with type-prefixed fragment anchors
3710        // (`#method.X`, `#associatedtype.X`, `#impl-...`). These ids do not
3711        // exist in the rendered markdown, so the links are dead and must be
3712        // downgraded to their label. Genuine section anchors must be kept.
3713        let md = concat!(
3714            "fn [parse](#method.parse)() -> Box and ",
3715            "[`Error`](#associatedtype.Error) plus ",
3716            "[here](#impl-Clone-for-Foo). See [Quick start](#quick-start)."
3717        );
3718        let out = clean_markdown(md);
3719        assert!(
3720            !out.contains("#method.parse"),
3721            "method anchor survived: {out}"
3722        );
3723        assert!(
3724            !out.contains("#associatedtype.Error"),
3725            "assoc-type anchor survived: {out}"
3726        );
3727        assert!(!out.contains("#impl-"), "impl anchor survived: {out}");
3728        // Labels are kept as text.
3729        assert!(out.contains("fn parse()"), "method label dropped: {out}");
3730        assert!(out.contains("`Error`"), "assoc-type label dropped: {out}");
3731        assert!(out.contains("here"), "impl label dropped: {out}");
3732        // Genuine section anchors are preserved.
3733        assert!(
3734            out.contains("[Quick start](#quick-start)"),
3735            "section anchor wrongly downgraded: {out}"
3736        );
3737    }
3738
3739    #[test]
3740    fn test_clean_markdown_removes_stray_middot_line() {
3741        // rustdoc out-of-band row leaves a lone middot after the source link
3742        // and collapse toggle are stripped.
3743        let md = "Crate serde\n==========\n\n\u{00b7}\n\nSerde is a framework.";
3744        let out = clean_markdown(md);
3745        assert!(
3746            !out.contains("\n\u{00b7}\n"),
3747            "stray middot line leaked: {out:?}"
3748        );
3749        assert!(out.contains("Crate serde"), "heading dropped: {out}");
3750        assert!(out.contains("Serde is a framework."), "body dropped: {out}");
3751        // Inline middots in prose are preserved.
3752        let inline = clean_markdown("a \u{00b7} b");
3753        assert!(
3754            inline.contains("\u{00b7}"),
3755            "inline middot wrongly dropped: {inline}"
3756        );
3757    }
3758
3759    #[test]
3760    fn test_clean_markdown_strips_trailing_middot_and_nbsp() {
3761        // The stability/out-of-band line keeps a dangling middot once the
3762        // trailing source link is stripped (e.g. "1.0.0 \u{00b7}"); and rustdoc
3763        // headings often end with a non-breaking space.
3764        let md = "Struct HashMap\u{00a0} \n==========\n\n1.0.0 \u{00b7}\n\nBody.";
3765        let out = clean_markdown(md);
3766        assert!(
3767            out.contains("Struct HashMap\n"),
3768            "trailing nbsp not trimmed from heading: {out:?}"
3769        );
3770        assert!(
3771            out.contains("1.0.0\n") || out.ends_with("1.0.0\n\nBody."),
3772            "trailing middot not stripped: {out:?}"
3773        );
3774        assert!(
3775            !out.contains("1.0.0 \u{00b7}"),
3776            "orphan middot survived: {out:?}"
3777        );
3778        // Inline middots between words on the same line are preserved.
3779        assert!(
3780            clean_markdown("a \u{00b7} b").contains('\u{00b7}'),
3781            "inline middot wrongly dropped"
3782        );
3783    }
3784
3785    #[test]
3786    fn test_clean_markdown_removes_breadcrumb_colon_lines() {
3787        let md = "## Documentation: spawn
3788
3789::
3790
3791Function spawn
3792
3793let x = S::Ok;";
3794        let out = clean_markdown(md);
3795        // The orphan breadcrumb separator line is gone.
3796        assert!(!out.contains("\n::\n"), "stray colon line leaked: {out}");
3797        // Inline `::` inside content is preserved.
3798        assert!(
3799            out.contains("S::Ok"),
3800            "inline path separator dropped: {out}"
3801        );
3802        assert!(out.contains("Function spawn"));
3803    }
3804
3805    #[test]
3806    fn test_clean_markdown_preserves_content() {
3807        // Test that clean_markdown doesn't remove too much content
3808        let markdown = r"# Dioxus
3809
3810## At a glance
3811
3812Dioxus is a framework for building cross-platform apps.
3813
3814## Quick start
3815
3816To get started with Dioxus:
3817
3818```
3819cargo install dioxus-cli
3820```
3821
3822[External Link](https://dioxuslabs.com)
3823
3824[Anchor](#quick-start)
3825";
3826        let cleaned = clean_markdown(markdown);
3827
3828        // Should preserve main content
3829        assert!(cleaned.contains("Dioxus is a framework"));
3830        assert!(cleaned.contains("At a glance"));
3831        assert!(cleaned.contains("Quick start"));
3832        assert!(cleaned.contains("cargo install"));
3833
3834        // Should preserve external links and anchor links
3835        assert!(
3836            cleaned.contains("[External Link](https://dioxuslabs.com)"),
3837            "Should preserve external links"
3838        );
3839        assert!(
3840            cleaned.contains("[Anchor](#quick-start)"),
3841            "Should preserve anchor links"
3842        );
3843    }
3844
3845    // ============================================================================
3846    // Performance optimization tests
3847    // ============================================================================
3848
3849    /// Test that `extract_documentation` handles complex HTML with main content
3850    /// This test verifies the single-pass optimization doesn't break extraction
3851    #[test]
3852    fn test_extract_documentation_single_pass_optimization() {
3853        let html = r#"
3854<!DOCTYPE html>
3855<html>
3856<head><title>Test Crate</title></head>
3857<body>
3858    <nav>Navigation content</nav>
3859    <section id="main-content">
3860        <h1>Test Crate</h1>
3861        <p>This is the main documentation.</p>
3862        <script>console.log('test');</script>
3863        <div class="docblock">
3864            <p>Docblock content here.</p>
3865        </div>
3866    </section>
3867    <footer>Footer content</footer>
3868</body>
3869</html>
3870"#;
3871        let docs = extract_documentation(html);
3872
3873        // Should extract main content
3874        assert!(docs.contains("Test Crate"), "Should contain title");
3875        assert!(
3876            docs.contains("main documentation"),
3877            "Should contain main content"
3878        );
3879        assert!(
3880            docs.contains("Docblock content"),
3881            "Should preserve docblock"
3882        );
3883
3884        // Should remove unwanted elements
3885        assert!(!docs.contains("Navigation content"), "Should remove nav");
3886        assert!(!docs.contains("Footer content"), "Should remove footer");
3887        assert!(!docs.contains("console.log"), "Should remove script");
3888    }
3889
3890    /// Test that `extract_search_results` handles complex HTML correctly
3891    /// This verifies the single-pass optimization for search results
3892    #[test]
3893    fn test_extract_search_results_single_pass_optimization() {
3894        let html = r#"
3895<!DOCTYPE html>
3896<html>
3897<body>
3898    <section id="main-content">
3899        <h1>serde::Serialize</h1>
3900        <pre><code>pub trait Serialize { }</code></pre>
3901        <p>Serialize trait documentation.</p>
3902    </section>
3903    <nav>Sidebar</nav>
3904</body>
3905</html>
3906"#;
3907        let result = extract_search_results(html, "serde::Serialize");
3908
3909        // Should extract search results correctly
3910        assert!(result.contains("Documentation"));
3911        assert!(result.contains("serde::Serialize"));
3912        assert!(result.contains("Serialize trait"));
3913
3914        // Should remove navigation
3915        assert!(!result.contains("Sidebar"));
3916    }
3917
3918    /// Test that multiple skip tags are handled efficiently
3919    #[test]
3920    fn test_clean_html_multiple_skip_tags() {
3921        let html = r"
3922<html>
3923<head>
3924    <style>.test { color: red; }</style>
3925    <script>var x = 1;</script>
3926</head>
3927<body>
3928    <nav>Navigation</nav>
3929    <article>
3930        <h1>Title</h1>
3931        <p>Content with <script>inline script</script> removed.</p>
3932        <footer>Article footer</footer>
3933    </article>
3934    <footer>Page footer</footer>
3935</body>
3936</html>
3937";
3938        let cleaned = clean_html(html);
3939
3940        // Should preserve content
3941        assert!(cleaned.contains("Title"));
3942        assert!(cleaned.contains("Content"));
3943
3944        // Should remove all unwanted elements
3945        assert!(!cleaned.contains("style"), "Should remove style tags");
3946        assert!(!cleaned.contains("script"), "Should remove script tags");
3947        assert!(!cleaned.contains("Navigation"), "Should remove nav");
3948        assert!(!cleaned.contains("footer"), "Should remove footer");
3949        assert!(!cleaned.contains(".test"), "Should remove CSS content");
3950        assert!(!cleaned.contains("var x"), "Should remove JS content");
3951    }
3952
3953    /// Test that cached selectors work correctly for all tag types
3954    #[test]
3955    fn test_cached_selectors_all_tag_types() {
3956        // Test each tag type defined in constants
3957        let test_cases = [
3958            (
3959                "<script>alert('test')</script><p>Content</p>",
3960                "script",
3961                "Content",
3962            ),
3963            ("<style>.x{}</style><p>Content</p>", "style", "Content"),
3964            (
3965                "<noscript>Enable JS</noscript><p>Content</p>",
3966                "noscript",
3967                "Content",
3968            ),
3969            (
3970                "<iframe src=\"x\"></iframe><p>Content</p>",
3971                "iframe",
3972                "Content",
3973            ),
3974            ("<nav><a>Link</a></nav><p>Content</p>", "nav", "Content"),
3975            ("<header>Head</header><p>Content</p>", "header", "Content"),
3976            ("<footer>Foot</footer><p>Content</p>", "footer", "Content"),
3977            ("<aside>Sidebar</aside><p>Content</p>", "aside", "Content"),
3978            ("<button>Click</button><p>Content</p>", "button", "Content"),
3979        ];
3980
3981        for (html, tag_to_remove, expected_content) in test_cases {
3982            let cleaned = clean_html(html);
3983            assert!(
3984                !cleaned.contains(tag_to_remove),
3985                "Should remove {tag_to_remove} tag"
3986            );
3987            assert!(
3988                cleaned.contains(expected_content),
3989                "Should preserve {expected_content}"
3990            );
3991        }
3992    }
3993}