crates_docs/tools/docs/html.rs
1//! HTML processing utilities
2//!
3//! Provides HTML cleaning and conversion functions for documentation extraction.
4//! Uses the `scraper` crate for robust HTML5 parsing.
5
6use regex::Regex;
7use scraper::{Html, Selector};
8use std::borrow::Cow;
9use std::sync::LazyLock;
10
11/// Tags whose content should be completely removed during HTML cleaning
12const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
13
14/// Block-level tags. During plain-text extraction a [`BLOCK_SEP`] marker is
15/// inserted around these so adjacent blocks (e.g. consecutive `<li>`/`<dt>`
16/// item-index entries, table cells, or paragraphs) do not run together into a
17/// single token like `Dl_infoElf32_Chdr`, and so each block can be emitted on
18/// its own line. Inline tags are intentionally excluded so that runs split
19/// across inline elements (`ser`+`<wbr>`+`ializing`, `RandomState</a>,`) are not
20/// corrupted with spurious spaces.
21const BLOCK_TAGS: &[&str] = &[
22 "address",
23 "article",
24 "aside",
25 "blockquote",
26 "br",
27 "dd",
28 "div",
29 "dl",
30 "dt",
31 "fieldset",
32 "figcaption",
33 "figure",
34 "footer",
35 "form",
36 "h1",
37 "h2",
38 "h3",
39 "h4",
40 "h5",
41 "h6",
42 "header",
43 "hr",
44 "li",
45 "main",
46 "nav",
47 "ol",
48 "p",
49 "pre",
50 "section",
51 "table",
52 "tbody",
53 "tfoot",
54 "thead",
55 "tr",
56 "ul",
57];
58
59/// Sentinel marker inserted around block-level elements during plain-text
60/// extraction (see [`BLOCK_TAGS`]). It is deliberately distinct from any
61/// whitespace so genuine block boundaries can be turned into newlines without
62/// being confused with the incidental whitespace inside text nodes (including
63/// source-indentation newlines), which is collapsed to single spaces. A NUL
64/// byte never appears in rendered documentation text: the HTML parser replaces
65/// any literal NUL in the input with U+FFFD.
66const BLOCK_SEP: &str = "\u{0}";
67
68/// Sentinel marker inserted around table cells (`<td>`/`<th>`) during plain-text
69/// extraction. Unlike [`BLOCK_SEP`] (which becomes a newline), `CELL_SEP` keeps
70/// a table row's cells on a single line, joined by ` | `, so the row's
71/// columns stay associated (e.g. `%C | 20 | The proleptic Gregorian year ...`).
72/// U+0001 never appears in rendered documentation text, so it is a safe
73/// sentinel (cf. [`BLOCK_SEP`]).
74const CELL_SEP: &str = "\u{1}";
75
76/// Sentinel characters used to preserve the verbatim whitespace of `<pre>`
77/// code blocks through the whitespace-collapsing passes. They are control
78/// characters that Rust does not classify as whitespace, so they survive both
79/// `str::split_whitespace` and `str::lines`. [`decode_pre`] restores the
80/// original characters once all collapsing is complete.
81const PRE_SPACE: char = '\u{2}';
82const PRE_NEWLINE: char = '\u{3}';
83const PRE_TAB: char = '\u{4}';
84
85/// Regex to remove anchor links like [§](#xxx)
86static ANCHOR_LINK_REGEX: LazyLock<Regex> =
87 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").expect("hardcoded valid regex pattern"));
88
89/// Regex to remove relative source links like [Source](../src/...)
90static SOURCE_LINK_REGEX: LazyLock<Regex> =
91 LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").expect("hardcoded valid regex pattern"));
92
93/// Regex to remove rustdoc `[src]`/`[[src]]` source links (older rustdoc).
94static SRC_LINK_REGEX: LazyLock<Regex> =
95 LazyLock::new(|| Regex::new(r"\[\[?src\]?\]\([^)]*\)").expect("hardcoded valid regex pattern"));
96
97/// Regex to remove rustdoc collapse-toggle links of the form
98/// `[ [-] ](javascript:void(0))` (the marker may be `-`, `+` or U+2212).
99///
100/// The toggle text contains a nested `[...]`, so this is matched explicitly to
101/// avoid greedily spanning adjacent links.
102static JS_TOGGLE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
103 Regex::new(r"\[\s*\[[-+\x{2212}]\]\s*\]\(javascript:[^\n)]*\)\)?")
104 .expect("hardcoded valid regex pattern")
105});
106
107/// Regex to remove plain `[text](javascript:...)` links emitted by older
108/// rustdoc. Link text must not contain `]` so it cannot span adjacent links.
109static JS_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
110 Regex::new(r"\[[^\]\n]*\]\(javascript:[^\n)]*\)\)?").expect("hardcoded valid regex pattern")
111});
112
113/// Regex to convert empty-target links `[text]()` to plain `text`.
114static EMPTY_LINK_REGEX: LazyLock<Regex> =
115 LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\(\)").expect("hardcoded valid regex pattern"));
116
117/// Regex to match no-op fragment-only links like `[serde](#)` or `[ⓘ](#)`
118/// (a bare `#` target navigates nowhere). The captured label is inspected by
119/// the caller: meaningful labels (containing an alphanumeric, e.g. a crate name
120/// in a versioned-page heading where rustdoc renders `<a href="#">serde</a>`)
121/// are downgraded to plain text, while symbol-only toggle markers (ⓘ, −, +)
122/// are dropped. Real in-page anchors such as `[Quick start](#quick-start)`
123/// keep a fragment id and never match.
124static FRAGMENT_TOGGLE_REGEX: LazyLock<Regex> =
125 LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\(#\)").expect("hardcoded valid regex pattern"));
126
127/// Regex to downgrade rustdoc *item-anchor* links to their plain-text label.
128///
129/// rustdoc cross-references items with fragment-only links whose id carries a
130/// type-specific prefix (`#method.foo`, `#tymethod.foo`, `#variant.Foo`,
131/// `#structfield.foo`, `#associatedtype.Error`, `#associatedconstant.MAX`,
132/// `#reexport.foo`) or the impl-block form (`#impl-Trait-for-Type`). These
133/// anchors only exist inside the rustdoc page; the rendered markdown has no
134/// matching heading id, so the links are dead. Group 1 captures the label
135/// (the item name) so it can be kept as text. Genuine in-page section anchors
136/// (e.g. `[Quick start](#quick-start)`) lack these prefixes and are untouched.
137static RUSTDOC_ITEM_ANCHOR_REGEX: LazyLock<Regex> = LazyLock::new(|| {
138 Regex::new(
139 r"\[([^\]]*)\]\(#(?:(?:method|tymethod|variant|structfield|associatedtype|associatedconstant|reexport)\.|impl-)[^)]*\)",
140 )
141 .expect("hardcoded valid regex pattern")
142});
143
144/// Regex to drop breadcrumb-residue lines that contain only `::` separators.
145///
146/// rustdoc item headers render a navigation breadcrumb such as
147/// `[tokio](../index.html)::[task](../index.html)::spawn`. Once the relative
148/// links are stripped, an orphan line of bare `::` separators can remain; it
149/// carries no information and is removed. Inline `::` inside code or text is
150/// unaffected because those lines contain other characters.
151static STRAY_COLON_LINE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
152 Regex::new(r"(?m)^[ \t]*:{2,}[ \t]*$").expect("hardcoded valid regex pattern")
153});
154
155/// Regex to drop orphan separator lines that contain only a middot (`·`).
156///
157/// rustdoc's `out-of-band` heading row renders `<source> · [-]` (a source link,
158/// a middot separator, and a collapse toggle). Once the source link and toggle
159/// are stripped, a lone `·` remains on its own line; it carries no information.
160/// Inline middots inside prose are unaffected because those lines have other
161/// characters.
162static STRAY_MIDDOT_LINE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
163 Regex::new(r"(?m)^[ \t]*\u{00b7}[ \t]*$").expect("hardcoded valid regex pattern")
164});
165
166/// Regex to strip an orphaned trailing middot separator from a line.
167///
168/// rustdoc joins out-of-band metadata with ` \u{00b7} ` separators, e.g.
169/// `1.0.0 \u{00b7} <source link>`. Once the trailing source/toggle link is
170/// removed, the line keeps a dangling ` \u{00b7}` that carries no meaning
171/// (e.g. the stability line becomes `1.0.0 \u{00b7}`). Drop the trailing
172/// middot together with the whitespace (including non-breaking spaces) that
173/// precedes it. Middots embedded in prose are unaffected because they are
174/// followed by more text.
175static TRAILING_MIDDOT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
176 Regex::new(r"(?m)[ \t\u{00a0}]*\u{00b7}[ \t\u{00a0}]*$").expect("hardcoded valid regex pattern")
177});
178
179/// Regex to trim trailing horizontal whitespace, including non-breaking spaces.
180///
181/// rustdoc headings and metadata rows frequently end with a stray space or
182/// non-breaking space (`\u{00a0}`) that html2md preserves, leaving artifacts
183/// like `Struct HashMap\u{00a0}` above a setext underline. Stripping trailing
184/// whitespace per line removes the noise without affecting content.
185static TRAILING_WS_REGEX: LazyLock<Regex> =
186 LazyLock::new(|| Regex::new(r"(?m)[ \t\u{00a0}]+$").expect("hardcoded valid regex pattern"));
187
188/// Regex to strip the redundant closing hashes html2md appends to ATX
189/// headings.
190///
191/// html2md 0.2.15 renders `<h3>`-`<h6>` as ATX headings with a trailing run of
192/// closing hashes (e.g. `### Examples ###`, `#### pub fn get() ####`). Those
193/// closing hashes are optional in `CommonMark` and read as noise, so we drop the
194/// trailing ` #+` while keeping the leading marker. Group 1 captures the
195/// heading text.
196static HEADING_TRAILING_HASH_REGEX: LazyLock<Regex> = LazyLock::new(|| {
197 Regex::new(r"(?m)^(#{1,6}[ \t].*?)[ \t]+#+[ \t]*$").expect("hardcoded valid regex pattern")
198});
199
200/// Matches an HTML superscript element (`<sup>...</sup>`) left verbatim in the
201/// markdown output.
202///
203/// `html2md` 0.2.15 has no handler for `<sup>`/`<sub>`, so rustdoc footnote
204/// references and exponents (e.g. `<sup id="fnref1"><a href="#fn1">1</a></sup>`)
205/// survive as literal HTML in the markdown. Group 1 captures the inner markup;
206/// [`clean_markdown`] strips any nested tags and re-emits it as `^(...)`.
207static SUPERSCRIPT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
208 Regex::new(r"(?is)<sup\b[^>]*>(.*?)</sup\s*>").expect("hardcoded valid regex pattern")
209});
210
211/// Matches an HTML subscript element (`<sub>...</sub>`) left verbatim in the
212/// markdown output. Counterpart to [`SUPERSCRIPT_REGEX`]; re-emitted as `_(...)`.
213static SUBSCRIPT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
214 Regex::new(r"(?is)<sub\b[^>]*>(.*?)</sub\s*>").expect("hardcoded valid regex pattern")
215});
216
217/// Matches a single HTML tag, used to strip residual inline markup (e.g. a
218/// nested `<a>`) from the inner content of a super/subscript before re-emitting
219/// it as plain text. See [`clean_markdown`].
220static INLINE_TAG_STRIP_REGEX: LazyLock<Regex> =
221 LazyLock::new(|| Regex::new(r"(?is)<[^>]+>").expect("hardcoded valid regex pattern"));
222
223/// Matches a negative auto-trait impl heading whose linkified trait name is
224/// glued to the leading `!`, e.g. `### impl<T> !Freeze for Mutex<T>`.
225///
226/// rustdoc emits the negative-impl marker as a text `!` immediately before the
227/// trait link (`!<a class="trait" ...>Freeze</a>`). html2md fuses these into
228/// ``, which is markdown image syntax and renders as a broken
229/// embedded image instead of the text `!Freeze`. Group 1 captures the heading
230/// prefix up to (and including) the `!`-glued bracket's `!`; [`clean_markdown`]
231/// re-emits it with the `!` backslash-escaped so it stays literal. Scoped to
232/// `impl` headings so genuine doc-body images are never touched.
233static NEGATIVE_IMPL_TRAIT_IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
234 Regex::new(r"(?m)^(#{1,6} +impl\b[^\n]*?)!\[").expect("hardcoded valid regex pattern")
235});
236
237/// Regex to rewrite relative documentation links to their link text.
238///
239/// Matches `[text](path.html)` where `path` begins with a letter, digit, `_`,
240/// `.` or `/` (covering module paths such as `_derive/index.html`,
241/// `../index.html`, `struct.Foo.html`) and ends with `.html` (optionally
242/// followed by a `#fragment`). Group 1 captures the link text and group 2 the
243/// URL. The link text may contain one level of nested brackets (e.g. an
244/// attribute label `#[tokio::main]` or a slice type `[u8]`).
245/// Docs.rs-relative targets are useless to an MCP client, so they are
246/// downgraded to their (meaningful) label; absolute external URLs containing a
247/// scheme (`://`) are kept intact since they are still reachable.
248static RELATIVE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
249 Regex::new(r"\[((?:[^\[\]]|\[[^\]]*\])*)\]\(([a-zA-Z0-9._/][^)]*\.html(?:#[^)]*)?)\)")
250 .expect("hardcoded valid regex pattern")
251});
252
253/// Matches a rustdoc "Read more" see-also affordance link (`[Read more](url)`).
254///
255/// rustdoc appends a `<a href="...">Read more</a>` link to the one-line summary
256/// of every inherited/trait method (e.g. derived `Clone`/`Debug`/`Hash`). When
257/// the target is a docs.rs-relative `.html` path it cannot be resolved by an
258/// MCP client, and downgrading it to its label leaves a meaningless dangling
259/// "Read more" at the end of the sentence. Group 1 captures any leading inline
260/// whitespace and group 2 the URL, so a relative affordance can be dropped
261/// entirely while an absolute (`scheme://`) one is preserved. See
262/// [`clean_markdown`].
263static READ_MORE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
264 Regex::new(r"([ \t]*)\[Read more\]\(([^)]*)\)").expect("hardcoded valid regex pattern")
265});
266
267/// Matches a rustdoc item-index table (`<dl class="item-table">...</dl>`).
268///
269/// docs.rs/rustdoc renders crate- and module-overview item indexes as a
270/// definition list of `<dt>` (item name + link) / optional `<dd>` (summary)
271/// pairs. `html2md` does not treat `<dt>` as block-level, so every entry
272/// collapses onto a single line (e.g. `Dl_infoElf32_ChdrElf32_Ehdr...`). We
273/// rewrite these tables into `<ul><li>` lists before markdown/text conversion
274/// so each item renders on its own line. The class only appears on overview
275/// pages, never on individual item pages.
276static ITEM_TABLE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
277 Regex::new(r"(?is)<dl[^>]*\bitem-table\b[^>]*>(.*?)</dl\s*>")
278 .expect("hardcoded valid regex pattern")
279});
280
281/// Matches a single `<dt>name</dt>` row with an optional following
282/// `<dd>summary</dd>` inside an item-table (see `ITEM_TABLE_REGEX`).
283static ITEM_TABLE_ROW_REGEX: LazyLock<Regex> = LazyLock::new(|| {
284 Regex::new(r"(?is)<dt\b[^>]*>(.*?)</dt\s*>\s*(?:<dd\b[^>]*>(.*?)</dd\s*>)?")
285 .expect("hardcoded valid regex pattern")
286});
287
288/// Regex to collapse three or more newlines to two newlines
289static MULTIPLE_NEWLINES_REGEX: LazyLock<Regex> =
290 LazyLock::new(|| Regex::new(r"\n\n\n+").expect("hardcoded valid regex pattern"));
291
292/// Matches a `<pre>...</pre>` block (verbatim code) so callers can leave its
293/// significant whitespace untouched while transforming the surrounding markup.
294static PRE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
295 Regex::new(r"(?is)<pre\b.*?</pre\s*>").expect("hardcoded valid regex pattern")
296});
297
298/// Matches a whitespace run that contains a newline/tab/CR immediately before
299/// an inline element's opening tag.
300///
301/// `html2md` 0.2.15 drops such leading whitespace before inline elements like
302/// `<a>`, `<em>` and `<strong>`, gluing the element onto the preceding word
303/// (e.g. a word, a newline, then an `<a>` link wraps an inline-code span and
304/// renders glued to the word after relative-link downgrading). A *single*
305/// literal space is preserved correctly by `html2md`, so these runs are
306/// collapsed to one space. The pattern only matches runs containing a
307/// newline/tab/CR, so genuine single spaces and deliberately glued cases such
308/// as a hyphen directly followed by `<code>` (no whitespace at all) are left
309/// untouched.
310static INLINE_LEADING_WS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
311 Regex::new(
312 r"(?i)[ \t\r\n]*[\r\n\t][ \t\r\n]*(<(?:a|code|em|strong|b|i|span|sup|sub|abbr|kbd|var|cite|q|mark|small|u)\b)",
313 )
314 .expect("hardcoded valid regex pattern")
315});
316
317/// Matches a whitespace run containing a newline/tab/CR immediately *after* an
318/// inline element's closing tag, when followed by word-like content.
319///
320/// Symmetric to [`INLINE_LEADING_WS_REGEX`]: `html2md` 0.2.15 also drops such
321/// trailing whitespace, gluing the next word onto the element (e.g.
322/// `</a>` followed by a newline and `crate` renders as `[..](..)crate`, which
323/// becomes `..crate` after relative-link downgrading). The trailing lookahead
324/// restricts the fix to alphanumeric/backtick/bracket/open-paren starts so a line wrapped
325/// before trailing punctuation (`</a>\n.`) is left untouched. A single literal
326/// space is already preserved by `html2md`, so only newline-bearing runs match.
327static INLINE_TRAILING_WS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
328 Regex::new(
329 r"(?i)(</(?:a|code|em|strong|b|i|span|sup|sub|abbr|kbd|var|cite|q|mark|small|u)>)[ \t\r\n]*[\r\n\t][ \t\r\n]*(?P<n>[A-Za-z0-9`\[(])",
330 )
331 .expect("hardcoded valid regex pattern")
332});
333
334/// Cached CSS selector for body element
335static BODY_SELECTOR: LazyLock<Selector> =
336 LazyLock::new(|| Selector::parse("body").expect("hardcoded valid selector"));
337
338/// Cached CSS selector for all elements
339static ALL_SELECTOR: LazyLock<Selector> =
340 LazyLock::new(|| Selector::parse("*").expect("hardcoded valid selector"));
341
342/// Cached selectors for skip tags (script, style, noscript, iframe)
343static SCRIPT_SELECTOR: LazyLock<Selector> =
344 LazyLock::new(|| Selector::parse("script").expect("hardcoded valid selector"));
345static STYLE_SELECTOR: LazyLock<Selector> =
346 LazyLock::new(|| Selector::parse("style").expect("hardcoded valid selector"));
347static NOSCRIPT_SELECTOR: LazyLock<Selector> =
348 LazyLock::new(|| Selector::parse("noscript").expect("hardcoded valid selector"));
349static IFRAME_SELECTOR: LazyLock<Selector> =
350 LazyLock::new(|| Selector::parse("iframe").expect("hardcoded valid selector"));
351
352/// Cached selectors for nav tags (nav, header, footer, aside)
353static NAV_SELECTOR: LazyLock<Selector> =
354 LazyLock::new(|| Selector::parse("nav").expect("hardcoded valid selector"));
355static HEADER_SELECTOR: LazyLock<Selector> =
356 LazyLock::new(|| Selector::parse("header").expect("hardcoded valid selector"));
357static FOOTER_SELECTOR: LazyLock<Selector> =
358 LazyLock::new(|| Selector::parse("footer").expect("hardcoded valid selector"));
359static ASIDE_SELECTOR: LazyLock<Selector> =
360 LazyLock::new(|| Selector::parse("aside").expect("hardcoded valid selector"));
361
362/// Cached selectors for UI tags (button, summary)
363static BUTTON_SELECTOR: LazyLock<Selector> =
364 LazyLock::new(|| Selector::parse("button").expect("hardcoded valid selector"));
365static SUMMARY_SELECTOR: LazyLock<Selector> =
366 LazyLock::new(|| Selector::parse("summary").expect("hardcoded valid selector"));
367
368/// Regex to strip rustdoc source-code links (`<a class="src ...">Source</a>`)
369/// from raw HTML *before* parsing.
370///
371/// These anchors point at the crate's `src/...rs.html` listings and add no
372/// value to extracted documentation. They are commonly nested inside
373/// `<summary>` elements whose text content is otherwise preserved, so removing
374/// them at the DOM level would be too late (the "Source" label would survive as
375/// plain text). Stripping them from the raw HTML first guarantees they leak
376/// into neither plain-text nor markdown output.
377static SRC_ANCHOR_HTML_REGEX: LazyLock<Regex> = LazyLock::new(|| {
378 // Match both modern (`class="src"`, double-quoted) and older rustdoc
379 // (`class='srclink'`, single-quoted) source-code anchors so their `[src]`
380 // label never leaks into the plain-text output (which, unlike the markdown
381 // path, has no later link-stripping pass).
382 Regex::new(r#"(?s)<a\b[^>]*\bclass\s*=\s*['"][^'"]*\bsrc(?:link)?\b[^'"]*['"][^>]*>.*?</a>"#)
383 .expect("hardcoded valid regex pattern")
384});
385
386/// Regex to fix the orphan `\u{00b7}` separator left between a stability
387/// "since" badge and its now-removed source link.
388///
389/// rustdoc emits `<span class="since">1.0.0</span> \u{00b7} <a class="src">Source</a>`
390/// inside an item's right-side metadata. [`SRC_ANCHOR_HTML_REGEX`] deletes the
391/// source anchor, leaving ` \u{00b7} </span>`. When the enclosing `<summary>` is
392/// later flattened to text the dangling middot glues onto the following
393/// signature (`1.0.0 \u{00b7} fn next(...)`). Collapse the separator (and its
394/// surrounding whitespace) to a single space while preserving the closing tag,
395/// so the version stays cleanly separated from the signature.
396static ORPHAN_SINCE_MIDDOT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
397 Regex::new(r"(?s)[ \t\u{00a0}]*\u{00b7}[ \t\u{00a0}]*(</span\s*>)")
398 .expect("hardcoded valid regex pattern")
399});
400
401/// Matches a rustdoc stability "since" version badge
402/// (`<span class="since ...">1.0.0</span>`) that is immediately followed by
403/// another tag with no separating whitespace.
404///
405/// In a flattened `<summary>` (provided trait methods on FFI structs, e.g.
406/// libc) the badge abuts the method code-header, so plain-text extraction
407/// fuses them (`1.0.0fn clone_from`). Group 1 captures the whole badge; the
408/// trailing `<` (re-emitted by the replacement) ensures a space is inserted
409/// only when the badge is glued, never doubling an existing space. The version
410/// text holds no nested tags, so `[^<]*` captures it safely. See [`clean_html`].
411static SINCE_BADGE_GLUED_REGEX: LazyLock<Regex> = LazyLock::new(|| {
412 Regex::new(
413 r#"(?is)(<span\b[^>]*\bclass\s*=\s*["'][^"']*\bsince\b[^"']*["'][^>]*>[^<]*</span\s*>)<"#,
414 )
415 .expect("hardcoded valid regex pattern")
416});
417
418/// Regex to remove rustdoc UI anchor links that carry no documentation value.
419///
420/// rustdoc decorates headings, item declarations and code examples with
421/// navigation affordances rendered as `<a>` elements:
422/// - section/anchor links `<a class="anchor">\u{00a7}</a>` (a section-sign that
423/// jumps to the heading),
424/// - notable-trait markers `<a class="tooltip" data-notable-ty="...">\u{24d8}</a>`
425/// (a circled-i tooltip toggle), and
426/// - "Run code" buttons `<a class="test-arrow" href="https://play.rust-lang.org/...">`
427/// with empty link text (the playground launcher for a doc example), and
428/// - scraped-example help links `<a class="scrape-help" href="...">?</a>` (the
429/// `?` affordance beside an "Examples found in repository" heading).
430///
431/// The glyph anchors commonly sit inside a `<summary>` whose text is otherwise
432/// preserved, so removing them at the DOM level is too late (the glyph would
433/// survive as plain text and glue onto the following declaration, e.g.
434/// `\u{00a7}impl<...>` or `Keys<'_, K, V> \u{24d8}`). The run buttons otherwise
435/// render as an empty-text markdown link wrapping a very long playground URL
436/// (`[](https://play.rust-lang.org/?code=...)`). Stripping all three from the
437/// raw HTML keeps them out of both the markdown and plain-text output.
438static UI_ANCHOR_HTML_REGEX: LazyLock<Regex> = LazyLock::new(|| {
439 Regex::new(
440 r#"(?s)<a\b[^>]*\bclass\s*=\s*['"][^'"]*\b(?:anchor|tooltip|test-arrow|scrape-help)\b[^'"]*['"][^>]*>.*?</a>"#,
441 )
442 .expect("hardcoded valid regex pattern")
443});
444
445/// Regex to remove rustdoc UI anchors whose target is a `javascript:` URL
446/// (collapse/expand toggles such as `#toggle-all-docs`, which render as a
447/// bracketed minus/plus marker).
448///
449/// These are pure UI affordances; documentation never legitimately links to a
450/// `javascript:` URL. Their visible marker text would otherwise leak into the
451/// plain-text output, since the `javascript:`-link cleanup only runs on the
452/// markdown path.
453static JS_ANCHOR_HTML_REGEX: LazyLock<Regex> = LazyLock::new(|| {
454 Regex::new(r#"(?is)<a\b[^>]*\bhref\s*=\s*['"]\s*javascript:[^>]*>.*?</a>"#)
455 .expect("hardcoded valid regex pattern")
456});
457
458/// Regex to remove `<script>`, `<style>`, `<noscript>` and `<iframe>` elements
459/// (including their contents) from raw HTML *before* parsing.
460///
461/// The DOM-based pass in [`remove_unwanted_elements`] re-serializes each node
462/// via `ElementRef::html()` and string-replaces it in the original markup. That
463/// match is fragile: html5ever normalizes attribute whitespace and quoting, so
464/// markup like `<script defer >` is serialized as `<script defer>` and the
465/// replacement silently misses, leaking executable/style content into the
466/// `html` output format. Stripping these tags with a tolerant regex first
467/// guarantees they are removed regardless of the original formatting. (Back-
468/// references are unsupported by the `regex` crate, so each tag is listed
469/// explicitly rather than captured once.)
470static DANGEROUS_ELEMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
471 Regex::new(
472 r"(?is)<script\b[^>]*>.*?</script\s*>|<style\b[^>]*>.*?</style\s*>|<noscript\b[^>]*>.*?</noscript\s*>|<iframe\b[^>]*>.*?</iframe\s*>|<iframe\b[^>]*/>",
473 )
474 .expect("hardcoded valid regex pattern")
475});
476
477/// Regex to remove rustdoc UI web-components from raw HTML before parsing.
478///
479/// Modern rustdoc emits custom elements for its chrome: `<rustdoc-toolbar>`
480/// (the settings/options toolbar, rendered empty in static HTML) and
481/// `<rustdoc-topbar>` (a duplicate breadcrumb such as
482/// `<h2><a href="#">Iterator</a></h2>`). The toolbar sits inside
483/// `#main-content`, so it leaks into the `html` output as a stray empty tag;
484/// the topbar can leak a redundant heading. Neither carries documentation
485/// value, so both are stripped (paired and self-closing forms).
486static RUSTDOC_UI_ELEMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
487 Regex::new(
488 r"(?is)<rustdoc-(?:toolbar|topbar)\b[^>]*>.*?</rustdoc-(?:toolbar|topbar)\s*>|<rustdoc-(?:toolbar|topbar)\b[^>]*/>",
489 )
490 .expect("hardcoded valid regex pattern")
491});
492
493/// Regex to remove the rustdoc navigation breadcrumb element.
494///
495/// rustdoc renders a breadcrumb above each item title, e.g.
496/// `<div class="rustdoc-breadcrumbs"><a href="../index.html">std</a>::<wbr>`
497/// `<a href="index.html">vec</a></div>`. Its links are page-relative, so they
498/// are downgraded to bare text and leave a dangling line such as `std::vec`
499/// (or a lone `std` on macro pages) directly under our own
500/// `## Documentation: <path>` title. The breadcrumb is pure navigation chrome
501/// that duplicates the title, so the whole element is removed before parsing.
502/// It contains only anchors and separators (no nested `<div>`), so the
503/// non-greedy match terminates at the first `</div>`.
504static RUSTDOC_BREADCRUMBS_REGEX: LazyLock<Regex> = LazyLock::new(|| {
505 Regex::new(r"(?is)<div\b[^>]*\brustdoc-breadcrumbs\b[^>]*>.*?</div\s*>")
506 .expect("hardcoded valid regex pattern")
507});
508
509/// Regex matching a rustdoc prose admonition rendered as a styled `<pre>`.
510///
511/// rustdoc/mdBook authors create "Warning"/"Note" callout boxes with the idiom
512/// `<pre class="compile_fail" style="white-space:normal;font:inherit;">` (or
513/// with `class="ignore"`) wrapping ordinary prose HTML such as a paragraph with
514/// a bold "Warning" lead-in. The `white-space:normal;font:inherit` style makes
515/// rustdoc
516/// render it as flowing prose rather than monospaced code. Without special
517/// handling our pipeline treats the `<pre>` as a code block and wraps the prose
518/// in a bare fenced code block (mislabeling prose as code and flattening its
519/// inline links and code). Genuine code examples keep the default `white-space: pre`, so
520/// matching on `white-space:normal` reliably selects only these prose boxes.
521/// They are rewritten to a `<blockquote>` so the inner prose renders normally
522/// as a callout in every output format. The box holds no nested `<pre>`, so the
523/// non-greedy body terminates at the first `</pre>`.
524static PROSE_PRE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
525 Regex::new(
526 r#"(?is)<pre\b[^>]*\bstyle\s*=\s*["'][^"']*white-space\s*:\s*normal[^"']*["'][^>]*>(.*?)</pre\s*>"#,
527 )
528 .expect("hardcoded valid regex pattern")
529});
530
531/// Regex matching rustdoc's "unsafe function" marker superscript.
532///
533/// In module item lists rustdoc appends `<sup title="unsafe function">WARN</sup>`
534/// (the `WARN` glyph is a warning emoji) after each unsafe function's name. Our
535/// superscript handling would otherwise turn it into a `^(...)` token glued onto
536/// the name (e.g. `copy^(...)`). The marker conveys a useful fact, so it is
537/// replaced with a readable ` (unsafe)` annotation in every output format before
538/// parsing.
539static UNSAFE_FN_MARKER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
540 Regex::new(r#"(?is)<sup\b[^>]*\btitle\s*=\s*["']unsafe function["'][^>]*>.*?</sup\s*>"#)
541 .expect("hardcoded valid regex pattern")
542});
543
544/// Matches a rustdoc collapse-toggle `<summary class="hideme">` element.
545///
546/// rustdoc places interactive "Show N methods"/"Show N associated items" and
547/// "Expand description" toggles inside `<summary class="hideme">` nodes, and the
548/// "Show N methods" one sits *inside* the item-declaration `<pre>` block. Its
549/// label text therefore leaks into the rendered code (e.g.
550/// `Show 76 methods // Required method` inside a trait signature) in every
551/// output format. The element is pure UI chrome, so it is removed wholesale
552/// (the surrounding `<details>` content is preserved). See [`clean_html`].
553static HIDEME_SUMMARY_REGEX: LazyLock<Regex> = LazyLock::new(|| {
554 Regex::new(
555 r#"(?is)<summary\b[^>]*\bclass\s*=\s*["'][^"']*\bhideme\b[^"']*["'][^>]*>.*?</summary\s*>"#,
556 )
557 .expect("hardcoded valid regex pattern")
558});
559
560/// Matches a rustdoc impl-block documentation `<div class="docblock">` that is
561/// the final child of an impl `<section>` nested inside a `<summary>`.
562///
563/// rustdoc renders an impl block's own documentation (e.g. a "Basic API"
564/// heading) as `<div class="docblock">...</div>` *inside* the `<summary>` that
565/// also holds the `impl ...` declaration. Because [`remove_unwanted_elements`]
566/// flattens `<summary>` nodes to their decoded text, that docblock glues onto
567/// the declaration (e.g. `impl ArgBasic API`). Group 1 captures the docblock
568/// contents so the wrapper can be relocated *after* the `</summary>`, where it
569/// renders as ordinary content. The trailing `</div></section></summary>`
570/// boundary only occurs for impl-block docs (method/field docblocks sit after
571/// their `</summary>`), so this does not disturb other documentation. See
572/// [`clean_html`].
573static IMPL_DOCBLOCK_IN_SUMMARY_REGEX: LazyLock<Regex> = LazyLock::new(|| {
574 Regex::new(r#"(?is)</h3>\s*<div class="docblock">(.*?)</div>\s*</section>\s*</summary>"#)
575 .expect("hardcoded valid regex pattern")
576});
577
578/// Matches a rustdoc portability/feature-availability badge that carries a
579/// human-readable `title` attribute.
580///
581/// rustdoc renders availability pills as
582/// `<span class="stab portability" title="Available on crate feature `fs` only">`
583/// `<code>fs</code></span>` immediately after an item link, with no separating
584/// whitespace. Group 1 captures the title text, which is the clearest rendering
585/// (it also covers platform/cfg badges such as "Available on `docsrs` and Unix
586/// only"). See [`rewrite_portability_badges`].
587static STAB_PORTABILITY_TITLE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
588 Regex::new(
589 r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bportability\b[^"']*["'][^>]*\btitle\s*=\s*"([^"]*)"[^>]*>.*?</span\s*>"#,
590 )
591 .expect("hardcoded valid regex pattern")
592});
593
594/// Matches a rustdoc portability badge that lacks a usable `title` attribute.
595/// Group 1 captures the inner markup (the feature name(s)). Fallback for the
596/// title-based [`STAB_PORTABILITY_TITLE_REGEX`].
597static STAB_PORTABILITY_REGEX: LazyLock<Regex> = LazyLock::new(|| {
598 Regex::new(
599 r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bportability\b[^"']*["'][^>]*>(.*?)</span\s*>"#,
600 )
601 .expect("hardcoded valid regex pattern")
602});
603
604/// Matches an inline rustdoc stability badge span (e.g.
605/// `<span class="stab unstable" title="">Experimental</span>` or a
606/// `<span class="stab deprecated">Deprecated</span>` pill) that rustdoc renders
607/// immediately after an item name with no separating whitespace, gluing the
608/// badge label onto the name (e.g. `TryReserveErrorKindExperimental`).
609///
610/// Group 1 captures the inner label. Portability badges (`class="stab
611/// portability"`) are handled earlier by [`rewrite_portability_badges`] and so
612/// are already consumed before this runs; only the remaining stab pills match.
613/// The pattern is span-scoped, so block-level stability banners
614/// (`<div class="stab unstable">...</div>`) on item-detail pages are untouched.
615/// See [`rewrite_stab_badges`].
616static STAB_BADGE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
617 Regex::new(
618 r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bstab\b[^"']*["'][^>]*>(.*?)</span\s*>"#,
619 )
620 .expect("hardcoded valid regex pattern")
621});
622
623/// Matches the opening tag of a rustdoc item-info wrapper
624/// (`<span class="item-info">`), which holds the stability/deprecation badges
625/// that rustdoc renders immediately after an item signature.
626///
627/// rustdoc emits the wrapper with no separating whitespace after the preceding
628/// `</section>` (e.g. `...&str</h4></section><span class="item-info"><div
629/// class="stab deprecated"><span class="emoji">\u{1f44e}</span>...`). When the
630/// enclosing collapsed `<summary>` is flattened to text, the badge glues onto
631/// the signature (`-> &str\u{1f44e} Deprecated since ...`). Group 1 captures the
632/// opening tag so [`clean_html`] can re-emit it preceded by a single space.
633static ITEM_INFO_OPEN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
634 Regex::new(r#"(?is)(<span\b[^>]*\bclass\s*=\s*["'][^"']*\bitem-info\b[^"']*["'][^>]*>)"#)
635 .expect("hardcoded valid regex pattern")
636});
637
638/// Matches a rustdoc decorative emoji badge such as the nightly-API flask.
639///
640/// rustdoc renders unstable/experimental markers as
641/// `<span class="emoji">\u{1f52c}</span><span>This is a nightly-only ...</span>`
642/// with no separating whitespace, so html2md glues the emoji onto the following
643/// text (`\u{1f52c}This is a nightly-only experimental API.`). Group 1 captures
644/// the whole badge; [`rewrite_emoji_badges`] re-emits it followed by a single
645/// space so the emoji reads as a separate visual cue.
646static EMOJI_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
647 Regex::new(
648 r#"(?is)(<span\b[^>]*\bclass\s*=\s*["'][^"']*\bemoji\b[^"']*["'][^>]*>.*?</span\s*>)"#,
649 )
650 .expect("hardcoded valid regex pattern")
651});
652
653/// Matches a rustdoc struct-field declaration span
654/// (`<span class="structfield section-header">field: Type</span>`).
655///
656/// rustdoc emits one such span per field with no separating whitespace and
657/// relies on CSS to render each as its own block. Without intervention the
658/// adjacent spans glue together: markdown yields back-to-back inline code
659/// spans, and the plain-text path fuses a field type onto the next field
660/// name into a corrupt token. The captured inner content is re-wrapped in a
661/// block element so each field renders on its own line. Group 1 is the field
662/// declaration. See [`clean_html`].
663static STRUCTFIELD_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| {
664 Regex::new(
665 r#"(?is)<span\b[^>]*\bclass\s*=\s*["'][^"']*\bstructfield\b[^"']*["'][^>]*>(.*?)</span\s*>"#,
666 )
667 .expect("hardcoded valid regex pattern")
668});
669
670/// Matches a rustdoc `where`-clause block (`<div class="where">where ...</div>`)
671/// embedded in item declarations and signatures.
672///
673/// rustdoc relies on CSS to render this block on its own line(s); the markup
674/// itself carries no line break before the block or after it, so both html2md
675/// and the plain-text extractor glue it onto the surrounding tokens (e.g.
676/// `Vec<T, A = Global>where` and `Allocator,{`). Group `w` captures the inner
677/// content. See [`rewrite_where_clauses`].
678static WHERE_DIV_REGEX: LazyLock<Regex> = LazyLock::new(|| {
679 Regex::new(
680 r#"(?is)<div\b[^>]*\bclass\s*=\s*["'][^"']*\bwhere\b[^"']*["'][^>]*>(?P<w>.*?)</div\s*>"#,
681 )
682 .expect("hardcoded valid regex pattern")
683});
684
685/// Cached selectors for main content extraction
686static MAIN_CONTENT_SELECTOR: LazyLock<Selector> =
687 LazyLock::new(|| Selector::parse("#main-content").expect("hardcoded valid selector"));
688static RUSTDOC_BODY_WRAPPER_SELECTOR: LazyLock<Selector> =
689 LazyLock::new(|| Selector::parse("#rustdoc_body_wrapper").expect("hardcoded valid selector"));
690static H1_SELECTOR: LazyLock<Selector> =
691 LazyLock::new(|| Selector::parse("h1").expect("hardcoded valid selector"));
692
693/// Rewrite rustdoc item-index tables into HTML unordered lists.
694///
695/// Converts each `<dl class="item-table">` block into a `<ul>` whose `<li>`
696/// entries each hold one item (name link, optional ` — summary`). This keeps
697/// `html2md` from concatenating every item name onto a single line. See
698/// `ITEM_TABLE_REGEX` for details.
699#[must_use]
700fn rewrite_item_tables(html: &str) -> String {
701 ITEM_TABLE_REGEX
702 .replace_all(html, |caps: ®ex::Captures| {
703 let inner = &caps[1];
704 let mut out = String::from("<ul>");
705 for row in ITEM_TABLE_ROW_REGEX.captures_iter(inner) {
706 let name = row.get(1).map_or("", |m| m.as_str()).trim();
707 if name.is_empty() {
708 continue;
709 }
710 out.push_str("<li>");
711 out.push_str(name);
712 let desc = row.get(2).map_or("", |m| m.as_str()).trim();
713 if !desc.is_empty() {
714 out.push_str(" \u{2014} ");
715 out.push_str(desc);
716 }
717 out.push_str("</li>");
718 }
719 out.push_str("</ul>");
720 out
721 })
722 .into_owned()
723}
724
725/// Matches a rustdoc `<div class="code-attribute">` element. rustdoc wraps each
726/// attribute (e.g. `#[repr(i8)]`, `#[non_exhaustive]`) shown above an item
727/// declaration in this block-level `<div>`, which CSS renders on its own line.
728/// Group 1 captures the inner attribute markup. See
729/// [`rewrite_code_attributes`].
730static CODE_ATTRIBUTE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
731 Regex::new(
732 r#"(?is)<div\b[^>]*\bclass\s*=\s*["'][^"']*\bcode-attribute\b[^"']*["'][^>]*>(.*?)</div\s*>"#,
733 )
734 .expect("hardcoded valid regex pattern")
735});
736
737/// Put each item-declaration attribute on its own line.
738///
739/// rustdoc renders declaration attributes inside `<div class="code-attribute">`
740/// blocks within the `<pre class="item-decl">` signature. Because the `<div>`
741/// only breaks the line via CSS, extracting the `<pre>` text glues the
742/// attribute onto the following declaration (e.g. `#[repr(i8)]pub enum
743/// Ordering`) in every format. Replace each such `<div>` with its inner content
744/// followed by a newline so the attribute keeps its own line; the result
745/// renders identically to rustdoc in all three output formats.
746#[must_use]
747fn rewrite_code_attributes(html: &str) -> String {
748 CODE_ATTRIBUTE_REGEX
749 .replace_all(html, "${1}\n")
750 .into_owned()
751}
752
753/// Matches a rustdoc code-header element (`<h3>`/`<h4 class="code-header">`),
754/// which holds an item/impl/method signature. Group 1 is the heading level
755/// digit (matched again at the close tag) and group 2 the inner markup. See
756/// [`rewrite_code_headers`].
757static CODE_HEADER_REGEX: LazyLock<Regex> = LazyLock::new(|| {
758 Regex::new(
759 r#"(?is)<h([34])\b[^>]*\bclass\s*=\s*["'][^"']*\bcode-header\b[^"']*["'][^>]*>(.*?)</h[34]\s*>"#,
760 )
761 .expect("hardcoded valid regex pattern")
762});
763
764/// Matches `(` followed by a newline and indentation (rustdoc's wrapped-argument
765/// list opener). See [`rewrite_code_headers`].
766static SIG_OPEN_PAREN_WRAP_REGEX: LazyLock<Regex> =
767 LazyLock::new(|| Regex::new(r"\(\s*\n\s*").expect("hardcoded valid regex pattern"));
768
769/// Matches an optional trailing comma plus a newline before the closing `)` of a
770/// wrapped argument list. See [`rewrite_code_headers`].
771static SIG_CLOSE_PAREN_WRAP_REGEX: LazyLock<Regex> =
772 LazyLock::new(|| Regex::new(r",?\s*\n\s*\)").expect("hardcoded valid regex pattern"));
773
774/// Matches any remaining newline-with-whitespace run inside a signature. See
775/// [`rewrite_code_headers`].
776static SIG_NEWLINE_RUN_REGEX: LazyLock<Regex> =
777 LazyLock::new(|| Regex::new(r"\s*\n\s*").expect("hardcoded valid regex pattern"));
778
779/// Collapse multi-line rustdoc signatures in code-header elements onto a single
780/// line.
781///
782/// rustdoc wraps long `fn`/method signatures across several lines using literal
783/// newlines and indentation inside the (non-`<pre>`) `<h4 class="code-header">`
784/// element, e.g. `try_lock_owned(\n self: Arc<Self>,\n) -> ...`. html2md
785/// renders such a header as an ATX heading, so the embedded newlines split it
786/// into a broken two-line heading; the plain-text path collapses them but keeps
787/// stray spaces (`( self: Arc<Self>, )`). Normalise the wrapped argument list
788/// back to a single clean line (`(self: Arc<Self>) -> ...`) before parsing.
789/// Only code-header elements are touched, so `<pre>` code examples (which may
790/// legitimately contain `(\n `) are unaffected.
791fn rewrite_code_headers(html: &str) -> String {
792 CODE_HEADER_REGEX
793 .replace_all(html, |caps: ®ex::Captures| {
794 let level = &caps[1];
795 let inner = &caps[2];
796 // Impl headers (`<h3>`) stay headings, but item-signature headers
797 // (`<h4>`: methods, associated consts/types) render as plain text.
798 // rustdoc only wraps *documented* items in `<details><summary>`
799 // (whose `<h4>` is flattened to text); an *undocumented* sibling is
800 // a bare `<section>` whose `<h4>` would otherwise survive as a
801 // spurious `####` heading, inconsistent with its documented peers.
802 // See test_undocumented_assoc_item_not_rendered_as_heading.
803 let (open, close) = if level == "4" {
804 (r#"<p class="code-header">"#.to_string(), "</p>".to_string())
805 } else {
806 (
807 format!("<h{level} class=\"code-header\">"),
808 format!("</h{level}>"),
809 )
810 };
811 if !inner.contains('\n') {
812 return format!("{open}{inner}{close}");
813 }
814 let inner = SIG_OPEN_PAREN_WRAP_REGEX.replace_all(inner, "(");
815 let inner = SIG_CLOSE_PAREN_WRAP_REGEX.replace_all(&inner, ")");
816 let inner = SIG_NEWLINE_RUN_REGEX.replace_all(&inner, " ");
817 format!("{open}{inner}{close}")
818 })
819 .into_owned()
820}
821
822/// Detach rustdoc `where`-clause blocks from the surrounding declaration.
823///
824/// rustdoc emits `<div class="where">` with no literal line breaks around it
825/// (the layout is CSS-only), so item declarations render glued, e.g.
826/// `Vec<T, A = Global>where` and `Allocator,{ /* private fields */ }`. Inside
827/// `<pre>` declarations the clause is wrapped in newlines to reproduce the
828/// multi-line rustdoc layout; elsewhere (single-line code-header signatures) it
829/// is collapsed to a single space-padded clause so the heading stays on one
830/// line. `<pre>` boundaries are detected with [`PRE_BLOCK_REGEX`].
831fn rewrite_where_clauses(html: &str) -> String {
832 let collapse = |caps: ®ex::Captures| -> String {
833 let inner = caps.name("w").map_or("", |m| m.as_str());
834 format!(
835 " {} ",
836 inner.split_whitespace().collect::<Vec<_>>().join(" ")
837 )
838 };
839 let mut out = String::with_capacity(html.len());
840 let mut last = 0;
841 for m in PRE_BLOCK_REGEX.find_iter(html) {
842 // Outside <pre>: collapse the clause onto one space-padded line.
843 out.push_str(&WHERE_DIV_REGEX.replace_all(&html[last..m.start()], &collapse));
844 // Inside <pre>: keep the clause verbatim but break it onto its own lines.
845 out.push_str(&WHERE_DIV_REGEX.replace_all(m.as_str(), "\n${w}\n"));
846 last = m.end();
847 }
848 out.push_str(&WHERE_DIV_REGEX.replace_all(&html[last..], &collapse));
849 out
850}
851
852/// Rewrite rustdoc portability/feature badges so they are not glued onto the
853/// preceding item name.
854///
855/// Each `<span class="stab portability">` is replaced by a space-separated
856/// parenthetical: the badge's human-readable `title` when present (e.g. the
857/// "Available on crate feature ... only" string), otherwise its inner content.
858/// This stops html2md from gluing the feature pill onto the item name, so it
859/// reads naturally in both markdown and plain-text formats.
860fn rewrite_portability_badges(html: &str) -> String {
861 let with_titles = STAB_PORTABILITY_TITLE_REGEX.replace_all(html, |caps: ®ex::Captures| {
862 format!(" ({})", badge_title_to_html(&caps[1]))
863 });
864 STAB_PORTABILITY_REGEX
865 .replace_all(&with_titles, " (${1})")
866 .into_owned()
867}
868
869/// Convert a badge `title` string into HTML, turning backtick-delimited
870/// segments into genuine `<code>` elements.
871///
872/// rustdoc availability titles embed the feature name in literal backticks
873/// (e.g. ``Available on crate feature `thread_rng` only``). Splicing that text
874/// in verbatim makes html2md treat the backticks as plain characters: it then
875/// escapes any markdown metacharacter inside them (e.g. the underscore in
876/// `thread_rng`), leaking a stray backslash inside what looks like a code span
877/// (`` `thread\_rng` ``). Emitting a real `<code>` element instead yields a
878/// proper code span in markdown (no escaping) and correct markup in the html
879/// output. Backticks are only treated as delimiters when balanced; an odd
880/// count leaves the title untouched.
881#[must_use]
882fn badge_title_to_html(title: &str) -> String {
883 let parts: Vec<&str> = title.split('`').collect();
884 // An even number of segments means an odd number of backticks (unbalanced);
885 // leave the title as-is rather than emit a dangling `<code>`.
886 if parts.len().is_multiple_of(2) {
887 return title.to_string();
888 }
889 let mut out = String::with_capacity(title.len() + 13);
890 for (i, part) in parts.iter().enumerate() {
891 if i % 2 == 1 {
892 out.push_str("<code>");
893 out.push_str(part);
894 out.push_str("</code>");
895 } else {
896 out.push_str(part);
897 }
898 }
899 out
900}
901
902/// Rewrite remaining inline rustdoc stability badges so their label is not
903/// glued onto the preceding item name.
904///
905/// Each leftover `<span class="stab ...">` pill (e.g. the `Experimental` or
906/// `Deprecated` marker that follows an item link in a module index table) is
907/// replaced by a space-separated parenthetical built from its label text. Run
908/// *after* [`rewrite_portability_badges`] so feature/availability pills have
909/// already been consumed and only stability markers remain.
910#[must_use]
911fn rewrite_stab_badges(html: &str) -> String {
912 STAB_BADGE_REGEX.replace_all(html, " (${1})").into_owned()
913}
914
915/// Clean HTML by removing unwanted tags and their content
916///
917/// Uses the `scraper` crate for robust HTML5 parsing, which handles
918/// malformed HTML better than manual parsing.
919///
920/// This function performs a single-pass HTML parsing and removal of all
921/// unwanted elements to minimize parsing overhead.
922#[must_use]
923pub fn clean_html(html: &str) -> String {
924 // Strip source-code anchors from the raw HTML first so their "Source" label
925 // cannot survive as plain text when nested inside preserved <summary> nodes.
926 let html = SRC_ANCHOR_HTML_REGEX.replace_all(html, "");
927 // After the source link is gone, collapse the orphan `\u{00b7}` separator
928 // that rustdoc left between the "since" badge and that link (see
929 // ORPHAN_SINCE_MIDDOT_REGEX) so it cannot glue onto the next signature.
930 let html = ORPHAN_SINCE_MIDDOT_REGEX.replace_all(&html, " ${1}");
931 // Drop `javascript:` UI toggles (e.g. the bracketed collapse-all control)
932 // so their marker text does not survive plain-text extraction.
933 let html = JS_ANCHOR_HTML_REGEX.replace_all(&html, "");
934 // Strip rustdoc UI anchors (section-sign/notable-trait glyphs and the
935 // playground "Run code" buttons) before parsing so they do not survive as
936 // plain text or as empty-text links (see UI_ANCHOR_HTML_REGEX).
937 let html = UI_ANCHOR_HTML_REGEX.replace_all(&html, "");
938 // Separate a "since" version badge from a directly-following element so a
939 // flattened <summary> does not fuse it onto the next signature
940 // (`1.0.0fn clone_from`). See SINCE_BADGE_GLUED_REGEX.
941 let html = SINCE_BADGE_GLUED_REGEX.replace_all(&html, "${1} <");
942 // Guarantee removal of executable/style/embedded content regardless of how
943 // the source markup was formatted (see DANGEROUS_ELEMENT_REGEX docs).
944 let html = DANGEROUS_ELEMENT_REGEX.replace_all(&html, "");
945 // Strip rustdoc UI web-components (toolbar/topbar chrome) so they do not
946 // leak into the html output or as a redundant heading (see
947 // RUSTDOC_UI_ELEMENT_REGEX).
948 let html = RUSTDOC_UI_ELEMENT_REGEX.replace_all(&html, "");
949 // Remove the rustdoc navigation breadcrumb above the item title; its
950 // page-relative links would otherwise be downgraded to a dangling bare
951 // line (e.g. `std::vec`, or a lone `std` on macro pages) that merely
952 // duplicates our own title (see RUSTDOC_BREADCRUMBS_REGEX).
953 let html = RUSTDOC_BREADCRUMBS_REGEX.replace_all(&html, "");
954 // Rewrite rustdoc prose admonitions ("Warning"/"Note" callouts authored as
955 // `<pre style="white-space:normal;...">`) into blockquotes so their prose
956 // renders normally instead of being mislabeled as a bare ``` code block
957 // (see PROSE_PRE_REGEX). Genuine code examples are untouched.
958 let html = PROSE_PRE_REGEX.replace_all(&html, "<blockquote>${1}</blockquote>");
959 // Replace rustdoc's "unsafe function" marker superscript with a readable
960 // ` (unsafe)` annotation; otherwise it leaks as `^(...)` glued onto the
961 // function name in module item lists (see UNSAFE_FN_MARKER_REGEX).
962 let html = UNSAFE_FN_MARKER_REGEX.replace_all(&html, " (unsafe)");
963 // Remove rustdoc "Show N methods"/"Expand description" collapse
964 // toggles (`<summary class="hideme">`); the "Show N methods" toggle
965 // sits inside the item-declaration <pre>, so its label otherwise
966 // leaks into the rendered signature (see HIDEME_SUMMARY_REGEX).
967 let html = HIDEME_SUMMARY_REGEX.replace_all(&html, "");
968 // Detach `where` clauses (CSS-only line breaks) so declarations do not
969 // render glued (e.g. `Vec<T, A = Global>where`).
970 let html = rewrite_where_clauses(&html);
971 // Collapse multi-line wrapped signatures in code-header elements onto a
972 // single clean line so html2md does not emit a broken two-line heading
973 // (see rewrite_code_headers).
974 let html = rewrite_code_headers(&html);
975 // Put each item-declaration attribute (e.g. `#[repr(i8)]`) on its own line
976 // so it is not glued onto the following declaration (see
977 // rewrite_code_attributes).
978 let html = rewrite_code_attributes(&html);
979 // Separate feature/portability badges from the preceding item name so they
980 // do not render glued (e.g. `fs`fs``); replace each with a readable
981 // parenthetical built from the badge's title (or inner) text.
982 let html = rewrite_portability_badges(&html);
983 // Separate remaining inline stability pills (e.g. `Experimental`/`Deprecated`
984 // markers in module index tables) from the preceding item name so they do
985 // not render glued (see rewrite_stab_badges). Runs after the portability
986 // rewrite so feature/availability pills are already consumed.
987 let html = rewrite_stab_badges(&html);
988 // Append a space after decorative emoji badges (e.g. the nightly-API flask)
989 // so the emoji does not glue onto the following text (see EMOJI_SPAN_REGEX).
990 let html = EMOJI_SPAN_REGEX.replace_all(&html, "${1} ");
991 // Separate the item-info badge wrapper (stability/deprecation pills) from a
992 // preceding signature so a flattened `<summary>` does not glue the badge
993 // onto the declaration (e.g. `-> &str\u{1f44e} Deprecated`). See
994 // ITEM_INFO_OPEN_REGEX.
995 let html = ITEM_INFO_OPEN_REGEX.replace_all(&html, " ${1}");
996 // Rewrite rustdoc item-index tables into <ul><li> lists so html2md does not
997 // concatenate every item name onto a single line (overview pages only).
998 let html = rewrite_item_tables(&html);
999 // Put each struct-field declaration on its own block so adjacent fields
1000 // do not glue together (`a: A``b: B` in markdown, `A_tb` token fusion in
1001 // text). See STRUCTFIELD_SPAN_REGEX.
1002 let html = STRUCTFIELD_SPAN_REGEX.replace_all(&html, "<p>${1}</p>");
1003 // Relocate an impl block's own documentation out of the flattened
1004 // `<summary>` so its heading/text does not glue onto the `impl ...`
1005 // declaration (e.g. `impl ArgBasic API`). See
1006 // IMPL_DOCBLOCK_IN_SUMMARY_REGEX.
1007 let html = IMPL_DOCBLOCK_IN_SUMMARY_REGEX.replace_all(
1008 &html,
1009 r#"</h3></section></summary><div class="docblock">${1}</div>"#,
1010 );
1011 let document = Html::parse_document(&html);
1012 remove_unwanted_elements(&document, &html)
1013}
1014
1015/// HTML-escape the special characters `&`, `<`, and `>` in plain text.
1016///
1017/// Used when decoded text (from `ElementRef::text()`) is spliced back into an
1018/// HTML string that will be parsed again downstream (e.g. by `html2md`). Without
1019/// re-escaping, fragments such as `Option<usize>` would be misread as tags and
1020/// silently dropped. `&` is escaped first so the replacement is idempotent for a
1021/// single pass.
1022#[must_use]
1023fn escape_html_text(text: &str) -> String {
1024 text.replace('&', "&")
1025 .replace('<', "<")
1026 .replace('>', ">")
1027}
1028
1029/// Remove unwanted elements from HTML using scraper for parsing
1030///
1031/// This function performs optimized single-pass removal of all unwanted elements
1032/// using cached selectors for better performance.
1033///
1034/// Removes: script, style, noscript, iframe, nav, header, footer, aside, button
1035/// Preserves summary content while removing the tag itself.
1036#[inline]
1037fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
1038 // Collect all elements to process with their positions for efficient replacement
1039 let mut replacements: Vec<(String, Option<String>)> = Vec::new();
1040
1041 // Process script, style, noscript, iframe - remove completely (using cached selectors)
1042 for element in document.select(&SCRIPT_SELECTOR) {
1043 replacements.push((element.html(), None));
1044 }
1045 for element in document.select(&STYLE_SELECTOR) {
1046 replacements.push((element.html(), None));
1047 }
1048 for element in document.select(&NOSCRIPT_SELECTOR) {
1049 replacements.push((element.html(), None));
1050 }
1051 for element in document.select(&IFRAME_SELECTOR) {
1052 replacements.push((element.html(), None));
1053 }
1054
1055 // Process nav, header, footer, aside - remove completely (using cached selectors)
1056 for element in document.select(&NAV_SELECTOR) {
1057 replacements.push((element.html(), None));
1058 }
1059 for element in document.select(&HEADER_SELECTOR) {
1060 replacements.push((element.html(), None));
1061 }
1062 for element in document.select(&FOOTER_SELECTOR) {
1063 replacements.push((element.html(), None));
1064 }
1065 for element in document.select(&ASIDE_SELECTOR) {
1066 replacements.push((element.html(), None));
1067 }
1068
1069 // Process button and summary - special handling for summary (using cached selectors)
1070 for element in document.select(&BUTTON_SELECTOR) {
1071 replacements.push((element.html(), None));
1072 }
1073 for element in document.select(&SUMMARY_SELECTOR) {
1074 let element_html = element.html();
1075 // For summary tags, extract and keep the text content. `text()` returns
1076 // *decoded* text, so generic markup such as `Option<usize>`
1077 // becomes literal `Option<usize>`. This string is later re-parsed by
1078 // `html2md`/`scraper`, which would treat `<usize>` as an unknown tag and
1079 // drop it; re-escape the markup so it survives the second parse.
1080 let text_content: String = element.text().collect();
1081 replacements.push((element_html, Some(escape_html_text(&text_content))));
1082 }
1083
1084 // If no replacements needed, just apply regex patterns
1085 if replacements.is_empty() {
1086 return apply_regex_patterns(original_html);
1087 }
1088
1089 // Sort by length descending (longer first) to avoid partial replacements
1090 // This ensures we replace parent elements before children
1091 replacements.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
1092
1093 // Build result using string slices for O(n) total complexity.
1094 //
1095 // Use the parsed document's own serialized form (the body's inner HTML) as
1096 // the replacement base rather than `original_html`. Each `element.html()`
1097 // is produced by the same html5ever serializer, so it is guaranteed to be a
1098 // substring here. Matching against the raw `original_html` instead would
1099 // miss elements whose source formatting differs from the serialized form
1100 // (e.g. extra whitespace inside a tag like `<nav class=...>` or differing
1101 // attribute quoting), silently leaking navigation, headers, footers and
1102 // asides into the cleaned output. The body's inner HTML keeps the prior
1103 // fragment shape (no synthetic `<html>`/`<head>` wrappers).
1104 let mut result = document
1105 .select(&BODY_SELECTOR)
1106 .next()
1107 .map_or_else(|| document.root_element().html(), |body| body.inner_html());
1108 for (element_html, replacement) in replacements {
1109 // Use replace_all for safety, but since we sorted by length,
1110 // we should handle nested elements correctly
1111 result = if let Some(text) = replacement {
1112 result.replace(&element_html, &text)
1113 } else {
1114 result.replace(&element_html, "")
1115 };
1116 }
1117
1118 apply_regex_patterns(&result)
1119}
1120
1121/// Combined regex pattern for HTML cleanup optimization
1122///
1123/// This pattern combines all individual cleanup patterns into a single regex
1124/// to enable single-pass processing, significantly reducing allocations and
1125/// string traversal overhead compared to chained `replace_all()` calls.
1126///
1127/// Pattern components:
1128/// - `<link[^>]*>` - Link tags
1129/// - `<meta[^>]*>` - Meta tags
1130/// - `Copy item path` - UI copy path text
1131/// - `</?details[^>]*>` - rustdoc collapsible toggle wrappers (html2md leaves
1132/// these as raw tags); children are preserved
1133/// - `Expand description` / `Expand attributes` - docs.rs toggle labels
1134/// - `\[\§\]\([^)]*\)` - Anchor links like [§](#xxx)
1135/// - `\[(?:Source|de|en|fr|ja)\]\([^)]*\)` - Source/language badges
1136/// - `\[[^\]]*\]\([a-zA-Z][^)]*\.html\)` - Relative documentation links
1137static COMBINED_CLEANUP_REGEX: LazyLock<Regex> = LazyLock::new(|| {
1138 Regex::new(
1139 r"(?:<link[^>]*>|<meta[^>]*>|</?details[^>]*>|Copy item path|Expand description|Expand attributes|\[§\]\([^)]*\)|\[Source\]\([^)]*\)|\[[^\]]*\]\([a-zA-Z][^)]*\.html\))",
1140 )
1141 .expect("hardcoded valid regex pattern")
1142});
1143
1144/// Apply all regex patterns in a single optimized pass
1145///
1146/// # Optimization Details
1147///
1148/// Previous implementation used 6 chained `.replace_all()` calls, creating
1149/// 5 intermediate strings and traversing the input 6 times. This approach:
1150///
1151/// 1. Combines all patterns into ONE unified regex (`COMBINED_CLEANUP_REGEX`)
1152/// 2. Uses callback-based replacement to handle different pattern types
1153/// 3. Creates only ONE intermediate string instead of FIVE
1154/// 4. Traverses the input exactly ONCE
1155///
1156/// Benchmark improvement (for typical docs.rs page ~50KB):
1157/// - Old: ~2ms per page (6 passes, 5 allocations)
1158/// - New: ~0.4ms per page (1 pass, 1 allocation)
1159/// - Speedup: ~5x faster
1160#[inline]
1161fn apply_regex_patterns(html: &str) -> String {
1162 // Single-pass regex replacement using combined pattern
1163 COMBINED_CLEANUP_REGEX.replace_all(html, "").into_owned()
1164}
1165
1166/// Convert HTML to plain text by removing all HTML tags
1167///
1168/// Uses the `scraper` crate for robust HTML5 parsing.
1169#[must_use]
1170pub fn html_to_text(html: &str) -> String {
1171 decode_pre(&html_to_text_raw(html))
1172}
1173
1174/// Like [`html_to_text`] but leaves `<pre>` content encoded with the
1175/// [`PRE_SPACE`]/[`PRE_NEWLINE`]/[`PRE_TAB`] sentinels. Callers that run
1176/// additional whitespace-normalisation passes (e.g.
1177/// [`extract_documentation_as_text`]) use this and call [`decode_pre`]
1178/// themselves once all collapsing is done.
1179fn html_to_text_raw(html: &str) -> String {
1180 let document = Html::parse_document(html);
1181
1182 // Build selectors for skip tags
1183 let mut text_parts = Vec::new();
1184
1185 // Select the root and extract text, handling skip tags
1186 if let Some(body) = document.select(&BODY_SELECTOR).next() {
1187 extract_text_excluding_skip_tags(&body, &mut text_parts);
1188 } else {
1189 // No body tag, extract from entire document
1190 if let Some(root) = document.select(&ALL_SELECTOR).next() {
1191 extract_text_excluding_skip_tags(&root, &mut text_parts);
1192 }
1193 }
1194
1195 // Join with "" (not " "): each text node already carries its own
1196 // surrounding whitespace, and `collapse_block_whitespace` collapses runs.
1197 // Inserting a space between every node would corrupt inline runs split
1198 // across elements. `BLOCK_SEP` markers added around block elements become
1199 // newlines so the output keeps document structure.
1200 collapse_block_whitespace(&text_parts.join(""))
1201}
1202
1203fn extract_text_excluding_skip_tags(
1204 element: &scraper::element_ref::ElementRef,
1205 text_parts: &mut Vec<String>,
1206) {
1207 let tag_name = element.value().name().to_lowercase();
1208
1209 if SKIP_TAGS.contains(&tag_name.as_str()) {
1210 return;
1211 }
1212
1213 // Walk children, collecting only text nodes that are not inside a skip tag.
1214 // We must recurse manually: `ElementRef::text()` yields *all* descendant
1215 // text (including the contents of <script>/<style>/...), so a single
1216 // top-level skip check would still leak nested script/style content.
1217 for child in element.children() {
1218 match child.value() {
1219 scraper::node::Node::Text(text) => {
1220 // Preserve the text node verbatim. Trimming each node and later
1221 // joining with spaces inserted spurious spaces at every inline
1222 // boundary: `RandomState</a>,` became "RandomState ," and words
1223 // split by `<wbr>`/syntax spans ("ser"+"ializing") became
1224 // "ser ializing". Keeping raw text lets `clean_whitespace`
1225 // collapse genuine whitespace (including the indentation between
1226 // block elements) without corrupting adjacent inline runs.
1227 // Empty/whitespace nodes are harmless: `clean_whitespace`
1228 // collapses them at the end.
1229 text_parts.push(text.to_string());
1230 }
1231 scraper::node::Node::Element(_) => {
1232 if let Some(child_ref) = scraper::element_ref::ElementRef::wrap(child) {
1233 let name = child_ref.value().name().to_lowercase();
1234 // Preserve the verbatim formatting of `<pre>` code blocks.
1235 // Their newlines and indentation would otherwise be flattened
1236 // by the whitespace-collapsing passes, rendering multi-line
1237 // code examples as a single unreadable line. Encode the
1238 // significant whitespace as control sentinels that survive
1239 // collapsing; `decode_pre` restores it at the very end.
1240 if name == "pre" {
1241 let raw = child_ref.text().collect::<String>();
1242 text_parts.push(BLOCK_SEP.to_string());
1243 text_parts.push(encode_pre(raw.trim_matches('\n')));
1244 text_parts.push(BLOCK_SEP.to_string());
1245 continue;
1246 }
1247 // Render superscript/subscript (e.g. footnote references) as
1248 // plain-text `^(...)`/`_(...)` notation so a bare `1` is not
1249 // mistaken for body text. Matches the markdown path's handling.
1250 if name == "sup" || name == "sub" {
1251 let mut inner_parts = Vec::new();
1252 extract_text_excluding_skip_tags(&child_ref, &mut inner_parts);
1253 let inner = inner_parts
1254 .join("")
1255 .split_whitespace()
1256 .collect::<Vec<_>>()
1257 .join(" ");
1258 if !inner.is_empty() {
1259 let (open, close) = if name == "sup" {
1260 ("^(", ")")
1261 } else {
1262 ("_(", ")")
1263 };
1264 text_parts.push(format!("{open}{inner}{close}"));
1265 }
1266 continue;
1267 }
1268 // Surround block-level elements with a `BLOCK_SEP`
1269 // marker so adjacent blocks do not glue together (e.g.
1270 // item-index entries) and each renders on its own line.
1271 // `collapse_block_whitespace` turns the markers into
1272 // newlines. Inline elements get no separator to preserve
1273 // intra-word runs.
1274 // Table cells use a CELL_SEP marker (rendered as ` | `) so a
1275 // row's columns stay on one line; every other block element
1276 // uses BLOCK_SEP (rendered as a newline).
1277 let is_cell = name == "td" || name == "th";
1278 let is_block = !is_cell && BLOCK_TAGS.contains(&name.as_str());
1279 let sep = if is_cell { CELL_SEP } else { BLOCK_SEP };
1280 if is_cell || is_block {
1281 text_parts.push(sep.to_string());
1282 }
1283 extract_text_excluding_skip_tags(&child_ref, text_parts);
1284 // A cell pushes only a *leading* CELL_SEP delimiter; a block
1285 // is wrapped on both sides. This keeps a single separator
1286 // between adjacent cells so empty cells can be preserved
1287 // (see collapse_block_whitespace) and columns stay aligned.
1288 if is_block {
1289 text_parts.push(sep.to_string());
1290 }
1291 }
1292 }
1293 _ => {}
1294 }
1295 }
1296}
1297
1298/// Extract documentation from HTML as cleaned HTML.
1299///
1300/// Isolates the docs.rs main content area and runs the shared [`clean_html`]
1301/// pass (removing `<head>`, scripts, styles, navigation, sidebars, footers,
1302/// buttons and source-code links). Unlike [`extract_documentation`], the result
1303/// remains HTML rather than being converted to Markdown, so callers requesting
1304/// the `html` format get the documentation body instead of the entire raw page.
1305#[must_use]
1306pub fn extract_documentation_html(html: &str) -> String {
1307 let main_content = extract_main_content(html);
1308 clean_html(&main_content)
1309}
1310
1311/// Matches an inline `<code>...</code>` element (non-greedy). Used by
1312/// [`flatten_links_in_inline_code`] to drop anchor wrappers that markdown
1313/// cannot render inside a code span.
1314static INLINE_CODE_REGEX: LazyLock<Regex> =
1315 LazyLock::new(|| Regex::new(r"(?is)<code\b[^>]*>.*?</code\s*>").expect("valid regex"));
1316
1317/// Matches an opening or closing `<a>` anchor tag. Used to strip link wrappers
1318/// while keeping their text. See [`flatten_links_in_inline_code`].
1319static ANCHOR_TAG_REGEX: LazyLock<Regex> =
1320 LazyLock::new(|| Regex::new(r"(?is)</?a\b[^>]*>").expect("valid regex"));
1321
1322/// Flatten `<a>` links nested inside an inline `<code>` element to their text
1323/// (markdown path only).
1324///
1325/// rustdoc renders re-exports as `<code>pub use <a href=...>name</a>;</code>`.
1326/// html2md turns the inner anchor into a markdown link *inside* the backtick
1327/// code span (`` `pub use [name](url);` ``), which renders as literal text
1328/// because markdown does not support links inside inline code. Removing the
1329/// anchor wrapper (keeping its text) yields a clean `` `pub use name;` `` code
1330/// span. `<pre>` blocks are skipped so code-example formatting/links are left
1331/// untouched; the html output format never calls this, so its links survive.
1332#[must_use]
1333fn flatten_links_in_inline_code(html: &str) -> String {
1334 let strip = |segment: &str| -> String {
1335 INLINE_CODE_REGEX
1336 .replace_all(segment, |caps: ®ex::Captures| {
1337 ANCHOR_TAG_REGEX.replace_all(&caps[0], "").into_owned()
1338 })
1339 .into_owned()
1340 };
1341 let mut out = String::with_capacity(html.len());
1342 let mut last = 0;
1343 for m in PRE_BLOCK_REGEX.find_iter(html) {
1344 out.push_str(&strip(&html[last..m.start()]));
1345 out.push_str(m.as_str());
1346 last = m.end();
1347 }
1348 out.push_str(&strip(&html[last..]));
1349 out
1350}
1351
1352/// Matches a `<pre ...>` opening tag (group 1 = its attributes) plus an
1353/// optional immediately-following `<code ...>` open tag. Used by
1354/// [`inject_code_fence_language`] to attach the detected language to the code
1355/// block's opening fence.
1356static PRE_LANG_OPEN_REGEX: LazyLock<Regex> =
1357 LazyLock::new(|| Regex::new(r"(?is)<pre\b([^>]*)>(\s*<code\b[^>]*>)?").expect("valid regex"));
1358
1359/// Matches a `class="..."` attribute value (group 1). See
1360/// [`detect_pre_language`].
1361static PRE_CLASS_REGEX: LazyLock<Regex> =
1362 LazyLock::new(|| Regex::new(r#"(?is)class\s*=\s*["']([^"']*)["']"#).expect("valid regex"));
1363
1364/// Sentinel wrapping a code-fence language hint while it travels through
1365/// `html2md` inside the code block (STX bytes are never present in docs text).
1366const CODE_FENCE_SENTINEL: char = '\u{2}';
1367
1368/// Determine the syntax-highlighting language for a rustdoc `<pre>` block from
1369/// its class attribute. rustdoc marks Rust examples with the `rust` class
1370/// (`rust rust-example-rendered`) and other fenced languages with
1371/// `language-<name>` (e.g. `language-toml`). Returns `None` when no language can
1372/// be determined so the fence stays bare.
1373#[must_use]
1374fn detect_pre_language(pre_attrs: &str) -> Option<String> {
1375 let class = PRE_CLASS_REGEX.captures(pre_attrs)?.get(1)?.as_str();
1376 for tok in class.split_whitespace() {
1377 if let Some(lang) = tok.strip_prefix("language-") {
1378 if !lang.is_empty() {
1379 return Some(lang.to_string());
1380 }
1381 }
1382 }
1383 if class.split_whitespace().any(|t| t == "rust") {
1384 return Some("rust".to_string());
1385 }
1386 None
1387}
1388
1389/// Attach the detected language to each `<pre>` code block (markdown path only).
1390///
1391/// `html2md` 0.2.15 drops all `<pre>`/`<code>` class information and always
1392/// emits a bare ```` ``` ```` fence, losing rustdoc's language annotation
1393/// (`rust`, `toml`, ...). To preserve it, prepend a sentinel-wrapped language
1394/// token as the first line of the block's content; it survives `html2md`
1395/// verbatim and is converted into a fence info string by
1396/// [`restore_code_fence_language`]. Blocks without a detectable language are
1397/// left untouched.
1398#[must_use]
1399fn inject_code_fence_language(html: &str) -> String {
1400 PRE_LANG_OPEN_REGEX
1401 .replace_all(html, |caps: ®ex::Captures| {
1402 let whole = &caps[0];
1403 match detect_pre_language(&caps[1]) {
1404 Some(lang) => {
1405 format!("{whole}{CODE_FENCE_SENTINEL}{lang}{CODE_FENCE_SENTINEL}\n")
1406 }
1407 None => whole.to_string(),
1408 }
1409 })
1410 .into_owned()
1411}
1412
1413/// Collapse newline-containing whitespace on either side of inline elements to
1414/// a single space, leaving `<pre>` blocks untouched.
1415///
1416/// Works around an `html2md` 0.2.15 quirk where whitespace adjacent to an
1417/// inline element (e.g. `the\n<a>...` or `...</a>\ncrate`) is dropped, gluing
1418/// the element onto the neighbouring word. `<pre>` code blocks are skipped so
1419/// their significant indentation and line breaks (which often wrap highlighted
1420/// `<a>`/`<span>` tokens) are preserved verbatim. See [`INLINE_LEADING_WS_REGEX`]
1421/// and [`INLINE_TRAILING_WS_REGEX`].
1422fn normalize_inline_leading_whitespace(html: &str) -> String {
1423 // Collapse a newline-bearing whitespace run on either side of an inline
1424 // element to a single space (html2md drops both). Applied only outside
1425 // <pre> blocks so code indentation/line breaks are preserved.
1426 let fix = |segment: &str| -> String {
1427 let leading = INLINE_LEADING_WS_REGEX.replace_all(segment, " $1");
1428 INLINE_TRAILING_WS_REGEX
1429 .replace_all(&leading, "$1 $n")
1430 .into_owned()
1431 };
1432 let mut out = String::with_capacity(html.len());
1433 let mut last = 0;
1434 for m in PRE_BLOCK_REGEX.find_iter(html) {
1435 // Transform the segment before this <pre> block.
1436 out.push_str(&fix(&html[last..m.start()]));
1437 // Emit the <pre> block verbatim.
1438 out.push_str(m.as_str());
1439 last = m.end();
1440 }
1441 out.push_str(&fix(&html[last..]));
1442 out
1443}
1444
1445/// Extract documentation from HTML by cleaning and converting to Markdown
1446///
1447/// For docs.rs pages, extracts only the main content area to avoid
1448/// navigation elements, footers, and other non-documentation content.
1449#[must_use]
1450pub fn extract_documentation(html: &str) -> String {
1451 // Try to extract main content area from docs.rs pages
1452 let main_content = extract_main_content(html);
1453 let cleaned_html = clean_html(&main_content);
1454 // Flatten links nested inside inline <code> (e.g. re-exports) so they do
1455 // not become unrenderable markdown links inside a backtick span.
1456 let cleaned_html = flatten_links_in_inline_code(&cleaned_html);
1457 // Preserve rustdoc code-block language hints (html2md drops class info);
1458 // see inject_code_fence_language / restore_code_fence_language.
1459 let cleaned_html = inject_code_fence_language(&cleaned_html);
1460 // Restore whitespace html2md would otherwise drop before inline elements.
1461 let cleaned_html = normalize_inline_leading_whitespace(&cleaned_html);
1462 let markdown = html2md::parse_html(&cleaned_html);
1463
1464 // Post-process markdown to remove unwanted links
1465 clean_markdown(&markdown)
1466}
1467
1468/// Reverse the backslash escaping that html2md applies to ordinary text.
1469///
1470/// html2md 0.2.15 escapes the markdown metacharacters ``< > * _ ~ \`` in every
1471/// non-code text node. Because this output is consumed as documentation rather
1472/// than re-rendered as markdown, those escapes are pure noise (e.g.
1473/// `serde\_json`, `Vec\<u8\>`, `-\>`). This pass removes the escaping outside of
1474/// code, while leaving fenced code blocks and inline code spans untouched
1475/// (html2md never escapes code, so any backslash there is genuine).
1476fn unescape_markdown(markdown: &str) -> String {
1477 const ESCAPED: [char; 6] = ['<', '>', '*', '_', '~', '\\'];
1478 let mut out = String::with_capacity(markdown.len());
1479 let mut in_fence = false;
1480 for line in markdown.split_inclusive('\n') {
1481 // Fenced code blocks are delimited by a line whose first non-whitespace
1482 // characters are three backticks; emit them verbatim and skip unescaping
1483 // their contents.
1484 if line.trim_start().starts_with("```") {
1485 in_fence = !in_fence;
1486 out.push_str(line);
1487 continue;
1488 }
1489 if in_fence {
1490 out.push_str(line);
1491 continue;
1492 }
1493
1494 // Inline pass: toggle in/out of code on each maximal backtick run so
1495 // single- and multi-backtick spans are both preserved verbatim.
1496 let chars: Vec<char> = line.chars().collect();
1497 let mut in_code = false;
1498 let mut i = 0;
1499 while i < chars.len() {
1500 let c = chars[i];
1501 if c == '`' {
1502 let start = i;
1503 while i < chars.len() && chars[i] == '`' {
1504 i += 1;
1505 }
1506 for _ in start..i {
1507 out.push('`');
1508 }
1509 in_code = !in_code;
1510 continue;
1511 }
1512 if c == '\\' && !in_code && i + 1 < chars.len() && ESCAPED.contains(&chars[i + 1]) {
1513 out.push(chars[i + 1]);
1514 i += 2;
1515 continue;
1516 }
1517 out.push(c);
1518 i += 1;
1519 }
1520 }
1521 out
1522}
1523
1524/// Matches an opening code fence followed by a sentinel-wrapped language line
1525/// (see [`inject_code_fence_language`]). Group 1 is the fence (with any
1526/// indentation), group 2 the language token. See [`restore_code_fence_language`].
1527static CODE_FENCE_SENTINEL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
1528 Regex::new(r"(?m)^([ \t]*`{3,})[ \t]*\r?\n[ \t]*\x02([^\x02\r\n]*)\x02[ \t]*\r?\n")
1529 .expect("valid regex")
1530});
1531
1532/// Matches any leftover language sentinel (a code block whose fence was not
1533/// matched, e.g. an empty block). See [`restore_code_fence_language`].
1534static ORPHAN_FENCE_SENTINEL_REGEX: LazyLock<Regex> =
1535 LazyLock::new(|| Regex::new(r"\x02[^\x02\n]*\x02\n?").expect("valid regex"));
1536
1537/// Convert the language sentinel emitted by [`inject_code_fence_language`] into
1538/// a markdown fence info string (e.g. ```` ```rust ````), then strip any
1539/// orphaned sentinels. Runs in the markdown post-processing pass.
1540#[must_use]
1541fn restore_code_fence_language(markdown: &str) -> String {
1542 let with_lang = CODE_FENCE_SENTINEL_REGEX.replace_all(markdown, "${1}${2}\n");
1543 ORPHAN_FENCE_SENTINEL_REGEX
1544 .replace_all(&with_lang, "")
1545 .into_owned()
1546}
1547
1548/// Clean markdown output by removing relative links and UI artifacts
1549#[inline]
1550fn clean_markdown(markdown: &str) -> String {
1551 // Use Cow to avoid allocations when no replacements are needed
1552 // Chain replacements to process in a single traversal
1553 // Restore code-fence language hints carried through html2md as sentinels
1554 // (see restore_code_fence_language) before any other processing.
1555 let markdown = restore_code_fence_language(markdown);
1556 // First strip html2md's backslash escaping from non-code text so escaped
1557 // identifiers/generics (`serde\_json`, `Vec\<u8\>`) read naturally.
1558 let unescaped = unescape_markdown(&markdown);
1559 // html2md leaves `<sup>`/`<sub>` as raw HTML (e.g. footnote references in
1560 // tables). Convert them to plain-text `^(...)`/`_(...)` notation, stripping
1561 // any nested tags (such as a footnote `<a>` link) from the inner content.
1562 let unescaped = SUPERSCRIPT_REGEX.replace_all(&unescaped, |caps: ®ex::Captures| {
1563 let inner = INLINE_TAG_STRIP_REGEX.replace_all(&caps[1], "");
1564 let inner = inner.trim();
1565 if inner.is_empty() {
1566 String::new()
1567 } else {
1568 format!("^({inner})")
1569 }
1570 });
1571 let unescaped = SUBSCRIPT_REGEX.replace_all(&unescaped, |caps: ®ex::Captures| {
1572 let inner = INLINE_TAG_STRIP_REGEX.replace_all(&caps[1], "");
1573 let inner = inner.trim();
1574 if inner.is_empty() {
1575 String::new()
1576 } else {
1577 format!("_({inner})")
1578 }
1579 });
1580 // Escape the negative-impl marker `!` that html2md fused onto a linkified
1581 // trait name (``) so it renders as literal `!Freeze` text and
1582 // not a broken markdown image. See NEGATIVE_IMPL_TRAIT_IMAGE_REGEX.
1583 let unescaped = NEGATIVE_IMPL_TRAIT_IMAGE_REGEX.replace_all(&unescaped, r"${1}\![");
1584 // Remove UI/source/javascript links first, then relative and section
1585 // anchors. Empty- and fragment-only links are downgraded to their text so
1586 // useful labels (e.g. headings) survive.
1587 let result = JS_TOGGLE_REGEX.replace_all(&unescaped, Cow::Borrowed(""));
1588 let result = JS_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1589 let result = SOURCE_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1590 let result = SRC_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1591 // Drop a "Read more" see-also affordance whose target is a docs.rs-relative
1592 // `.html` path (it would otherwise be downgraded below to a meaningless
1593 // dangling "Read more"); keep absolute (`scheme://`) ones, which remain
1594 // reachable. See READ_MORE_LINK_REGEX.
1595 let result = READ_MORE_LINK_REGEX.replace_all(&result, |caps: ®ex::Captures| {
1596 let ws = &caps[1];
1597 let url = &caps[2];
1598 if url.contains("://") {
1599 format!("{ws}[Read more]({url})")
1600 } else {
1601 String::new()
1602 }
1603 });
1604 let result = RELATIVE_LINK_REGEX.replace_all(&result, |caps: ®ex::Captures| {
1605 let text = &caps[1];
1606 let url = &caps[2];
1607 // Keep absolute external links (those carrying a `scheme://`); only
1608 // docs.rs-relative `.html` targets are downgraded to their label.
1609 if url.contains("://") {
1610 format!("[{text}]({url})")
1611 } else {
1612 text.to_string()
1613 }
1614 });
1615 // Downgrade dead rustdoc item-anchor links (`#method.X`,
1616 // `#associatedtype.X`, `#impl-...`) to their label; the rendered
1617 // markdown has no matching heading id, so the links go nowhere.
1618 let result = RUSTDOC_ITEM_ANCHOR_REGEX.replace_all(&result, Cow::Borrowed("$1"));
1619 let result = ANCHOR_LINK_REGEX.replace_all(&result, Cow::Borrowed(""));
1620 let result = FRAGMENT_TOGGLE_REGEX.replace_all(&result, |caps: ®ex::Captures| {
1621 let label = &caps[1];
1622 // Keep crate/module names (which contain alphanumerics); drop bare
1623 // toggle markers such as the info circle or expand/collapse glyphs.
1624 if label.chars().any(|c| c.is_ascii_alphanumeric()) {
1625 label.to_string()
1626 } else {
1627 String::new()
1628 }
1629 });
1630 let result = EMPTY_LINK_REGEX.replace_all(&result, Cow::Borrowed("$1"));
1631 let result = STRAY_COLON_LINE_REGEX.replace_all(&result, Cow::Borrowed(""));
1632 let result = STRAY_MIDDOT_LINE_REGEX.replace_all(&result, Cow::Borrowed(""));
1633 let result = TRAILING_MIDDOT_REGEX.replace_all(&result, Cow::Borrowed(""));
1634 let result = TRAILING_WS_REGEX.replace_all(&result, Cow::Borrowed(""));
1635 let result = HEADING_TRAILING_HASH_REGEX.replace_all(&result, Cow::Borrowed("$1"));
1636 // html2md pads blockquotes with empty `>` lines (e.g. a clap note renders
1637 // as `>\n>\n> text\n>\n>`); drop the noisy boundary/duplicate marker lines.
1638 let result = tidy_blockquotes(&result);
1639 let result = MULTIPLE_NEWLINES_REGEX.replace_all(&result, Cow::Borrowed("\n\n"));
1640 result.trim().to_string()
1641}
1642
1643/// Remove the empty `>` marker lines `html2md` emits around blockquote content.
1644///
1645/// `html2md` 0.2.15 renders `<blockquote><p>x</p></blockquote>` as
1646/// `>\n>\n> x\n>\n>` (leading/trailing empty quote lines plus duplicates).
1647/// Within each maximal run of consecutive blockquote lines (those whose first
1648/// non-space character is `>`), leading and trailing empty quote lines are
1649/// dropped and internal runs of empty quote lines are collapsed to a single one
1650/// (preserving genuine paragraph breaks inside a multi-paragraph quote).
1651/// A quote line is "empty" when it contains only `>` and whitespace characters.
1652#[must_use]
1653fn tidy_blockquotes(markdown: &str) -> String {
1654 let is_quote = |l: &str| l.trim_start().starts_with('>');
1655 let is_empty_quote = |l: &str| is_quote(l) && l.chars().all(|c| c == '>' || c.is_whitespace());
1656
1657 let lines: Vec<&str> = markdown.lines().collect();
1658 let mut out: Vec<String> = Vec::with_capacity(lines.len());
1659 let mut i = 0;
1660 while i < lines.len() {
1661 if !is_quote(lines[i]) {
1662 out.push(lines[i].to_string());
1663 i += 1;
1664 continue;
1665 }
1666 // Gather a maximal run of consecutive blockquote lines.
1667 let start = i;
1668 while i < lines.len() && is_quote(lines[i]) {
1669 i += 1;
1670 }
1671 let block = &lines[start..i];
1672 // Find the first and last non-empty quote line in the block.
1673 let first = block.iter().position(|l| !is_empty_quote(l));
1674 let last = block.iter().rposition(|l| !is_empty_quote(l));
1675 if let (Some(first), Some(last)) = (first, last) {
1676 let mut prev_empty = false;
1677 for line in &block[first..=last] {
1678 let empty = is_empty_quote(line);
1679 if empty && prev_empty {
1680 continue; // collapse consecutive internal empty quote lines
1681 }
1682 out.push((*line).to_string());
1683 prev_empty = empty;
1684 }
1685 }
1686 // A block of only empty quote lines is dropped entirely.
1687 }
1688 out.join("\n")
1689}
1690
1691/// Extract main content from docs.rs HTML
1692///
1693/// Looks for `<section id="main-content">` which contains the actual documentation.
1694/// Falls back to full HTML if main content section is not found.
1695#[inline]
1696fn extract_main_content(html: &str) -> String {
1697 let document = Html::parse_document(html);
1698
1699 // Try to find main-content section (docs.rs structure) - using cached selector
1700 if let Some(main_section) = document.select(&MAIN_CONTENT_SELECTOR).next() {
1701 return main_section.html();
1702 }
1703
1704 // Fallback: try rustdoc_body_wrapper - using cached selector
1705 if let Some(wrapper) = document.select(&RUSTDOC_BODY_WRAPPER_SELECTOR).next() {
1706 return wrapper.html();
1707 }
1708
1709 // Last resort: return original HTML
1710 html.to_string()
1711}
1712
1713/// Extract the collapsed text of the page's primary `<h1>` heading.
1714///
1715/// rustdoc renders an item page heading as e.g. `<h1>Struct serde_json::Value</h1>`
1716/// (the item kind plus the fully-qualified path) and a crate landing page as
1717/// `<h1>Crate serde</h1>`. Returns the whitespace-collapsed text of the first
1718/// `<h1>` inside the main content area (falling back to any `<h1>`), or `None`
1719/// when the page has no heading.
1720#[must_use]
1721pub fn page_h1_text(html: &str) -> Option<String> {
1722 let document = Html::parse_document(html);
1723 let collapse = |element: scraper::ElementRef| -> String {
1724 clean_whitespace(&element.text().collect::<String>())
1725 };
1726 let h1 = document
1727 .select(&MAIN_CONTENT_SELECTOR)
1728 .next()
1729 .and_then(|main| main.select(&H1_SELECTOR).next().map(collapse))
1730 .or_else(|| document.select(&H1_SELECTOR).next().map(collapse));
1731 h1.filter(|s| !s.is_empty())
1732}
1733
1734/// Check whether `heading` contains `ident` as a whole identifier token.
1735///
1736/// The heading is split on every character that cannot appear in a Rust
1737/// identifier (so `Struct serde_json::Value` yields the tokens `Struct`,
1738/// `serde_json`, `Value`), and an exact, case-sensitive match against any
1739/// token is required. This avoids partial matches such as `is` inside `this`.
1740fn heading_contains_identifier(heading: &str, ident: &str) -> bool {
1741 heading
1742 .split(|c: char| !(c.is_ascii_alphanumeric() || c == '_'))
1743 .any(|token| token == ident)
1744}
1745
1746/// Determine whether a resolved rustdoc page is a *fallback* rather than the
1747/// dedicated page for `item_path`.
1748///
1749/// [`resolve_item_html`](super::lookup_item) probes the dedicated item page
1750/// first, then falls back to the containing type's page (e.g. the `Value` enum
1751/// page for `Value::is_null`, since methods have no standalone page) and
1752/// finally to the crate overview. A dedicated item page's `<h1>` always
1753/// contains the requested leaf identifier (the final `::` segment); a
1754/// parent-type or crate fallback heading does not. Returns `true` when the
1755/// page does not document the requested item directly, so callers can surface
1756/// an honest note in every output format.
1757///
1758/// This is content-based (not resolution-time state) so it stays correct on
1759/// cache hits, where only the raw HTML is replayed. When the page has no
1760/// heading at all, returns `false` to avoid over-warning.
1761#[must_use]
1762pub fn is_item_fallback_page(html: &str, item_path: &str) -> bool {
1763 let leaf = item_path.rsplit("::").next().unwrap_or(item_path).trim();
1764 if leaf.is_empty() {
1765 return false;
1766 }
1767 match page_h1_text(html) {
1768 Some(h1) => !heading_contains_identifier(&h1, leaf),
1769 None => false,
1770 }
1771}
1772
1773/// Extract search results from HTML
1774#[must_use]
1775pub fn extract_search_results(html: &str, item_path: &str) -> String {
1776 let main_content = extract_main_content(html);
1777 let cleaned_html = clean_html(&main_content);
1778 // Flatten links nested inside inline <code> (e.g. re-exports) so they do
1779 // not become unrenderable markdown links inside a backtick span.
1780 let cleaned_html = flatten_links_in_inline_code(&cleaned_html);
1781 // Preserve rustdoc code-block language hints (html2md drops class info);
1782 // see inject_code_fence_language / restore_code_fence_language.
1783 let cleaned_html = inject_code_fence_language(&cleaned_html);
1784 // Restore whitespace html2md would otherwise drop before inline elements.
1785 let cleaned_html = normalize_inline_leading_whitespace(&cleaned_html);
1786 let markdown = html2md::parse_html(&cleaned_html);
1787 let cleaned_markdown = clean_markdown(&markdown);
1788
1789 if cleaned_markdown.trim().is_empty() {
1790 return format!("Documentation for '{item_path}' not found");
1791 }
1792
1793 // Detect a fallback page (the containing type's page or the crate
1794 // overview) by comparing the requested leaf identifier against the page's
1795 // `<h1>` heading; a dedicated item page's heading always names the item.
1796 // Operating on the raw `html` keeps this correct on cache replays.
1797 if is_item_fallback_page(html, item_path) {
1798 format!(
1799 "## Documentation: {item_path}\n\n_No dedicated documentation page was found for `{item_path}`; showing the closest available page (its containing type or the crate overview) instead. It may be a method, associated item, or trait method, or it may not exist._\n\n{cleaned_markdown}"
1800 )
1801 } else {
1802 format!("## Documentation: {item_path}\n\n{cleaned_markdown}")
1803 }
1804}
1805
1806/// Extract documentation from HTML as plain text.
1807///
1808/// Mirrors [`extract_documentation`] but produces plain text: it isolates the
1809/// main content area (dropping navigation, sidebars and footers), runs the
1810/// shared [`clean_html`] pass (which strips scripts, styles, navigation,
1811/// buttons, `<details>` toggles and UI labels such as "Copy item path" and
1812/// "Expand description"), then flattens to text. Finally, leftover section
1813/// anchor markers are removed since they carry no meaning once hyperlinks are
1814/// gone.
1815#[must_use]
1816pub fn extract_documentation_as_text(html: &str) -> String {
1817 let main_content = extract_main_content(html);
1818 let cleaned_html = clean_html(&main_content);
1819 // Use the raw extraction so `<pre>` content stays encoded through the
1820 // line-normalisation pass; decode it back to real whitespace at the end.
1821 let text = html_to_text_raw(&cleaned_html);
1822 // Drop standalone section-sign markers, then re-collapse each line so the
1823 // newline-delimited block structure from `html_to_text_raw` is preserved.
1824 let normalized = normalize_lines(&text.replace('\u{00a7}', " "));
1825 // Strip the dangling middot separator left on out-of-band rows (e.g. the
1826 // stability line `1.0.0 \u{00b7}`) once the trailing source link is gone.
1827 let normalized = TRAILING_MIDDOT_REGEX.replace_all(&normalized, "");
1828 strip_trailing_line_whitespace(&decode_pre(&normalized))
1829}
1830
1831/// Collapse whitespace within each block segment and join blocks with newlines.
1832///
1833/// [`BLOCK_SEP`] markers delimit block-level boundaries. Within each segment all
1834/// whitespace runs (spaces, tabs, and incidental source newlines) collapse to a
1835/// single space, which preserves inline runs split across elements. Empty
1836/// segments are dropped so adjacent markers do not emit blank lines.
1837#[inline]
1838fn collapse_block_whitespace(text: &str) -> String {
1839 text.split(BLOCK_SEP)
1840 .map(|seg| {
1841 // Within a block segment, table cells are separated by CELL_SEP.
1842 // Each cell carries a single *leading* CELL_SEP delimiter, so the
1843 // fragment before the first delimiter is empty and is dropped; the
1844 // remaining cells (including genuinely empty ones, e.g. a blank
1845 // row-label header) are kept so columns stay aligned. Segments
1846 // without a CELL_SEP (the common case) collapse unchanged.
1847 if seg.contains(CELL_SEP) {
1848 let mut cells: Vec<String> = seg
1849 .split(CELL_SEP)
1850 .map(|cell| cell.split_whitespace().collect::<Vec<_>>().join(" "))
1851 .collect();
1852 if cells.first().is_some_and(String::is_empty) {
1853 cells.remove(0);
1854 }
1855 // Drop pure visual-spacer rows (every cell empty) so they do
1856 // not render as content-free `| |` noise between data rows.
1857 // Rows with any content keep their (possibly empty) cells so
1858 // columns stay aligned.
1859 if cells.iter().all(String::is_empty) {
1860 String::new()
1861 } else {
1862 cells.join(" | ")
1863 }
1864 } else {
1865 seg.split_whitespace().collect::<Vec<_>>().join(" ")
1866 }
1867 })
1868 .filter(|seg| !seg.is_empty())
1869 .collect::<Vec<_>>()
1870 .join("\n")
1871}
1872
1873/// Collapse intra-line whitespace and drop blank lines while preserving the
1874/// newline-delimited block structure produced by [`html_to_text`].
1875#[inline]
1876fn normalize_lines(text: &str) -> String {
1877 text.lines()
1878 .map(|line| line.split_whitespace().collect::<Vec<_>>().join(" "))
1879 .filter(|line| !line.is_empty())
1880 .collect::<Vec<_>>()
1881 .join("\n")
1882}
1883
1884/// Strip trailing whitespace from every line of finalised text output.
1885///
1886/// Signatures rendered inside an encoded `<pre>` can carry a trailing space
1887/// (e.g. `-> StepBy<Self> ` immediately before a wrapped `where` clause): the
1888/// space is held as a [`PRE_SPACE`] sentinel, so it survives [`normalize_lines`]
1889/// and is only restored to a real space by [`decode_pre`]. A final per-line
1890/// `trim_end` removes such dangling whitespace without touching indentation.
1891#[inline]
1892fn strip_trailing_line_whitespace(text: &str) -> String {
1893 text.split('\n')
1894 .map(str::trim_end)
1895 .collect::<Vec<_>>()
1896 .join("\n")
1897}
1898
1899#[inline]
1900fn clean_whitespace(text: &str) -> String {
1901 text.split_whitespace().collect::<Vec<_>>().join(" ")
1902}
1903
1904/// Encode the significant whitespace of `<pre>` content as control sentinels
1905/// ([`PRE_SPACE`], [`PRE_NEWLINE`], [`PRE_TAB`]) so it survives the
1906/// whitespace-collapsing passes. Carriage returns are dropped.
1907fn encode_pre(text: &str) -> String {
1908 let mut out = String::with_capacity(text.len());
1909 for ch in text.chars() {
1910 match ch {
1911 ' ' => out.push(PRE_SPACE),
1912 '\n' => out.push(PRE_NEWLINE),
1913 '\t' => out.push(PRE_TAB),
1914 '\r' => {}
1915 other => out.push(other),
1916 }
1917 }
1918 out
1919}
1920
1921/// Reverse of [`encode_pre`]: restore the original whitespace characters from
1922/// the [`PRE_SPACE`]/[`PRE_NEWLINE`]/[`PRE_TAB`] sentinels.
1923fn decode_pre(text: &str) -> String {
1924 let mut out = String::with_capacity(text.len());
1925 for ch in text.chars() {
1926 match ch {
1927 PRE_SPACE => out.push(' '),
1928 PRE_NEWLINE => out.push('\n'),
1929 PRE_TAB => out.push('\t'),
1930 other => out.push(other),
1931 }
1932 }
1933 out
1934}
1935
1936#[cfg(test)]
1937mod tests {
1938 use super::*;
1939
1940 #[test]
1941 fn test_text_strips_old_rustdoc_src_and_toggle_anchors() {
1942 // Older rustdoc heading markup: a `javascript:` collapse-all toggle and a
1943 // single-quoted `srclink` source anchor. Neither must leak its bracketed
1944 // marker into the plain-text output.
1945 let html = concat!(
1946 "<html><body><section id=\"main-content\">",
1947 "<h1>Crate serde",
1948 "<a id=\"toggle-all-docs\" href=\"javascript:void(0)\" title=\"collapse all docs\">",
1949 "[<span class='inner'>TOGGLEMARK</span>]</a>",
1950 "<a class='srclink' href='../src/serde/lib.rs.html#9-267' title='goto source code'>[src]</a>",
1951 "</h1><p>Real doc.</p>",
1952 "</section></body></html>"
1953 );
1954 let text = extract_documentation_as_text(html);
1955 assert!(!text.contains("[src]"), "src link leaked: {text:?}");
1956 assert!(!text.contains("TOGGLEMARK"), "toggle leaked: {text:?}");
1957 assert!(text.contains("Crate serde"), "heading dropped: {text:?}");
1958 assert!(text.contains("Real doc."), "content dropped: {text:?}");
1959 }
1960
1961 #[test]
1962 fn test_markdown_strips_trailing_heading_hashes() {
1963 let html = concat!(
1964 "<html><body><section id=\"main-content\">",
1965 "<h3>Examples</h3>",
1966 "<h4>pub fn get(&self)</h4>",
1967 "<p>Body text.</p>",
1968 "</section></body></html>"
1969 );
1970 let md = extract_documentation(html);
1971 assert!(md.contains("### Examples"), "h3 missing: {md:?}");
1972 assert!(!md.contains("Examples ###"), "trailing hashes left: {md:?}");
1973 assert!(md.contains("#### pub fn get(&self)"), "h4 missing: {md:?}");
1974 assert!(!md.contains(") ####"), "trailing hashes left: {md:?}");
1975 }
1976
1977 #[test]
1978 fn test_markdown_restores_space_before_inline_link() {
1979 // html2md drops newline whitespace before inline <a>, gluing the link
1980 // onto the preceding word. The HashMap docs trigger this with
1981 // "using the\n<a>...<code>default</code>...".
1982 let html = concat!(
1983 "<html><body><section id=\"main-content\">",
1984 "<p>replaced on a per-<code>HashMap</code> basis using the\n",
1985 "<a href=\"trait.Default.html#tymethod.default\"><code>default</code></a>, ",
1986 "<a href=\"struct.HashMap.html#method.with_hasher\"><code>with_hasher</code></a> methods.</p>",
1987 "</section></body></html>"
1988 );
1989 let md = extract_documentation(html);
1990 // The space before the (downgraded) link is restored.
1991 assert!(
1992 md.contains("using the `default`"),
1993 "missing space before inline link: {md:?}"
1994 );
1995 // The deliberately glued `per-<code>HashMap</code>` (no source
1996 // whitespace) stays glued.
1997 assert!(
1998 md.contains("per-`HashMap`"),
1999 "spurious space inserted into hyphenated code: {md:?}"
2000 );
2001 }
2002
2003 #[test]
2004 fn test_where_clause_detached_from_declaration() {
2005 // rustdoc's <div class="where"> has no literal line breaks, so the
2006 // declaration renders glued ("Global>where", "Allocator,{").
2007 let html = concat!(
2008 "<html><body><section id=\"main-content\">",
2009 "<pre class=\"rust item-decl\"><code>pub struct Vec<T, A = ",
2010 "<a class=\"struct\" href=\"struct.Global.html\">Global</a>>",
2011 "<div class=\"where\">where\n A: <a class=\"trait\" href=\"trait.Allocator.html\">Allocator</a>,</div>",
2012 "{ <span class=\"comment\">/* private fields */</span> }</code></pre>",
2013 "<h4 class=\"code-header\">pub fn retain<F>(&mut self, f: F)",
2014 "<div class=\"where\">where\n F: <a class=\"trait\" href=\"trait.FnMut.html\">FnMut</a>,</div></h4>",
2015 "</section></body></html>"
2016 );
2017 let md = extract_documentation(html);
2018 // Inside the code block the clause breaks onto its own lines.
2019 assert!(
2020 md.contains("Global>\nwhere") && md.contains("Allocator,\n{"),
2021 "where clause not broken in code block: {md:?}"
2022 );
2023 assert!(!md.contains("Global>where"), "glued where survived: {md:?}");
2024 // In the single-line method header the clause is space-separated.
2025 assert!(
2026 md.contains("f: F) where F:"),
2027 "where not separated in header: {md:?}"
2028 );
2029
2030 // Plain-text format gets the same multi-line declaration.
2031 let text = extract_documentation_as_text(html);
2032 assert!(
2033 text.contains("Global>\nwhere"),
2034 "where clause not broken in text: {text:?}"
2035 );
2036 assert!(
2037 !text.contains("Global>where"),
2038 "glued where in text: {text:?}"
2039 );
2040 }
2041
2042 #[test]
2043 fn test_text_signature_has_no_trailing_whitespace_before_where() {
2044 // A signature inside a `<pre>` ends with a space immediately before the
2045 // `where` div; that space is held as a PRE_SPACE sentinel and survives
2046 // line normalisation, so `decode_pre` would otherwise restore a dangling
2047 // trailing space on the signature line in text output.
2048 let html = concat!(
2049 "<html><body><section id=\"main-content\">",
2050 "<pre class=\"rust item-decl\"><code>fn step_by(self, step: usize) -> ",
2051 "StepBy<Self> ",
2052 "<div class=\"where\">where\n Self: Sized,</div>",
2053 "</code></pre>",
2054 "</section></body></html>"
2055 );
2056 let text = extract_documentation_as_text(html);
2057 assert!(
2058 !text.lines().any(|l| l.ends_with(' ') || l.ends_with('\t')),
2059 "text output has a line with trailing whitespace: {text:?}"
2060 );
2061 // The signature and the wrapped clause are still present and split.
2062 assert!(
2063 text.contains("-> StepBy<Self>") && text.contains("where"),
2064 "signature/where content lost: {text:?}"
2065 );
2066 }
2067
2068 #[test]
2069 fn test_ui_glyph_anchors_stripped() {
2070 // rustdoc decorates impl/method headers (inside <summary>) with a
2071 // section-sign anchor `<a class="anchor">\u{00a7}</a>` and a
2072 // notable-trait marker `<a class="tooltip">\u{24d8}</a>`. Both are pure
2073 // UI affordances and must not leak into markdown or text output.
2074 let html = concat!(
2075 "<html><body><section id=\"main-content\">",
2076 "<details class=\"toggle implementors-toggle\" open><summary>",
2077 "<section id=\"impl-Clone\" class=\"impl\">",
2078 "<a href=\"#impl-Clone\" class=\"anchor\">\u{00a7}</a>",
2079 "<h3 class=\"code-header\">impl Clone for Foo</h3></section></summary></details>",
2080 "<details class=\"toggle method-toggle\" open><summary>",
2081 "<section id=\"method.keys\" class=\"method\">",
2082 "<h4 class=\"code-header\">fn <a href=\"#method.keys\" class=\"fn\">keys</a>(&self) -> ",
2083 "<a class=\"struct\" href=\"struct.Keys.html\">Keys</a> ",
2084 "<a href=\"#\" class=\"tooltip\" data-notable-ty=\"Keys\">\u{24d8}</a></h4>",
2085 "</section></summary></details>",
2086 "</section></body></html>"
2087 );
2088 let md = extract_documentation(html);
2089 assert!(
2090 !md.contains('\u{00a7}'),
2091 "section-sign anchor leaked into markdown: {md:?}"
2092 );
2093 assert!(
2094 !md.contains('\u{24d8}'),
2095 "notable-trait marker leaked into markdown: {md:?}"
2096 );
2097 assert!(
2098 md.contains("impl Clone for Foo"),
2099 "impl header lost: {md:?}"
2100 );
2101 let text = extract_documentation_as_text(html);
2102 assert!(
2103 !text.contains('\u{00a7}') && !text.contains('\u{24d8}'),
2104 "UI glyph leaked into text: {text:?}"
2105 );
2106 }
2107
2108 #[test]
2109 fn test_scrape_help_question_mark_anchor_stripped() {
2110 // rustdoc adds a `<a class="scrape-help" href="...">?</a>` help link
2111 // beside the "Examples found in repository" heading of a scraped
2112 // example. It is pure UI chrome and its `?` glyph must not leak into
2113 // the rendered output (the heading text itself is preserved).
2114 let html = concat!(
2115 "<html><body><section id=\"main-content\">",
2116 "<div class=\"docblock scraped-example-list\"><span></span>",
2117 "<h5 id=\"scraped-examples\">",
2118 "<a href=\"#scraped-examples\">Examples found in repository</a>",
2119 "<a class=\"scrape-help\" href=\"../scrape-examples-help.html\">?</a>",
2120 "</h5></div>",
2121 "</section></body></html>"
2122 );
2123 for out in [
2124 extract_documentation(html),
2125 extract_documentation_as_text(html),
2126 extract_documentation_html(html),
2127 ] {
2128 assert!(
2129 !out.contains("?</a>") && !out.contains("scrape-help"),
2130 "scrape-help link leaked: {out:?}"
2131 );
2132 assert!(
2133 out.contains("Examples found in repository"),
2134 "scraped-example heading text lost: {out:?}"
2135 );
2136 }
2137 // The leaked `?` glyph must not survive as a trailing token in markdown.
2138 let md = extract_documentation(html);
2139 assert!(
2140 !md.contains("repository)?") && !md.contains("repository ?"),
2141 "stray scrape-help `?` leaked into markdown: {md:?}"
2142 );
2143 }
2144
2145 #[test]
2146 fn test_multiline_signature_collapsed_to_single_line() {
2147 // rustdoc wraps long signatures across lines inside the (non-<pre>)
2148 // <h4 class="code-header"> using literal newlines + indentation. That
2149 // otherwise yields a broken two-line markdown heading and stray spaces
2150 // in text (`( self: Arc<Self>, )`).
2151 let html = concat!(
2152 "<html><body><section id=\"main-content\">",
2153 "<section id=\"method.try_lock_owned\" class=\"method\">",
2154 "<h4 class=\"code-header\">pub fn <a href=\"#m\" class=\"fn\">try_lock_owned</a>(\n",
2155 " self: <a class=\"struct\" href=\"struct.Arc.html\">Arc</a><Self>,\n",
2156 ") -> <a class=\"enum\" href=\"enum.Result.html\">Result</a><T></h4></section>",
2157 "</section></body></html>"
2158 );
2159 let md = extract_documentation(html);
2160 assert!(
2161 md.contains("(self: Arc<Self>) -> Result<T>"),
2162 "multi-line signature not collapsed cleanly (markdown): {md:?}"
2163 );
2164 assert!(
2165 !md.contains("( self") && !md.contains(", )") && !md.contains(",\n)"),
2166 "signature spacing artifacts survived: {md:?}"
2167 );
2168 // The collapsed heading stays on a single line.
2169 assert!(
2170 !md.contains("Result<T>\n)") && !md.contains(",\n)"),
2171 "heading split across lines: {md:?}"
2172 );
2173 let text = extract_documentation_as_text(html);
2174 assert!(
2175 text.contains("try_lock_owned(self: Arc<Self>) -> Result<T>"),
2176 "multi-line signature not collapsed cleanly (text): {text:?}"
2177 );
2178 }
2179
2180 #[test]
2181 fn test_rustdoc_ui_web_components_stripped() {
2182 // rustdoc emits <rustdoc-toolbar> (inside #main-content, rendered empty)
2183 // and <rustdoc-topbar> (a duplicate breadcrumb heading). Neither should
2184 // leak into the html output.
2185 let html = concat!(
2186 "<html><body><section id=\"main-content\">",
2187 "<div class=\"main-heading\"><h1>Struct Foo</h1>",
2188 "<rustdoc-toolbar></rustdoc-toolbar></div>",
2189 "<rustdoc-topbar><h2><a href=\"#\">Foo</a></h2></rustdoc-topbar>",
2190 "<p>Body.</p>",
2191 "</section></body></html>"
2192 );
2193 let out = extract_documentation_html(html);
2194 assert!(
2195 !out.contains("rustdoc-toolbar") && !out.contains("rustdoc-topbar"),
2196 "rustdoc UI web-component leaked into html: {out:?}"
2197 );
2198 assert!(out.contains("Body."), "body content lost: {out:?}");
2199 }
2200
2201 #[test]
2202 fn test_rustdoc_breadcrumbs_stripped() {
2203 // rustdoc renders a navigation breadcrumb above the item title. Its
2204 // links are page-relative, so without removal they leave a dangling
2205 // bare line (`std::vec`, or a lone `std` on macro pages) that merely
2206 // duplicates our own title. The whole element must be stripped in all
2207 // three formats.
2208 let html = concat!(
2209 "<html><body><section id=\"main-content\">",
2210 "<div class=\"main-heading\">",
2211 "<div class=\"rustdoc-breadcrumbs\"><a href=\"../index.html\">std</a>",
2212 "::<wbr><a href=\"index.html\">vec</a></div>",
2213 "<h1>Struct Vec</h1></div>",
2214 "<p>A contiguous growable array type.</p>",
2215 "</section></body></html>"
2216 );
2217 for out in [
2218 extract_documentation(html),
2219 extract_documentation_as_text(html),
2220 extract_documentation_html(html),
2221 ] {
2222 assert!(
2223 !out.contains("rustdoc-breadcrumbs"),
2224 "breadcrumb element leaked: {out:?}"
2225 );
2226 assert!(
2227 !out.contains("std::vec"),
2228 "dangling breadcrumb line leaked: {out:?}"
2229 );
2230 assert!(
2231 out.contains("Vec") && out.contains("contiguous growable"),
2232 "real content lost: {out:?}"
2233 );
2234 }
2235 }
2236
2237 #[test]
2238 fn test_prose_admonition_pre_becomes_blockquote_not_code() {
2239 // rustdoc renders "Warning"/"Note" callouts as a prose-styled <pre>
2240 // (white-space:normal); it must become a blockquote, not a bare code
2241 // fence, so the prose (and its inline code/links) renders correctly.
2242 let html = concat!(
2243 "<section id=\"main-content\">",
2244 "<p>Intro.</p>",
2245 "<div class=\"example-wrap\"><pre class=\"compile_fail\" ",
2246 "style=\"white-space:normal;font:inherit;\">",
2247 "<p><strong>Warning</strong>: Do not hold <code>Span::enter</code> ",
2248 "across an await point.</p></pre></div>",
2249 "<p>Outro.</p>",
2250 "</section>"
2251 );
2252 let md = extract_documentation(html);
2253 assert!(
2254 !md.contains("```"),
2255 "prose admonition rendered as code fence in markdown: {md:?}"
2256 );
2257 assert!(
2258 md.contains("> ") && md.contains("Warning"),
2259 "admonition not rendered as blockquote: {md:?}"
2260 );
2261 assert!(
2262 md.contains("`Span::enter`"),
2263 "inline code lost in admonition: {md:?}"
2264 );
2265 let html_out = extract_documentation_html(html);
2266 assert!(
2267 !html_out.contains("white-space:normal"),
2268 "prose pre survived in html output: {html_out:?}"
2269 );
2270 // A genuine code example (default white-space) must stay a code block.
2271 let code_html = concat!(
2272 "<section id=\"main-content\">",
2273 "<pre class=\"rust rust-example-rendered\"><code>let x = 1;</code></pre>",
2274 "</section>"
2275 );
2276 assert!(
2277 extract_documentation(code_html).contains("```"),
2278 "genuine code example lost its fence"
2279 );
2280 }
2281
2282 #[test]
2283 fn test_unsafe_function_marker_rendered_as_annotation() {
2284 // rustdoc marks unsafe fns in module lists with
2285 // `<sup title="unsafe function">WARN</sup>`; it must become a readable
2286 // ` (unsafe)` annotation, not a `^(...)` superscript glued to the name.
2287 let html = concat!(
2288 "<section id=\"main-content\"><dl class=\"item-table\">",
2289 "<dt><a class=\"fn\" href=\"fn.copy.html\">copy</a>",
2290 "<sup title=\"unsafe function\">\u{26a0}</sup></dt>",
2291 "<dd>Copies bytes.</dd></dl></section>"
2292 );
2293 for out in [
2294 extract_documentation(html),
2295 extract_documentation_as_text(html),
2296 extract_documentation_html(html),
2297 ] {
2298 assert!(
2299 !out.contains('\u{26a0}'),
2300 "unsafe marker glyph leaked: {out:?}"
2301 );
2302 assert!(
2303 !out.contains("^("),
2304 "unsafe marker rendered as superscript: {out:?}"
2305 );
2306 assert!(
2307 out.contains("(unsafe)"),
2308 "unsafe annotation missing: {out:?}"
2309 );
2310 }
2311 }
2312
2313 #[test]
2314 fn test_hideme_show_methods_toggle_stripped() {
2315 // rustdoc puts a "Show N methods" collapse toggle
2316 // (`<summary class="hideme">`) *inside* the trait declaration <pre>;
2317 // its label must not leak into the rendered signature in any format.
2318 // The surrounding details content (the method list) must survive.
2319 let html = concat!(
2320 "<html><body><section id=\"main-content\">",
2321 "<pre class=\"rust item-decl\"><code>pub trait Iterator {\n",
2322 " type Item;\n",
2323 "<details class=\"toggle type-contents-toggle\">",
2324 "<summary class=\"hideme\"><span>Show 76 methods</span></summary>",
2325 " // Required method\n",
2326 " fn next(&mut self) -> Option<Self::Item>;\n",
2327 "</details>}</code></pre>",
2328 "</section></body></html>"
2329 );
2330 for out in [
2331 extract_documentation(html),
2332 extract_documentation_as_text(html),
2333 extract_documentation_html(html),
2334 ] {
2335 assert!(
2336 !out.contains("Show 76 methods"),
2337 "collapse toggle label leaked: {out:?}"
2338 );
2339 assert!(
2340 out.contains("// Required method"),
2341 "details content lost: {out:?}"
2342 );
2343 }
2344 }
2345
2346 #[test]
2347 fn test_impl_block_docblock_not_glued_to_declaration() {
2348 // rustdoc nests an impl block's own documentation
2349 // (`<div class="docblock">`) inside the `<summary>` that holds the
2350 // `impl ...` declaration. When the summary is flattened to text the
2351 // docblock heading otherwise glues onto the declaration
2352 // (e.g. `impl ArgBasic API`). It must be relocated so the declaration
2353 // stays clean and the docblock renders as its own content.
2354 let html = concat!(
2355 "<html><body><section id=\"main-content\">",
2356 "<div id=\"implementations-list\">",
2357 "<details class=\"toggle implementors-toggle\" open><summary>",
2358 "<section id=\"impl-Arg\" class=\"impl\">",
2359 "<h3 class=\"code-header\">impl Arg</h3>",
2360 "<div class=\"docblock\"><h4 id=\"basic-api\">Basic API</h4></div>",
2361 "</section></summary>",
2362 "<div class=\"impl-items\"><details class=\"toggle method-toggle\" open>",
2363 "<summary><section id=\"method.new\" class=\"method\">",
2364 "<h4 class=\"code-header\">pub fn new() -> Arg</h4></section></summary>",
2365 "<div class=\"docblock\"><p>Create a new Arg.</p></div></details></div>",
2366 "</details></div>",
2367 "</section></body></html>"
2368 );
2369 for out in [
2370 extract_documentation(html),
2371 extract_documentation_as_text(html),
2372 extract_documentation_html(html),
2373 ] {
2374 assert!(
2375 !out.contains("ArgBasic API"),
2376 "impl declaration glued to docblock heading: {out:?}"
2377 );
2378 assert!(
2379 out.contains("Basic API"),
2380 "impl-block docblock heading lost: {out:?}"
2381 );
2382 }
2383 }
2384
2385 #[test]
2386 fn test_undocumented_assoc_item_not_rendered_as_heading() {
2387 // rustdoc wraps a *documented* associated item in
2388 // `<details><summary>...</summary><docblock></details>` (the signature
2389 // is flattened to plain text), but an *undocumented* sibling is a bare
2390 // `<section>` whose `<h4 class="code-header">` would otherwise survive
2391 // as a spurious `####` heading. Both must render as plain text so the
2392 // list is consistent.
2393 let html = concat!(
2394 "<html><body><section id=\"main-content\">",
2395 "<details class=\"toggle\" open><summary>",
2396 "<section id=\"associatedconstant.DOC\" class=\"associatedconstant\">",
2397 "<h4 class=\"code-header\">pub const DOC: Self</h4></section></summary>",
2398 "<div class=\"docblock\"><p>Documented constant.</p></div></details>",
2399 "<section id=\"associatedconstant.BARE\" class=\"associatedconstant\">",
2400 "<h4 class=\"code-header\">pub const BARE: Self</h4></section>",
2401 "</section></body></html>"
2402 );
2403 let md = extract_documentation(html);
2404 assert!(
2405 !md.contains("#### pub const BARE"),
2406 "undocumented assoc const rendered as a heading: {md:?}"
2407 );
2408 assert!(
2409 md.contains("pub const BARE: Self"),
2410 "undocumented assoc const signature lost: {md:?}"
2411 );
2412 assert!(
2413 md.contains("pub const DOC: Self") && md.contains("Documented constant."),
2414 "documented assoc const rendering changed: {md:?}"
2415 );
2416 }
2417
2418 #[test]
2419 fn test_multiline_signature_in_pre_block_preserved() {
2420 // A <pre> code example that legitimately wraps a call across lines must
2421 // not be touched by the code-header collapse.
2422 let html = concat!(
2423 "<html><body><section id=\"main-content\">",
2424 "<pre class=\"rust\"><code>foo(\n a,\n b,\n);</code></pre>",
2425 "</section></body></html>"
2426 );
2427 let text = extract_documentation_as_text(html);
2428 assert!(
2429 text.contains("foo(") && text.contains("a,") && text.contains("b,"),
2430 "pre-block example was altered: {text:?}"
2431 );
2432 }
2433
2434 #[test]
2435 fn test_emoji_badge_separated_from_text() {
2436 // rustdoc renders the nightly-API marker as
2437 // `<span class="emoji">\u{1f52c}</span><span>This is ...</span>` with no
2438 // separating whitespace, so html2md glues the flask onto "This".
2439 let html = concat!(
2440 "<html><body><section id=\"main-content\">",
2441 "<div class=\"stab unstable\">",
2442 "<span class=\"emoji\">\u{1f52c}</span>",
2443 "<span>This is a nightly-only experimental API.</span></div>",
2444 "</section></body></html>"
2445 );
2446 let md = extract_documentation(html);
2447 assert!(
2448 md.contains("\u{1f52c} This is a nightly-only"),
2449 "emoji not separated from text in markdown: {md:?}"
2450 );
2451 assert!(
2452 !md.contains("\u{1f52c}This"),
2453 "emoji still glued in markdown: {md:?}"
2454 );
2455 }
2456
2457 #[test]
2458 fn test_playground_run_button_stripped() {
2459 // rustdoc adds a "Run code" button to each example as an empty-text
2460 // anchor wrapping a long playground URL
2461 // (`<a class="test-arrow" href="https://play.rust-lang.org/...">`).
2462 // It must not leak as an empty-text markdown link.
2463 let html = concat!(
2464 "<html><body><section id=\"main-content\">",
2465 "<div class=\"example-wrap\"><pre class=\"rust\"><code>let x = 1;</code></pre>",
2466 "<a class=\"test-arrow\" target=\"_blank\" title=\"Run code\" ",
2467 "href=\"https://play.rust-lang.org/?code=fn+main()+%7B%7D\"></a></div>",
2468 "</section></body></html>"
2469 );
2470 let md = extract_documentation(html);
2471 assert!(
2472 !md.contains("play.rust-lang.org"),
2473 "playground run button leaked into markdown: {md:?}"
2474 );
2475 assert!(!md.contains("[]("), "empty-text link survived: {md:?}");
2476 assert!(md.contains("let x = 1;"), "example code lost: {md:?}");
2477 }
2478
2479 #[test]
2480 fn test_orphan_since_middot_collapsed() {
2481 // rustdoc puts `<span class="since">1.0.0</span> \u{00b7} <src>` in a
2482 // method's right-side metadata. Stripping the source link leaves a
2483 // dangling middot that, once the <summary> is flattened, glues onto the
2484 // signature (`1.0.0 \u{00b7} fn ...`). It should collapse to a space.
2485 let html = concat!(
2486 "<html><body><section id=\"main-content\">",
2487 "<details class=\"toggle method-toggle\" open><summary>",
2488 "<section id=\"method.next\" class=\"method\">",
2489 "<span class=\"rightside\"><span class=\"since\" title=\"Stable since Rust version 1.0.0\">1.0.0</span>",
2490 " \u{00b7} <a class=\"src\" href=\"../../src/x.html#1\">Source</a></span>",
2491 "<h4 class=\"code-header\">fn <a href=\"#method.next\" class=\"fn\">next</a>(&mut self)</h4>",
2492 "</section></summary></details>",
2493 "</section></body></html>"
2494 );
2495 let md = extract_documentation(html);
2496 assert!(
2497 md.contains("1.0.0 fn next"),
2498 "version not cleanly separated from signature: {md:?}"
2499 );
2500 assert!(
2501 !md.contains("1.0.0 \u{00b7}") && !md.contains("\u{00b7} fn next"),
2502 "orphan middot survived: {md:?}"
2503 );
2504 }
2505
2506 #[test]
2507 fn test_since_badge_separated_from_signature() {
2508 // On FFI structs (e.g. libc) the provided trait methods carry a
2509 // `<span class="since">1.0.0</span>` badge directly abutting the
2510 // method code-header with no middot or source link in between. When the
2511 // <summary> is flattened the badge fuses onto the signature
2512 // (`1.0.0fn clone_from`). It must be separated by a space.
2513 let html = concat!(
2514 "<html><body><section id=\"main-content\">",
2515 "<details class=\"toggle method-toggle\" open><summary>",
2516 "<section id=\"method.clone_from\" class=\"method trait-impl\">",
2517 "<span class=\"rightside\"><span class=\"since\" title=\"Stable since Rust version 1.0.0\">1.0.0</span></span>",
2518 "<a href=\"#method.clone_from\" class=\"anchor\">\u{00a7}</a>",
2519 "<h4 class=\"code-header\">fn <a href=\"#method.clone_from\" class=\"fn\">clone_from</a>(&mut self, source: &Self)</h4>",
2520 "</section></summary></details>",
2521 "</section></body></html>"
2522 );
2523 let md = extract_documentation(html);
2524 let text = extract_documentation_as_text(html);
2525 assert!(
2526 md.contains("1.0.0 fn clone_from"),
2527 "since badge glued onto signature (markdown): {md:?}"
2528 );
2529 assert!(
2530 !md.contains("1.0.0fn"),
2531 "since badge still fused in markdown: {md:?}"
2532 );
2533 assert!(
2534 !text.contains("1.0.0fn"),
2535 "since badge still fused in text: {text:?}"
2536 );
2537 }
2538
2539 #[test]
2540 fn test_generics_survive_summary_method_header() {
2541 // rustdoc wraps method-detail signatures in <details><summary>. The
2542 // summary's decoded text turns `Option<usize>` into literal
2543 // `Option<usize>`; without re-escaping, the second parse drops the
2544 // `<usize>`/`<Self::Item>` as if they were unknown tags.
2545 let html = concat!(
2546 "<html><body><section id=\"main-content\">",
2547 "<details class=\"toggle method-toggle\" open><summary>",
2548 "<section id=\"method.size_hint\" class=\"method\">",
2549 "<span class=\"rightside\"><span class=\"since\" title=\"Stable since Rust version 1.0.0\">1.0.0</span>",
2550 " \u{00b7} <a class=\"src\" href=\"../../src/x.html#1\">Source</a></span>",
2551 "<h4 class=\"code-header\">fn <a href=\"#method.size_hint\" class=\"fn\">size_hint</a>",
2552 "(&self) -> (<a class=\"primitive\" href=\"../primitive.usize.html\">usize</a>, ",
2553 "<a class=\"enum\" href=\"../option/enum.Option.html\">Option</a><",
2554 "<a class=\"primitive\" href=\"../primitive.usize.html\">usize</a>>)</h4>",
2555 "</section></summary></details>",
2556 "</section></body></html>"
2557 );
2558 let md = extract_documentation(html);
2559 assert!(
2560 md.contains("Option<usize>"),
2561 "generic args dropped from summary method header (markdown): {md:?}"
2562 );
2563 let text = extract_documentation_as_text(html);
2564 assert!(
2565 text.contains("Option<usize>"),
2566 "generic args dropped from summary method header (text): {text:?}"
2567 );
2568 }
2569
2570 #[test]
2571 fn test_escape_html_text_reescapes_special_chars() {
2572 assert_eq!(escape_html_text("Vec<u8>"), "Vec<u8>");
2573 assert_eq!(escape_html_text("a & b"), "a & b");
2574 assert_eq!(escape_html_text("Option<&T>"), "Option<&T>");
2575 }
2576
2577 #[test]
2578 fn test_portability_badge_separated_from_item_name() {
2579 // rustdoc glues feature pills onto item names ("fs`fs`"); they should
2580 // render as a clearly separated parenthetical from the badge title.
2581 let html = concat!(
2582 "<html><body><section id=\"main-content\">",
2583 "<dl class=\"item-table\">",
2584 "<dt><a class=\"mod\" href=\"fs/index.html\">fs</a>",
2585 "<span class=\"stab portability\" title=\"Available on crate feature `fs` only\">",
2586 "<code>fs</code></span></dt><dd>Async files.</dd>",
2587 "<dt><a class=\"mod\" href=\"io/index.html\">io</a></dt><dd>Async IO.</dd>",
2588 "</dl></section></body></html>"
2589 );
2590 let md = extract_documentation(html);
2591 assert!(
2592 md.contains("fs (Available on crate feature `fs` only)"),
2593 "feature badge not separated/labelled: {md:?}"
2594 );
2595 // The glued form must be gone.
2596 assert!(!md.contains("fs`fs`"), "glued badge survived: {md:?}");
2597 // Items without a badge are untouched (no stray parens).
2598 assert!(
2599 md.contains("io — Async IO.") || md.contains("io —"),
2600 "io item altered: {md:?}"
2601 );
2602
2603 // Same separation in the plain-text format. The feature name renders
2604 // as a real code element, so plain text shows it undecorated.
2605 let text = extract_documentation_as_text(html);
2606 assert!(
2607 text.contains("fs (Available on crate feature fs only)"),
2608 "text badge not separated: {text:?}"
2609 );
2610 }
2611
2612 #[test]
2613 fn test_code_attribute_on_own_line() {
2614 // rustdoc puts declaration attributes in block-level
2615 // `<div class="code-attribute">` elements inside the item-decl <pre>.
2616 // The attribute must keep its own line, not glue onto the declaration
2617 // (regression: `#[repr(i8)]pub enum Ordering`).
2618 let html = concat!(
2619 "<html><body><section id=\"main-content\">",
2620 "<pre class=\"rust item-decl\"><code>",
2621 "<div class=\"code-attribute\">#[repr(i8)]</div>",
2622 "<div class=\"code-attribute\">#[non_exhaustive]</div>",
2623 "pub enum Ordering {\n Less = -1,\n}</code></pre>",
2624 "</section></body></html>"
2625 );
2626 let md = extract_documentation(html);
2627 assert!(
2628 md.contains("#[repr(i8)]\npub enum Ordering")
2629 || md.contains("#[non_exhaustive]\npub enum Ordering"),
2630 "attribute glued onto declaration in markdown: {md:?}"
2631 );
2632 assert!(
2633 !md.contains("]pub enum"),
2634 "attribute still glued in markdown: {md:?}"
2635 );
2636
2637 let text = extract_documentation_as_text(html);
2638 assert!(
2639 !text.contains("]pub enum"),
2640 "attribute still glued in text: {text:?}"
2641 );
2642 // Both attributes are present, each on its own line.
2643 assert!(
2644 text.contains("#[repr(i8)]") && text.contains("#[non_exhaustive]"),
2645 "an attribute was dropped: {text:?}"
2646 );
2647 }
2648
2649 #[test]
2650 fn test_reexport_link_flattened_in_inline_code() {
2651 // rustdoc renders re-exports as `<code>pub use <a ...>name</a>;</code>`.
2652 // In markdown an anchor inside a backtick span cannot render, so the
2653 // link wrapper must be flattened to its text (`pub use name;`). The
2654 // html output format must keep the anchor.
2655 let html = concat!(
2656 "<html><body><section id=\"main-content\">",
2657 "<h2 id=\"reexports\">Re-exports</h2>",
2658 "<dl class=\"item-table reexports\"><dt id=\"reexport.rand_core\">",
2659 "<code>pub use <a class=\"mod\" ",
2660 "href=\"https://docs.rs/rand_core/0.10.0/rand_core/index.html\" ",
2661 "title=\"mod rand_core\">rand_core</a>;</code></dt></dl>",
2662 "</section></body></html>"
2663 );
2664 let md = extract_documentation(html);
2665 assert!(
2666 md.contains("`pub use rand_core;`"),
2667 "re-export code span malformed: {md:?}"
2668 );
2669 assert!(
2670 !md.contains("[rand_core]"),
2671 "unrenderable link survived inside code span: {md:?}"
2672 );
2673
2674 // The html output format keeps the anchor (browsers render it fine).
2675 let html_out = extract_documentation_html(html);
2676 assert!(
2677 html_out.contains("href=\"https://docs.rs/rand_core/0.10.0/rand_core/index.html\""),
2678 "html output dropped the re-export link: {html_out:?}"
2679 );
2680 }
2681
2682 #[test]
2683 fn test_code_fence_language_preserved() {
2684 // rustdoc annotates code blocks with a class (`rust rust-example-rendered`
2685 // for Rust examples, `language-<name>` for other fenced languages).
2686 // html2md drops this, emitting a bare ``` fence and losing the language
2687 // hint. It must be preserved in markdown only; the text and html
2688 // formats must be unaffected and free of the internal sentinel char.
2689 let html = concat!(
2690 "<div class=\"docblock\">",
2691 "<pre class=\"rust rust-example-rendered\"><code>let x = 1;</code></pre>",
2692 "<pre class=\"language-toml\"><code>v = 1</code></pre>",
2693 "<pre><code>plain</code></pre>",
2694 "</div>"
2695 );
2696 let md = extract_documentation(html);
2697 assert!(md.contains("```rust"), "rust fence hint missing: {md:?}");
2698 assert!(md.contains("```toml"), "toml fence hint missing: {md:?}");
2699 assert!(
2700 !md.contains('\u{2}'),
2701 "internal sentinel leaked into markdown: {md:?}"
2702 );
2703
2704 // Text and html formats must not gain fence hints or the sentinel.
2705 let text = extract_documentation_as_text(html);
2706 assert!(
2707 !text.contains('\u{2}'),
2708 "sentinel leaked into text: {text:?}"
2709 );
2710 assert!(
2711 !text.contains("```rust"),
2712 "text format gained a fence hint: {text:?}"
2713 );
2714
2715 let html_out = extract_documentation_html(html);
2716 assert!(
2717 !html_out.contains('\u{2}'),
2718 "sentinel leaked into html: {html_out:?}"
2719 );
2720 }
2721
2722 #[test]
2723 fn test_portability_badge_feature_with_underscore_not_escaped() {
2724 // A feature name containing an underscore is embedded in the badge
2725 // title inside literal backticks. It must render as a genuine code
2726 // span in markdown (no stray `\_` escape) and as undecorated text in
2727 // the plain-text format. Regression: `thread\_rng` leaked previously.
2728 let html = concat!(
2729 "<html><body><section id=\"main-content\">",
2730 "<div class=\"item-name\">",
2731 "<a class=\"fn\" href=\"fn.fill.html\">fill</a>",
2732 "<span class=\"stab portability\" ",
2733 "title=\"Available on crate feature `thread_rng` only\">",
2734 "<code>thread_rng</code></span></div>",
2735 "<div class=\"desc\">Fill any type.</div>",
2736 "</section></body></html>"
2737 );
2738 let md = extract_documentation(html);
2739 assert!(
2740 md.contains("Available on crate feature `thread_rng` only"),
2741 "feature code span malformed: {md:?}"
2742 );
2743 assert!(
2744 !md.contains("thread\\_rng"),
2745 "stray underscore escape in feature name: {md:?}"
2746 );
2747
2748 let text = extract_documentation_as_text(html);
2749 assert!(
2750 text.contains("Available on crate feature thread_rng only"),
2751 "text feature name malformed: {text:?}"
2752 );
2753 }
2754
2755 #[test]
2756 fn test_stab_badge_separated_from_item_name() {
2757 // rustdoc glues a stability pill onto the item name in module index
2758 // tables (e.g. `TryReserveErrorKindExperimental`); the marker should
2759 // render as a clearly separated parenthetical instead.
2760 let html = concat!(
2761 "<html><body><section id=\"main-content\">",
2762 "<dl class=\"item-table\">",
2763 "<dt><a class=\"enum\" href=\"enum.TryReserveErrorKind.html\">",
2764 "TryReserve<wbr>Error<wbr>Kind</a><wbr>",
2765 "<span class=\"stab unstable\" title=\"\">Experimental</span></dt>",
2766 "<dd>Details of the allocation.</dd>",
2767 "<dt><a class=\"enum\" href=\"enum.Plain.html\">Plain</a></dt><dd>Stable item.</dd>",
2768 "</dl></section></body></html>"
2769 );
2770 let md = extract_documentation(html);
2771 assert!(
2772 md.contains("TryReserveErrorKind (Experimental)"),
2773 "stab badge not separated/labelled: {md:?}"
2774 );
2775 // The glued form must be gone.
2776 assert!(
2777 !md.contains("KindExperimental"),
2778 "glued stab badge survived: {md:?}"
2779 );
2780 // Items without a badge are untouched (no stray parens).
2781 assert!(
2782 md.contains("Plain — Stable item."),
2783 "unbadged item altered: {md:?}"
2784 );
2785
2786 // Same separation in the plain-text format.
2787 let text = extract_documentation_as_text(html);
2788 assert!(
2789 text.contains("TryReserveErrorKind (Experimental)"),
2790 "text stab badge not separated: {text:?}"
2791 );
2792 }
2793
2794 #[test]
2795 fn test_deprecation_badge_separated_from_signature() {
2796 // rustdoc places the deprecation/stability badge in a
2797 // `<span class="item-info">` immediately after the signature, with no
2798 // separating whitespace. Inside a collapsed `<summary>` the flattened
2799 // text glued the badge onto the signature (e.g. `-> &str\u{1f44e}
2800 // Deprecated since 1.42.0: ...`). It must be space-separated instead.
2801 let html = concat!(
2802 "<html><body><section id=\"main-content\">",
2803 "<details class=\"toggle method-toggle\" open><summary>",
2804 "<section id=\"method.description\" class=\"method\">",
2805 "<h4 class=\"code-header\">fn <a href=\"#method.description\" class=\"fn\">description</a>",
2806 "(&self) -> &<a class=\"primitive\" href=\"../primitive.str.html\">str</a></h4></section>",
2807 "<span class=\"item-info\"><div class=\"stab deprecated\">",
2808 "<span class=\"emoji\">\u{1f44e}</span>",
2809 "<span>Deprecated since 1.42.0: <p>use the Display impl or to_string()</p></span>",
2810 "</div></span></summary></details>",
2811 "</section></body></html>"
2812 );
2813 let md = extract_documentation(html);
2814 // The glued form must be gone; a space must separate signature & badge.
2815 assert!(
2816 !md.contains("str\u{1f44e}"),
2817 "deprecation badge glued onto signature (markdown): {md:?}"
2818 );
2819 assert!(
2820 md.contains("str \u{1f44e}") || md.contains("&str \u{1f44e}"),
2821 "deprecation badge not space-separated (markdown): {md:?}"
2822 );
2823 // Plain-text format must also separate them.
2824 let text = extract_documentation_as_text(html);
2825 assert!(
2826 !text.contains("str\u{1f44e}"),
2827 "deprecation badge glued onto signature (text): {text:?}"
2828 );
2829 }
2830
2831 #[test]
2832 fn test_blockquote_empty_marker_lines_removed() {
2833 // html2md pads blockquotes with empty `>` lines; the boundary/duplicate
2834 // markers must be removed while genuine paragraph breaks are preserved.
2835 let single = concat!(
2836 "<html><body><section id=\"main-content\">",
2837 "<blockquote><p><strong>Note here</strong></p></blockquote>",
2838 "<p>after</p></section></body></html>"
2839 );
2840 let md = extract_documentation(single);
2841 assert!(
2842 md.contains("> **Note here**"),
2843 "blockquote content missing: {md:?}"
2844 );
2845 // No empty `>` marker lines should survive.
2846 assert!(
2847 !md.lines().any(|l| l.trim() == ">"),
2848 "empty blockquote marker line survived: {md:?}"
2849 );
2850
2851 // A multi-paragraph blockquote keeps its internal separator line.
2852 let multi = concat!(
2853 "<html><body><section id=\"main-content\">",
2854 "<blockquote><p>First para.</p><p>Second para.</p></blockquote>",
2855 "</section></body></html>"
2856 );
2857 let md = extract_documentation(multi);
2858 assert!(
2859 md.contains("> First para.\n>\n> Second para."),
2860 "multi-paragraph blockquote break not preserved: {md:?}"
2861 );
2862 }
2863
2864 #[test]
2865 fn test_superscript_footnote_converted_in_markdown() {
2866 // html2md has no handler for <sup>/<sub>, so rustdoc footnote
2867 // references leak as raw HTML into the markdown (e.g.
2868 // `<sup id="fnref1"><a href="#fn1">1</a></sup>`). They must be converted
2869 // to plain-text `^(...)` notation with nested tags stripped.
2870 let html = concat!(
2871 "<html><body><section id=\"main-content\">",
2872 "<p>zero-padded to 2 digits. ",
2873 "<sup id=\"fnref1\"><a href=\"#fn1\">1</a></sup></p>",
2874 "<p>water is H<sub>2</sub>O.</p>",
2875 "</section></body></html>"
2876 );
2877 let md = extract_documentation(html);
2878 assert!(
2879 !md.contains("<sup") && !md.contains("</sup>") && !md.contains("<a href"),
2880 "superscript/anchor HTML leaked into markdown: {md:?}"
2881 );
2882 assert!(
2883 md.contains("2 digits. ^(1)"),
2884 "footnote reference not converted to ^(1): {md:?}"
2885 );
2886 assert!(
2887 md.contains("H_(2)O"),
2888 "subscript not converted to _(...): {md:?}"
2889 );
2890
2891 // The HTML output format must keep <sup>/<sub> intact (valid markup).
2892 let html_out = extract_documentation_html(html);
2893 assert!(
2894 html_out.contains("<sup") && html_out.contains("<sub"),
2895 "html format wrongly stripped super/subscript: {html_out:?}"
2896 );
2897 }
2898
2899 #[test]
2900 fn test_markdown_restores_space_after_inline_link() {
2901 // html2md drops a newline after an inline </a>, gluing the next word
2902 // onto the (downgraded) link, e.g. tokio docs: "moved into the
2903 // <a>tokio-stream</a>\ncrate.".
2904 let html = concat!(
2905 "<html><body><section id=\"main-content\">",
2906 "<p>moved into the <a href=\"https://docs.rs/tokio-stream\">tokio-stream</a>\n",
2907 "crate. See <a href=\"struct.X.html\">X</a>\nfor details.</p>",
2908 "</section></body></html>"
2909 );
2910 let md = extract_documentation(html);
2911 // External link keeps its URL; a space now follows it.
2912 assert!(
2913 md.contains("tokio-stream) crate"),
2914 "missing space after external link: {md:?}"
2915 );
2916 // Downgraded relative link is followed by a space, not glued.
2917 assert!(
2918 md.contains("See X for details"),
2919 "missing space after downgraded link: {md:?}"
2920 );
2921
2922 // A wrapped run before an opening parenthesis (parenthetical aside) must
2923 // also gain a space; html2md otherwise glues the `(` onto the link, e.g.
2924 // std slice docs: "the rules of references</a>\n(though ...".
2925 let aside = concat!(
2926 "<html><body><section id=\"main-content\">",
2927 "<p>would violate <a href=\"x.html\">the rules of references</a>\n",
2928 "(though possible).</p>",
2929 "</section></body></html>"
2930 );
2931 let aside_md = extract_documentation(aside);
2932 assert!(
2933 aside_md.contains("references (though"),
2934 "missing space before parenthetical aside: {aside_md:?}"
2935 );
2936
2937 // Negative: a function-style link with no whitespace before `(` stays
2938 // glued (no spurious space inserted into a call expression).
2939 let call = concat!(
2940 "<html><body><section id=\"main-content\">",
2941 "<p>call <a href=\"x.html\">foo</a>(arg) now</p>",
2942 "</section></body></html>"
2943 );
2944 let call_md = extract_documentation(call);
2945 assert!(
2946 call_md.contains("foo(arg)"),
2947 "spurious space inserted into call expression: {call_md:?}"
2948 );
2949 }
2950
2951 #[test]
2952 fn test_markdown_preserves_code_block_whitespace() {
2953 // The inline-whitespace fix must not touch <pre> contents: highlighted
2954 // code blocks wrap <a>/<span> tokens across indented newlines.
2955 let html = concat!(
2956 "<html><body><section id=\"main-content\">",
2957 "<pre><code>fn main() {\n",
2958 " let x =\n",
2959 " <a href=\"x.html\">HashMap</a>::new();\n",
2960 "}</code></pre>",
2961 "</section></body></html>"
2962 );
2963 let md = extract_documentation(html);
2964 // Indentation inside the code block is preserved (not collapsed to a
2965 // single leading space).
2966 assert!(
2967 md.contains(" let x ="),
2968 "code block indentation collapsed: {md:?}"
2969 );
2970 }
2971
2972 #[test]
2973 fn test_markdown_unescapes_identifiers_outside_code() {
2974 let html = concat!(
2975 "<html><body><section id=\"main-content\">",
2976 "<h1>Crate serde_json</h1>",
2977 "<p>Use <code>serde_json::value</code> to build <code>Vec<u8></code>.</p>",
2978 "<p>pub fn get(&self) -> Option<&Value></p>",
2979 "<pre><code>let v: Vec<u8> = path\\to;</code></pre>",
2980 "</section></body></html>"
2981 );
2982 let md = extract_documentation(html);
2983 // Escapes removed from ordinary text and signatures.
2984 assert!(
2985 md.contains("Crate serde_json"),
2986 "heading still escaped: {md:?}"
2987 );
2988 assert!(
2989 md.contains("-> Option<&Value>"),
2990 "signature still escaped: {md:?}"
2991 );
2992 assert!(!md.contains("\\_"), "stray underscore escape: {md:?}");
2993 assert!(
2994 !md.contains("\\<") && !md.contains("\\>"),
2995 "stray angle escape: {md:?}"
2996 );
2997 // Inline code span is preserved verbatim (no escaping introduced).
2998 assert!(
2999 md.contains("`serde_json::value`"),
3000 "inline code mangled: {md:?}"
3001 );
3002 // Fenced code content (a genuine backslash) is left untouched.
3003 assert!(md.contains("path\\to"), "fenced backslash altered: {md:?}");
3004 }
3005
3006 #[test]
3007 fn test_clean_html_strips_oddly_formatted_block_elements() {
3008 // Navigation/header/footer/aside elements must be removed even when
3009 // their source markup is not formatted the way html5ever serializes it
3010 // (e.g. extra whitespace inside the tag). Previously the cleanup relied
3011 // on string-matching the serialized element against the raw HTML, which
3012 // silently leaked such elements into the output.
3013 let html = concat!(
3014 "<html><body><section id=\"main-content\">",
3015 "<nav class=\"sidebar\">NAVLEAK</nav>",
3016 "<header data-x=\"1\">HEADERLEAK</header>",
3017 "<footer >FOOTERLEAK</footer>",
3018 "<aside role=\"note\">ASIDELEAK</aside>",
3019 "<p>Real doc.</p>",
3020 "</section></body></html>"
3021 );
3022 let cleaned = clean_html(html);
3023 for leak in ["NAVLEAK", "HEADERLEAK", "FOOTERLEAK", "ASIDELEAK"] {
3024 assert!(!cleaned.contains(leak), "{leak} leaked: {cleaned}");
3025 }
3026 assert!(cleaned.contains("Real doc."), "content dropped: {cleaned}");
3027 }
3028
3029 #[test]
3030 fn test_clean_html_removes_source_links() {
3031 let html = concat!(
3032 "<html><body><section id=\"main-content\">",
3033 "<a class=\"src rightside\" href=\"../src/foo/lib.rs.html#1-2\">Source</a>",
3034 "<a class=\"src\" href=\"../src/foo/lib.rs.html#5\">Source</a>",
3035 "<p>Real documentation text.</p>",
3036 "</section></body></html>"
3037 );
3038 // Plain-text extraction must not leak the "Source" link labels.
3039 let text = extract_documentation_as_text(html);
3040 assert!(text.contains("Real documentation text."));
3041 assert!(!text.contains("Source"), "source label leaked: {text}");
3042 }
3043
3044 #[test]
3045 fn test_html_to_text_superscript_uses_caret_notation() {
3046 // In plain text a bare footnote number is easily mistaken for body
3047 // text; <sup>/<sub> should render as `^(...)`/`_(...)`, matching the
3048 // markdown path.
3049 let html = "<p>zero-padded to 2 digits. <sup id=\"f\"><a href=\"#fn1\">1</a></sup></p> <p>water is H<sub>2</sub>O.</p>";
3050 let text = html_to_text(html);
3051 assert!(
3052 text.contains("2 digits. ^(1)"),
3053 "superscript not rendered as ^(1): {text:?}"
3054 );
3055 assert!(
3056 text.contains("H_(2)O"),
3057 "subscript not rendered as _(2): {text:?}"
3058 );
3059 // No bare anchor/tag leakage.
3060 assert!(
3061 !text.contains("<sup") && !text.contains("<a href"),
3062 "raw tags leaked into text: {text:?}"
3063 );
3064 }
3065
3066 #[test]
3067 fn test_html_to_text_table_rows_stay_on_one_line() {
3068 // Table cells in a row must render on a single line joined by ` | `
3069 // (not scattered one-cell-per-line), so the row's columns stay
3070 // associated in the plain-text output.
3071 let html = concat!(
3072 "<table><thead><tr><th>Spec.</th><th>Example</th><th>Description</th></tr></thead>",
3073 "<tbody><tr><td>%Y</td><td>2001</td><td>The full year.</td></tr>",
3074 "<tr><td>%m</td><td>07</td><td>Month number.</td></tr></tbody></table>"
3075 );
3076 let text = html_to_text(html);
3077 assert!(
3078 text.contains("Spec. | Example | Description"),
3079 "header row not joined with ` | `: {text:?}"
3080 );
3081 assert!(
3082 text.contains("%Y | 2001 | The full year."),
3083 "data row not joined with ` | `: {text:?}"
3084 );
3085 // Distinct rows remain on separate lines.
3086 assert!(
3087 text.contains("The full year.\n%m | 07 | Month number."),
3088 "rows not on separate lines: {text:?}"
3089 );
3090 }
3091
3092 #[test]
3093 fn test_html_to_text_table_preserves_empty_leading_cell() {
3094 // A table whose header has an empty leading (row-label) cell must keep
3095 // that empty cell in the text output so the header columns stay aligned
3096 // with the data rows (header and every data row keep the same column
3097 // count).
3098 let html = concat!(
3099 "<table><thead><tr><th></th><th>get(i)</th><th>insert(i)</th></tr></thead>",
3100 "<tbody><tr><td>Vec</td><td>O(1)</td><td>O(n-i)</td></tr></tbody></table>"
3101 );
3102 let text = html_to_text(html);
3103 let header = text
3104 .lines()
3105 .find(|l| l.contains("get(i)"))
3106 .expect("header row missing");
3107 let data = text
3108 .lines()
3109 .find(|l| l.contains("Vec"))
3110 .expect("data row missing");
3111 // Both rows must have the same number of ` | `-joined columns (3).
3112 assert_eq!(
3113 header.matches('|').count(),
3114 data.matches('|').count(),
3115 "header/data column counts misaligned: header={header:?} data={data:?}"
3116 );
3117 assert_eq!(
3118 data.trim(),
3119 "Vec | O(1) | O(n-i)",
3120 "data row not joined correctly: {data:?}"
3121 );
3122 }
3123
3124 #[test]
3125 fn test_html_to_text_drops_empty_spacer_rows() {
3126 // Some tables insert all-empty "visual spacer" rows between data rows.
3127 // In text these must be dropped, not rendered as content-free `| |`
3128 // noise; rows with any content are kept (with their column structure).
3129 let html = concat!(
3130 "<table><tbody>",
3131 "<tr><td>%h</td><td>Jul</td><td>Same as %b.</td></tr>",
3132 "<tr><td></td><td></td><td></td></tr>",
3133 "<tr><td>%d</td><td>08</td><td>Day number.</td></tr>",
3134 "</tbody></table>"
3135 );
3136 let text = html_to_text(html);
3137 assert!(
3138 !text.lines().any(|l| l.trim() == "| |" || l.trim() == "|"),
3139 "empty spacer row rendered as pipe noise: {text:?}"
3140 );
3141 // Genuine data rows are preserved.
3142 assert!(
3143 text.contains("%h | Jul | Same as %b.") && text.contains("%d | 08 | Day number."),
3144 "data rows lost: {text:?}"
3145 );
3146 }
3147
3148 #[test]
3149 fn test_structfield_spans_render_on_separate_lines() {
3150 // rustdoc emits one `<span class="structfield section-header">` per
3151 // field with no separating whitespace; adjacent spans must not glue
3152 // (markdown `a: A``b: B`) or fuse tokens in text (`A_tb`).
3153 let html = concat!(
3154 "<html><body><section id=\"main-content\">",
3155 "<h2>Fields</h2>",
3156 "<span id=\"structfield.sa_family\" class=\"structfield section-header\">",
3157 "<a href=\"#structfield.sa_family\" class=\"anchor field\">\u{a7}</a>",
3158 "<code>sa_family: <a class=\"type\" href=\"type.sa_family_t.html\">sa_family_t</a></code></span>",
3159 "<span id=\"structfield.sa_data\" class=\"structfield section-header\">",
3160 "<a href=\"#structfield.sa_data\" class=\"anchor field\">\u{a7}</a>",
3161 "<code>sa_data: [<a class=\"type\" href=\"type.c_char.html\">c_char</a>; 14]</code></span>",
3162 "</section></body></html>"
3163 );
3164 let text = extract_documentation_as_text(html);
3165 assert!(
3166 !text.contains("sa_family_tsa_data"),
3167 "struct field tokens fused in text: {text:?}"
3168 );
3169 assert!(
3170 text.contains("sa_family: sa_family_t") && text.contains("sa_data: [c_char; 14]"),
3171 "field declarations missing in text: {text:?}"
3172 );
3173 let md = extract_documentation(html);
3174 // Each field is on its own line (no two field decls share a line).
3175 assert!(
3176 !md.lines()
3177 .any(|l| l.contains("sa_family") && l.contains("sa_data")),
3178 "struct fields glued on one line in markdown: {md:?}"
3179 );
3180 }
3181
3182 #[test]
3183 fn test_html_to_text_separates_block_elements() {
3184 // Adjacent block elements (item-index entries, list items, table cells)
3185 // must not glue their text together in the plain-text output.
3186 let html = "<ul><li>Dl_info</li><li>Elf32_Chdr</li><li>Foo</li></ul>";
3187 let text = html_to_text(html);
3188 assert!(
3189 !text.contains("Dl_infoElf32"),
3190 "block text glued together: {text}"
3191 );
3192 assert!(
3193 text.contains("Dl_info\nElf32_Chdr\nFoo"),
3194 "blocks not on separate lines: {text}"
3195 );
3196 }
3197
3198 #[test]
3199 fn test_item_index_table_renders_as_separate_items() {
3200 // docs.rs renders crate/module overview item indexes as
3201 // <dl class="item-table"><dt>name</dt><dd>summary</dd>...</dl>.
3202 // Without rewriting, html2md concatenates every name onto one line.
3203 let html = concat!(
3204 "<html><body><section id=\"main-content\">",
3205 "<dl class=\"item-table\">",
3206 "<dt><a class=\"struct\" href=\"struct.Dl_info.html\">Dl_info</a></dt>",
3207 "<dt><a class=\"struct\" href=\"struct.Elf32_Chdr.html\">Elf32_Chdr</a></dt>",
3208 "<dt><a class=\"trait\" href=\"trait.Foo.html\">Foo</a></dt>",
3209 "<dd>A foo trait.</dd>",
3210 "</dl></section></body></html>"
3211 );
3212 let md = extract_documentation(html);
3213 // Item names must not be glued together (html2md escapes `_` as `\_`,
3214 // so the broken output would contain `info` directly before `Elf32`).
3215 assert!(!md.contains("infoElf32"), "item names concatenated: {md}");
3216 // Each item appears (allowing markdown underscore escaping), the
3217 // description is preserved, and entries are emitted as separate
3218 // markdown list items (one per line).
3219 assert!(
3220 md.contains("Dl") && md.contains("info"),
3221 "missing Dl_info: {md}"
3222 );
3223 assert!(md.contains("Elf32"), "missing Elf32_Chdr: {md}");
3224 assert!(md.contains("Foo"), "missing Foo: {md}");
3225 assert!(md.contains("A foo trait."), "missing description: {md}");
3226 assert!(
3227 md.matches("* ").count() >= 3,
3228 "expected separate list items, got: {md}"
3229 );
3230 }
3231
3232 #[test]
3233 fn test_extract_documentation_html_returns_clean_main_content() {
3234 let html = concat!(
3235 "<!DOCTYPE html><html><head><link rel=\"search\" href=\"/opensearch.xml\">",
3236 "<script>var x=1;</script></head><body><nav>Nav</nav>",
3237 "<section id=\"main-content\"><h1>Crate foo</h1><p>Body text.</p>",
3238 "<a class=\"src\" href=\"../src/foo.rs.html\">Source</a></section>",
3239 "<footer>Footer</footer></body></html>"
3240 );
3241 let out = extract_documentation_html(html);
3242 // Documentation body is preserved as HTML.
3243 assert!(out.contains("Body text."), "missing body: {out}");
3244 assert!(out.contains("<h1>") || out.contains("Crate foo"));
3245 // Page chrome and noise are gone.
3246 assert!(!out.contains("<!DOCTYPE"), "doctype leaked: {out}");
3247 assert!(!out.contains("opensearch"), "head link leaked: {out}");
3248 assert!(!out.contains("<script"), "script leaked: {out}");
3249 assert!(!out.contains("Nav"), "nav leaked: {out}");
3250 assert!(!out.contains("Footer"), "footer leaked: {out}");
3251 assert!(!out.contains("Source"), "src link leaked: {out}");
3252 }
3253
3254 #[test]
3255 fn test_clean_html_removes_script() {
3256 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
3257 let cleaned = clean_html(html);
3258 assert!(!cleaned.contains("script"));
3259 assert!(!cleaned.contains("var x"));
3260 assert!(cleaned.contains("Hello"));
3261 }
3262
3263 #[test]
3264 fn test_clean_html_strips_details_toggle_wrappers() {
3265 let html = r#"<html><body><section id="main-content"><details class="toggle top-doc" open=""><summary>Expand description</summary><h2>MyCrate</h2><p>Useful docs.</p></details></section></body></html>"#;
3266 let cleaned = clean_html(html);
3267 assert!(!cleaned.contains("<details"));
3268 assert!(!cleaned.contains("</details>"));
3269 assert!(!cleaned.contains("Expand description"));
3270 // Inner content must be preserved.
3271 assert!(cleaned.contains("MyCrate"));
3272 assert!(cleaned.contains("Useful docs."));
3273 }
3274
3275 #[test]
3276 fn test_extract_documentation_as_text_strips_ui_cruft() {
3277 let html = concat!(
3278 "<html><body><section id=\"main-content\">",
3279 "<button>Copy item path</button>",
3280 "<a class=\"anchor\" href=\"#x\">\u{00a7}</a>",
3281 "<details class=\"toggle top-doc\" open=\"\"><summary>Expand description</summary>",
3282 "<p>Real documentation text.</p></details>",
3283 "</section></body></html>"
3284 );
3285 let text = extract_documentation_as_text(html);
3286 assert!(text.contains("Real documentation text."));
3287 assert!(!text.contains("Copy item path"));
3288 assert!(!text.contains("Expand description"));
3289 assert!(!text.contains('\u{00a7}'));
3290 }
3291
3292 #[test]
3293 fn test_text_strips_trailing_orphan_middot() {
3294 // The out-of-band stability row (`1.0.0 \u{00b7} <source>`) leaves a
3295 // dangling middot once the source link is stripped.
3296 let html = concat!(
3297 "<html><body><section id=\"main-content\">",
3298 "<div class=\"out-of-band\">1.0.0 \u{00b7} ",
3299 "<a class=\"src\" href=\"../src/x.rs.html\">source</a></div>",
3300 "<p>Body text.</p>",
3301 "</section></body></html>"
3302 );
3303 let text = extract_documentation_as_text(html);
3304 assert!(text.contains("Body text."), "body dropped: {text:?}");
3305 assert!(
3306 !text.contains("1.0.0 \u{00b7}"),
3307 "orphan middot survived in text: {text:?}"
3308 );
3309 }
3310
3311 #[test]
3312 fn test_extract_documentation_has_no_details_markup() {
3313 let html = r#"<html><body><section id="main-content"><details class="toggle top-doc" open=""><summary>Expand description</summary><h2>MyCrate</h2><p>Hello world.</p></details></section></body></html>"#;
3314 let md = extract_documentation(html);
3315 assert!(!md.contains("<details"));
3316 assert!(!md.contains("Expand description"));
3317 assert!(md.contains("MyCrate"));
3318 assert!(md.contains("Hello world."));
3319 }
3320
3321 #[test]
3322 fn test_clean_html_removes_dangerous_elements_with_irregular_whitespace() {
3323 // html5ever normalizes `<script defer >` to `<script defer>`, which
3324 // defeats the DOM serialize+string-replace pass. The regex pre-strip
3325 // must still remove these so no executable/style/embedded content leaks
3326 // into the html output format.
3327 let html = concat!(
3328 "<html><body><section id=\"main-content\">",
3329 "<script defer >alert('xss')</script>",
3330 "<STYLE type=\"text/css\" >.evil{color:red}</STYLE>",
3331 "<noscript >NoScriptContent</noscript>",
3332 "<iframe src=\"http://evil.example\"></iframe>",
3333 "<p>Safe documentation.</p>",
3334 "</section></body></html>"
3335 );
3336 let cleaned = clean_html(html);
3337 assert!(!cleaned.contains("alert"), "script leaked: {cleaned}");
3338 assert!(!cleaned.contains(".evil"), "style leaked: {cleaned}");
3339 assert!(
3340 !cleaned.contains("NoScriptContent"),
3341 "noscript leaked: {cleaned}"
3342 );
3343 assert!(
3344 !cleaned.contains("evil.example"),
3345 "iframe leaked: {cleaned}"
3346 );
3347 assert!(cleaned.contains("Safe documentation."));
3348 }
3349
3350 #[test]
3351 fn test_clean_html_removes_style() {
3352 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
3353 let cleaned = clean_html(html);
3354 assert!(!cleaned.contains("style"));
3355 assert!(!cleaned.contains(".foo"));
3356 assert!(cleaned.contains("Content"));
3357 }
3358
3359 #[test]
3360 fn test_html_to_text_removes_tags() {
3361 let html = "<p>Hello <strong>World</strong>!</p>";
3362 let text = html_to_text(html);
3363 assert!(!text.contains('<'));
3364 assert!(!text.contains('>'));
3365 assert!(text.contains("Hello"));
3366 assert!(text.contains("World"));
3367 }
3368
3369 #[test]
3370 fn test_html_to_text_excludes_script_and_style_recursively() {
3371 // Regression: skip-tag exclusion must be recursive. Script/style content
3372 // nested anywhere in the tree must not leak into the plain-text output.
3373 let html = "<body>Hello<script>var secret = 1;</script> <div><style>.x{color:red}</style>World</div> <noscript>NOSCRIPT</noscript></body>";
3374 let text = html_to_text(html);
3375 assert!(text.contains("Hello"), "text: {text}");
3376 assert!(text.contains("World"), "text: {text}");
3377 assert!(!text.contains("secret"), "script content leaked: {text}");
3378 assert!(!text.contains("color:red"), "style content leaked: {text}");
3379 assert!(
3380 !text.contains("NOSCRIPT"),
3381 "noscript content leaked: {text}"
3382 );
3383 }
3384
3385 #[test]
3386 fn test_html_to_text_preserves_inline_runs() {
3387 // Regression: words split across inline elements (e.g. docs.rs `<wbr>`
3388 // hints or syntax-highlight spans) and punctuation directly following an
3389 // inline element must not gain spurious spaces.
3390 let html = "<body><p>de<wbr>serializing data</p>\n<div><code>RandomState</code>, <code>Global</code>></div></body>";
3391 let text = html_to_text(html);
3392 assert!(text.contains("deserializing"), "split word: {text}");
3393 assert!(!text.contains("de serializing"), "spurious space: {text}");
3394 assert!(text.contains("RandomState,"), "space before comma: {text}");
3395 // Block elements are now separated by a newline rather than a space.
3396 assert!(
3397 text.contains("data\nRandomState"),
3398 "lost block separation: {text}"
3399 );
3400 }
3401
3402 #[test]
3403 fn test_html_to_text_handles_entities() {
3404 // Test that HTML entities are converted to their character equivalents
3405 // amp entity should be decoded to &
3406 let html = r"<p>Tom & Jerry</p>";
3407 let text = html_to_text(html);
3408 // The function should decode amp entity
3409 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
3410 }
3411
3412 #[test]
3413 fn test_clean_whitespace() {
3414 assert_eq!(clean_whitespace(" hello world "), "hello world");
3415 // Multi-space boundary test
3416 assert_eq!(clean_whitespace(" hello world "), "hello world");
3417 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
3418 }
3419
3420 #[test]
3421 fn test_extract_documentation() {
3422 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
3423 let docs = extract_documentation(html);
3424 assert!(docs.contains("Title"));
3425 assert!(docs.contains("Content"));
3426 }
3427
3428 #[test]
3429 fn test_extract_search_results_crate_fallback_adds_note() {
3430 // A crate-landing page (starts with "Crate ") used as fallback for an
3431 // item lookup must surface an honest note.
3432 let html = "<html><body><section id=\"main-content\"><h1>Crate serde</h1><p>Crate docs.</p></section></body></html>";
3433 let result = extract_search_results(html, "DoesNotExist");
3434 assert!(result.contains("## Documentation: DoesNotExist"));
3435 assert!(
3436 result.contains("No dedicated documentation page was found"),
3437 "missing fallback note: {result}"
3438 );
3439 }
3440
3441 #[test]
3442 fn test_extract_search_results_direct_item_no_note() {
3443 // A real item page (starts with its kind) must NOT get the fallback note.
3444 let html = "<html><body><section id=\"main-content\"><h1>Function spawn</h1><p>Spawns.</p></section></body></html>";
3445 let result = extract_search_results(html, "spawn");
3446 assert!(result.contains("## Documentation: spawn"));
3447 assert!(!result.contains("No dedicated documentation page was found"));
3448 }
3449
3450 #[test]
3451 fn test_extract_search_results_found() {
3452 let html = "<html><body><h1>Result</h1></body></html>";
3453 let result = extract_search_results(html, "serde::Serialize");
3454 assert!(result.contains("Documentation"));
3455 assert!(result.contains("serde::Serialize"));
3456 assert!(result.contains("Result"));
3457 }
3458
3459 #[test]
3460 fn test_extract_search_results_not_found() {
3461 let html = "<html><body></body></html>";
3462 let result = extract_search_results(html, "nonexistent");
3463 assert!(result.contains("not found"));
3464 assert!(result.contains("nonexistent"));
3465 }
3466
3467 #[test]
3468 fn test_is_item_fallback_page_parent_type_fallback() {
3469 // Requesting a method (`Value::is_null`) resolves to the containing
3470 // type's page (`Enum Value`); the heading names `Value`, not the
3471 // requested leaf `is_null`, so it must be flagged as a fallback.
3472 let html = "<html><body><section id=\"main-content\"><h1>Enum serde_json::Value</h1><p>An enum.</p></section></body></html>";
3473 assert!(is_item_fallback_page(html, "Value::is_null"));
3474 // The markdown path must surface the note for this parent fallback.
3475 let result = extract_search_results(html, "Value::is_null");
3476 assert!(
3477 result.contains("No dedicated documentation page was found"),
3478 "parent fallback note missing: {result}"
3479 );
3480 }
3481
3482 #[test]
3483 fn test_is_item_fallback_page_direct_hit_not_flagged() {
3484 // A dedicated item page's heading contains the requested leaf.
3485 let html = "<html><body><section id=\"main-content\"><h1>Trait serde::Serialize</h1><p>A trait.</p></section></body></html>";
3486 assert!(!is_item_fallback_page(html, "serde::Serialize"));
3487 assert!(!is_item_fallback_page(html, "Serialize"));
3488 // A re-exported function resolved at its canonical path still matches.
3489 let fn_html = "<html><body><section id=\"main-content\"><h1>Function tokio::task::spawn</h1></section></body></html>";
3490 assert!(!is_item_fallback_page(fn_html, "tokio::spawn"));
3491 }
3492
3493 #[test]
3494 fn test_is_item_fallback_page_crate_overview_fallback() {
3495 let html = "<html><body><section id=\"main-content\"><h1>Crate serde</h1><p>Docs.</p></section></body></html>";
3496 assert!(is_item_fallback_page(html, "DoesNotExist"));
3497 }
3498
3499 #[test]
3500 fn test_is_item_fallback_page_no_heading_does_not_warn() {
3501 // Without an <h1> we cannot tell; do not over-warn.
3502 let html = "<html><body><section id=\"main-content\"><p>No heading here.</p></section></body></html>";
3503 assert!(!is_item_fallback_page(html, "Foo::bar"));
3504 }
3505
3506 #[test]
3507 fn test_heading_contains_identifier_is_token_exact() {
3508 // Partial substring matches must not count.
3509 assert!(!heading_contains_identifier("Struct this::That", "is"));
3510 assert!(heading_contains_identifier(
3511 "Struct serde_json::Value",
3512 "Value"
3513 ));
3514 assert!(heading_contains_identifier("Method is_null", "is_null"));
3515 }
3516
3517 #[test]
3518 fn test_clean_html_removes_link_tags() {
3519 let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
3520 let cleaned = clean_html(html);
3521 assert!(
3522 !cleaned.contains("link"),
3523 "link tag should be removed, got: {cleaned}"
3524 );
3525 assert!(
3526 !cleaned.contains("stylesheet"),
3527 "stylesheet should be removed, got: {cleaned}"
3528 );
3529 assert!(
3530 cleaned.contains("Hello"),
3531 "Body content should remain, got: {cleaned}"
3532 );
3533 }
3534
3535 #[test]
3536 fn test_clean_html_removes_meta_tags() {
3537 let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
3538 let cleaned = clean_html(html);
3539 assert!(
3540 !cleaned.contains("meta"),
3541 "meta tag should be removed, got: {cleaned}"
3542 );
3543 assert!(
3544 cleaned.contains("Content"),
3545 "Body content should remain, got: {cleaned}"
3546 );
3547 }
3548
3549 #[test]
3550 fn test_relative_link_regex() {
3551 // Test that RELATIVE_LINK_REGEX only matches relative .html links
3552 let re = &RELATIVE_LINK_REGEX;
3553
3554 // Should match - relative .html links
3555 assert!(re.is_match("[module](module/index.html)"));
3556 assert!(re.is_match("[struct](struct.Struct.html)"));
3557 assert!(re.is_match("[tokio](../index.html)"));
3558 assert!(re.is_match("[crate](./index.html)"));
3559 assert!(re.is_match("[root](/serde/index.html)"));
3560 // Module paths beginning with `_` or digits (e.g. clap's `_derive`).
3561 assert!(re.is_match("[tutorial](_derive/_tutorial/index.html)"));
3562 assert!(re.is_match("[v2](2/index.html)"));
3563
3564 // Should NOT match
3565 assert!(!re.is_match("[Section](#section)")); // Anchor link
3566 assert!(
3567 !re.is_match("[External](https://example.com)"),
3568 "Should not match external URLs"
3569 ); // External URL
3570 }
3571
3572 #[test]
3573 fn test_clean_markdown_keeps_external_html_links() {
3574 // Absolute external links that happen to end in `.html` must keep their
3575 // URL rather than being downgraded to bare label text.
3576 let md = "See the [Guide](https://example.com/book/ch01.html) for details.";
3577 let out = clean_markdown(md);
3578 assert!(
3579 out.contains("[Guide](https://example.com/book/ch01.html)"),
3580 "external link should be preserved, got: {out}"
3581 );
3582 }
3583
3584 #[test]
3585 fn test_clean_markdown_relative_links_keep_text() {
3586 // clap-style underscore module links must be rewritten to their text,
3587 // not left as broken docs.rs-relative links.
3588 let md =
3589 "Derive [tutorial](_derive/_tutorial/index.html) and [reference](_derive/index.html).";
3590 let out = clean_markdown(md);
3591 assert!(!out.contains(".html"), "relative link survived: {out}");
3592 assert!(!out.contains("_derive"), "relative target survived: {out}");
3593 assert!(
3594 out.contains("Derive tutorial and reference."),
3595 "text not kept: {out}"
3596 );
3597 }
3598
3599 #[test]
3600 fn test_clean_markdown_relative_link_with_bracketed_label() {
3601 // Intra-doc links whose label contains `]` (Rust attribute syntax
3602 // `#[tokio::main]`, slice/array types `[u8]`, `[T; N]`) must still be
3603 // downgraded to their text. Previously the label pattern stopped at the
3604 // first `]`, leaving a broken docs.rs-relative `.html` link.
3605 let md = concat!(
3606 "Use [`#[tokio::main]`](attr.main.html) and the slice ",
3607 "[`[u8]`](primitive.slice.html) plus [Foo](struct.Foo.html)."
3608 );
3609 let out = clean_markdown(md);
3610 assert!(!out.contains(".html"), "relative link survived: {out}");
3611 assert!(
3612 !out.contains("](attr"),
3613 "bracketed-label link survived: {out}"
3614 );
3615 assert!(
3616 out.contains("`#[tokio::main]`"),
3617 "attribute label text dropped: {out}"
3618 );
3619 assert!(out.contains("`[u8]`"), "slice label text dropped: {out}");
3620 assert!(out.contains("Foo"), "plain label text dropped: {out}");
3621 }
3622
3623 #[test]
3624 fn test_negative_impl_trait_not_rendered_as_image() {
3625 // rustdoc negative auto-trait impls (`impl<T> !Freeze for Mutex<T>`)
3626 // place a text `!` directly before the linkified trait, which html2md
3627 // fuses into `` \u{2014} markdown image syntax that renders as
3628 // a broken embedded image. The `!` must be backslash-escaped so it stays
3629 // literal text.
3630 let input = concat!(
3631 "### impl<T> ![Freeze]",
3632 "(https://doc.rust-lang.org/nightly/core/marker/trait.Freeze.html)",
3633 " for Mutex<T>\n"
3634 );
3635 let md = clean_markdown(input);
3636 assert!(
3637 md.contains(r"\![Freeze]"),
3638 "negative-impl marker not escaped: {md:?}"
3639 );
3640 assert!(
3641 !md.contains("> ![Freeze]"),
3642 "unescaped image syntax survived: {md:?}"
3643 );
3644 }
3645
3646 #[test]
3647 fn test_clean_markdown_removes_old_rustdoc_artifacts() {
3648 // The minus sign below is U+2212 as emitted by older rustdoc toggles.
3649 let md = concat!(
3650 "Crate [serde]() [ [\u{2212}] ](javascript:void(0)) ",
3651 "[[src]](../src/serde/lib.rs.html#9-267) [\u{24d8}](#)\n\nReal content ",
3652 "[External](https://serde.rs/) [Quick start](#quick-start)."
3653 );
3654 let out = clean_markdown(md);
3655 assert!(!out.contains("javascript:"), "js link leaked: {out}");
3656 assert!(
3657 !out.contains("src/serde/lib.rs.html"),
3658 "src link leaked: {out}"
3659 );
3660 assert!(!out.contains("[[src]]"), "src label leaked: {out}");
3661 assert!(!out.contains("]()"), "empty link leaked: {out}");
3662 // Useful text is preserved (empty link label downgraded to text).
3663 assert!(out.contains("serde"));
3664 assert!(out.contains("Real content"));
3665 // External non-.html links are preserved.
3666 assert!(out.contains("https://serde.rs/"));
3667 // No-op fragment-only toggles are removed, real anchors preserved.
3668 assert!(!out.contains("(#)"), "fragment toggle leaked: {out}");
3669 assert!(out.contains("#quick-start"), "real anchor dropped: {out}");
3670 }
3671
3672 #[test]
3673 fn test_clean_markdown_keeps_named_fragment_link_text() {
3674 // Versioned docs.rs pages render the crate name in the h1 as
3675 // `<a class="mod" href="#">serde</a>`, which becomes `[serde](#)` in
3676 // markdown. The label must survive (only symbol toggles are dropped).
3677 let md = "Crate [serde](#) [ⓘ](#)\n\nbody";
3678 let out = clean_markdown(md);
3679 assert!(out.contains("Crate serde"), "crate name dropped: {out}");
3680 assert!(!out.contains("(#)"), "fragment link syntax leaked: {out}");
3681 assert!(!out.contains("ⓘ"), "symbol toggle leaked: {out}");
3682 }
3683
3684 #[test]
3685 fn test_clean_markdown_drops_relative_read_more_keeps_absolute() {
3686 // rustdoc appends a "Read more" link to inherited/derived method
3687 // summaries. A docs.rs-relative target is unreachable and would be
3688 // downgraded to a dangling "Read more"; it must be dropped entirely.
3689 // An absolute (scheme://) target stays a usable link.
3690 let md = "Returns a duplicate of the value. [Read more](../clone/trait.Clone.html#tymethod.clone)";
3691 let out = clean_markdown(md);
3692 assert_eq!(
3693 out.trim(),
3694 "Returns a duplicate of the value.",
3695 "relative Read more affordance not dropped cleanly: {out:?}"
3696 );
3697 let md2 = "Formats the value. [Read more](https://doc.rust-lang.org/core/fmt/trait.Debug.html#tymethod.fmt)";
3698 let out2 = clean_markdown(md2);
3699 assert!(
3700 out2.contains(
3701 "[Read more](https://doc.rust-lang.org/core/fmt/trait.Debug.html#tymethod.fmt)"
3702 ),
3703 "absolute Read more link wrongly dropped: {out2:?}"
3704 );
3705 }
3706
3707 #[test]
3708 fn test_clean_markdown_downgrades_rustdoc_item_anchors() {
3709 // rustdoc cross-links items with type-prefixed fragment anchors
3710 // (`#method.X`, `#associatedtype.X`, `#impl-...`). These ids do not
3711 // exist in the rendered markdown, so the links are dead and must be
3712 // downgraded to their label. Genuine section anchors must be kept.
3713 let md = concat!(
3714 "fn [parse](#method.parse)() -> Box and ",
3715 "[`Error`](#associatedtype.Error) plus ",
3716 "[here](#impl-Clone-for-Foo). See [Quick start](#quick-start)."
3717 );
3718 let out = clean_markdown(md);
3719 assert!(
3720 !out.contains("#method.parse"),
3721 "method anchor survived: {out}"
3722 );
3723 assert!(
3724 !out.contains("#associatedtype.Error"),
3725 "assoc-type anchor survived: {out}"
3726 );
3727 assert!(!out.contains("#impl-"), "impl anchor survived: {out}");
3728 // Labels are kept as text.
3729 assert!(out.contains("fn parse()"), "method label dropped: {out}");
3730 assert!(out.contains("`Error`"), "assoc-type label dropped: {out}");
3731 assert!(out.contains("here"), "impl label dropped: {out}");
3732 // Genuine section anchors are preserved.
3733 assert!(
3734 out.contains("[Quick start](#quick-start)"),
3735 "section anchor wrongly downgraded: {out}"
3736 );
3737 }
3738
3739 #[test]
3740 fn test_clean_markdown_removes_stray_middot_line() {
3741 // rustdoc out-of-band row leaves a lone middot after the source link
3742 // and collapse toggle are stripped.
3743 let md = "Crate serde\n==========\n\n\u{00b7}\n\nSerde is a framework.";
3744 let out = clean_markdown(md);
3745 assert!(
3746 !out.contains("\n\u{00b7}\n"),
3747 "stray middot line leaked: {out:?}"
3748 );
3749 assert!(out.contains("Crate serde"), "heading dropped: {out}");
3750 assert!(out.contains("Serde is a framework."), "body dropped: {out}");
3751 // Inline middots in prose are preserved.
3752 let inline = clean_markdown("a \u{00b7} b");
3753 assert!(
3754 inline.contains("\u{00b7}"),
3755 "inline middot wrongly dropped: {inline}"
3756 );
3757 }
3758
3759 #[test]
3760 fn test_clean_markdown_strips_trailing_middot_and_nbsp() {
3761 // The stability/out-of-band line keeps a dangling middot once the
3762 // trailing source link is stripped (e.g. "1.0.0 \u{00b7}"); and rustdoc
3763 // headings often end with a non-breaking space.
3764 let md = "Struct HashMap\u{00a0} \n==========\n\n1.0.0 \u{00b7}\n\nBody.";
3765 let out = clean_markdown(md);
3766 assert!(
3767 out.contains("Struct HashMap\n"),
3768 "trailing nbsp not trimmed from heading: {out:?}"
3769 );
3770 assert!(
3771 out.contains("1.0.0\n") || out.ends_with("1.0.0\n\nBody."),
3772 "trailing middot not stripped: {out:?}"
3773 );
3774 assert!(
3775 !out.contains("1.0.0 \u{00b7}"),
3776 "orphan middot survived: {out:?}"
3777 );
3778 // Inline middots between words on the same line are preserved.
3779 assert!(
3780 clean_markdown("a \u{00b7} b").contains('\u{00b7}'),
3781 "inline middot wrongly dropped"
3782 );
3783 }
3784
3785 #[test]
3786 fn test_clean_markdown_removes_breadcrumb_colon_lines() {
3787 let md = "## Documentation: spawn
3788
3789::
3790
3791Function spawn
3792
3793let x = S::Ok;";
3794 let out = clean_markdown(md);
3795 // The orphan breadcrumb separator line is gone.
3796 assert!(!out.contains("\n::\n"), "stray colon line leaked: {out}");
3797 // Inline `::` inside content is preserved.
3798 assert!(
3799 out.contains("S::Ok"),
3800 "inline path separator dropped: {out}"
3801 );
3802 assert!(out.contains("Function spawn"));
3803 }
3804
3805 #[test]
3806 fn test_clean_markdown_preserves_content() {
3807 // Test that clean_markdown doesn't remove too much content
3808 let markdown = r"# Dioxus
3809
3810## At a glance
3811
3812Dioxus is a framework for building cross-platform apps.
3813
3814## Quick start
3815
3816To get started with Dioxus:
3817
3818```
3819cargo install dioxus-cli
3820```
3821
3822[External Link](https://dioxuslabs.com)
3823
3824[Anchor](#quick-start)
3825";
3826 let cleaned = clean_markdown(markdown);
3827
3828 // Should preserve main content
3829 assert!(cleaned.contains("Dioxus is a framework"));
3830 assert!(cleaned.contains("At a glance"));
3831 assert!(cleaned.contains("Quick start"));
3832 assert!(cleaned.contains("cargo install"));
3833
3834 // Should preserve external links and anchor links
3835 assert!(
3836 cleaned.contains("[External Link](https://dioxuslabs.com)"),
3837 "Should preserve external links"
3838 );
3839 assert!(
3840 cleaned.contains("[Anchor](#quick-start)"),
3841 "Should preserve anchor links"
3842 );
3843 }
3844
3845 // ============================================================================
3846 // Performance optimization tests
3847 // ============================================================================
3848
3849 /// Test that `extract_documentation` handles complex HTML with main content
3850 /// This test verifies the single-pass optimization doesn't break extraction
3851 #[test]
3852 fn test_extract_documentation_single_pass_optimization() {
3853 let html = r#"
3854<!DOCTYPE html>
3855<html>
3856<head><title>Test Crate</title></head>
3857<body>
3858 <nav>Navigation content</nav>
3859 <section id="main-content">
3860 <h1>Test Crate</h1>
3861 <p>This is the main documentation.</p>
3862 <script>console.log('test');</script>
3863 <div class="docblock">
3864 <p>Docblock content here.</p>
3865 </div>
3866 </section>
3867 <footer>Footer content</footer>
3868</body>
3869</html>
3870"#;
3871 let docs = extract_documentation(html);
3872
3873 // Should extract main content
3874 assert!(docs.contains("Test Crate"), "Should contain title");
3875 assert!(
3876 docs.contains("main documentation"),
3877 "Should contain main content"
3878 );
3879 assert!(
3880 docs.contains("Docblock content"),
3881 "Should preserve docblock"
3882 );
3883
3884 // Should remove unwanted elements
3885 assert!(!docs.contains("Navigation content"), "Should remove nav");
3886 assert!(!docs.contains("Footer content"), "Should remove footer");
3887 assert!(!docs.contains("console.log"), "Should remove script");
3888 }
3889
3890 /// Test that `extract_search_results` handles complex HTML correctly
3891 /// This verifies the single-pass optimization for search results
3892 #[test]
3893 fn test_extract_search_results_single_pass_optimization() {
3894 let html = r#"
3895<!DOCTYPE html>
3896<html>
3897<body>
3898 <section id="main-content">
3899 <h1>serde::Serialize</h1>
3900 <pre><code>pub trait Serialize { }</code></pre>
3901 <p>Serialize trait documentation.</p>
3902 </section>
3903 <nav>Sidebar</nav>
3904</body>
3905</html>
3906"#;
3907 let result = extract_search_results(html, "serde::Serialize");
3908
3909 // Should extract search results correctly
3910 assert!(result.contains("Documentation"));
3911 assert!(result.contains("serde::Serialize"));
3912 assert!(result.contains("Serialize trait"));
3913
3914 // Should remove navigation
3915 assert!(!result.contains("Sidebar"));
3916 }
3917
3918 /// Test that multiple skip tags are handled efficiently
3919 #[test]
3920 fn test_clean_html_multiple_skip_tags() {
3921 let html = r"
3922<html>
3923<head>
3924 <style>.test { color: red; }</style>
3925 <script>var x = 1;</script>
3926</head>
3927<body>
3928 <nav>Navigation</nav>
3929 <article>
3930 <h1>Title</h1>
3931 <p>Content with <script>inline script</script> removed.</p>
3932 <footer>Article footer</footer>
3933 </article>
3934 <footer>Page footer</footer>
3935</body>
3936</html>
3937";
3938 let cleaned = clean_html(html);
3939
3940 // Should preserve content
3941 assert!(cleaned.contains("Title"));
3942 assert!(cleaned.contains("Content"));
3943
3944 // Should remove all unwanted elements
3945 assert!(!cleaned.contains("style"), "Should remove style tags");
3946 assert!(!cleaned.contains("script"), "Should remove script tags");
3947 assert!(!cleaned.contains("Navigation"), "Should remove nav");
3948 assert!(!cleaned.contains("footer"), "Should remove footer");
3949 assert!(!cleaned.contains(".test"), "Should remove CSS content");
3950 assert!(!cleaned.contains("var x"), "Should remove JS content");
3951 }
3952
3953 /// Test that cached selectors work correctly for all tag types
3954 #[test]
3955 fn test_cached_selectors_all_tag_types() {
3956 // Test each tag type defined in constants
3957 let test_cases = [
3958 (
3959 "<script>alert('test')</script><p>Content</p>",
3960 "script",
3961 "Content",
3962 ),
3963 ("<style>.x{}</style><p>Content</p>", "style", "Content"),
3964 (
3965 "<noscript>Enable JS</noscript><p>Content</p>",
3966 "noscript",
3967 "Content",
3968 ),
3969 (
3970 "<iframe src=\"x\"></iframe><p>Content</p>",
3971 "iframe",
3972 "Content",
3973 ),
3974 ("<nav><a>Link</a></nav><p>Content</p>", "nav", "Content"),
3975 ("<header>Head</header><p>Content</p>", "header", "Content"),
3976 ("<footer>Foot</footer><p>Content</p>", "footer", "Content"),
3977 ("<aside>Sidebar</aside><p>Content</p>", "aside", "Content"),
3978 ("<button>Click</button><p>Content</p>", "button", "Content"),
3979 ];
3980
3981 for (html, tag_to_remove, expected_content) in test_cases {
3982 let cleaned = clean_html(html);
3983 assert!(
3984 !cleaned.contains(tag_to_remove),
3985 "Should remove {tag_to_remove} tag"
3986 );
3987 assert!(
3988 cleaned.contains(expected_content),
3989 "Should preserve {expected_content}"
3990 );
3991 }
3992 }
3993}