Skip to main content

docgen_core/
headings.rs

1//! Extract the `h2`/`h3` heading outline of a document and stamp matching
2//! `id` anchors onto the rendered heading tags.
3//!
4//! The right-rail "On this page" table of contents and the scroll-spy island
5//! both key off `id` attributes on `<h2>`/`<h3>` in the rendered article. Comrak
6//! *can* emit heading ids, but it places them on a nested
7//! `<a class="anchor" id="…">` element rather than the heading itself, which the
8//! `h2[id]` / `h3[id]` selectors the scroll-spy uses would never match. So we
9//! anchorize the heading text ourselves (with comrak's own [`Anchorizer`], so
10//! the slugs are byte-for-byte what comrak would have produced) and inject the
11//! `id` directly onto the heading's opening tag.
12
13use comrak::html::collect_text;
14use comrak::nodes::{AstNode, NodeValue};
15use comrak::Anchorizer;
16use serde::Serialize;
17
18/// One entry in a page's heading outline. Only `h2`/`h3` are collected — `h1`
19/// is the (hidden) page title and `h4`+ are too deep for the rail TOC.
20#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
21pub struct Heading {
22    /// Stable anchor id (matches the `id` stamped on the rendered heading tag).
23    pub id: String,
24    /// Human-readable heading text.
25    pub text: String,
26    /// Heading level: `2` or `3`.
27    pub depth: u8,
28}
29
30/// Walk the AST in document order and collect the `h2`/`h3` outline, anchorizing
31/// each heading's text into a unique id. One [`Anchorizer`] per call guarantees
32/// the `-1`, `-2`, … de-duplication suffixes match comrak's own scheme.
33pub fn collect_headings<'a>(root: &'a AstNode<'a>) -> Vec<Heading> {
34    let mut anchorizer = Anchorizer::new();
35    let mut out = Vec::new();
36    for node in root.descendants() {
37        if let NodeValue::Heading(h) = &node.data.borrow().value {
38            if h.level == 2 || h.level == 3 {
39                let text = collect_text(node);
40                let id = anchorizer.anchorize(&text);
41                out.push(Heading {
42                    id,
43                    text: text.trim().to_string(),
44                    depth: h.level,
45                });
46            }
47        }
48    }
49    out
50}
51
52/// Inject `id="…"` onto the `<h2>`/`<h3>` opening tags of `html`, in document
53/// order, using the ids from [`collect_headings`].
54///
55/// `headings` MUST be in the same order the tags appear in `html` (it is, since
56/// both derive from one AST walk). Each heading consumes the next matching
57/// `<h2>`/`<h3>` occurrence. Comrak emits bare `<h2>` / `<h3>` (no sourcepos,
58/// no existing id), so a plain ordered text rewrite is exact and unambiguous.
59pub fn stamp_heading_ids(html: &str, headings: &[Heading]) -> String {
60    let mut out = String::with_capacity(html.len() + headings.len() * 24);
61    let mut rest = html;
62    let mut iter = headings.iter();
63
64    loop {
65        // Find the next `<h2>` or `<h3>` opening tag.
66        let h2 = rest.find("<h2>");
67        let h3 = rest.find("<h3>");
68        let next = match (h2, h3) {
69            (None, None) => None,
70            (Some(a), None) => Some((a, 2u8)),
71            (None, Some(b)) => Some((b, 3u8)),
72            (Some(a), Some(b)) => {
73                if a < b {
74                    Some((a, 2))
75                } else {
76                    Some((b, 3))
77                }
78            }
79        };
80
81        let Some((pos, level)) = next else {
82            out.push_str(rest);
83            break;
84        };
85
86        let tag_len = 4; // "<hN>"
87        out.push_str(&rest[..pos]);
88        match iter.next() {
89            Some(h) if h.depth == level => {
90                out.push_str(&format!("<h{} id=\"{}\">", level, escape_attr(&h.id)));
91            }
92            // Misalignment (shouldn't happen): leave the tag untouched.
93            _ => out.push_str(&rest[pos..pos + tag_len]),
94        }
95        rest = &rest[pos + tag_len..];
96    }
97
98    out
99}
100
101/// Minimal attribute escaping for an anchorized id (anchorize already strips
102/// most markup-significant characters; this guards the remainder).
103fn escape_attr(s: &str) -> String {
104    s.replace('&', "&amp;")
105        .replace('"', "&quot;")
106        .replace('<', "&lt;")
107}
108
109#[cfg(test)]
110mod tests {
111    use super::*;
112    use comrak::{parse_document, Arena};
113
114    #[test]
115    fn collects_h2_and_h3_skips_h1_and_h4() {
116        let arena = Arena::new();
117        let root = parse_document(
118            &arena,
119            "# Title\n\n## Alpha\n\n### Beta\n\n#### Deep\n",
120            &crate::markdown::comrak_options(),
121        );
122        let hs = collect_headings(root);
123        assert_eq!(hs.len(), 2);
124        assert_eq!(
125            hs[0],
126            Heading {
127                id: "alpha".into(),
128                text: "Alpha".into(),
129                depth: 2
130            }
131        );
132        assert_eq!(
133            hs[1],
134            Heading {
135                id: "beta".into(),
136                text: "Beta".into(),
137                depth: 3
138            }
139        );
140    }
141
142    #[test]
143    fn duplicate_headings_get_unique_suffixes() {
144        let arena = Arena::new();
145        let root = parse_document(
146            &arena,
147            "## Notes\n\n## Notes\n",
148            &crate::markdown::comrak_options(),
149        );
150        let hs = collect_headings(root);
151        assert_eq!(hs[0].id, "notes");
152        assert_eq!(hs[1].id, "notes-1");
153    }
154
155    #[test]
156    fn stamps_ids_onto_heading_tags_in_order() {
157        let html = "<h2>Alpha</h2>\n<p>x</p>\n<h3>Beta</h3>\n";
158        let headings = vec![
159            Heading {
160                id: "alpha".into(),
161                text: "Alpha".into(),
162                depth: 2,
163            },
164            Heading {
165                id: "beta".into(),
166                text: "Beta".into(),
167                depth: 3,
168            },
169        ];
170        let out = stamp_heading_ids(html, &headings);
171        assert!(out.contains(r#"<h2 id="alpha">Alpha</h2>"#));
172        assert!(out.contains(r#"<h3 id="beta">Beta</h3>"#));
173    }
174
175    #[test]
176    fn stamp_is_noop_without_headings() {
177        let html = "<p>no headings here</p>";
178        assert_eq!(stamp_heading_ids(html, &[]), html);
179    }
180}