Skip to main content

marco_core/intelligence/
toc.rs

1//! Table of Contents extraction and Markdown generation.
2//!
3//! # Workflow
4//! 1. Parse the document with `core::parse()`
5//! 2. Call [`extract_toc`] to get a flat list of [`TocEntry`] items
6//! 3. Call [`generate_toc_markdown`] to produce the fenced TOC block
7//! 4. Call [`replace_toc_in_text`] to update or insert the block in the source
8
9use crate::parser::{Document, NodeKind};
10use std::collections::HashMap;
11
12/// A single entry in the extracted Table of Contents.
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct TocEntry {
15    /// Heading level (1..=6).
16    pub level: u8,
17    /// Raw heading text.
18    pub text: String,
19    /// Generated anchor slug.
20    pub slug: String,
21    /// 1-based source line number of the heading (from the AST span), or 0 when unknown.
22    pub line: usize,
23}
24
25/// Result of attempting to replace an existing TOC block in source text.
26#[derive(Debug)]
27pub enum TocReplaceResult {
28    /// No `<!-- TOC -->` markers found; caller should insert the block at cursor.
29    NoMarkers,
30    /// Existing block is identical to the new one — no change needed.
31    NoChange,
32    /// Returns the full source text with the TOC block replaced.
33    Updated(String),
34}
35
36/// Generate a GitHub-compatible URL slug from heading text.
37///
38/// Algorithm:
39/// 1. Lowercase
40/// 2. Each character: alphanumeric → keep; space / hyphen / underscore → `-`; anything else → `-`
41/// 3. Collapse consecutive hyphens into one
42/// 4. Trim leading and trailing hyphens
43pub fn heading_slug(text: &str) -> String {
44    let lower = text.to_lowercase();
45
46    let mut slug = String::with_capacity(lower.len());
47    let mut prev_hyphen = false;
48
49    for c in lower.chars() {
50        let mapped = if c.is_alphanumeric() {
51            prev_hyphen = false;
52            slug.push(c);
53            continue;
54        } else {
55            '-'
56        };
57
58        if !prev_hyphen {
59            slug.push(mapped);
60        }
61        prev_hyphen = true;
62    }
63
64    // Trim leading/trailing hyphens
65    let trimmed = slug.trim_matches('-');
66    trimmed.to_string()
67}
68
69/// Extract all top-level headings from a document as a flat list of [`TocEntry`] items.
70///
71/// - Explicit `{#id}` from the AST `id` field is used as-is; otherwise the slug is
72///   derived from the heading text via [`heading_slug`].
73/// - Duplicate slugs receive `-1`, `-2`, … suffixes in document order.
74///
75/// **Slug parity with the renderer**: the renderer increments its own slug counter for
76/// *every* heading it encounters during the depth-first tree walk — including headings
77/// that live inside blockquotes, admonitions, etc.  This function replicates that walk
78/// so that the counters stay in sync.  Nested headings are not listed in the output
79/// (callers expect a "structural" TOC), but their slugs are still counted so that the
80/// numbers assigned to top-level headings match the `id` attributes in the rendered HTML.
81pub fn extract_toc(doc: &Document) -> Vec<TocEntry> {
82    let mut entries = Vec::new();
83    let mut slug_counts: HashMap<String, usize> = HashMap::new();
84    walk_toc_nodes(&doc.children, true, &mut entries, &mut slug_counts);
85    entries
86}
87
88/// Depth-first walk that mirrors the renderer's heading slug counter.
89///
90/// `include` is `true` only for top-level document children.  At deeper levels it is
91/// `false` so those headings affect the counter but are not added to `entries`.
92fn walk_toc_nodes(
93    nodes: &[crate::parser::Node],
94    include: bool,
95    entries: &mut Vec<TocEntry>,
96    slug_counts: &mut HashMap<String, usize>,
97) {
98    for node in nodes {
99        if let NodeKind::Heading { level, text, id } = &node.kind {
100            let base = id
101                .as_deref()
102                .map(|s| s.to_string())
103                .unwrap_or_else(|| heading_slug(text));
104
105            let count = slug_counts.entry(base.clone()).or_insert(0);
106            let slug = if *count == 0 {
107                base.clone()
108            } else {
109                format!("{}-{}", base, count)
110            };
111            *count += 1;
112
113            if include {
114                entries.push(TocEntry {
115                    level: *level,
116                    text: text.clone(),
117                    slug,
118                    line: node.span.map(|s| s.start.line).unwrap_or(0),
119                });
120            }
121        }
122
123        // Recurse into block containers (blockquotes, admonitions, list items, …).
124        // Their headings are counted for slug deduplication but not shown in the TOC.
125        if !node.children.is_empty() {
126            walk_toc_nodes(&node.children, false, entries, slug_counts);
127        }
128    }
129}
130
131/// Generate a Markdown TOC block wrapped in `<!-- TOC -->` / `<!-- /TOC -->` markers.
132///
133/// Returns an empty string when `entries` is empty.
134/// Indentation is relative to the minimum heading level present.
135pub fn generate_toc_markdown(entries: &[TocEntry]) -> String {
136    if entries.is_empty() {
137        return String::new();
138    }
139
140    let min_level = entries.iter().map(|e| e.level).min().unwrap_or(1);
141    let mut lines = vec!["<!-- TOC -->".to_string()];
142
143    for entry in entries {
144        let indent = "  ".repeat((entry.level - min_level) as usize);
145        lines.push(format!("{}- [{}](#{})", indent, entry.text, entry.slug));
146    }
147
148    lines.push(String::new()); // blank line terminates the list before the closing marker
149    lines.push("<!-- /TOC -->".to_string());
150    lines.join("\n")
151}
152
153/// Attempt to find and replace an existing `<!-- TOC -->...<!-- /TOC -->` block in
154/// `current_text` with `new_toc`.
155///
156/// Returns:
157/// - [`TocReplaceResult::NoMarkers`] when no markers were found
158/// - [`TocReplaceResult::NoChange`] when the existing block equals `new_toc`
159/// - [`TocReplaceResult::Updated`] with the new full source text otherwise
160pub fn replace_toc_in_text(current_text: &str, new_toc: &str) -> TocReplaceResult {
161    const START_MARKER: &str = "<!-- TOC -->";
162    const END_MARKER: &str = "<!-- /TOC -->";
163
164    let Some(start_pos) = current_text.find(START_MARKER) else {
165        return TocReplaceResult::NoMarkers;
166    };
167    let Some(end_pos) = current_text.find(END_MARKER) else {
168        return TocReplaceResult::NoMarkers;
169    };
170
171    if end_pos < start_pos {
172        return TocReplaceResult::NoMarkers;
173    }
174
175    let end_of_block = end_pos + END_MARKER.len();
176    let existing = &current_text[start_pos..end_of_block];
177
178    if existing == new_toc {
179        return TocReplaceResult::NoChange;
180    }
181
182    let mut result = String::with_capacity(current_text.len());
183    result.push_str(&current_text[..start_pos]);
184    result.push_str(new_toc);
185    result.push_str(&current_text[end_of_block..]);
186    TocReplaceResult::Updated(result)
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn smoke_heading_slug_basic() {
195        assert_eq!(heading_slug("Hello World"), "hello-world");
196        assert_eq!(heading_slug("Introduction"), "introduction");
197        assert_eq!(
198            heading_slug("Getting Started Guide"),
199            "getting-started-guide"
200        );
201    }
202
203    #[test]
204    fn smoke_heading_slug_special_chars() {
205        assert_eq!(heading_slug("Code <example> & test"), "code-example-test");
206        assert_eq!(heading_slug("A/B Testing"), "a-b-testing");
207        assert_eq!(heading_slug("Hello---World"), "hello-world");
208    }
209
210    #[test]
211    fn smoke_heading_slug_empty() {
212        assert_eq!(heading_slug(""), "");
213        assert_eq!(heading_slug("---"), "");
214        assert_eq!(heading_slug("!@#"), "");
215    }
216
217    #[test]
218    fn smoke_extract_toc_basic() {
219        use crate::parser::{Document, Node};
220
221        let doc = Document {
222            children: vec![
223                Node {
224                    kind: NodeKind::Heading {
225                        level: 1,
226                        text: "Title".to_string(),
227                        id: None,
228                    },
229                    span: None,
230                    children: vec![],
231                },
232                Node {
233                    kind: NodeKind::Heading {
234                        level: 2,
235                        text: "Getting Started".to_string(),
236                        id: None,
237                    },
238                    span: None,
239                    children: vec![],
240                },
241                Node {
242                    kind: NodeKind::Heading {
243                        level: 2,
244                        text: "Installation".to_string(),
245                        id: None,
246                    },
247                    span: None,
248                    children: vec![],
249                },
250            ],
251            ..Default::default()
252        };
253
254        let entries = extract_toc(&doc);
255        assert_eq!(entries.len(), 3);
256        assert_eq!(entries[0].slug, "title");
257        assert_eq!(entries[1].slug, "getting-started");
258        assert_eq!(entries[2].slug, "installation");
259    }
260
261    #[test]
262    fn smoke_extract_toc_explicit_id_wins() {
263        use crate::parser::{Document, Node};
264
265        let doc = Document {
266            children: vec![Node {
267                kind: NodeKind::Heading {
268                    level: 2,
269                    text: "My Title".to_string(),
270                    id: Some("custom-id".to_string()),
271                },
272                span: None,
273                children: vec![],
274            }],
275            ..Default::default()
276        };
277
278        let entries = extract_toc(&doc);
279        assert_eq!(entries[0].slug, "custom-id");
280    }
281
282    #[test]
283    fn smoke_extract_toc_duplicate_slugs() {
284        use crate::parser::{Document, Node};
285
286        let doc = Document {
287            children: vec![
288                Node {
289                    kind: NodeKind::Heading {
290                        level: 2,
291                        text: "Introduction".to_string(),
292                        id: None,
293                    },
294                    span: None,
295                    children: vec![],
296                },
297                Node {
298                    kind: NodeKind::Heading {
299                        level: 2,
300                        text: "Introduction".to_string(),
301                        id: None,
302                    },
303                    span: None,
304                    children: vec![],
305                },
306                Node {
307                    kind: NodeKind::Heading {
308                        level: 2,
309                        text: "Introduction".to_string(),
310                        id: None,
311                    },
312                    span: None,
313                    children: vec![],
314                },
315            ],
316            ..Default::default()
317        };
318
319        let entries = extract_toc(&doc);
320        assert_eq!(entries[0].slug, "introduction");
321        assert_eq!(entries[1].slug, "introduction-1");
322        assert_eq!(entries[2].slug, "introduction-2");
323    }
324
325    #[test]
326    fn smoke_generate_toc_markdown_basic() {
327        let entries = vec![
328            TocEntry {
329                level: 1,
330                text: "Title".to_string(),
331                slug: "title".to_string(),
332                line: 0,
333            },
334            TocEntry {
335                level: 2,
336                text: "Getting Started".to_string(),
337                slug: "getting-started".to_string(),
338                line: 0,
339            },
340            TocEntry {
341                level: 3,
342                text: "Installation".to_string(),
343                slug: "installation".to_string(),
344                line: 0,
345            },
346        ];
347
348        let md = generate_toc_markdown(&entries);
349        assert!(md.starts_with("<!-- TOC -->"));
350        assert!(md.ends_with("<!-- /TOC -->"));
351        assert!(md.contains("- [Title](#title)"));
352        assert!(md.contains("  - [Getting Started](#getting-started)"));
353        assert!(md.contains("    - [Installation](#installation)"));
354    }
355
356    #[test]
357    fn smoke_generate_toc_markdown_empty() {
358        assert_eq!(generate_toc_markdown(&[]), "");
359    }
360
361    #[test]
362    fn smoke_replace_toc_no_markers() {
363        let text = "# Hello\n\nSome content.\n";
364        let toc = "<!-- TOC -->\n- [Hello](#hello)\n<!-- /TOC -->";
365        assert!(matches!(
366            replace_toc_in_text(text, toc),
367            TocReplaceResult::NoMarkers
368        ));
369    }
370
371    #[test]
372    fn smoke_replace_toc_updates_existing() {
373        let text = "# Hello\n\n<!-- TOC -->\n- [Old](#old)\n<!-- /TOC -->\n\nContent.\n";
374        let new_toc = "<!-- TOC -->\n- [Hello](#hello)\n<!-- /TOC -->";
375        match replace_toc_in_text(text, new_toc) {
376            TocReplaceResult::Updated(result) => {
377                assert!(result.contains("- [Hello](#hello)"));
378                assert!(!result.contains("- [Old](#old)"));
379                assert!(result.contains("# Hello"));
380                assert!(result.contains("Content."));
381            }
382            other => panic!("expected Updated, got {:?}", other),
383        }
384    }
385
386    #[test]
387    fn smoke_replace_toc_no_change() {
388        let toc = "<!-- TOC -->\n- [Hello](#hello)\n<!-- /TOC -->";
389        let text = format!("# Hello\n\n{}\n\nContent.\n", toc);
390        assert!(matches!(
391            replace_toc_in_text(&text, toc),
392            TocReplaceResult::NoChange
393        ));
394    }
395
396    /// A heading inside a blockquote must consume a slug counter slot so that a
397    /// same-text heading at the top level receives the correct suffix — matching
398    /// the `id` the renderer would assign to it.
399    #[test]
400    fn smoke_extract_toc_nested_heading_syncs_slug_counter() {
401        use crate::parser::{Document, Node};
402
403        // Document structure:
404        //   > ## Introduction   ← inside blockquote, NOT in TOC output
405        //   ## Introduction     ← top-level, slug must be "introduction-1"
406        let blockquote_heading = Node {
407            kind: NodeKind::Heading {
408                level: 2,
409                text: "Introduction".to_string(),
410                id: None,
411            },
412            span: None,
413            children: vec![],
414        };
415        let blockquote_node = Node {
416            kind: NodeKind::Blockquote,
417            span: None,
418            children: vec![blockquote_heading],
419        };
420        let top_level_heading = Node {
421            kind: NodeKind::Heading {
422                level: 2,
423                text: "Introduction".to_string(),
424                id: None,
425            },
426            span: None,
427            children: vec![],
428        };
429
430        let doc = Document {
431            children: vec![blockquote_node, top_level_heading],
432            ..Default::default()
433        };
434
435        let entries = extract_toc(&doc);
436        // Only the top-level heading is in the TOC.
437        assert_eq!(entries.len(), 1);
438        // Its slug must be "introduction-1" (counter was consumed by the blockquote heading).
439        assert_eq!(entries[0].slug, "introduction-1");
440    }
441}
442
443#[cfg(test)]
444mod parse_roundtrip {
445    #[test]
446    fn toc_block_renders_as_invisible_html_comments() {
447        // Blank line before closing marker is required so the block parser
448        // can recognise <!-- /TOC --> as an HTML comment rather than list inline text.
449        let input = "<!-- TOC -->\n- [Title](#title)\n  - [Sub](#sub)\n\n<!-- /TOC -->\n";
450        let doc = crate::parser::parse(input).expect("parse failed");
451        let kinds: Vec<_> = doc
452            .children
453            .iter()
454            .map(|n| format!("{:?}", n.kind))
455            .collect();
456        eprintln!("Parsed nodes: {:?}", kinds);
457        let html = crate::render::render(&doc, &crate::render::RenderOptions::default())
458            .expect("render failed");
459        eprintln!("HTML output:\n{}", html);
460        // Both markers must be invisible HTML comments, not text
461        assert!(
462            !html.contains("&lt;!"),
463            "markers were escaped as text, not passed through as HTML"
464        );
465    }
466}