Skip to main content

marco_core/intelligence/
toc.rs

1//! Table of Contents extraction and Markdown generation.
2//!
3//! # Workflow
4//! 1. Parse the document with `core::parse()`
5//! 2. Call [`extract_toc`] to get a flat list of [`TocEntry`] items
6//! 3. Call [`generate_toc_markdown`] to produce the fenced TOC block
7//! 4. Call [`replace_toc_in_text`] to update or insert the block in the source
8
9use crate::parser::{Document, NodeKind};
10use std::collections::HashMap;
11
12/// A single entry in the extracted Table of Contents.
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct TocEntry {
15    pub level: u8,
16    pub text: String,
17    pub slug: String,
18    /// 1-based source line number of the heading (from the AST span), or 0 when unknown.
19    pub line: usize,
20}
21
22/// Result of attempting to replace an existing TOC block in source text.
23#[derive(Debug)]
24pub enum TocReplaceResult {
25    /// No `<!-- TOC -->` markers found; caller should insert the block at cursor.
26    NoMarkers,
27    /// Existing block is identical to the new one — no change needed.
28    NoChange,
29    /// Returns the full source text with the TOC block replaced.
30    Updated(String),
31}
32
33/// Generate a GitHub-compatible URL slug from heading text.
34///
35/// Algorithm:
36/// 1. Lowercase
37/// 2. Each character: alphanumeric → keep; space / hyphen / underscore → `-`; anything else → `-`
38/// 3. Collapse consecutive hyphens into one
39/// 4. Trim leading and trailing hyphens
40pub fn heading_slug(text: &str) -> String {
41    let lower = text.to_lowercase();
42
43    let mut slug = String::with_capacity(lower.len());
44    let mut prev_hyphen = false;
45
46    for c in lower.chars() {
47        let mapped = if c.is_alphanumeric() {
48            prev_hyphen = false;
49            slug.push(c);
50            continue;
51        } else {
52            '-'
53        };
54
55        if !prev_hyphen {
56            slug.push(mapped);
57        }
58        prev_hyphen = true;
59    }
60
61    // Trim leading/trailing hyphens
62    let trimmed = slug.trim_matches('-');
63    trimmed.to_string()
64}
65
66/// Extract all top-level headings from a document as a flat list of [`TocEntry`] items.
67///
68/// - Explicit `{#id}` from the AST `id` field is used as-is; otherwise the slug is
69///   derived from the heading text via [`heading_slug`].
70/// - Duplicate slugs receive `-1`, `-2`, … suffixes in document order.
71///
72/// **Slug parity with the renderer**: the renderer increments its own slug counter for
73/// *every* heading it encounters during the depth-first tree walk — including headings
74/// that live inside blockquotes, admonitions, etc.  This function replicates that walk
75/// so that the counters stay in sync.  Nested headings are not listed in the output
76/// (callers expect a "structural" TOC), but their slugs are still counted so that the
77/// numbers assigned to top-level headings match the `id` attributes in the rendered HTML.
78pub fn extract_toc(doc: &Document) -> Vec<TocEntry> {
79    let mut entries = Vec::new();
80    let mut slug_counts: HashMap<String, usize> = HashMap::new();
81    walk_toc_nodes(&doc.children, true, &mut entries, &mut slug_counts);
82    entries
83}
84
85/// Depth-first walk that mirrors the renderer's heading slug counter.
86///
87/// `include` is `true` only for top-level document children.  At deeper levels it is
88/// `false` so those headings affect the counter but are not added to `entries`.
89fn walk_toc_nodes(
90    nodes: &[crate::parser::Node],
91    include: bool,
92    entries: &mut Vec<TocEntry>,
93    slug_counts: &mut HashMap<String, usize>,
94) {
95    for node in nodes {
96        if let NodeKind::Heading { level, text, id } = &node.kind {
97            let base = id
98                .as_deref()
99                .map(|s| s.to_string())
100                .unwrap_or_else(|| heading_slug(text));
101
102            let count = slug_counts.entry(base.clone()).or_insert(0);
103            let slug = if *count == 0 {
104                base.clone()
105            } else {
106                format!("{}-{}", base, count)
107            };
108            *count += 1;
109
110            if include {
111                entries.push(TocEntry {
112                    level: *level,
113                    text: text.clone(),
114                    slug,
115                    line: node.span.map(|s| s.start.line).unwrap_or(0),
116                });
117            }
118        }
119
120        // Recurse into block containers (blockquotes, admonitions, list items, …).
121        // Their headings are counted for slug deduplication but not shown in the TOC.
122        if !node.children.is_empty() {
123            walk_toc_nodes(&node.children, false, entries, slug_counts);
124        }
125    }
126}
127
128/// Generate a Markdown TOC block wrapped in `<!-- TOC -->` / `<!-- /TOC -->` markers.
129///
130/// Returns an empty string when `entries` is empty.
131/// Indentation is relative to the minimum heading level present.
132pub fn generate_toc_markdown(entries: &[TocEntry]) -> String {
133    if entries.is_empty() {
134        return String::new();
135    }
136
137    let min_level = entries.iter().map(|e| e.level).min().unwrap_or(1);
138    let mut lines = vec!["<!-- TOC -->".to_string()];
139
140    for entry in entries {
141        let indent = "  ".repeat((entry.level - min_level) as usize);
142        lines.push(format!("{}- [{}](#{})", indent, entry.text, entry.slug));
143    }
144
145    lines.push(String::new()); // blank line terminates the list before the closing marker
146    lines.push("<!-- /TOC -->".to_string());
147    lines.join("\n")
148}
149
150/// Attempt to find and replace an existing `<!-- TOC -->...<!-- /TOC -->` block in
151/// `current_text` with `new_toc`.
152///
153/// Returns:
154/// - [`TocReplaceResult::NoMarkers`] when no markers were found
155/// - [`TocReplaceResult::NoChange`] when the existing block equals `new_toc`
156/// - [`TocReplaceResult::Updated`] with the new full source text otherwise
157pub fn replace_toc_in_text(current_text: &str, new_toc: &str) -> TocReplaceResult {
158    const START_MARKER: &str = "<!-- TOC -->";
159    const END_MARKER: &str = "<!-- /TOC -->";
160
161    let Some(start_pos) = current_text.find(START_MARKER) else {
162        return TocReplaceResult::NoMarkers;
163    };
164    let Some(end_pos) = current_text.find(END_MARKER) else {
165        return TocReplaceResult::NoMarkers;
166    };
167
168    if end_pos < start_pos {
169        return TocReplaceResult::NoMarkers;
170    }
171
172    let end_of_block = end_pos + END_MARKER.len();
173    let existing = &current_text[start_pos..end_of_block];
174
175    if existing == new_toc {
176        return TocReplaceResult::NoChange;
177    }
178
179    let mut result = String::with_capacity(current_text.len());
180    result.push_str(&current_text[..start_pos]);
181    result.push_str(new_toc);
182    result.push_str(&current_text[end_of_block..]);
183    TocReplaceResult::Updated(result)
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189
190    #[test]
191    fn smoke_heading_slug_basic() {
192        assert_eq!(heading_slug("Hello World"), "hello-world");
193        assert_eq!(heading_slug("Introduction"), "introduction");
194        assert_eq!(
195            heading_slug("Getting Started Guide"),
196            "getting-started-guide"
197        );
198    }
199
200    #[test]
201    fn smoke_heading_slug_special_chars() {
202        assert_eq!(heading_slug("Code <example> & test"), "code-example-test");
203        assert_eq!(heading_slug("A/B Testing"), "a-b-testing");
204        assert_eq!(heading_slug("Hello---World"), "hello-world");
205    }
206
207    #[test]
208    fn smoke_heading_slug_empty() {
209        assert_eq!(heading_slug(""), "");
210        assert_eq!(heading_slug("---"), "");
211        assert_eq!(heading_slug("!@#"), "");
212    }
213
214    #[test]
215    fn smoke_extract_toc_basic() {
216        use crate::parser::{Document, Node};
217
218        let doc = Document {
219            children: vec![
220                Node {
221                    kind: NodeKind::Heading {
222                        level: 1,
223                        text: "Title".to_string(),
224                        id: None,
225                    },
226                    span: None,
227                    children: vec![],
228                },
229                Node {
230                    kind: NodeKind::Heading {
231                        level: 2,
232                        text: "Getting Started".to_string(),
233                        id: None,
234                    },
235                    span: None,
236                    children: vec![],
237                },
238                Node {
239                    kind: NodeKind::Heading {
240                        level: 2,
241                        text: "Installation".to_string(),
242                        id: None,
243                    },
244                    span: None,
245                    children: vec![],
246                },
247            ],
248            ..Default::default()
249        };
250
251        let entries = extract_toc(&doc);
252        assert_eq!(entries.len(), 3);
253        assert_eq!(entries[0].slug, "title");
254        assert_eq!(entries[1].slug, "getting-started");
255        assert_eq!(entries[2].slug, "installation");
256    }
257
258    #[test]
259    fn smoke_extract_toc_explicit_id_wins() {
260        use crate::parser::{Document, Node};
261
262        let doc = Document {
263            children: vec![Node {
264                kind: NodeKind::Heading {
265                    level: 2,
266                    text: "My Title".to_string(),
267                    id: Some("custom-id".to_string()),
268                },
269                span: None,
270                children: vec![],
271            }],
272            ..Default::default()
273        };
274
275        let entries = extract_toc(&doc);
276        assert_eq!(entries[0].slug, "custom-id");
277    }
278
279    #[test]
280    fn smoke_extract_toc_duplicate_slugs() {
281        use crate::parser::{Document, Node};
282
283        let doc = Document {
284            children: vec![
285                Node {
286                    kind: NodeKind::Heading {
287                        level: 2,
288                        text: "Introduction".to_string(),
289                        id: None,
290                    },
291                    span: None,
292                    children: vec![],
293                },
294                Node {
295                    kind: NodeKind::Heading {
296                        level: 2,
297                        text: "Introduction".to_string(),
298                        id: None,
299                    },
300                    span: None,
301                    children: vec![],
302                },
303                Node {
304                    kind: NodeKind::Heading {
305                        level: 2,
306                        text: "Introduction".to_string(),
307                        id: None,
308                    },
309                    span: None,
310                    children: vec![],
311                },
312            ],
313            ..Default::default()
314        };
315
316        let entries = extract_toc(&doc);
317        assert_eq!(entries[0].slug, "introduction");
318        assert_eq!(entries[1].slug, "introduction-1");
319        assert_eq!(entries[2].slug, "introduction-2");
320    }
321
322    #[test]
323    fn smoke_generate_toc_markdown_basic() {
324        let entries = vec![
325            TocEntry {
326                level: 1,
327                text: "Title".to_string(),
328                slug: "title".to_string(),
329                line: 0,
330            },
331            TocEntry {
332                level: 2,
333                text: "Getting Started".to_string(),
334                slug: "getting-started".to_string(),
335                line: 0,
336            },
337            TocEntry {
338                level: 3,
339                text: "Installation".to_string(),
340                slug: "installation".to_string(),
341                line: 0,
342            },
343        ];
344
345        let md = generate_toc_markdown(&entries);
346        assert!(md.starts_with("<!-- TOC -->"));
347        assert!(md.ends_with("<!-- /TOC -->"));
348        assert!(md.contains("- [Title](#title)"));
349        assert!(md.contains("  - [Getting Started](#getting-started)"));
350        assert!(md.contains("    - [Installation](#installation)"));
351    }
352
353    #[test]
354    fn smoke_generate_toc_markdown_empty() {
355        assert_eq!(generate_toc_markdown(&[]), "");
356    }
357
358    #[test]
359    fn smoke_replace_toc_no_markers() {
360        let text = "# Hello\n\nSome content.\n";
361        let toc = "<!-- TOC -->\n- [Hello](#hello)\n<!-- /TOC -->";
362        assert!(matches!(
363            replace_toc_in_text(text, toc),
364            TocReplaceResult::NoMarkers
365        ));
366    }
367
368    #[test]
369    fn smoke_replace_toc_updates_existing() {
370        let text = "# Hello\n\n<!-- TOC -->\n- [Old](#old)\n<!-- /TOC -->\n\nContent.\n";
371        let new_toc = "<!-- TOC -->\n- [Hello](#hello)\n<!-- /TOC -->";
372        match replace_toc_in_text(text, new_toc) {
373            TocReplaceResult::Updated(result) => {
374                assert!(result.contains("- [Hello](#hello)"));
375                assert!(!result.contains("- [Old](#old)"));
376                assert!(result.contains("# Hello"));
377                assert!(result.contains("Content."));
378            }
379            other => panic!("expected Updated, got {:?}", other),
380        }
381    }
382
383    #[test]
384    fn smoke_replace_toc_no_change() {
385        let toc = "<!-- TOC -->\n- [Hello](#hello)\n<!-- /TOC -->";
386        let text = format!("# Hello\n\n{}\n\nContent.\n", toc);
387        assert!(matches!(
388            replace_toc_in_text(&text, toc),
389            TocReplaceResult::NoChange
390        ));
391    }
392
393    /// A heading inside a blockquote must consume a slug counter slot so that a
394    /// same-text heading at the top level receives the correct suffix — matching
395    /// the `id` the renderer would assign to it.
396    #[test]
397    fn smoke_extract_toc_nested_heading_syncs_slug_counter() {
398        use crate::parser::{Document, Node};
399
400        // Document structure:
401        //   > ## Introduction   ← inside blockquote, NOT in TOC output
402        //   ## Introduction     ← top-level, slug must be "introduction-1"
403        let blockquote_heading = Node {
404            kind: NodeKind::Heading {
405                level: 2,
406                text: "Introduction".to_string(),
407                id: None,
408            },
409            span: None,
410            children: vec![],
411        };
412        let blockquote_node = Node {
413            kind: NodeKind::Blockquote,
414            span: None,
415            children: vec![blockquote_heading],
416        };
417        let top_level_heading = Node {
418            kind: NodeKind::Heading {
419                level: 2,
420                text: "Introduction".to_string(),
421                id: None,
422            },
423            span: None,
424            children: vec![],
425        };
426
427        let doc = Document {
428            children: vec![blockquote_node, top_level_heading],
429            ..Default::default()
430        };
431
432        let entries = extract_toc(&doc);
433        // Only the top-level heading is in the TOC.
434        assert_eq!(entries.len(), 1);
435        // Its slug must be "introduction-1" (counter was consumed by the blockquote heading).
436        assert_eq!(entries[0].slug, "introduction-1");
437    }
438}
439
440#[cfg(test)]
441mod parse_roundtrip {
442    #[test]
443    fn toc_block_renders_as_invisible_html_comments() {
444        // Blank line before closing marker is required so the block parser
445        // can recognise <!-- /TOC --> as an HTML comment rather than list inline text.
446        let input = "<!-- TOC -->\n- [Title](#title)\n  - [Sub](#sub)\n\n<!-- /TOC -->\n";
447        let doc = crate::parser::parse(input).expect("parse failed");
448        let kinds: Vec<_> = doc
449            .children
450            .iter()
451            .map(|n| format!("{:?}", n.kind))
452            .collect();
453        eprintln!("Parsed nodes: {:?}", kinds);
454        let html = crate::render::render(&doc, &crate::render::RenderOptions::default())
455            .expect("render failed");
456        eprintln!("HTML output:\n{}", html);
457        // Both markers must be invisible HTML comments, not text
458        assert!(
459            !html.contains("&lt;!"),
460            "markers were escaped as text, not passed through as HTML"
461        );
462    }
463}