terraphim_markdown_parser/
lib.rs

1use std::collections::HashSet;
2use std::ops::Range;
3use std::str::FromStr;
4
5use markdown::ParseOptions;
6use markdown::mdast::Node;
7use terraphim_types::{Document, DocumentType};
8use thiserror::Error;
9use ulid::Ulid;
10
11pub mod chunk;
12pub mod heading;
13
14pub use chunk::{ContentChunk, chunk_by_headings};
15pub use heading::{
16    HeadingNode, HeadingTree, MatchStrategy, SectionConfig, SectionPattern, SectionType,
17    build_heading_tree, classify_sections,
18};
19
20pub const TERRAPHIM_BLOCK_ID_PREFIX: &str = "terraphim:block-id:";
21
22/// Extract the first H1 heading from markdown content using AST parsing.
23///
24/// Returns the heading text with original case preserved, or `None` if no
25/// `# Heading` is found. Only matches depth-1 headings (`#`, not `##`).
26pub fn extract_first_heading(content: &str) -> Option<String> {
27    let ast = markdown::to_mdast(content, &ParseOptions::gfm()).ok()?;
28    find_first_h1(&ast)
29}
30
31/// Walk the AST to find the first depth-1 heading and collect its text content.
32fn find_first_h1(node: &Node) -> Option<String> {
33    match node {
34        Node::Heading(h) if h.depth == 1 => {
35            let text = collect_text_content(&h.children);
36            if text.is_empty() { None } else { Some(text) }
37        }
38        _ => {
39            if let Some(children) = children(node) {
40                for child in children {
41                    if let Some(heading) = find_first_h1(child) {
42                        return Some(heading);
43                    }
44                }
45            }
46            None
47        }
48    }
49}
50
51/// Recursively collect all text content from AST nodes.
52pub(crate) fn collect_text_content(nodes: &[Node]) -> String {
53    let mut text = String::new();
54    for node in nodes {
55        match node {
56            Node::Text(t) => text.push_str(&t.value),
57            Node::InlineCode(c) => text.push_str(&c.value),
58            other => {
59                if let Some(children) = children(other) {
60                    text.push_str(&collect_text_content(children));
61                }
62            }
63        }
64    }
65    text
66}
67
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum BlockKind {
70    Paragraph,
71    ListItem,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq)]
75pub struct Block {
76    pub id: Ulid,
77    pub kind: BlockKind,
78
79    /// Byte span of the block in the markdown buffer.
80    ///
81    /// For paragraphs, this includes the block-id comment line plus the paragraph content.
82    /// For list items, this includes the full list item (including nested content).
83    pub span: Range<usize>,
84
85    /// Byte span of the block-id anchor.
86    ///
87    /// For paragraphs, this is the full comment line (including any leading quote/indent prefix).
88    /// For list items, this is the inline HTML comment inside the list item’s first line.
89    pub id_span: Range<usize>,
90}
91
92#[derive(Debug, Clone)]
93pub struct NormalizedMarkdown {
94    pub markdown: String,
95    pub blocks: Vec<Block>,
96    pub ast: Option<markdown::mdast::Node>,
97}
98
99#[derive(Debug, Error)]
100pub enum MarkdownParserError {
101    #[error("failed to parse markdown: {0}")]
102    Markdown(String),
103
104    #[error("missing or invalid terraphim block id for {0:?} at byte offset {1}")]
105    MissingOrInvalidBlockId(BlockKind, usize),
106}
107
108impl From<markdown::message::Message> for MarkdownParserError {
109    fn from(value: markdown::message::Message) -> Self {
110        Self::Markdown(format!("{value:?}"))
111    }
112}
113
114#[derive(Debug, Clone)]
115struct Edit {
116    range: Range<usize>,
117    replacement: String,
118}
119
120impl Edit {
121    fn insert(at: usize, text: String) -> Self {
122        Self {
123            range: at..at,
124            replacement: text,
125        }
126    }
127}
128
129/// Ensure every list item and paragraph has a stable Terraphim block id.
130///
131/// Canonical forms:
132/// - Paragraph: `<!-- terraphim:block-id:<ULID> -->` on its own line immediately before the paragraph
133/// - List item: inline after the marker (and optional task checkbox), e.g. `- <!-- terraphim:block-id:<ULID> --> text`
134pub fn ensure_terraphim_block_ids(markdown: &str) -> Result<String, MarkdownParserError> {
135    let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
136    let mut edits: Vec<Edit> = Vec::new();
137    ensure_block_ids_in_children(&ast, markdown, &mut edits, ParentKind::Other);
138
139    if edits.is_empty() {
140        return Ok(markdown.to_string());
141    }
142
143    // Apply edits from the end of the buffer to the beginning so byte offsets stay valid.
144    edits.sort_by_key(|e| std::cmp::Reverse(e.range.start));
145    let mut out = markdown.to_string();
146    for edit in edits {
147        out.replace_range(edit.range, &edit.replacement);
148    }
149    Ok(out)
150}
151
152/// Normalize markdown into canonical Terraphim form and return the extracted blocks.
153///
154/// Uses an iterative fixed-point loop (up to 4 passes) because inserting inline
155/// block IDs into list items can change how the markdown crate re-parses the
156/// result, exposing adjacent paragraphs as root-level paragraphs without block
157/// IDs. Repeating until the output stabilizes ensures every block is tagged.
158pub fn normalize_markdown(markdown: &str) -> Result<NormalizedMarkdown, MarkdownParserError> {
159    let mut current = ensure_terraphim_block_ids(markdown)?;
160    for _ in 0..4 {
161        let next = ensure_terraphim_block_ids(&current)?;
162        if next == current {
163            break;
164        }
165        current = next;
166    }
167    let blocks = extract_blocks(&current)?;
168    let ast = markdown::to_mdast(&current, &ParseOptions::gfm()).ok();
169    Ok(NormalizedMarkdown {
170        markdown: current,
171        blocks,
172        ast,
173    })
174}
175
176/// Convert extracted blocks into Terraphim `Document`s so downstream graph tooling can be reused.
177pub fn blocks_to_documents(source_id: &str, normalized: &NormalizedMarkdown) -> Vec<Document> {
178    normalized
179        .blocks
180        .iter()
181        .map(|block| {
182            let block_id = block.id.to_string();
183            let id = format!("{source_id}#{block_id}");
184            let body = strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()])
185                .trim()
186                .to_string();
187            let title = first_nonempty_line(&body).unwrap_or_else(|| "Untitled".to_string());
188            Document {
189                id,
190                url: source_id.to_string(),
191                title,
192                body,
193                description: None,
194                summarization: None,
195                stub: None,
196                tags: None,
197                rank: None,
198                source_haystack: None,
199                doc_type: DocumentType::KgEntry,
200                synonyms: None,
201                route: None,
202                priority: None,
203            }
204        })
205        .collect()
206}
207
208#[derive(Debug, Clone, Copy, PartialEq, Eq)]
209enum ParentKind {
210    ListItem,
211    Other,
212}
213
214fn ensure_block_ids_in_children(
215    node: &Node,
216    source: &str,
217    edits: &mut Vec<Edit>,
218    parent: ParentKind,
219) {
220    match node {
221        Node::Root(root) => {
222            ensure_block_ids_in_list(&root.children, source, edits, ParentKind::Other)
223        }
224        Node::Blockquote(bq) => ensure_block_ids_in_list(&bq.children, source, edits, parent),
225        Node::List(list) => ensure_block_ids_in_list(&list.children, source, edits, parent),
226        Node::ListItem(li) => {
227            if let Some(pos) = node.position() {
228                ensure_list_item_inline_id(source, pos.start.offset, edits);
229            }
230            ensure_block_ids_in_list(&li.children, source, edits, ParentKind::ListItem);
231        }
232        _ => {
233            if let Some(children) = children(node) {
234                ensure_block_ids_in_list(children, source, edits, parent);
235            }
236        }
237    }
238}
239
240fn ensure_block_ids_in_list(
241    children: &[Node],
242    source: &str,
243    edits: &mut Vec<Edit>,
244    parent: ParentKind,
245) {
246    let mut first_direct_paragraph_in_list_item = false;
247
248    for (idx, child) in children.iter().enumerate() {
249        match child {
250            Node::ListItem(_) => ensure_block_ids_in_children(child, source, edits, parent),
251            Node::Paragraph(_) => {
252                // The first direct paragraph of a list item is considered owned by the list item’s
253                // inline block id, so we do not insert a separate comment line for it.
254                if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
255                    first_direct_paragraph_in_list_item = true;
256                } else if let Some(pos) = child.position() {
257                    let has_prev_block_id = idx
258                        .checked_sub(1)
259                        .and_then(|prev| parse_block_id_from_html_node(&children[prev]))
260                        .is_some();
261                    if !has_prev_block_id {
262                        edits.push(insert_paragraph_id_comment(source, pos.start.offset));
263                    }
264                }
265            }
266            _ => ensure_block_ids_in_children(child, source, edits, parent),
267        }
268    }
269}
270
271fn insert_paragraph_id_comment(source: &str, paragraph_start: usize) -> Edit {
272    let (line_start, prefix) = line_prefix_at(source, paragraph_start);
273    let id = Ulid::new();
274    Edit::insert(
275        line_start,
276        format!("{prefix}<!-- terraphim:block-id:{id} -->\n"),
277    )
278}
279
280fn ensure_list_item_inline_id(source: &str, list_item_start: usize, edits: &mut Vec<Edit>) {
281    let (line_start, line_end) = line_bounds_at(source, list_item_start);
282    let line = &source[line_start..line_end];
283
284    if let Some((comment_start, comment_end, parsed)) = find_inline_block_id_comment(line) {
285        if parsed.is_some() {
286            return;
287        }
288
289        // Replace invalid block id comment with a fresh one.
290        let replacement = format!("<!-- terraphim:block-id:{} -->", Ulid::new());
291        edits.push(Edit {
292            range: (line_start + comment_start)..(line_start + comment_end),
293            replacement,
294        });
295        return;
296    }
297
298    // No existing comment on the first line; insert it after the list marker and optional checkbox.
299    if let Some(insert_at) = list_item_inline_insert_point(source, list_item_start) {
300        let trailing_space = match source.as_bytes().get(insert_at) {
301            None | Some(b'\n') | Some(b'\r') => "",
302            _ => " ",
303        };
304        edits.push(Edit::insert(
305            insert_at,
306            format!(
307                "<!-- terraphim:block-id:{} -->{trailing_space}",
308                Ulid::new()
309            ),
310        ));
311    }
312}
313
314fn list_item_inline_insert_point(source: &str, list_item_start: usize) -> Option<usize> {
315    let bytes = source.as_bytes();
316    let mut i = list_item_start;
317
318    // Skip indentation and blockquote markers on this line (e.g. "> " prefixes).
319    // We only do a shallow pass to handle common cases like "> - item".
320    loop {
321        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
322            i += 1;
323        }
324        if bytes.get(i..i + 2) == Some(b"> ") {
325            i += 2;
326            continue;
327        }
328        break;
329    }
330
331    // Unordered list marker
332    if matches!(bytes.get(i), Some(b'-' | b'*' | b'+')) {
333        i += 1;
334        if matches!(bytes.get(i), Some(b' ' | b'\t')) {
335            i += 1;
336        } else {
337            return None;
338        }
339    } else if matches!(bytes.get(i), Some(b'0'..=b'9')) {
340        // Ordered list marker: digits + '.' or ')' + whitespace
341        while matches!(bytes.get(i), Some(b'0'..=b'9')) {
342            i += 1;
343        }
344        if matches!(bytes.get(i), Some(b'.' | b')')) {
345            i += 1;
346        } else {
347            return None;
348        }
349        if matches!(bytes.get(i), Some(b' ' | b'\t')) {
350            i += 1;
351        } else {
352            return None;
353        }
354    } else {
355        return None;
356    }
357
358    // Optional task list checkbox: [ ] / [x] / [X]
359    if bytes.get(i) == Some(&b'[')
360        && matches!(bytes.get(i + 1), Some(b' ' | b'x' | b'X'))
361        && bytes.get(i + 2) == Some(&b']')
362        && matches!(bytes.get(i + 3), Some(b' ' | b'\t'))
363    {
364        i += 4;
365    }
366
367    Some(i)
368}
369
370fn extract_blocks(markdown: &str) -> Result<Vec<Block>, MarkdownParserError> {
371    let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
372    let mut blocks = Vec::new();
373    extract_blocks_from_children(&ast, markdown, &mut blocks, ParentKind::Other)?;
374
375    // Validate uniqueness: ids should be stable and non-duplicated.
376    let mut seen = HashSet::new();
377    for b in &blocks {
378        let id = b.id.to_string();
379        if !seen.insert(id) {
380            // If duplicates exist, it is safer to surface an error rather than silently re-ID.
381            return Err(MarkdownParserError::MissingOrInvalidBlockId(
382                b.kind,
383                b.span.start,
384            ));
385        }
386    }
387
388    Ok(blocks)
389}
390
391fn extract_blocks_from_children(
392    node: &Node,
393    source: &str,
394    blocks: &mut Vec<Block>,
395    parent: ParentKind,
396) -> Result<(), MarkdownParserError> {
397    match node {
398        Node::Root(root) => {
399            extract_blocks_from_list(&root.children, source, blocks, ParentKind::Other)?;
400        }
401        Node::Blockquote(bq) => {
402            extract_blocks_from_list(&bq.children, source, blocks, parent)?;
403        }
404        Node::List(list) => {
405            extract_blocks_from_list(&list.children, source, blocks, parent)?;
406        }
407        Node::ListItem(li) => {
408            let Some(pos) = node.position() else {
409                return Ok(());
410            };
411
412            let Some((id, id_span)) = extract_list_item_id(source, pos.start.offset) else {
413                return Err(MarkdownParserError::MissingOrInvalidBlockId(
414                    BlockKind::ListItem,
415                    pos.start.offset,
416                ));
417            };
418            let start = line_bounds_at(source, pos.start.offset).0;
419            let end = pos.end.offset;
420            blocks.push(Block {
421                id,
422                kind: BlockKind::ListItem,
423                span: start..end,
424                id_span,
425            });
426            extract_blocks_from_list(&li.children, source, blocks, ParentKind::ListItem)?;
427        }
428        _ => {
429            if let Some(children) = children(node) {
430                extract_blocks_from_list(children, source, blocks, parent)?;
431            }
432        }
433    }
434    Ok(())
435}
436
437fn extract_blocks_from_list(
438    children: &[Node],
439    source: &str,
440    blocks: &mut Vec<Block>,
441    parent: ParentKind,
442) -> Result<(), MarkdownParserError> {
443    let mut first_direct_paragraph_in_list_item = false;
444
445    for (idx, child) in children.iter().enumerate() {
446        match child {
447            Node::ListItem(_) => extract_blocks_from_children(child, source, blocks, parent)?,
448            Node::Paragraph(_) => {
449                if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
450                    first_direct_paragraph_in_list_item = true;
451                    continue;
452                }
453
454                let Some(pos) = child.position() else {
455                    continue;
456                };
457
458                let Some((id, anchor_span)) = idx
459                    .checked_sub(1)
460                    .and_then(|prev| {
461                        parse_block_id_from_html_node_with_span(source, &children[prev])
462                    })
463                    .and_then(|(id, span)| id.map(|id| (id, span)))
464                else {
465                    return Err(MarkdownParserError::MissingOrInvalidBlockId(
466                        BlockKind::Paragraph,
467                        pos.start.offset,
468                    ));
469                };
470
471                blocks.push(Block {
472                    id,
473                    kind: BlockKind::Paragraph,
474                    span: anchor_span.start..pos.end.offset,
475                    id_span: anchor_span,
476                })
477            }
478            _ => extract_blocks_from_children(child, source, blocks, parent)?,
479        }
480    }
481
482    Ok(())
483}
484
485fn extract_list_item_id(source: &str, list_item_start: usize) -> Option<(Ulid, Range<usize>)> {
486    let (line_start, line_end) = line_bounds_at(source, list_item_start);
487    let line = &source[line_start..line_end];
488    let (comment_start, comment_end, parsed) = find_inline_block_id_comment(line)?;
489    let id = parsed?;
490    Some((id, (line_start + comment_start)..(line_start + comment_end)))
491}
492
493fn parse_block_id_from_html_node(node: &Node) -> Option<Ulid> {
494    match node {
495        Node::Html(val) => parse_block_id_comment(&val.value),
496        _ => None,
497    }
498}
499
500fn parse_block_id_from_html_node_with_span(
501    source: &str,
502    node: &Node,
503) -> Option<(Option<Ulid>, Range<usize>)> {
504    let Node::Html(val) = node else { return None };
505    let id = parse_block_id_comment(&val.value);
506
507    let Some(pos) = node.position() else {
508        return Some((id, 0..0));
509    };
510
511    let (line_start, line_end) = line_bounds_at(source, pos.start.offset);
512    Some((id, line_start..line_end))
513}
514
515fn parse_block_id_comment(raw_html: &str) -> Option<Ulid> {
516    let html = raw_html.trim();
517    let inner = html
518        .strip_prefix("<!--")
519        .and_then(|s| s.strip_suffix("-->"))?;
520    let inner = inner.trim();
521    let id_str = inner.strip_prefix(TERRAPHIM_BLOCK_ID_PREFIX)?;
522    Ulid::from_str(id_str.trim()).ok()
523}
524
525fn find_inline_block_id_comment(line: &str) -> Option<(usize, usize, Option<Ulid>)> {
526    let start = line.find("<!--")?;
527    let marker = line[start..].find(TERRAPHIM_BLOCK_ID_PREFIX)? + start;
528    let end = line[marker..].find("-->")? + marker + 3;
529
530    let comment_start = start;
531    let comment_end = end;
532    let comment = &line[comment_start..comment_end];
533    Some((comment_start, comment_end, parse_block_id_comment(comment)))
534}
535
536fn line_bounds_at(source: &str, offset: usize) -> (usize, usize) {
537    let line_start = source[..offset].rfind('\n').map(|i| i + 1).unwrap_or(0);
538    let line_end = source[offset..]
539        .find('\n')
540        .map(|i| offset + i)
541        .unwrap_or_else(|| source.len());
542    (line_start, line_end)
543}
544
545fn line_prefix_at(source: &str, offset: usize) -> (usize, String) {
546    let (line_start, _line_end) = line_bounds_at(source, offset);
547    let prefix = &source[line_start..offset];
548    (line_start, prefix.to_string())
549}
550
551fn children(node: &Node) -> Option<&Vec<Node>> {
552    match node {
553        Node::Root(root) => Some(&root.children),
554        Node::Blockquote(bq) => Some(&bq.children),
555        Node::List(list) => Some(&list.children),
556        Node::ListItem(li) => Some(&li.children),
557        Node::Paragraph(p) => Some(&p.children),
558        Node::Heading(h) => Some(&h.children),
559        _ => None,
560    }
561}
562
563fn strip_terraphim_block_id_comments(text: &str) -> String {
564    let mut out = String::with_capacity(text.len());
565    for line in text.lines() {
566        let mut remaining = line;
567        let mut cleaned = String::new();
568        loop {
569            let Some((start, end, _)) = find_inline_block_id_comment(remaining) else {
570                cleaned.push_str(remaining);
571                break;
572            };
573            cleaned.push_str(&remaining[..start]);
574            remaining = &remaining[end..];
575        }
576
577        if cleaned.trim().is_empty() {
578            continue;
579        }
580
581        out.push_str(cleaned.trim_end());
582        out.push('\n')
583    }
584    out
585}
586
587fn first_nonempty_line(text: &str) -> Option<String> {
588    text.lines()
589        .map(|l| l.trim())
590        .find(|l| !l.is_empty())
591        .map(|l| l.chars().take(80).collect::<String>())
592}
593
594#[cfg(test)]
595mod tests {
596    use super::*;
597
598    fn count_block_ids(s: &str) -> usize {
599        s.lines()
600            .filter(|l| l.contains("<!-- terraphim:block-id:"))
601            .count()
602    }
603
604    #[test]
605    fn inserts_paragraph_ids() {
606        let input = "Hello world\n\nSecond paragraph\n";
607        let out = ensure_terraphim_block_ids(input).unwrap();
608        // 2 paragraphs => 2 id comment lines
609        assert_eq!(count_block_ids(&out), 2);
610        assert!(out.contains("Hello world"));
611        assert!(out.contains("Second paragraph"));
612    }
613
614    #[test]
615    fn inserts_list_item_inline_ids() {
616        let input = "- first\n- second\n";
617        let out = ensure_terraphim_block_ids(input).unwrap();
618        assert_eq!(count_block_ids(&out), 2);
619        assert!(out.contains("- <!-- terraphim:block-id:"));
620    }
621
622    #[test]
623    fn normalize_returns_blocks() {
624        let input = "- item\n\nPara\n";
625        let normalized = normalize_markdown(input).unwrap();
626        assert!(normalized.blocks.len() >= 2);
627    }
628
629    #[test]
630    fn extract_first_heading_h1() {
631        let input = "# Bun Package Manager\n\nsynonyms:: npm, yarn\n";
632        assert_eq!(
633            extract_first_heading(input),
634            Some("Bun Package Manager".to_string())
635        );
636    }
637
638    #[test]
639    fn extract_first_heading_skips_h2() {
640        let input = "## Not This\n\n# This One\n";
641        assert_eq!(extract_first_heading(input), Some("This One".to_string()));
642    }
643
644    #[test]
645    fn extract_first_heading_none_when_absent() {
646        let input = "Just some text\n\n## Only H2\n";
647        assert_eq!(extract_first_heading(input), None);
648    }
649
650    #[test]
651    fn extract_first_heading_with_inline_code() {
652        let input = "# The `bun` Runtime\n";
653        assert_eq!(
654            extract_first_heading(input),
655            Some("The bun Runtime".to_string())
656        );
657    }
658
659    #[test]
660    fn iterative_normalization_stabilizes() {
661        // After inserting inline block IDs into list items, the markdown parser
662        // may re-parse the result differently, exposing adjacent text as a new
663        // paragraph without a block ID. The iterative loop must handle this.
664        //
665        // This input has a list item immediately followed by a paragraph with
666        // no blank line separating them, which can trigger re-parsing shifts.
667        let input = "- item one\n- item two\n\nA paragraph after the list.\n\nAnother paragraph.\n";
668
669        // Single pass
670        let pass1 = ensure_terraphim_block_ids(input).unwrap();
671
672        // The iterative normalize_markdown must produce a stable result
673        let normalized = normalize_markdown(input).unwrap();
674
675        // Verify stability: running ensure_terraphim_block_ids again should
676        // produce the same output (the fixed-point was reached).
677        let again = ensure_terraphim_block_ids(&normalized.markdown).unwrap();
678        assert_eq!(
679            again, normalized.markdown,
680            "normalization should be stable after normalize_markdown"
681        );
682
683        // All blocks must have IDs: 2 list items + 2 paragraphs = 4
684        let id_count = normalized
685            .markdown
686            .lines()
687            .filter(|l| l.contains("<!-- terraphim:block-id:"))
688            .count();
689        assert!(
690            id_count >= 4,
691            "expected at least 4 block IDs, got {id_count}; pass1 had {} IDs",
692            count_block_ids(&pass1),
693        );
694    }
695}
terraphim_markdown_parser/lib.rs

terraphim_markdown_parser/
lib.rs