Skip to main content

terraphim_markdown_parser/
lib.rs

1//! Markdown parser for Terraphim knowledge-graph documents.
2//!
3//! Converts markdown files into typed [`terraphim_types::Document`] values,
4//! extracting titles, tags, and body text for indexing and search.
5use std::collections::HashSet;
6use std::ops::Range;
7use std::str::FromStr;
8
9use markdown::ParseOptions;
10use markdown::mdast::Node;
11use terraphim_types::{Document, DocumentType};
12use thiserror::Error;
13use ulid::Ulid;
14
15pub mod chunk;
16pub mod heading;
17
18pub use chunk::{ContentChunk, chunk_by_headings};
19pub use heading::{
20    HeadingNode, HeadingTree, MatchStrategy, SectionConfig, SectionPattern, SectionType,
21    build_heading_tree, classify_sections,
22};
23
24pub const TERRAPHIM_BLOCK_ID_PREFIX: &str = "terraphim:block-id:";
25
26/// Extract the first H1 heading from markdown content using AST parsing.
27///
28/// Returns the heading text with original case preserved, or `None` if no
29/// `# Heading` is found. Only matches depth-1 headings (`#`, not `##`).
30pub fn extract_first_heading(content: &str) -> Option<String> {
31    let ast = markdown::to_mdast(content, &ParseOptions::gfm()).ok()?;
32    find_first_h1(&ast)
33}
34
35/// Walk the AST to find the first depth-1 heading and collect its text content.
36fn find_first_h1(node: &Node) -> Option<String> {
37    match node {
38        Node::Heading(h) if h.depth == 1 => {
39            let text = collect_text_content(&h.children);
40            if text.is_empty() { None } else { Some(text) }
41        }
42        _ => {
43            if let Some(children) = children(node) {
44                for child in children {
45                    if let Some(heading) = find_first_h1(child) {
46                        return Some(heading);
47                    }
48                }
49            }
50            None
51        }
52    }
53}
54
55/// Recursively collect all text content from AST nodes.
56pub(crate) fn collect_text_content(nodes: &[Node]) -> String {
57    let mut text = String::new();
58    for node in nodes {
59        match node {
60            Node::Text(t) => text.push_str(&t.value),
61            Node::InlineCode(c) => text.push_str(&c.value),
62            other => {
63                if let Some(children) = children(other) {
64                    text.push_str(&collect_text_content(children));
65                }
66            }
67        }
68    }
69    text
70}
71
72#[derive(Debug, Clone, Copy, PartialEq, Eq)]
73pub enum BlockKind {
74    Paragraph,
75    ListItem,
76}
77
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct Block {
80    pub id: Ulid,
81    pub kind: BlockKind,
82
83    /// Byte span of the block in the markdown buffer.
84    ///
85    /// For paragraphs, this includes the block-id comment line plus the paragraph content.
86    /// For list items, this includes the full list item (including nested content).
87    pub span: Range<usize>,
88
89    /// Byte span of the block-id anchor.
90    ///
91    /// For paragraphs, this is the full comment line (including any leading quote/indent prefix).
92    /// For list items, this is the inline HTML comment inside the list item’s first line.
93    pub id_span: Range<usize>,
94}
95
96#[derive(Debug, Clone)]
97pub struct NormalizedMarkdown {
98    pub markdown: String,
99    pub blocks: Vec<Block>,
100    pub ast: Option<markdown::mdast::Node>,
101}
102
103#[derive(Debug, Error)]
104pub enum MarkdownParserError {
105    #[error("failed to parse markdown: {0}")]
106    Markdown(String),
107
108    #[error("missing or invalid terraphim block id for {0:?} at byte offset {1}")]
109    MissingOrInvalidBlockId(BlockKind, usize),
110}
111
112impl From<markdown::message::Message> for MarkdownParserError {
113    fn from(value: markdown::message::Message) -> Self {
114        Self::Markdown(format!("{value:?}"))
115    }
116}
117
118#[derive(Debug, Clone)]
119struct Edit {
120    range: Range<usize>,
121    replacement: String,
122}
123
124impl Edit {
125    fn insert(at: usize, text: String) -> Self {
126        Self {
127            range: at..at,
128            replacement: text,
129        }
130    }
131}
132
133/// Ensure every list item and paragraph has a stable Terraphim block id.
134///
135/// Canonical forms:
136/// - Paragraph: `<!-- terraphim:block-id:<ULID> -->` on its own line immediately before the paragraph
137/// - List item: inline after the marker (and optional task checkbox), e.g. `- <!-- terraphim:block-id:<ULID> --> text`
138pub fn ensure_terraphim_block_ids(markdown: &str) -> Result<String, MarkdownParserError> {
139    let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
140    let mut edits: Vec<Edit> = Vec::new();
141    ensure_block_ids_in_children(&ast, markdown, &mut edits, ParentKind::Other);
142
143    if edits.is_empty() {
144        return Ok(markdown.to_string());
145    }
146
147    // Apply edits from the end of the buffer to the beginning so byte offsets stay valid.
148    edits.sort_by_key(|e| std::cmp::Reverse(e.range.start));
149    let mut out = markdown.to_string();
150    for edit in edits {
151        out.replace_range(edit.range, &edit.replacement);
152    }
153    Ok(out)
154}
155
156/// Normalize markdown into canonical Terraphim form and return the extracted blocks.
157///
158/// Uses an iterative fixed-point loop (up to 4 passes) because inserting inline
159/// block IDs into list items can change how the markdown crate re-parses the
160/// result, exposing adjacent paragraphs as root-level paragraphs without block
161/// IDs. Repeating until the output stabilizes ensures every block is tagged.
162pub fn normalize_markdown(markdown: &str) -> Result<NormalizedMarkdown, MarkdownParserError> {
163    let mut current = ensure_terraphim_block_ids(markdown)?;
164    for _ in 0..4 {
165        let next = ensure_terraphim_block_ids(&current)?;
166        if next == current {
167            break;
168        }
169        current = next;
170    }
171    let blocks = extract_blocks(&current)?;
172    let ast = markdown::to_mdast(&current, &ParseOptions::gfm()).ok();
173    Ok(NormalizedMarkdown {
174        markdown: current,
175        blocks,
176        ast,
177    })
178}
179
180/// Convert extracted blocks into Terraphim `Document`s so downstream graph tooling can be reused.
181pub fn blocks_to_documents(source_id: &str, normalized: &NormalizedMarkdown) -> Vec<Document> {
182    normalized
183        .blocks
184        .iter()
185        .map(|block| {
186            let block_id = block.id.to_string();
187            let id = format!("{source_id}#{block_id}");
188            let body = strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()])
189                .trim()
190                .to_string();
191            let title = first_nonempty_line(&body).unwrap_or_else(|| "Untitled".to_string());
192            Document {
193                id,
194                url: source_id.to_string(),
195                title,
196                body,
197                description: None,
198                summarization: None,
199                stub: None,
200                tags: None,
201                rank: None,
202                source_haystack: None,
203                doc_type: DocumentType::KgEntry,
204                synonyms: None,
205                route: None,
206                priority: None,
207                quality_score: None,
208            }
209        })
210        .collect()
211}
212
213#[derive(Debug, Clone, Copy, PartialEq, Eq)]
214enum ParentKind {
215    ListItem,
216    Other,
217}
218
219fn ensure_block_ids_in_children(
220    node: &Node,
221    source: &str,
222    edits: &mut Vec<Edit>,
223    parent: ParentKind,
224) {
225    match node {
226        Node::Root(root) => {
227            ensure_block_ids_in_list(&root.children, source, edits, ParentKind::Other)
228        }
229        Node::Blockquote(bq) => ensure_block_ids_in_list(&bq.children, source, edits, parent),
230        Node::List(list) => ensure_block_ids_in_list(&list.children, source, edits, parent),
231        Node::ListItem(li) => {
232            if let Some(pos) = node.position() {
233                ensure_list_item_inline_id(source, pos.start.offset, edits);
234            }
235            ensure_block_ids_in_list(&li.children, source, edits, ParentKind::ListItem);
236        }
237        _ => {
238            if let Some(children) = children(node) {
239                ensure_block_ids_in_list(children, source, edits, parent);
240            }
241        }
242    }
243}
244
245fn ensure_block_ids_in_list(
246    children: &[Node],
247    source: &str,
248    edits: &mut Vec<Edit>,
249    parent: ParentKind,
250) {
251    let mut first_direct_paragraph_in_list_item = false;
252
253    for (idx, child) in children.iter().enumerate() {
254        match child {
255            Node::ListItem(_) => ensure_block_ids_in_children(child, source, edits, parent),
256            Node::Paragraph(_) => {
257                // The first direct paragraph of a list item is considered owned by the list item’s
258                // inline block id, so we do not insert a separate comment line for it.
259                if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
260                    first_direct_paragraph_in_list_item = true;
261                } else if let Some(pos) = child.position() {
262                    let has_prev_block_id = idx
263                        .checked_sub(1)
264                        .and_then(|prev| parse_block_id_from_html_node(&children[prev]))
265                        .is_some();
266                    if !has_prev_block_id {
267                        edits.push(insert_paragraph_id_comment(source, pos.start.offset));
268                    }
269                }
270            }
271            _ => ensure_block_ids_in_children(child, source, edits, parent),
272        }
273    }
274}
275
276fn insert_paragraph_id_comment(source: &str, paragraph_start: usize) -> Edit {
277    let (line_start, prefix) = line_prefix_at(source, paragraph_start);
278    let id = Ulid::new();
279    Edit::insert(
280        line_start,
281        format!("{prefix}<!-- terraphim:block-id:{id} -->\n"),
282    )
283}
284
285fn ensure_list_item_inline_id(source: &str, list_item_start: usize, edits: &mut Vec<Edit>) {
286    let (line_start, line_end) = line_bounds_at(source, list_item_start);
287    let line = &source[line_start..line_end];
288
289    if let Some((comment_start, comment_end, parsed)) = find_inline_block_id_comment(line) {
290        if parsed.is_some() {
291            return;
292        }
293
294        // Replace invalid block id comment with a fresh one.
295        let replacement = format!("<!-- terraphim:block-id:{} -->", Ulid::new());
296        edits.push(Edit {
297            range: (line_start + comment_start)..(line_start + comment_end),
298            replacement,
299        });
300        return;
301    }
302
303    // No existing comment on the first line; insert it after the list marker and optional checkbox.
304    if let Some(insert_at) = list_item_inline_insert_point(source, list_item_start) {
305        let trailing_space = match source.as_bytes().get(insert_at) {
306            None | Some(b'\n') | Some(b'\r') => "",
307            _ => " ",
308        };
309        edits.push(Edit::insert(
310            insert_at,
311            format!(
312                "<!-- terraphim:block-id:{} -->{trailing_space}",
313                Ulid::new()
314            ),
315        ));
316    }
317}
318
319fn list_item_inline_insert_point(source: &str, list_item_start: usize) -> Option<usize> {
320    let bytes = source.as_bytes();
321    let mut i = list_item_start;
322
323    // Skip indentation and blockquote markers on this line (e.g. "> " prefixes).
324    // We only do a shallow pass to handle common cases like "> - item".
325    loop {
326        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
327            i += 1;
328        }
329        if bytes.get(i..i + 2) == Some(b"> ") {
330            i += 2;
331            continue;
332        }
333        break;
334    }
335
336    // Unordered list marker
337    if matches!(bytes.get(i), Some(b'-' | b'*' | b'+')) {
338        i += 1;
339        if matches!(bytes.get(i), Some(b' ' | b'\t')) {
340            i += 1;
341        } else {
342            return None;
343        }
344    } else if matches!(bytes.get(i), Some(b'0'..=b'9')) {
345        // Ordered list marker: digits + '.' or ')' + whitespace
346        while matches!(bytes.get(i), Some(b'0'..=b'9')) {
347            i += 1;
348        }
349        if matches!(bytes.get(i), Some(b'.' | b')')) {
350            i += 1;
351        } else {
352            return None;
353        }
354        if matches!(bytes.get(i), Some(b' ' | b'\t')) {
355            i += 1;
356        } else {
357            return None;
358        }
359    } else {
360        return None;
361    }
362
363    // Optional task list checkbox: [ ] / [x] / [X]
364    if bytes.get(i) == Some(&b'[')
365        && matches!(bytes.get(i + 1), Some(b' ' | b'x' | b'X'))
366        && bytes.get(i + 2) == Some(&b']')
367        && matches!(bytes.get(i + 3), Some(b' ' | b'\t'))
368    {
369        i += 4;
370    }
371
372    Some(i)
373}
374
375fn extract_blocks(markdown: &str) -> Result<Vec<Block>, MarkdownParserError> {
376    let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
377    let mut blocks = Vec::new();
378    extract_blocks_from_children(&ast, markdown, &mut blocks, ParentKind::Other)?;
379
380    // Validate uniqueness: ids should be stable and non-duplicated.
381    let mut seen = HashSet::new();
382    for b in &blocks {
383        let id = b.id.to_string();
384        if !seen.insert(id) {
385            // If duplicates exist, it is safer to surface an error rather than silently re-ID.
386            return Err(MarkdownParserError::MissingOrInvalidBlockId(
387                b.kind,
388                b.span.start,
389            ));
390        }
391    }
392
393    Ok(blocks)
394}
395
396fn extract_blocks_from_children(
397    node: &Node,
398    source: &str,
399    blocks: &mut Vec<Block>,
400    parent: ParentKind,
401) -> Result<(), MarkdownParserError> {
402    match node {
403        Node::Root(root) => {
404            extract_blocks_from_list(&root.children, source, blocks, ParentKind::Other)?;
405        }
406        Node::Blockquote(bq) => {
407            extract_blocks_from_list(&bq.children, source, blocks, parent)?;
408        }
409        Node::List(list) => {
410            extract_blocks_from_list(&list.children, source, blocks, parent)?;
411        }
412        Node::ListItem(li) => {
413            let Some(pos) = node.position() else {
414                return Ok(());
415            };
416
417            let Some((id, id_span)) = extract_list_item_id(source, pos.start.offset) else {
418                return Err(MarkdownParserError::MissingOrInvalidBlockId(
419                    BlockKind::ListItem,
420                    pos.start.offset,
421                ));
422            };
423            let start = line_bounds_at(source, pos.start.offset).0;
424            let end = pos.end.offset;
425            blocks.push(Block {
426                id,
427                kind: BlockKind::ListItem,
428                span: start..end,
429                id_span,
430            });
431            extract_blocks_from_list(&li.children, source, blocks, ParentKind::ListItem)?;
432        }
433        _ => {
434            if let Some(children) = children(node) {
435                extract_blocks_from_list(children, source, blocks, parent)?;
436            }
437        }
438    }
439    Ok(())
440}
441
442fn extract_blocks_from_list(
443    children: &[Node],
444    source: &str,
445    blocks: &mut Vec<Block>,
446    parent: ParentKind,
447) -> Result<(), MarkdownParserError> {
448    let mut first_direct_paragraph_in_list_item = false;
449
450    for (idx, child) in children.iter().enumerate() {
451        match child {
452            Node::ListItem(_) => extract_blocks_from_children(child, source, blocks, parent)?,
453            Node::Paragraph(_) => {
454                if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
455                    first_direct_paragraph_in_list_item = true;
456                    continue;
457                }
458
459                let Some(pos) = child.position() else {
460                    continue;
461                };
462
463                let Some((id, anchor_span)) = idx
464                    .checked_sub(1)
465                    .and_then(|prev| {
466                        parse_block_id_from_html_node_with_span(source, &children[prev])
467                    })
468                    .and_then(|(id, span)| id.map(|id| (id, span)))
469                else {
470                    return Err(MarkdownParserError::MissingOrInvalidBlockId(
471                        BlockKind::Paragraph,
472                        pos.start.offset,
473                    ));
474                };
475
476                blocks.push(Block {
477                    id,
478                    kind: BlockKind::Paragraph,
479                    span: anchor_span.start..pos.end.offset,
480                    id_span: anchor_span,
481                })
482            }
483            _ => extract_blocks_from_children(child, source, blocks, parent)?,
484        }
485    }
486
487    Ok(())
488}
489
490fn extract_list_item_id(source: &str, list_item_start: usize) -> Option<(Ulid, Range<usize>)> {
491    let (line_start, line_end) = line_bounds_at(source, list_item_start);
492    let line = &source[line_start..line_end];
493    let (comment_start, comment_end, parsed) = find_inline_block_id_comment(line)?;
494    let id = parsed?;
495    Some((id, (line_start + comment_start)..(line_start + comment_end)))
496}
497
498fn parse_block_id_from_html_node(node: &Node) -> Option<Ulid> {
499    match node {
500        Node::Html(val) => parse_block_id_comment(&val.value),
501        _ => None,
502    }
503}
504
505fn parse_block_id_from_html_node_with_span(
506    source: &str,
507    node: &Node,
508) -> Option<(Option<Ulid>, Range<usize>)> {
509    let Node::Html(val) = node else { return None };
510    let id = parse_block_id_comment(&val.value);
511
512    let Some(pos) = node.position() else {
513        return Some((id, 0..0));
514    };
515
516    let (line_start, line_end) = line_bounds_at(source, pos.start.offset);
517    Some((id, line_start..line_end))
518}
519
520fn parse_block_id_comment(raw_html: &str) -> Option<Ulid> {
521    let html = raw_html.trim();
522    let inner = html
523        .strip_prefix("<!--")
524        .and_then(|s| s.strip_suffix("-->"))?;
525    let inner = inner.trim();
526    let id_str = inner.strip_prefix(TERRAPHIM_BLOCK_ID_PREFIX)?;
527    Ulid::from_str(id_str.trim()).ok()
528}
529
530fn find_inline_block_id_comment(line: &str) -> Option<(usize, usize, Option<Ulid>)> {
531    let start = line.find("<!--")?;
532    let marker = line[start..].find(TERRAPHIM_BLOCK_ID_PREFIX)? + start;
533    let end = line[marker..].find("-->")? + marker + 3;
534
535    let comment_start = start;
536    let comment_end = end;
537    let comment = &line[comment_start..comment_end];
538    Some((comment_start, comment_end, parse_block_id_comment(comment)))
539}
540
541fn line_bounds_at(source: &str, offset: usize) -> (usize, usize) {
542    let line_start = source[..offset].rfind('\n').map(|i| i + 1).unwrap_or(0);
543    let line_end = source[offset..]
544        .find('\n')
545        .map(|i| offset + i)
546        .unwrap_or_else(|| source.len());
547    (line_start, line_end)
548}
549
550fn line_prefix_at(source: &str, offset: usize) -> (usize, String) {
551    let (line_start, _line_end) = line_bounds_at(source, offset);
552    let prefix = &source[line_start..offset];
553    (line_start, prefix.to_string())
554}
555
556fn children(node: &Node) -> Option<&Vec<Node>> {
557    match node {
558        Node::Root(root) => Some(&root.children),
559        Node::Blockquote(bq) => Some(&bq.children),
560        Node::List(list) => Some(&list.children),
561        Node::ListItem(li) => Some(&li.children),
562        Node::Paragraph(p) => Some(&p.children),
563        Node::Heading(h) => Some(&h.children),
564        _ => None,
565    }
566}
567
568fn strip_terraphim_block_id_comments(text: &str) -> String {
569    let mut out = String::with_capacity(text.len());
570    for line in text.lines() {
571        let mut remaining = line;
572        let mut cleaned = String::new();
573        loop {
574            let Some((start, end, _)) = find_inline_block_id_comment(remaining) else {
575                cleaned.push_str(remaining);
576                break;
577            };
578            cleaned.push_str(&remaining[..start]);
579            remaining = &remaining[end..];
580        }
581
582        if cleaned.trim().is_empty() {
583            continue;
584        }
585
586        out.push_str(cleaned.trim_end());
587        out.push('\n')
588    }
589    out
590}
591
592fn first_nonempty_line(text: &str) -> Option<String> {
593    text.lines()
594        .map(|l| l.trim())
595        .find(|l| !l.is_empty())
596        .map(|l| l.chars().take(80).collect::<String>())
597}
598
599#[cfg(test)]
600mod tests {
601    use super::*;
602
603    fn count_block_ids(s: &str) -> usize {
604        s.lines()
605            .filter(|l| l.contains("<!-- terraphim:block-id:"))
606            .count()
607    }
608
609    #[test]
610    fn inserts_paragraph_ids() {
611        let input = "Hello world\n\nSecond paragraph\n";
612        let out = ensure_terraphim_block_ids(input).unwrap();
613        // 2 paragraphs => 2 id comment lines
614        assert_eq!(count_block_ids(&out), 2);
615        assert!(out.contains("Hello world"));
616        assert!(out.contains("Second paragraph"));
617    }
618
619    #[test]
620    fn inserts_list_item_inline_ids() {
621        let input = "- first\n- second\n";
622        let out = ensure_terraphim_block_ids(input).unwrap();
623        assert_eq!(count_block_ids(&out), 2);
624        assert!(out.contains("- <!-- terraphim:block-id:"));
625    }
626
627    #[test]
628    fn normalize_returns_blocks() {
629        let input = "- item\n\nPara\n";
630        let normalized = normalize_markdown(input).unwrap();
631        assert!(normalized.blocks.len() >= 2);
632    }
633
634    #[test]
635    fn extract_first_heading_h1() {
636        let input = "# Bun Package Manager\n\nsynonyms:: npm, yarn\n";
637        assert_eq!(
638            extract_first_heading(input),
639            Some("Bun Package Manager".to_string())
640        );
641    }
642
643    #[test]
644    fn extract_first_heading_skips_h2() {
645        let input = "## Not This\n\n# This One\n";
646        assert_eq!(extract_first_heading(input), Some("This One".to_string()));
647    }
648
649    #[test]
650    fn extract_first_heading_none_when_absent() {
651        let input = "Just some text\n\n## Only H2\n";
652        assert_eq!(extract_first_heading(input), None);
653    }
654
655    #[test]
656    fn extract_first_heading_with_inline_code() {
657        let input = "# The `bun` Runtime\n";
658        assert_eq!(
659            extract_first_heading(input),
660            Some("The bun Runtime".to_string())
661        );
662    }
663
664    #[test]
665    fn iterative_normalization_stabilizes() {
666        // After inserting inline block IDs into list items, the markdown parser
667        // may re-parse the result differently, exposing adjacent text as a new
668        // paragraph without a block ID. The iterative loop must handle this.
669        //
670        // This input has a list item immediately followed by a paragraph with
671        // no blank line separating them, which can trigger re-parsing shifts.
672        let input = "- item one\n- item two\n\nA paragraph after the list.\n\nAnother paragraph.\n";
673
674        // Single pass
675        let pass1 = ensure_terraphim_block_ids(input).unwrap();
676
677        // The iterative normalize_markdown must produce a stable result
678        let normalized = normalize_markdown(input).unwrap();
679
680        // Verify stability: running ensure_terraphim_block_ids again should
681        // produce the same output (the fixed-point was reached).
682        let again = ensure_terraphim_block_ids(&normalized.markdown).unwrap();
683        assert_eq!(
684            again, normalized.markdown,
685            "normalization should be stable after normalize_markdown"
686        );
687
688        // All blocks must have IDs: 2 list items + 2 paragraphs = 4
689        let id_count = normalized
690            .markdown
691            .lines()
692            .filter(|l| l.contains("<!-- terraphim:block-id:"))
693            .count();
694        assert!(
695            id_count >= 4,
696            "expected at least 4 block IDs, got {id_count}; pass1 had {} IDs",
697            count_block_ids(&pass1),
698        );
699    }
700}