Skip to main content

terraphim_markdown_parser/
lib.rs

1use std::collections::HashSet;
2use std::ops::Range;
3use std::str::FromStr;
4
5use markdown::ParseOptions;
6use markdown::mdast::Node;
7use terraphim_types::{Document, DocumentType};
8use thiserror::Error;
9use ulid::Ulid;
10
11pub const TERRAPHIM_BLOCK_ID_PREFIX: &str = "terraphim:block-id:";
12
13/// Extract the first H1 heading from markdown content using AST parsing.
14///
15/// Returns the heading text with original case preserved, or `None` if no
16/// `# Heading` is found. Only matches depth-1 headings (`#`, not `##`).
17pub fn extract_first_heading(content: &str) -> Option<String> {
18    let ast = markdown::to_mdast(content, &ParseOptions::gfm()).ok()?;
19    find_first_h1(&ast)
20}
21
22/// Walk the AST to find the first depth-1 heading and collect its text content.
23fn find_first_h1(node: &Node) -> Option<String> {
24    match node {
25        Node::Heading(h) if h.depth == 1 => {
26            let text = collect_text_content(&h.children);
27            if text.is_empty() { None } else { Some(text) }
28        }
29        _ => {
30            if let Some(children) = children(node) {
31                for child in children {
32                    if let Some(heading) = find_first_h1(child) {
33                        return Some(heading);
34                    }
35                }
36            }
37            None
38        }
39    }
40}
41
42/// Recursively collect all text content from AST nodes.
43fn collect_text_content(nodes: &[Node]) -> String {
44    let mut text = String::new();
45    for node in nodes {
46        match node {
47            Node::Text(t) => text.push_str(&t.value),
48            Node::InlineCode(c) => text.push_str(&c.value),
49            other => {
50                if let Some(children) = children(other) {
51                    text.push_str(&collect_text_content(children));
52                }
53            }
54        }
55    }
56    text
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum BlockKind {
61    Paragraph,
62    ListItem,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct Block {
67    pub id: Ulid,
68    pub kind: BlockKind,
69
70    /// Byte span of the block in the markdown buffer.
71    ///
72    /// For paragraphs, this includes the block-id comment line plus the paragraph content.
73    /// For list items, this includes the full list item (including nested content).
74    pub span: Range<usize>,
75
76    /// Byte span of the block-id anchor.
77    ///
78    /// For paragraphs, this is the full comment line (including any leading quote/indent prefix).
79    /// For list items, this is the inline HTML comment inside the list item’s first line.
80    pub id_span: Range<usize>,
81}
82
83#[derive(Debug, Clone, PartialEq, Eq)]
84pub struct NormalizedMarkdown {
85    pub markdown: String,
86    pub blocks: Vec<Block>,
87}
88
89#[derive(Debug, Error)]
90pub enum MarkdownParserError {
91    #[error("failed to parse markdown: {0}")]
92    Markdown(String),
93
94    #[error("missing or invalid terraphim block id for {0:?} at byte offset {1}")]
95    MissingOrInvalidBlockId(BlockKind, usize),
96}
97
98impl From<markdown::message::Message> for MarkdownParserError {
99    fn from(value: markdown::message::Message) -> Self {
100        Self::Markdown(format!("{value:?}"))
101    }
102}
103
104#[derive(Debug, Clone)]
105struct Edit {
106    range: Range<usize>,
107    replacement: String,
108}
109
110impl Edit {
111    fn insert(at: usize, text: String) -> Self {
112        Self {
113            range: at..at,
114            replacement: text,
115        }
116    }
117}
118
119/// Ensure every list item and paragraph has a stable Terraphim block id.
120///
121/// Canonical forms:
122/// - Paragraph: `<!-- terraphim:block-id:<ULID> -->` on its own line immediately before the paragraph
123/// - List item: inline after the marker (and optional task checkbox), e.g. `- <!-- terraphim:block-id:<ULID> --> text`
124pub fn ensure_terraphim_block_ids(markdown: &str) -> Result<String, MarkdownParserError> {
125    let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
126    let mut edits: Vec<Edit> = Vec::new();
127    ensure_block_ids_in_children(&ast, markdown, &mut edits, ParentKind::Other);
128
129    if edits.is_empty() {
130        return Ok(markdown.to_string());
131    }
132
133    // Apply edits from the end of the buffer to the beginning so byte offsets stay valid.
134    edits.sort_by(|a, b| b.range.start.cmp(&a.range.start));
135    let mut out = markdown.to_string();
136    for edit in edits {
137        out.replace_range(edit.range, &edit.replacement);
138    }
139    Ok(out)
140}
141
142/// Normalize markdown into canonical Terraphim form and return the extracted blocks.
143pub fn normalize_markdown(markdown: &str) -> Result<NormalizedMarkdown, MarkdownParserError> {
144    let normalized = ensure_terraphim_block_ids(markdown)?;
145    let blocks = extract_blocks(&normalized)?;
146    Ok(NormalizedMarkdown {
147        markdown: normalized,
148        blocks,
149    })
150}
151
152/// Convert extracted blocks into Terraphim `Document`s so downstream graph tooling can be reused.
153pub fn blocks_to_documents(source_id: &str, normalized: &NormalizedMarkdown) -> Vec<Document> {
154    normalized
155        .blocks
156        .iter()
157        .map(|block| {
158            let block_id = block.id.to_string();
159            let id = format!("{source_id}#{block_id}");
160            let body = strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()])
161                .trim()
162                .to_string();
163            let title = first_nonempty_line(&body).unwrap_or_else(|| "Untitled".to_string());
164            Document {
165                id,
166                url: source_id.to_string(),
167                title,
168                body,
169                description: None,
170                summarization: None,
171                stub: None,
172                tags: None,
173                rank: None,
174                source_haystack: None,
175                doc_type: DocumentType::KgEntry,
176                synonyms: None,
177                route: None,
178                priority: None,
179            }
180        })
181        .collect()
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185enum ParentKind {
186    ListItem,
187    Other,
188}
189
190fn ensure_block_ids_in_children(
191    node: &Node,
192    source: &str,
193    edits: &mut Vec<Edit>,
194    parent: ParentKind,
195) {
196    match node {
197        Node::Root(root) => {
198            ensure_block_ids_in_list(&root.children, source, edits, ParentKind::Other)
199        }
200        Node::Blockquote(bq) => ensure_block_ids_in_list(&bq.children, source, edits, parent),
201        Node::List(list) => ensure_block_ids_in_list(&list.children, source, edits, parent),
202        Node::ListItem(li) => {
203            if let Some(pos) = node.position() {
204                ensure_list_item_inline_id(source, pos.start.offset, edits);
205            }
206            ensure_block_ids_in_list(&li.children, source, edits, ParentKind::ListItem);
207        }
208        _ => {
209            if let Some(children) = children(node) {
210                ensure_block_ids_in_list(children, source, edits, parent);
211            }
212        }
213    }
214}
215
216fn ensure_block_ids_in_list(
217    children: &[Node],
218    source: &str,
219    edits: &mut Vec<Edit>,
220    parent: ParentKind,
221) {
222    let mut first_direct_paragraph_in_list_item = false;
223
224    for (idx, child) in children.iter().enumerate() {
225        match child {
226            Node::ListItem(_) => ensure_block_ids_in_children(child, source, edits, parent),
227            Node::Paragraph(_) => {
228                // The first direct paragraph of a list item is considered owned by the list item’s
229                // inline block id, so we do not insert a separate comment line for it.
230                if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
231                    first_direct_paragraph_in_list_item = true;
232                } else if let Some(pos) = child.position() {
233                    let has_prev_block_id = idx
234                        .checked_sub(1)
235                        .and_then(|prev| parse_block_id_from_html_node(&children[prev]))
236                        .is_some();
237                    if !has_prev_block_id {
238                        edits.push(insert_paragraph_id_comment(source, pos.start.offset));
239                    }
240                }
241            }
242            _ => ensure_block_ids_in_children(child, source, edits, parent),
243        }
244    }
245}
246
247fn insert_paragraph_id_comment(source: &str, paragraph_start: usize) -> Edit {
248    let (line_start, prefix) = line_prefix_at(source, paragraph_start);
249    let id = Ulid::new();
250    Edit::insert(
251        line_start,
252        format!("{prefix}<!-- terraphim:block-id:{id} -->\n"),
253    )
254}
255
256fn ensure_list_item_inline_id(source: &str, list_item_start: usize, edits: &mut Vec<Edit>) {
257    let (line_start, line_end) = line_bounds_at(source, list_item_start);
258    let line = &source[line_start..line_end];
259
260    if let Some((comment_start, comment_end, parsed)) = find_inline_block_id_comment(line) {
261        if parsed.is_some() {
262            return;
263        }
264
265        // Replace invalid block id comment with a fresh one.
266        let replacement = format!("<!-- terraphim:block-id:{} -->", Ulid::new());
267        edits.push(Edit {
268            range: (line_start + comment_start)..(line_start + comment_end),
269            replacement,
270        });
271        return;
272    }
273
274    // No existing comment on the first line; insert it after the list marker and optional checkbox.
275    if let Some(insert_at) = list_item_inline_insert_point(source, list_item_start) {
276        let trailing_space = match source.as_bytes().get(insert_at) {
277            None | Some(b'\n') | Some(b'\r') => "",
278            _ => " ",
279        };
280        edits.push(Edit::insert(
281            insert_at,
282            format!(
283                "<!-- terraphim:block-id:{} -->{trailing_space}",
284                Ulid::new()
285            ),
286        ));
287    }
288}
289
290fn list_item_inline_insert_point(source: &str, list_item_start: usize) -> Option<usize> {
291    let bytes = source.as_bytes();
292    let mut i = list_item_start;
293
294    // Skip indentation and blockquote markers on this line (e.g. "> " prefixes).
295    // We only do a shallow pass to handle common cases like "> - item".
296    loop {
297        while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
298            i += 1;
299        }
300        if bytes.get(i..i + 2) == Some(b"> ") {
301            i += 2;
302            continue;
303        }
304        break;
305    }
306
307    // Unordered list marker
308    if matches!(bytes.get(i), Some(b'-' | b'*' | b'+')) {
309        i += 1;
310        if matches!(bytes.get(i), Some(b' ' | b'\t')) {
311            i += 1;
312        } else {
313            return None;
314        }
315    } else if matches!(bytes.get(i), Some(b'0'..=b'9')) {
316        // Ordered list marker: digits + '.' or ')' + whitespace
317        while matches!(bytes.get(i), Some(b'0'..=b'9')) {
318            i += 1;
319        }
320        if matches!(bytes.get(i), Some(b'.' | b')')) {
321            i += 1;
322        } else {
323            return None;
324        }
325        if matches!(bytes.get(i), Some(b' ' | b'\t')) {
326            i += 1;
327        } else {
328            return None;
329        }
330    } else {
331        return None;
332    }
333
334    // Optional task list checkbox: [ ] / [x] / [X]
335    if bytes.get(i) == Some(&b'[')
336        && matches!(bytes.get(i + 1), Some(b' ' | b'x' | b'X'))
337        && bytes.get(i + 2) == Some(&b']')
338        && matches!(bytes.get(i + 3), Some(b' ' | b'\t'))
339    {
340        i += 4;
341    }
342
343    Some(i)
344}
345
346fn extract_blocks(markdown: &str) -> Result<Vec<Block>, MarkdownParserError> {
347    let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
348    let mut blocks = Vec::new();
349    extract_blocks_from_children(&ast, markdown, &mut blocks, ParentKind::Other)?;
350
351    // Validate uniqueness: ids should be stable and non-duplicated.
352    let mut seen = HashSet::new();
353    for b in &blocks {
354        let id = b.id.to_string();
355        if !seen.insert(id) {
356            // If duplicates exist, it is safer to surface an error rather than silently re-ID.
357            return Err(MarkdownParserError::MissingOrInvalidBlockId(
358                b.kind,
359                b.span.start,
360            ));
361        }
362    }
363
364    Ok(blocks)
365}
366
367fn extract_blocks_from_children(
368    node: &Node,
369    source: &str,
370    blocks: &mut Vec<Block>,
371    parent: ParentKind,
372) -> Result<(), MarkdownParserError> {
373    match node {
374        Node::Root(root) => {
375            extract_blocks_from_list(&root.children, source, blocks, ParentKind::Other)?;
376        }
377        Node::Blockquote(bq) => {
378            extract_blocks_from_list(&bq.children, source, blocks, parent)?;
379        }
380        Node::List(list) => {
381            extract_blocks_from_list(&list.children, source, blocks, parent)?;
382        }
383        Node::ListItem(li) => {
384            let Some(pos) = node.position() else {
385                return Ok(());
386            };
387
388            let Some((id, id_span)) = extract_list_item_id(source, pos.start.offset) else {
389                return Err(MarkdownParserError::MissingOrInvalidBlockId(
390                    BlockKind::ListItem,
391                    pos.start.offset,
392                ));
393            };
394            let start = line_bounds_at(source, pos.start.offset).0;
395            let end = pos.end.offset;
396            blocks.push(Block {
397                id,
398                kind: BlockKind::ListItem,
399                span: start..end,
400                id_span,
401            });
402            extract_blocks_from_list(&li.children, source, blocks, ParentKind::ListItem)?;
403        }
404        _ => {
405            if let Some(children) = children(node) {
406                extract_blocks_from_list(children, source, blocks, parent)?;
407            }
408        }
409    }
410    Ok(())
411}
412
413fn extract_blocks_from_list(
414    children: &[Node],
415    source: &str,
416    blocks: &mut Vec<Block>,
417    parent: ParentKind,
418) -> Result<(), MarkdownParserError> {
419    let mut first_direct_paragraph_in_list_item = false;
420
421    for (idx, child) in children.iter().enumerate() {
422        match child {
423            Node::ListItem(_) => extract_blocks_from_children(child, source, blocks, parent)?,
424            Node::Paragraph(_) => {
425                if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
426                    first_direct_paragraph_in_list_item = true;
427                    continue;
428                }
429
430                let Some(pos) = child.position() else {
431                    continue;
432                };
433
434                let Some((id, anchor_span)) = idx
435                    .checked_sub(1)
436                    .and_then(|prev| {
437                        parse_block_id_from_html_node_with_span(source, &children[prev])
438                    })
439                    .and_then(|(id, span)| id.map(|id| (id, span)))
440                else {
441                    return Err(MarkdownParserError::MissingOrInvalidBlockId(
442                        BlockKind::Paragraph,
443                        pos.start.offset,
444                    ));
445                };
446
447                blocks.push(Block {
448                    id,
449                    kind: BlockKind::Paragraph,
450                    span: anchor_span.start..pos.end.offset,
451                    id_span: anchor_span,
452                })
453            }
454            _ => extract_blocks_from_children(child, source, blocks, parent)?,
455        }
456    }
457
458    Ok(())
459}
460
461fn extract_list_item_id(source: &str, list_item_start: usize) -> Option<(Ulid, Range<usize>)> {
462    let (line_start, line_end) = line_bounds_at(source, list_item_start);
463    let line = &source[line_start..line_end];
464    let (comment_start, comment_end, parsed) = find_inline_block_id_comment(line)?;
465    let id = parsed?;
466    Some((id, (line_start + comment_start)..(line_start + comment_end)))
467}
468
469fn parse_block_id_from_html_node(node: &Node) -> Option<Ulid> {
470    match node {
471        Node::Html(val) => parse_block_id_comment(&val.value),
472        _ => None,
473    }
474}
475
476fn parse_block_id_from_html_node_with_span(
477    source: &str,
478    node: &Node,
479) -> Option<(Option<Ulid>, Range<usize>)> {
480    let Node::Html(val) = node else { return None };
481    let id = parse_block_id_comment(&val.value);
482
483    let Some(pos) = node.position() else {
484        return Some((id, 0..0));
485    };
486
487    let (line_start, line_end) = line_bounds_at(source, pos.start.offset);
488    Some((id, line_start..line_end))
489}
490
491fn parse_block_id_comment(raw_html: &str) -> Option<Ulid> {
492    let html = raw_html.trim();
493    let inner = html
494        .strip_prefix("<!--")
495        .and_then(|s| s.strip_suffix("-->"))?;
496    let inner = inner.trim();
497    let id_str = inner.strip_prefix(TERRAPHIM_BLOCK_ID_PREFIX)?;
498    Ulid::from_str(id_str.trim()).ok()
499}
500
501fn find_inline_block_id_comment(line: &str) -> Option<(usize, usize, Option<Ulid>)> {
502    let start = line.find("<!--")?;
503    let marker = line[start..].find(TERRAPHIM_BLOCK_ID_PREFIX)? + start;
504    let end = line[marker..].find("-->")? + marker + 3;
505
506    let comment_start = start;
507    let comment_end = end;
508    let comment = &line[comment_start..comment_end];
509    Some((comment_start, comment_end, parse_block_id_comment(comment)))
510}
511
512fn line_bounds_at(source: &str, offset: usize) -> (usize, usize) {
513    let line_start = source[..offset].rfind('\n').map(|i| i + 1).unwrap_or(0);
514    let line_end = source[offset..]
515        .find('\n')
516        .map(|i| offset + i)
517        .unwrap_or_else(|| source.len());
518    (line_start, line_end)
519}
520
521fn line_prefix_at(source: &str, offset: usize) -> (usize, String) {
522    let (line_start, _line_end) = line_bounds_at(source, offset);
523    let prefix = &source[line_start..offset];
524    (line_start, prefix.to_string())
525}
526
527fn children(node: &Node) -> Option<&Vec<Node>> {
528    match node {
529        Node::Root(root) => Some(&root.children),
530        Node::Blockquote(bq) => Some(&bq.children),
531        Node::List(list) => Some(&list.children),
532        Node::ListItem(li) => Some(&li.children),
533        Node::Paragraph(p) => Some(&p.children),
534        Node::Heading(h) => Some(&h.children),
535        _ => None,
536    }
537}
538
539fn strip_terraphim_block_id_comments(text: &str) -> String {
540    let mut out = String::with_capacity(text.len());
541    for line in text.lines() {
542        let mut remaining = line;
543        let mut cleaned = String::new();
544        loop {
545            let Some((start, end, _)) = find_inline_block_id_comment(remaining) else {
546                cleaned.push_str(remaining);
547                break;
548            };
549            cleaned.push_str(&remaining[..start]);
550            remaining = &remaining[end..];
551        }
552
553        if cleaned.trim().is_empty() {
554            continue;
555        }
556
557        out.push_str(cleaned.trim_end());
558        out.push('\n')
559    }
560    out
561}
562
563fn first_nonempty_line(text: &str) -> Option<String> {
564    text.lines()
565        .map(|l| l.trim())
566        .find(|l| !l.is_empty())
567        .map(|l| l.chars().take(80).collect::<String>())
568}
569
570#[cfg(test)]
571mod tests {
572    use super::*;
573
574    fn count_block_ids(s: &str) -> usize {
575        s.lines()
576            .filter(|l| l.contains("<!-- terraphim:block-id:"))
577            .count()
578    }
579
580    #[test]
581    fn inserts_paragraph_ids() {
582        let input = "Hello world\n\nSecond paragraph\n";
583        let out = ensure_terraphim_block_ids(input).unwrap();
584        // 2 paragraphs => 2 id comment lines
585        assert_eq!(count_block_ids(&out), 2);
586        assert!(out.contains("Hello world"));
587        assert!(out.contains("Second paragraph"));
588    }
589
590    #[test]
591    fn inserts_list_item_inline_ids() {
592        let input = "- first\n- second\n";
593        let out = ensure_terraphim_block_ids(input).unwrap();
594        assert_eq!(count_block_ids(&out), 2);
595        assert!(out.contains("- <!-- terraphim:block-id:"));
596    }
597
598    #[test]
599    fn normalize_returns_blocks() {
600        let input = "- item\n\nPara\n";
601        let normalized = normalize_markdown(input).unwrap();
602        assert!(normalized.blocks.len() >= 2);
603    }
604
605    #[test]
606    fn extract_first_heading_h1() {
607        let input = "# Bun Package Manager\n\nsynonyms:: npm, yarn\n";
608        assert_eq!(
609            extract_first_heading(input),
610            Some("Bun Package Manager".to_string())
611        );
612    }
613
614    #[test]
615    fn extract_first_heading_skips_h2() {
616        let input = "## Not This\n\n# This One\n";
617        assert_eq!(extract_first_heading(input), Some("This One".to_string()));
618    }
619
620    #[test]
621    fn extract_first_heading_none_when_absent() {
622        let input = "Just some text\n\n## Only H2\n";
623        assert_eq!(extract_first_heading(input), None);
624    }
625
626    #[test]
627    fn extract_first_heading_with_inline_code() {
628        let input = "# The `bun` Runtime\n";
629        assert_eq!(
630            extract_first_heading(input),
631            Some("The bun Runtime".to_string())
632        );
633    }
634}