1use std::collections::HashSet;
2use std::ops::Range;
3use std::str::FromStr;
4
5use markdown::ParseOptions;
6use markdown::mdast::Node;
7use terraphim_types::{Document, DocumentType};
8use thiserror::Error;
9use ulid::Ulid;
10
11pub mod chunk;
12pub mod heading;
13
14pub use chunk::{ContentChunk, chunk_by_headings};
15pub use heading::{
16 HeadingNode, HeadingTree, MatchStrategy, SectionConfig, SectionPattern, SectionType,
17 build_heading_tree, classify_sections,
18};
19
20pub const TERRAPHIM_BLOCK_ID_PREFIX: &str = "terraphim:block-id:";
21
22pub fn extract_first_heading(content: &str) -> Option<String> {
27 let ast = markdown::to_mdast(content, &ParseOptions::gfm()).ok()?;
28 find_first_h1(&ast)
29}
30
31fn find_first_h1(node: &Node) -> Option<String> {
33 match node {
34 Node::Heading(h) if h.depth == 1 => {
35 let text = collect_text_content(&h.children);
36 if text.is_empty() { None } else { Some(text) }
37 }
38 _ => {
39 if let Some(children) = children(node) {
40 for child in children {
41 if let Some(heading) = find_first_h1(child) {
42 return Some(heading);
43 }
44 }
45 }
46 None
47 }
48 }
49}
50
51pub(crate) fn collect_text_content(nodes: &[Node]) -> String {
53 let mut text = String::new();
54 for node in nodes {
55 match node {
56 Node::Text(t) => text.push_str(&t.value),
57 Node::InlineCode(c) => text.push_str(&c.value),
58 other => {
59 if let Some(children) = children(other) {
60 text.push_str(&collect_text_content(children));
61 }
62 }
63 }
64 }
65 text
66}
67
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum BlockKind {
70 Paragraph,
71 ListItem,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq)]
75pub struct Block {
76 pub id: Ulid,
77 pub kind: BlockKind,
78
79 pub span: Range<usize>,
84
85 pub id_span: Range<usize>,
90}
91
92#[derive(Debug, Clone)]
93pub struct NormalizedMarkdown {
94 pub markdown: String,
95 pub blocks: Vec<Block>,
96 pub ast: Option<markdown::mdast::Node>,
97}
98
99#[derive(Debug, Error)]
100pub enum MarkdownParserError {
101 #[error("failed to parse markdown: {0}")]
102 Markdown(String),
103
104 #[error("missing or invalid terraphim block id for {0:?} at byte offset {1}")]
105 MissingOrInvalidBlockId(BlockKind, usize),
106}
107
108impl From<markdown::message::Message> for MarkdownParserError {
109 fn from(value: markdown::message::Message) -> Self {
110 Self::Markdown(format!("{value:?}"))
111 }
112}
113
114#[derive(Debug, Clone)]
115struct Edit {
116 range: Range<usize>,
117 replacement: String,
118}
119
120impl Edit {
121 fn insert(at: usize, text: String) -> Self {
122 Self {
123 range: at..at,
124 replacement: text,
125 }
126 }
127}
128
129pub fn ensure_terraphim_block_ids(markdown: &str) -> Result<String, MarkdownParserError> {
135 let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
136 let mut edits: Vec<Edit> = Vec::new();
137 ensure_block_ids_in_children(&ast, markdown, &mut edits, ParentKind::Other);
138
139 if edits.is_empty() {
140 return Ok(markdown.to_string());
141 }
142
143 edits.sort_by_key(|e| std::cmp::Reverse(e.range.start));
145 let mut out = markdown.to_string();
146 for edit in edits {
147 out.replace_range(edit.range, &edit.replacement);
148 }
149 Ok(out)
150}
151
152pub fn normalize_markdown(markdown: &str) -> Result<NormalizedMarkdown, MarkdownParserError> {
159 let mut current = ensure_terraphim_block_ids(markdown)?;
160 for _ in 0..4 {
161 let next = ensure_terraphim_block_ids(¤t)?;
162 if next == current {
163 break;
164 }
165 current = next;
166 }
167 let blocks = extract_blocks(¤t)?;
168 let ast = markdown::to_mdast(¤t, &ParseOptions::gfm()).ok();
169 Ok(NormalizedMarkdown {
170 markdown: current,
171 blocks,
172 ast,
173 })
174}
175
176pub fn blocks_to_documents(source_id: &str, normalized: &NormalizedMarkdown) -> Vec<Document> {
178 normalized
179 .blocks
180 .iter()
181 .map(|block| {
182 let block_id = block.id.to_string();
183 let id = format!("{source_id}#{block_id}");
184 let body = strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()])
185 .trim()
186 .to_string();
187 let title = first_nonempty_line(&body).unwrap_or_else(|| "Untitled".to_string());
188 Document {
189 id,
190 url: source_id.to_string(),
191 title,
192 body,
193 description: None,
194 summarization: None,
195 stub: None,
196 tags: None,
197 rank: None,
198 source_haystack: None,
199 doc_type: DocumentType::KgEntry,
200 synonyms: None,
201 route: None,
202 priority: None,
203 }
204 })
205 .collect()
206}
207
208#[derive(Debug, Clone, Copy, PartialEq, Eq)]
209enum ParentKind {
210 ListItem,
211 Other,
212}
213
214fn ensure_block_ids_in_children(
215 node: &Node,
216 source: &str,
217 edits: &mut Vec<Edit>,
218 parent: ParentKind,
219) {
220 match node {
221 Node::Root(root) => {
222 ensure_block_ids_in_list(&root.children, source, edits, ParentKind::Other)
223 }
224 Node::Blockquote(bq) => ensure_block_ids_in_list(&bq.children, source, edits, parent),
225 Node::List(list) => ensure_block_ids_in_list(&list.children, source, edits, parent),
226 Node::ListItem(li) => {
227 if let Some(pos) = node.position() {
228 ensure_list_item_inline_id(source, pos.start.offset, edits);
229 }
230 ensure_block_ids_in_list(&li.children, source, edits, ParentKind::ListItem);
231 }
232 _ => {
233 if let Some(children) = children(node) {
234 ensure_block_ids_in_list(children, source, edits, parent);
235 }
236 }
237 }
238}
239
240fn ensure_block_ids_in_list(
241 children: &[Node],
242 source: &str,
243 edits: &mut Vec<Edit>,
244 parent: ParentKind,
245) {
246 let mut first_direct_paragraph_in_list_item = false;
247
248 for (idx, child) in children.iter().enumerate() {
249 match child {
250 Node::ListItem(_) => ensure_block_ids_in_children(child, source, edits, parent),
251 Node::Paragraph(_) => {
252 if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
255 first_direct_paragraph_in_list_item = true;
256 } else if let Some(pos) = child.position() {
257 let has_prev_block_id = idx
258 .checked_sub(1)
259 .and_then(|prev| parse_block_id_from_html_node(&children[prev]))
260 .is_some();
261 if !has_prev_block_id {
262 edits.push(insert_paragraph_id_comment(source, pos.start.offset));
263 }
264 }
265 }
266 _ => ensure_block_ids_in_children(child, source, edits, parent),
267 }
268 }
269}
270
271fn insert_paragraph_id_comment(source: &str, paragraph_start: usize) -> Edit {
272 let (line_start, prefix) = line_prefix_at(source, paragraph_start);
273 let id = Ulid::new();
274 Edit::insert(
275 line_start,
276 format!("{prefix}<!-- terraphim:block-id:{id} -->\n"),
277 )
278}
279
280fn ensure_list_item_inline_id(source: &str, list_item_start: usize, edits: &mut Vec<Edit>) {
281 let (line_start, line_end) = line_bounds_at(source, list_item_start);
282 let line = &source[line_start..line_end];
283
284 if let Some((comment_start, comment_end, parsed)) = find_inline_block_id_comment(line) {
285 if parsed.is_some() {
286 return;
287 }
288
289 let replacement = format!("<!-- terraphim:block-id:{} -->", Ulid::new());
291 edits.push(Edit {
292 range: (line_start + comment_start)..(line_start + comment_end),
293 replacement,
294 });
295 return;
296 }
297
298 if let Some(insert_at) = list_item_inline_insert_point(source, list_item_start) {
300 let trailing_space = match source.as_bytes().get(insert_at) {
301 None | Some(b'\n') | Some(b'\r') => "",
302 _ => " ",
303 };
304 edits.push(Edit::insert(
305 insert_at,
306 format!(
307 "<!-- terraphim:block-id:{} -->{trailing_space}",
308 Ulid::new()
309 ),
310 ));
311 }
312}
313
314fn list_item_inline_insert_point(source: &str, list_item_start: usize) -> Option<usize> {
315 let bytes = source.as_bytes();
316 let mut i = list_item_start;
317
318 loop {
321 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
322 i += 1;
323 }
324 if bytes.get(i..i + 2) == Some(b"> ") {
325 i += 2;
326 continue;
327 }
328 break;
329 }
330
331 if matches!(bytes.get(i), Some(b'-' | b'*' | b'+')) {
333 i += 1;
334 if matches!(bytes.get(i), Some(b' ' | b'\t')) {
335 i += 1;
336 } else {
337 return None;
338 }
339 } else if matches!(bytes.get(i), Some(b'0'..=b'9')) {
340 while matches!(bytes.get(i), Some(b'0'..=b'9')) {
342 i += 1;
343 }
344 if matches!(bytes.get(i), Some(b'.' | b')')) {
345 i += 1;
346 } else {
347 return None;
348 }
349 if matches!(bytes.get(i), Some(b' ' | b'\t')) {
350 i += 1;
351 } else {
352 return None;
353 }
354 } else {
355 return None;
356 }
357
358 if bytes.get(i) == Some(&b'[')
360 && matches!(bytes.get(i + 1), Some(b' ' | b'x' | b'X'))
361 && bytes.get(i + 2) == Some(&b']')
362 && matches!(bytes.get(i + 3), Some(b' ' | b'\t'))
363 {
364 i += 4;
365 }
366
367 Some(i)
368}
369
370fn extract_blocks(markdown: &str) -> Result<Vec<Block>, MarkdownParserError> {
371 let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
372 let mut blocks = Vec::new();
373 extract_blocks_from_children(&ast, markdown, &mut blocks, ParentKind::Other)?;
374
375 let mut seen = HashSet::new();
377 for b in &blocks {
378 let id = b.id.to_string();
379 if !seen.insert(id) {
380 return Err(MarkdownParserError::MissingOrInvalidBlockId(
382 b.kind,
383 b.span.start,
384 ));
385 }
386 }
387
388 Ok(blocks)
389}
390
391fn extract_blocks_from_children(
392 node: &Node,
393 source: &str,
394 blocks: &mut Vec<Block>,
395 parent: ParentKind,
396) -> Result<(), MarkdownParserError> {
397 match node {
398 Node::Root(root) => {
399 extract_blocks_from_list(&root.children, source, blocks, ParentKind::Other)?;
400 }
401 Node::Blockquote(bq) => {
402 extract_blocks_from_list(&bq.children, source, blocks, parent)?;
403 }
404 Node::List(list) => {
405 extract_blocks_from_list(&list.children, source, blocks, parent)?;
406 }
407 Node::ListItem(li) => {
408 let Some(pos) = node.position() else {
409 return Ok(());
410 };
411
412 let Some((id, id_span)) = extract_list_item_id(source, pos.start.offset) else {
413 return Err(MarkdownParserError::MissingOrInvalidBlockId(
414 BlockKind::ListItem,
415 pos.start.offset,
416 ));
417 };
418 let start = line_bounds_at(source, pos.start.offset).0;
419 let end = pos.end.offset;
420 blocks.push(Block {
421 id,
422 kind: BlockKind::ListItem,
423 span: start..end,
424 id_span,
425 });
426 extract_blocks_from_list(&li.children, source, blocks, ParentKind::ListItem)?;
427 }
428 _ => {
429 if let Some(children) = children(node) {
430 extract_blocks_from_list(children, source, blocks, parent)?;
431 }
432 }
433 }
434 Ok(())
435}
436
437fn extract_blocks_from_list(
438 children: &[Node],
439 source: &str,
440 blocks: &mut Vec<Block>,
441 parent: ParentKind,
442) -> Result<(), MarkdownParserError> {
443 let mut first_direct_paragraph_in_list_item = false;
444
445 for (idx, child) in children.iter().enumerate() {
446 match child {
447 Node::ListItem(_) => extract_blocks_from_children(child, source, blocks, parent)?,
448 Node::Paragraph(_) => {
449 if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
450 first_direct_paragraph_in_list_item = true;
451 continue;
452 }
453
454 let Some(pos) = child.position() else {
455 continue;
456 };
457
458 let Some((id, anchor_span)) = idx
459 .checked_sub(1)
460 .and_then(|prev| {
461 parse_block_id_from_html_node_with_span(source, &children[prev])
462 })
463 .and_then(|(id, span)| id.map(|id| (id, span)))
464 else {
465 return Err(MarkdownParserError::MissingOrInvalidBlockId(
466 BlockKind::Paragraph,
467 pos.start.offset,
468 ));
469 };
470
471 blocks.push(Block {
472 id,
473 kind: BlockKind::Paragraph,
474 span: anchor_span.start..pos.end.offset,
475 id_span: anchor_span,
476 })
477 }
478 _ => extract_blocks_from_children(child, source, blocks, parent)?,
479 }
480 }
481
482 Ok(())
483}
484
485fn extract_list_item_id(source: &str, list_item_start: usize) -> Option<(Ulid, Range<usize>)> {
486 let (line_start, line_end) = line_bounds_at(source, list_item_start);
487 let line = &source[line_start..line_end];
488 let (comment_start, comment_end, parsed) = find_inline_block_id_comment(line)?;
489 let id = parsed?;
490 Some((id, (line_start + comment_start)..(line_start + comment_end)))
491}
492
493fn parse_block_id_from_html_node(node: &Node) -> Option<Ulid> {
494 match node {
495 Node::Html(val) => parse_block_id_comment(&val.value),
496 _ => None,
497 }
498}
499
500fn parse_block_id_from_html_node_with_span(
501 source: &str,
502 node: &Node,
503) -> Option<(Option<Ulid>, Range<usize>)> {
504 let Node::Html(val) = node else { return None };
505 let id = parse_block_id_comment(&val.value);
506
507 let Some(pos) = node.position() else {
508 return Some((id, 0..0));
509 };
510
511 let (line_start, line_end) = line_bounds_at(source, pos.start.offset);
512 Some((id, line_start..line_end))
513}
514
515fn parse_block_id_comment(raw_html: &str) -> Option<Ulid> {
516 let html = raw_html.trim();
517 let inner = html
518 .strip_prefix("<!--")
519 .and_then(|s| s.strip_suffix("-->"))?;
520 let inner = inner.trim();
521 let id_str = inner.strip_prefix(TERRAPHIM_BLOCK_ID_PREFIX)?;
522 Ulid::from_str(id_str.trim()).ok()
523}
524
525fn find_inline_block_id_comment(line: &str) -> Option<(usize, usize, Option<Ulid>)> {
526 let start = line.find("<!--")?;
527 let marker = line[start..].find(TERRAPHIM_BLOCK_ID_PREFIX)? + start;
528 let end = line[marker..].find("-->")? + marker + 3;
529
530 let comment_start = start;
531 let comment_end = end;
532 let comment = &line[comment_start..comment_end];
533 Some((comment_start, comment_end, parse_block_id_comment(comment)))
534}
535
536fn line_bounds_at(source: &str, offset: usize) -> (usize, usize) {
537 let line_start = source[..offset].rfind('\n').map(|i| i + 1).unwrap_or(0);
538 let line_end = source[offset..]
539 .find('\n')
540 .map(|i| offset + i)
541 .unwrap_or_else(|| source.len());
542 (line_start, line_end)
543}
544
545fn line_prefix_at(source: &str, offset: usize) -> (usize, String) {
546 let (line_start, _line_end) = line_bounds_at(source, offset);
547 let prefix = &source[line_start..offset];
548 (line_start, prefix.to_string())
549}
550
551fn children(node: &Node) -> Option<&Vec<Node>> {
552 match node {
553 Node::Root(root) => Some(&root.children),
554 Node::Blockquote(bq) => Some(&bq.children),
555 Node::List(list) => Some(&list.children),
556 Node::ListItem(li) => Some(&li.children),
557 Node::Paragraph(p) => Some(&p.children),
558 Node::Heading(h) => Some(&h.children),
559 _ => None,
560 }
561}
562
563fn strip_terraphim_block_id_comments(text: &str) -> String {
564 let mut out = String::with_capacity(text.len());
565 for line in text.lines() {
566 let mut remaining = line;
567 let mut cleaned = String::new();
568 loop {
569 let Some((start, end, _)) = find_inline_block_id_comment(remaining) else {
570 cleaned.push_str(remaining);
571 break;
572 };
573 cleaned.push_str(&remaining[..start]);
574 remaining = &remaining[end..];
575 }
576
577 if cleaned.trim().is_empty() {
578 continue;
579 }
580
581 out.push_str(cleaned.trim_end());
582 out.push('\n')
583 }
584 out
585}
586
587fn first_nonempty_line(text: &str) -> Option<String> {
588 text.lines()
589 .map(|l| l.trim())
590 .find(|l| !l.is_empty())
591 .map(|l| l.chars().take(80).collect::<String>())
592}
593
594#[cfg(test)]
595mod tests {
596 use super::*;
597
598 fn count_block_ids(s: &str) -> usize {
599 s.lines()
600 .filter(|l| l.contains("<!-- terraphim:block-id:"))
601 .count()
602 }
603
604 #[test]
605 fn inserts_paragraph_ids() {
606 let input = "Hello world\n\nSecond paragraph\n";
607 let out = ensure_terraphim_block_ids(input).unwrap();
608 assert_eq!(count_block_ids(&out), 2);
610 assert!(out.contains("Hello world"));
611 assert!(out.contains("Second paragraph"));
612 }
613
614 #[test]
615 fn inserts_list_item_inline_ids() {
616 let input = "- first\n- second\n";
617 let out = ensure_terraphim_block_ids(input).unwrap();
618 assert_eq!(count_block_ids(&out), 2);
619 assert!(out.contains("- <!-- terraphim:block-id:"));
620 }
621
622 #[test]
623 fn normalize_returns_blocks() {
624 let input = "- item\n\nPara\n";
625 let normalized = normalize_markdown(input).unwrap();
626 assert!(normalized.blocks.len() >= 2);
627 }
628
629 #[test]
630 fn extract_first_heading_h1() {
631 let input = "# Bun Package Manager\n\nsynonyms:: npm, yarn\n";
632 assert_eq!(
633 extract_first_heading(input),
634 Some("Bun Package Manager".to_string())
635 );
636 }
637
638 #[test]
639 fn extract_first_heading_skips_h2() {
640 let input = "## Not This\n\n# This One\n";
641 assert_eq!(extract_first_heading(input), Some("This One".to_string()));
642 }
643
644 #[test]
645 fn extract_first_heading_none_when_absent() {
646 let input = "Just some text\n\n## Only H2\n";
647 assert_eq!(extract_first_heading(input), None);
648 }
649
650 #[test]
651 fn extract_first_heading_with_inline_code() {
652 let input = "# The `bun` Runtime\n";
653 assert_eq!(
654 extract_first_heading(input),
655 Some("The bun Runtime".to_string())
656 );
657 }
658
659 #[test]
660 fn iterative_normalization_stabilizes() {
661 let input = "- item one\n- item two\n\nA paragraph after the list.\n\nAnother paragraph.\n";
668
669 let pass1 = ensure_terraphim_block_ids(input).unwrap();
671
672 let normalized = normalize_markdown(input).unwrap();
674
675 let again = ensure_terraphim_block_ids(&normalized.markdown).unwrap();
678 assert_eq!(
679 again, normalized.markdown,
680 "normalization should be stable after normalize_markdown"
681 );
682
683 let id_count = normalized
685 .markdown
686 .lines()
687 .filter(|l| l.contains("<!-- terraphim:block-id:"))
688 .count();
689 assert!(
690 id_count >= 4,
691 "expected at least 4 block IDs, got {id_count}; pass1 had {} IDs",
692 count_block_ids(&pass1),
693 );
694 }
695}