1use std::collections::HashSet;
6use std::ops::Range;
7use std::str::FromStr;
8
9use markdown::ParseOptions;
10use markdown::mdast::Node;
11use terraphim_types::{Document, DocumentType};
12use thiserror::Error;
13use ulid::Ulid;
14
15pub mod chunk;
16pub mod heading;
17
18pub use chunk::{ContentChunk, chunk_by_headings};
19pub use heading::{
20 HeadingNode, HeadingTree, MatchStrategy, SectionConfig, SectionPattern, SectionType,
21 build_heading_tree, classify_sections,
22};
23
24pub const TERRAPHIM_BLOCK_ID_PREFIX: &str = "terraphim:block-id:";
25
26pub fn extract_first_heading(content: &str) -> Option<String> {
31 let ast = markdown::to_mdast(content, &ParseOptions::gfm()).ok()?;
32 find_first_h1(&ast)
33}
34
35fn find_first_h1(node: &Node) -> Option<String> {
37 match node {
38 Node::Heading(h) if h.depth == 1 => {
39 let text = collect_text_content(&h.children);
40 if text.is_empty() { None } else { Some(text) }
41 }
42 _ => {
43 if let Some(children) = children(node) {
44 for child in children {
45 if let Some(heading) = find_first_h1(child) {
46 return Some(heading);
47 }
48 }
49 }
50 None
51 }
52 }
53}
54
55pub(crate) fn collect_text_content(nodes: &[Node]) -> String {
57 let mut text = String::new();
58 for node in nodes {
59 match node {
60 Node::Text(t) => text.push_str(&t.value),
61 Node::InlineCode(c) => text.push_str(&c.value),
62 other => {
63 if let Some(children) = children(other) {
64 text.push_str(&collect_text_content(children));
65 }
66 }
67 }
68 }
69 text
70}
71
72#[derive(Debug, Clone, Copy, PartialEq, Eq)]
73pub enum BlockKind {
74 Paragraph,
75 ListItem,
76}
77
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct Block {
80 pub id: Ulid,
81 pub kind: BlockKind,
82
83 pub span: Range<usize>,
88
89 pub id_span: Range<usize>,
94}
95
96#[derive(Debug, Clone)]
97pub struct NormalizedMarkdown {
98 pub markdown: String,
99 pub blocks: Vec<Block>,
100 pub ast: Option<markdown::mdast::Node>,
101}
102
103#[derive(Debug, Error)]
104pub enum MarkdownParserError {
105 #[error("failed to parse markdown: {0}")]
106 Markdown(String),
107
108 #[error("missing or invalid terraphim block id for {0:?} at byte offset {1}")]
109 MissingOrInvalidBlockId(BlockKind, usize),
110}
111
112impl From<markdown::message::Message> for MarkdownParserError {
113 fn from(value: markdown::message::Message) -> Self {
114 Self::Markdown(format!("{value:?}"))
115 }
116}
117
118#[derive(Debug, Clone)]
119struct Edit {
120 range: Range<usize>,
121 replacement: String,
122}
123
124impl Edit {
125 fn insert(at: usize, text: String) -> Self {
126 Self {
127 range: at..at,
128 replacement: text,
129 }
130 }
131}
132
133pub fn ensure_terraphim_block_ids(markdown: &str) -> Result<String, MarkdownParserError> {
139 let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
140 let mut edits: Vec<Edit> = Vec::new();
141 ensure_block_ids_in_children(&ast, markdown, &mut edits, ParentKind::Other);
142
143 if edits.is_empty() {
144 return Ok(markdown.to_string());
145 }
146
147 edits.sort_by_key(|e| std::cmp::Reverse(e.range.start));
149 let mut out = markdown.to_string();
150 for edit in edits {
151 out.replace_range(edit.range, &edit.replacement);
152 }
153 Ok(out)
154}
155
156pub fn normalize_markdown(markdown: &str) -> Result<NormalizedMarkdown, MarkdownParserError> {
163 let mut current = ensure_terraphim_block_ids(markdown)?;
164 for _ in 0..4 {
165 let next = ensure_terraphim_block_ids(¤t)?;
166 if next == current {
167 break;
168 }
169 current = next;
170 }
171 let blocks = extract_blocks(¤t)?;
172 let ast = markdown::to_mdast(¤t, &ParseOptions::gfm()).ok();
173 Ok(NormalizedMarkdown {
174 markdown: current,
175 blocks,
176 ast,
177 })
178}
179
180pub fn blocks_to_documents(source_id: &str, normalized: &NormalizedMarkdown) -> Vec<Document> {
182 normalized
183 .blocks
184 .iter()
185 .map(|block| {
186 let block_id = block.id.to_string();
187 let id = format!("{source_id}#{block_id}");
188 let body = strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()])
189 .trim()
190 .to_string();
191 let title = first_nonempty_line(&body).unwrap_or_else(|| "Untitled".to_string());
192 Document {
193 id,
194 url: source_id.to_string(),
195 title,
196 body,
197 description: None,
198 summarization: None,
199 stub: None,
200 tags: None,
201 rank: None,
202 source_haystack: None,
203 doc_type: DocumentType::KgEntry,
204 synonyms: None,
205 route: None,
206 priority: None,
207 quality_score: None,
208 }
209 })
210 .collect()
211}
212
213#[derive(Debug, Clone, Copy, PartialEq, Eq)]
214enum ParentKind {
215 ListItem,
216 Other,
217}
218
219fn ensure_block_ids_in_children(
220 node: &Node,
221 source: &str,
222 edits: &mut Vec<Edit>,
223 parent: ParentKind,
224) {
225 match node {
226 Node::Root(root) => {
227 ensure_block_ids_in_list(&root.children, source, edits, ParentKind::Other)
228 }
229 Node::Blockquote(bq) => ensure_block_ids_in_list(&bq.children, source, edits, parent),
230 Node::List(list) => ensure_block_ids_in_list(&list.children, source, edits, parent),
231 Node::ListItem(li) => {
232 if let Some(pos) = node.position() {
233 ensure_list_item_inline_id(source, pos.start.offset, edits);
234 }
235 ensure_block_ids_in_list(&li.children, source, edits, ParentKind::ListItem);
236 }
237 _ => {
238 if let Some(children) = children(node) {
239 ensure_block_ids_in_list(children, source, edits, parent);
240 }
241 }
242 }
243}
244
245fn ensure_block_ids_in_list(
246 children: &[Node],
247 source: &str,
248 edits: &mut Vec<Edit>,
249 parent: ParentKind,
250) {
251 let mut first_direct_paragraph_in_list_item = false;
252
253 for (idx, child) in children.iter().enumerate() {
254 match child {
255 Node::ListItem(_) => ensure_block_ids_in_children(child, source, edits, parent),
256 Node::Paragraph(_) => {
257 if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
260 first_direct_paragraph_in_list_item = true;
261 } else if let Some(pos) = child.position() {
262 let has_prev_block_id = idx
263 .checked_sub(1)
264 .and_then(|prev| parse_block_id_from_html_node(&children[prev]))
265 .is_some();
266 if !has_prev_block_id {
267 edits.push(insert_paragraph_id_comment(source, pos.start.offset));
268 }
269 }
270 }
271 _ => ensure_block_ids_in_children(child, source, edits, parent),
272 }
273 }
274}
275
276fn insert_paragraph_id_comment(source: &str, paragraph_start: usize) -> Edit {
277 let (line_start, prefix) = line_prefix_at(source, paragraph_start);
278 let id = Ulid::new();
279 Edit::insert(
280 line_start,
281 format!("{prefix}<!-- terraphim:block-id:{id} -->\n"),
282 )
283}
284
285fn ensure_list_item_inline_id(source: &str, list_item_start: usize, edits: &mut Vec<Edit>) {
286 let (line_start, line_end) = line_bounds_at(source, list_item_start);
287 let line = &source[line_start..line_end];
288
289 if let Some((comment_start, comment_end, parsed)) = find_inline_block_id_comment(line) {
290 if parsed.is_some() {
291 return;
292 }
293
294 let replacement = format!("<!-- terraphim:block-id:{} -->", Ulid::new());
296 edits.push(Edit {
297 range: (line_start + comment_start)..(line_start + comment_end),
298 replacement,
299 });
300 return;
301 }
302
303 if let Some(insert_at) = list_item_inline_insert_point(source, list_item_start) {
305 let trailing_space = match source.as_bytes().get(insert_at) {
306 None | Some(b'\n') | Some(b'\r') => "",
307 _ => " ",
308 };
309 edits.push(Edit::insert(
310 insert_at,
311 format!(
312 "<!-- terraphim:block-id:{} -->{trailing_space}",
313 Ulid::new()
314 ),
315 ));
316 }
317}
318
319fn list_item_inline_insert_point(source: &str, list_item_start: usize) -> Option<usize> {
320 let bytes = source.as_bytes();
321 let mut i = list_item_start;
322
323 loop {
326 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
327 i += 1;
328 }
329 if bytes.get(i..i + 2) == Some(b"> ") {
330 i += 2;
331 continue;
332 }
333 break;
334 }
335
336 if matches!(bytes.get(i), Some(b'-' | b'*' | b'+')) {
338 i += 1;
339 if matches!(bytes.get(i), Some(b' ' | b'\t')) {
340 i += 1;
341 } else {
342 return None;
343 }
344 } else if matches!(bytes.get(i), Some(b'0'..=b'9')) {
345 while matches!(bytes.get(i), Some(b'0'..=b'9')) {
347 i += 1;
348 }
349 if matches!(bytes.get(i), Some(b'.' | b')')) {
350 i += 1;
351 } else {
352 return None;
353 }
354 if matches!(bytes.get(i), Some(b' ' | b'\t')) {
355 i += 1;
356 } else {
357 return None;
358 }
359 } else {
360 return None;
361 }
362
363 if bytes.get(i) == Some(&b'[')
365 && matches!(bytes.get(i + 1), Some(b' ' | b'x' | b'X'))
366 && bytes.get(i + 2) == Some(&b']')
367 && matches!(bytes.get(i + 3), Some(b' ' | b'\t'))
368 {
369 i += 4;
370 }
371
372 Some(i)
373}
374
375fn extract_blocks(markdown: &str) -> Result<Vec<Block>, MarkdownParserError> {
376 let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
377 let mut blocks = Vec::new();
378 extract_blocks_from_children(&ast, markdown, &mut blocks, ParentKind::Other)?;
379
380 let mut seen = HashSet::new();
382 for b in &blocks {
383 let id = b.id.to_string();
384 if !seen.insert(id) {
385 return Err(MarkdownParserError::MissingOrInvalidBlockId(
387 b.kind,
388 b.span.start,
389 ));
390 }
391 }
392
393 Ok(blocks)
394}
395
396fn extract_blocks_from_children(
397 node: &Node,
398 source: &str,
399 blocks: &mut Vec<Block>,
400 parent: ParentKind,
401) -> Result<(), MarkdownParserError> {
402 match node {
403 Node::Root(root) => {
404 extract_blocks_from_list(&root.children, source, blocks, ParentKind::Other)?;
405 }
406 Node::Blockquote(bq) => {
407 extract_blocks_from_list(&bq.children, source, blocks, parent)?;
408 }
409 Node::List(list) => {
410 extract_blocks_from_list(&list.children, source, blocks, parent)?;
411 }
412 Node::ListItem(li) => {
413 let Some(pos) = node.position() else {
414 return Ok(());
415 };
416
417 let Some((id, id_span)) = extract_list_item_id(source, pos.start.offset) else {
418 return Err(MarkdownParserError::MissingOrInvalidBlockId(
419 BlockKind::ListItem,
420 pos.start.offset,
421 ));
422 };
423 let start = line_bounds_at(source, pos.start.offset).0;
424 let end = pos.end.offset;
425 blocks.push(Block {
426 id,
427 kind: BlockKind::ListItem,
428 span: start..end,
429 id_span,
430 });
431 extract_blocks_from_list(&li.children, source, blocks, ParentKind::ListItem)?;
432 }
433 _ => {
434 if let Some(children) = children(node) {
435 extract_blocks_from_list(children, source, blocks, parent)?;
436 }
437 }
438 }
439 Ok(())
440}
441
442fn extract_blocks_from_list(
443 children: &[Node],
444 source: &str,
445 blocks: &mut Vec<Block>,
446 parent: ParentKind,
447) -> Result<(), MarkdownParserError> {
448 let mut first_direct_paragraph_in_list_item = false;
449
450 for (idx, child) in children.iter().enumerate() {
451 match child {
452 Node::ListItem(_) => extract_blocks_from_children(child, source, blocks, parent)?,
453 Node::Paragraph(_) => {
454 if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
455 first_direct_paragraph_in_list_item = true;
456 continue;
457 }
458
459 let Some(pos) = child.position() else {
460 continue;
461 };
462
463 let Some((id, anchor_span)) = idx
464 .checked_sub(1)
465 .and_then(|prev| {
466 parse_block_id_from_html_node_with_span(source, &children[prev])
467 })
468 .and_then(|(id, span)| id.map(|id| (id, span)))
469 else {
470 return Err(MarkdownParserError::MissingOrInvalidBlockId(
471 BlockKind::Paragraph,
472 pos.start.offset,
473 ));
474 };
475
476 blocks.push(Block {
477 id,
478 kind: BlockKind::Paragraph,
479 span: anchor_span.start..pos.end.offset,
480 id_span: anchor_span,
481 })
482 }
483 _ => extract_blocks_from_children(child, source, blocks, parent)?,
484 }
485 }
486
487 Ok(())
488}
489
490fn extract_list_item_id(source: &str, list_item_start: usize) -> Option<(Ulid, Range<usize>)> {
491 let (line_start, line_end) = line_bounds_at(source, list_item_start);
492 let line = &source[line_start..line_end];
493 let (comment_start, comment_end, parsed) = find_inline_block_id_comment(line)?;
494 let id = parsed?;
495 Some((id, (line_start + comment_start)..(line_start + comment_end)))
496}
497
498fn parse_block_id_from_html_node(node: &Node) -> Option<Ulid> {
499 match node {
500 Node::Html(val) => parse_block_id_comment(&val.value),
501 _ => None,
502 }
503}
504
505fn parse_block_id_from_html_node_with_span(
506 source: &str,
507 node: &Node,
508) -> Option<(Option<Ulid>, Range<usize>)> {
509 let Node::Html(val) = node else { return None };
510 let id = parse_block_id_comment(&val.value);
511
512 let Some(pos) = node.position() else {
513 return Some((id, 0..0));
514 };
515
516 let (line_start, line_end) = line_bounds_at(source, pos.start.offset);
517 Some((id, line_start..line_end))
518}
519
520fn parse_block_id_comment(raw_html: &str) -> Option<Ulid> {
521 let html = raw_html.trim();
522 let inner = html
523 .strip_prefix("<!--")
524 .and_then(|s| s.strip_suffix("-->"))?;
525 let inner = inner.trim();
526 let id_str = inner.strip_prefix(TERRAPHIM_BLOCK_ID_PREFIX)?;
527 Ulid::from_str(id_str.trim()).ok()
528}
529
530fn find_inline_block_id_comment(line: &str) -> Option<(usize, usize, Option<Ulid>)> {
531 let start = line.find("<!--")?;
532 let marker = line[start..].find(TERRAPHIM_BLOCK_ID_PREFIX)? + start;
533 let end = line[marker..].find("-->")? + marker + 3;
534
535 let comment_start = start;
536 let comment_end = end;
537 let comment = &line[comment_start..comment_end];
538 Some((comment_start, comment_end, parse_block_id_comment(comment)))
539}
540
541fn line_bounds_at(source: &str, offset: usize) -> (usize, usize) {
542 let line_start = source[..offset].rfind('\n').map(|i| i + 1).unwrap_or(0);
543 let line_end = source[offset..]
544 .find('\n')
545 .map(|i| offset + i)
546 .unwrap_or_else(|| source.len());
547 (line_start, line_end)
548}
549
550fn line_prefix_at(source: &str, offset: usize) -> (usize, String) {
551 let (line_start, _line_end) = line_bounds_at(source, offset);
552 let prefix = &source[line_start..offset];
553 (line_start, prefix.to_string())
554}
555
556fn children(node: &Node) -> Option<&Vec<Node>> {
557 match node {
558 Node::Root(root) => Some(&root.children),
559 Node::Blockquote(bq) => Some(&bq.children),
560 Node::List(list) => Some(&list.children),
561 Node::ListItem(li) => Some(&li.children),
562 Node::Paragraph(p) => Some(&p.children),
563 Node::Heading(h) => Some(&h.children),
564 _ => None,
565 }
566}
567
568fn strip_terraphim_block_id_comments(text: &str) -> String {
569 let mut out = String::with_capacity(text.len());
570 for line in text.lines() {
571 let mut remaining = line;
572 let mut cleaned = String::new();
573 loop {
574 let Some((start, end, _)) = find_inline_block_id_comment(remaining) else {
575 cleaned.push_str(remaining);
576 break;
577 };
578 cleaned.push_str(&remaining[..start]);
579 remaining = &remaining[end..];
580 }
581
582 if cleaned.trim().is_empty() {
583 continue;
584 }
585
586 out.push_str(cleaned.trim_end());
587 out.push('\n')
588 }
589 out
590}
591
592fn first_nonempty_line(text: &str) -> Option<String> {
593 text.lines()
594 .map(|l| l.trim())
595 .find(|l| !l.is_empty())
596 .map(|l| l.chars().take(80).collect::<String>())
597}
598
599#[cfg(test)]
600mod tests {
601 use super::*;
602
603 fn count_block_ids(s: &str) -> usize {
604 s.lines()
605 .filter(|l| l.contains("<!-- terraphim:block-id:"))
606 .count()
607 }
608
609 #[test]
610 fn inserts_paragraph_ids() {
611 let input = "Hello world\n\nSecond paragraph\n";
612 let out = ensure_terraphim_block_ids(input).unwrap();
613 assert_eq!(count_block_ids(&out), 2);
615 assert!(out.contains("Hello world"));
616 assert!(out.contains("Second paragraph"));
617 }
618
619 #[test]
620 fn inserts_list_item_inline_ids() {
621 let input = "- first\n- second\n";
622 let out = ensure_terraphim_block_ids(input).unwrap();
623 assert_eq!(count_block_ids(&out), 2);
624 assert!(out.contains("- <!-- terraphim:block-id:"));
625 }
626
627 #[test]
628 fn normalize_returns_blocks() {
629 let input = "- item\n\nPara\n";
630 let normalized = normalize_markdown(input).unwrap();
631 assert!(normalized.blocks.len() >= 2);
632 }
633
634 #[test]
635 fn extract_first_heading_h1() {
636 let input = "# Bun Package Manager\n\nsynonyms:: npm, yarn\n";
637 assert_eq!(
638 extract_first_heading(input),
639 Some("Bun Package Manager".to_string())
640 );
641 }
642
643 #[test]
644 fn extract_first_heading_skips_h2() {
645 let input = "## Not This\n\n# This One\n";
646 assert_eq!(extract_first_heading(input), Some("This One".to_string()));
647 }
648
649 #[test]
650 fn extract_first_heading_none_when_absent() {
651 let input = "Just some text\n\n## Only H2\n";
652 assert_eq!(extract_first_heading(input), None);
653 }
654
655 #[test]
656 fn extract_first_heading_with_inline_code() {
657 let input = "# The `bun` Runtime\n";
658 assert_eq!(
659 extract_first_heading(input),
660 Some("The bun Runtime".to_string())
661 );
662 }
663
664 #[test]
665 fn iterative_normalization_stabilizes() {
666 let input = "- item one\n- item two\n\nA paragraph after the list.\n\nAnother paragraph.\n";
673
674 let pass1 = ensure_terraphim_block_ids(input).unwrap();
676
677 let normalized = normalize_markdown(input).unwrap();
679
680 let again = ensure_terraphim_block_ids(&normalized.markdown).unwrap();
683 assert_eq!(
684 again, normalized.markdown,
685 "normalization should be stable after normalize_markdown"
686 );
687
688 let id_count = normalized
690 .markdown
691 .lines()
692 .filter(|l| l.contains("<!-- terraphim:block-id:"))
693 .count();
694 assert!(
695 id_count >= 4,
696 "expected at least 4 block IDs, got {id_count}; pass1 had {} IDs",
697 count_block_ids(&pass1),
698 );
699 }
700}