1use std::collections::HashSet;
2use std::ops::Range;
3use std::str::FromStr;
4
5use markdown::ParseOptions;
6use markdown::mdast::Node;
7use terraphim_types::{Document, DocumentType};
8use thiserror::Error;
9use ulid::Ulid;
10
11pub const TERRAPHIM_BLOCK_ID_PREFIX: &str = "terraphim:block-id:";
12
13pub fn extract_first_heading(content: &str) -> Option<String> {
18 let ast = markdown::to_mdast(content, &ParseOptions::gfm()).ok()?;
19 find_first_h1(&ast)
20}
21
22fn find_first_h1(node: &Node) -> Option<String> {
24 match node {
25 Node::Heading(h) if h.depth == 1 => {
26 let text = collect_text_content(&h.children);
27 if text.is_empty() { None } else { Some(text) }
28 }
29 _ => {
30 if let Some(children) = children(node) {
31 for child in children {
32 if let Some(heading) = find_first_h1(child) {
33 return Some(heading);
34 }
35 }
36 }
37 None
38 }
39 }
40}
41
42fn collect_text_content(nodes: &[Node]) -> String {
44 let mut text = String::new();
45 for node in nodes {
46 match node {
47 Node::Text(t) => text.push_str(&t.value),
48 Node::InlineCode(c) => text.push_str(&c.value),
49 other => {
50 if let Some(children) = children(other) {
51 text.push_str(&collect_text_content(children));
52 }
53 }
54 }
55 }
56 text
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum BlockKind {
61 Paragraph,
62 ListItem,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct Block {
67 pub id: Ulid,
68 pub kind: BlockKind,
69
70 pub span: Range<usize>,
75
76 pub id_span: Range<usize>,
81}
82
83#[derive(Debug, Clone, PartialEq, Eq)]
84pub struct NormalizedMarkdown {
85 pub markdown: String,
86 pub blocks: Vec<Block>,
87}
88
89#[derive(Debug, Error)]
90pub enum MarkdownParserError {
91 #[error("failed to parse markdown: {0}")]
92 Markdown(String),
93
94 #[error("missing or invalid terraphim block id for {0:?} at byte offset {1}")]
95 MissingOrInvalidBlockId(BlockKind, usize),
96}
97
98impl From<markdown::message::Message> for MarkdownParserError {
99 fn from(value: markdown::message::Message) -> Self {
100 Self::Markdown(format!("{value:?}"))
101 }
102}
103
104#[derive(Debug, Clone)]
105struct Edit {
106 range: Range<usize>,
107 replacement: String,
108}
109
110impl Edit {
111 fn insert(at: usize, text: String) -> Self {
112 Self {
113 range: at..at,
114 replacement: text,
115 }
116 }
117}
118
119pub fn ensure_terraphim_block_ids(markdown: &str) -> Result<String, MarkdownParserError> {
125 let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
126 let mut edits: Vec<Edit> = Vec::new();
127 ensure_block_ids_in_children(&ast, markdown, &mut edits, ParentKind::Other);
128
129 if edits.is_empty() {
130 return Ok(markdown.to_string());
131 }
132
133 edits.sort_by(|a, b| b.range.start.cmp(&a.range.start));
135 let mut out = markdown.to_string();
136 for edit in edits {
137 out.replace_range(edit.range, &edit.replacement);
138 }
139 Ok(out)
140}
141
142pub fn normalize_markdown(markdown: &str) -> Result<NormalizedMarkdown, MarkdownParserError> {
144 let normalized = ensure_terraphim_block_ids(markdown)?;
145 let blocks = extract_blocks(&normalized)?;
146 Ok(NormalizedMarkdown {
147 markdown: normalized,
148 blocks,
149 })
150}
151
152pub fn blocks_to_documents(source_id: &str, normalized: &NormalizedMarkdown) -> Vec<Document> {
154 normalized
155 .blocks
156 .iter()
157 .map(|block| {
158 let block_id = block.id.to_string();
159 let id = format!("{source_id}#{block_id}");
160 let body = strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()])
161 .trim()
162 .to_string();
163 let title = first_nonempty_line(&body).unwrap_or_else(|| "Untitled".to_string());
164 Document {
165 id,
166 url: source_id.to_string(),
167 title,
168 body,
169 description: None,
170 summarization: None,
171 stub: None,
172 tags: None,
173 rank: None,
174 source_haystack: None,
175 doc_type: DocumentType::KgEntry,
176 synonyms: None,
177 route: None,
178 priority: None,
179 }
180 })
181 .collect()
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185enum ParentKind {
186 ListItem,
187 Other,
188}
189
190fn ensure_block_ids_in_children(
191 node: &Node,
192 source: &str,
193 edits: &mut Vec<Edit>,
194 parent: ParentKind,
195) {
196 match node {
197 Node::Root(root) => {
198 ensure_block_ids_in_list(&root.children, source, edits, ParentKind::Other)
199 }
200 Node::Blockquote(bq) => ensure_block_ids_in_list(&bq.children, source, edits, parent),
201 Node::List(list) => ensure_block_ids_in_list(&list.children, source, edits, parent),
202 Node::ListItem(li) => {
203 if let Some(pos) = node.position() {
204 ensure_list_item_inline_id(source, pos.start.offset, edits);
205 }
206 ensure_block_ids_in_list(&li.children, source, edits, ParentKind::ListItem);
207 }
208 _ => {
209 if let Some(children) = children(node) {
210 ensure_block_ids_in_list(children, source, edits, parent);
211 }
212 }
213 }
214}
215
216fn ensure_block_ids_in_list(
217 children: &[Node],
218 source: &str,
219 edits: &mut Vec<Edit>,
220 parent: ParentKind,
221) {
222 let mut first_direct_paragraph_in_list_item = false;
223
224 for (idx, child) in children.iter().enumerate() {
225 match child {
226 Node::ListItem(_) => ensure_block_ids_in_children(child, source, edits, parent),
227 Node::Paragraph(_) => {
228 if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
231 first_direct_paragraph_in_list_item = true;
232 } else if let Some(pos) = child.position() {
233 let has_prev_block_id = idx
234 .checked_sub(1)
235 .and_then(|prev| parse_block_id_from_html_node(&children[prev]))
236 .is_some();
237 if !has_prev_block_id {
238 edits.push(insert_paragraph_id_comment(source, pos.start.offset));
239 }
240 }
241 }
242 _ => ensure_block_ids_in_children(child, source, edits, parent),
243 }
244 }
245}
246
247fn insert_paragraph_id_comment(source: &str, paragraph_start: usize) -> Edit {
248 let (line_start, prefix) = line_prefix_at(source, paragraph_start);
249 let id = Ulid::new();
250 Edit::insert(
251 line_start,
252 format!("{prefix}<!-- terraphim:block-id:{id} -->\n"),
253 )
254}
255
256fn ensure_list_item_inline_id(source: &str, list_item_start: usize, edits: &mut Vec<Edit>) {
257 let (line_start, line_end) = line_bounds_at(source, list_item_start);
258 let line = &source[line_start..line_end];
259
260 if let Some((comment_start, comment_end, parsed)) = find_inline_block_id_comment(line) {
261 if parsed.is_some() {
262 return;
263 }
264
265 let replacement = format!("<!-- terraphim:block-id:{} -->", Ulid::new());
267 edits.push(Edit {
268 range: (line_start + comment_start)..(line_start + comment_end),
269 replacement,
270 });
271 return;
272 }
273
274 if let Some(insert_at) = list_item_inline_insert_point(source, list_item_start) {
276 let trailing_space = match source.as_bytes().get(insert_at) {
277 None | Some(b'\n') | Some(b'\r') => "",
278 _ => " ",
279 };
280 edits.push(Edit::insert(
281 insert_at,
282 format!(
283 "<!-- terraphim:block-id:{} -->{trailing_space}",
284 Ulid::new()
285 ),
286 ));
287 }
288}
289
290fn list_item_inline_insert_point(source: &str, list_item_start: usize) -> Option<usize> {
291 let bytes = source.as_bytes();
292 let mut i = list_item_start;
293
294 loop {
297 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
298 i += 1;
299 }
300 if bytes.get(i..i + 2) == Some(b"> ") {
301 i += 2;
302 continue;
303 }
304 break;
305 }
306
307 if matches!(bytes.get(i), Some(b'-' | b'*' | b'+')) {
309 i += 1;
310 if matches!(bytes.get(i), Some(b' ' | b'\t')) {
311 i += 1;
312 } else {
313 return None;
314 }
315 } else if matches!(bytes.get(i), Some(b'0'..=b'9')) {
316 while matches!(bytes.get(i), Some(b'0'..=b'9')) {
318 i += 1;
319 }
320 if matches!(bytes.get(i), Some(b'.' | b')')) {
321 i += 1;
322 } else {
323 return None;
324 }
325 if matches!(bytes.get(i), Some(b' ' | b'\t')) {
326 i += 1;
327 } else {
328 return None;
329 }
330 } else {
331 return None;
332 }
333
334 if bytes.get(i) == Some(&b'[')
336 && matches!(bytes.get(i + 1), Some(b' ' | b'x' | b'X'))
337 && bytes.get(i + 2) == Some(&b']')
338 && matches!(bytes.get(i + 3), Some(b' ' | b'\t'))
339 {
340 i += 4;
341 }
342
343 Some(i)
344}
345
346fn extract_blocks(markdown: &str) -> Result<Vec<Block>, MarkdownParserError> {
347 let ast = markdown::to_mdast(markdown, &ParseOptions::gfm())?;
348 let mut blocks = Vec::new();
349 extract_blocks_from_children(&ast, markdown, &mut blocks, ParentKind::Other)?;
350
351 let mut seen = HashSet::new();
353 for b in &blocks {
354 let id = b.id.to_string();
355 if !seen.insert(id) {
356 return Err(MarkdownParserError::MissingOrInvalidBlockId(
358 b.kind,
359 b.span.start,
360 ));
361 }
362 }
363
364 Ok(blocks)
365}
366
367fn extract_blocks_from_children(
368 node: &Node,
369 source: &str,
370 blocks: &mut Vec<Block>,
371 parent: ParentKind,
372) -> Result<(), MarkdownParserError> {
373 match node {
374 Node::Root(root) => {
375 extract_blocks_from_list(&root.children, source, blocks, ParentKind::Other)?;
376 }
377 Node::Blockquote(bq) => {
378 extract_blocks_from_list(&bq.children, source, blocks, parent)?;
379 }
380 Node::List(list) => {
381 extract_blocks_from_list(&list.children, source, blocks, parent)?;
382 }
383 Node::ListItem(li) => {
384 let Some(pos) = node.position() else {
385 return Ok(());
386 };
387
388 let Some((id, id_span)) = extract_list_item_id(source, pos.start.offset) else {
389 return Err(MarkdownParserError::MissingOrInvalidBlockId(
390 BlockKind::ListItem,
391 pos.start.offset,
392 ));
393 };
394 let start = line_bounds_at(source, pos.start.offset).0;
395 let end = pos.end.offset;
396 blocks.push(Block {
397 id,
398 kind: BlockKind::ListItem,
399 span: start..end,
400 id_span,
401 });
402 extract_blocks_from_list(&li.children, source, blocks, ParentKind::ListItem)?;
403 }
404 _ => {
405 if let Some(children) = children(node) {
406 extract_blocks_from_list(children, source, blocks, parent)?;
407 }
408 }
409 }
410 Ok(())
411}
412
413fn extract_blocks_from_list(
414 children: &[Node],
415 source: &str,
416 blocks: &mut Vec<Block>,
417 parent: ParentKind,
418) -> Result<(), MarkdownParserError> {
419 let mut first_direct_paragraph_in_list_item = false;
420
421 for (idx, child) in children.iter().enumerate() {
422 match child {
423 Node::ListItem(_) => extract_blocks_from_children(child, source, blocks, parent)?,
424 Node::Paragraph(_) => {
425 if parent == ParentKind::ListItem && !first_direct_paragraph_in_list_item {
426 first_direct_paragraph_in_list_item = true;
427 continue;
428 }
429
430 let Some(pos) = child.position() else {
431 continue;
432 };
433
434 let Some((id, anchor_span)) = idx
435 .checked_sub(1)
436 .and_then(|prev| {
437 parse_block_id_from_html_node_with_span(source, &children[prev])
438 })
439 .and_then(|(id, span)| id.map(|id| (id, span)))
440 else {
441 return Err(MarkdownParserError::MissingOrInvalidBlockId(
442 BlockKind::Paragraph,
443 pos.start.offset,
444 ));
445 };
446
447 blocks.push(Block {
448 id,
449 kind: BlockKind::Paragraph,
450 span: anchor_span.start..pos.end.offset,
451 id_span: anchor_span,
452 })
453 }
454 _ => extract_blocks_from_children(child, source, blocks, parent)?,
455 }
456 }
457
458 Ok(())
459}
460
461fn extract_list_item_id(source: &str, list_item_start: usize) -> Option<(Ulid, Range<usize>)> {
462 let (line_start, line_end) = line_bounds_at(source, list_item_start);
463 let line = &source[line_start..line_end];
464 let (comment_start, comment_end, parsed) = find_inline_block_id_comment(line)?;
465 let id = parsed?;
466 Some((id, (line_start + comment_start)..(line_start + comment_end)))
467}
468
469fn parse_block_id_from_html_node(node: &Node) -> Option<Ulid> {
470 match node {
471 Node::Html(val) => parse_block_id_comment(&val.value),
472 _ => None,
473 }
474}
475
476fn parse_block_id_from_html_node_with_span(
477 source: &str,
478 node: &Node,
479) -> Option<(Option<Ulid>, Range<usize>)> {
480 let Node::Html(val) = node else { return None };
481 let id = parse_block_id_comment(&val.value);
482
483 let Some(pos) = node.position() else {
484 return Some((id, 0..0));
485 };
486
487 let (line_start, line_end) = line_bounds_at(source, pos.start.offset);
488 Some((id, line_start..line_end))
489}
490
491fn parse_block_id_comment(raw_html: &str) -> Option<Ulid> {
492 let html = raw_html.trim();
493 let inner = html
494 .strip_prefix("<!--")
495 .and_then(|s| s.strip_suffix("-->"))?;
496 let inner = inner.trim();
497 let id_str = inner.strip_prefix(TERRAPHIM_BLOCK_ID_PREFIX)?;
498 Ulid::from_str(id_str.trim()).ok()
499}
500
501fn find_inline_block_id_comment(line: &str) -> Option<(usize, usize, Option<Ulid>)> {
502 let start = line.find("<!--")?;
503 let marker = line[start..].find(TERRAPHIM_BLOCK_ID_PREFIX)? + start;
504 let end = line[marker..].find("-->")? + marker + 3;
505
506 let comment_start = start;
507 let comment_end = end;
508 let comment = &line[comment_start..comment_end];
509 Some((comment_start, comment_end, parse_block_id_comment(comment)))
510}
511
512fn line_bounds_at(source: &str, offset: usize) -> (usize, usize) {
513 let line_start = source[..offset].rfind('\n').map(|i| i + 1).unwrap_or(0);
514 let line_end = source[offset..]
515 .find('\n')
516 .map(|i| offset + i)
517 .unwrap_or_else(|| source.len());
518 (line_start, line_end)
519}
520
521fn line_prefix_at(source: &str, offset: usize) -> (usize, String) {
522 let (line_start, _line_end) = line_bounds_at(source, offset);
523 let prefix = &source[line_start..offset];
524 (line_start, prefix.to_string())
525}
526
527fn children(node: &Node) -> Option<&Vec<Node>> {
528 match node {
529 Node::Root(root) => Some(&root.children),
530 Node::Blockquote(bq) => Some(&bq.children),
531 Node::List(list) => Some(&list.children),
532 Node::ListItem(li) => Some(&li.children),
533 Node::Paragraph(p) => Some(&p.children),
534 Node::Heading(h) => Some(&h.children),
535 _ => None,
536 }
537}
538
539fn strip_terraphim_block_id_comments(text: &str) -> String {
540 let mut out = String::with_capacity(text.len());
541 for line in text.lines() {
542 let mut remaining = line;
543 let mut cleaned = String::new();
544 loop {
545 let Some((start, end, _)) = find_inline_block_id_comment(remaining) else {
546 cleaned.push_str(remaining);
547 break;
548 };
549 cleaned.push_str(&remaining[..start]);
550 remaining = &remaining[end..];
551 }
552
553 if cleaned.trim().is_empty() {
554 continue;
555 }
556
557 out.push_str(cleaned.trim_end());
558 out.push('\n')
559 }
560 out
561}
562
563fn first_nonempty_line(text: &str) -> Option<String> {
564 text.lines()
565 .map(|l| l.trim())
566 .find(|l| !l.is_empty())
567 .map(|l| l.chars().take(80).collect::<String>())
568}
569
570#[cfg(test)]
571mod tests {
572 use super::*;
573
574 fn count_block_ids(s: &str) -> usize {
575 s.lines()
576 .filter(|l| l.contains("<!-- terraphim:block-id:"))
577 .count()
578 }
579
580 #[test]
581 fn inserts_paragraph_ids() {
582 let input = "Hello world\n\nSecond paragraph\n";
583 let out = ensure_terraphim_block_ids(input).unwrap();
584 assert_eq!(count_block_ids(&out), 2);
586 assert!(out.contains("Hello world"));
587 assert!(out.contains("Second paragraph"));
588 }
589
590 #[test]
591 fn inserts_list_item_inline_ids() {
592 let input = "- first\n- second\n";
593 let out = ensure_terraphim_block_ids(input).unwrap();
594 assert_eq!(count_block_ids(&out), 2);
595 assert!(out.contains("- <!-- terraphim:block-id:"));
596 }
597
598 #[test]
599 fn normalize_returns_blocks() {
600 let input = "- item\n\nPara\n";
601 let normalized = normalize_markdown(input).unwrap();
602 assert!(normalized.blocks.len() >= 2);
603 }
604
605 #[test]
606 fn extract_first_heading_h1() {
607 let input = "# Bun Package Manager\n\nsynonyms:: npm, yarn\n";
608 assert_eq!(
609 extract_first_heading(input),
610 Some("Bun Package Manager".to_string())
611 );
612 }
613
614 #[test]
615 fn extract_first_heading_skips_h2() {
616 let input = "## Not This\n\n# This One\n";
617 assert_eq!(extract_first_heading(input), Some("This One".to_string()));
618 }
619
620 #[test]
621 fn extract_first_heading_none_when_absent() {
622 let input = "Just some text\n\n## Only H2\n";
623 assert_eq!(extract_first_heading(input), None);
624 }
625
626 #[test]
627 fn extract_first_heading_with_inline_code() {
628 let input = "# The `bun` Runtime\n";
629 assert_eq!(
630 extract_first_heading(input),
631 Some("The bun Runtime".to_string())
632 );
633 }
634}