terraphim-markdown-parser 1.19.0

Terraphim Markdown Parser
Documentation
use ulid::Ulid;

use crate::NormalizedMarkdown;
use crate::heading::{HeadingNode, HeadingTree, SectionType};

#[derive(Debug, Clone)]
pub struct ContentChunk {
    pub chunk_id: String,
    pub content_id: String,
    pub block_ids: Vec<Ulid>,
    pub chapter_number: Option<u8>,
    pub section_path: String,
    pub chunk_type: SectionType,
    pub text: String,
    pub token_count: u32,
}

struct ChunkState {
    chapter_counter: u8,
}

pub fn chunk_by_headings(
    content_id: &str,
    tree: &HeadingTree,
    normalized: &NormalizedMarkdown,
) -> Vec<ContentChunk> {
    let mut chunks = Vec::new();
    let mut state = ChunkState { chapter_counter: 0 };

    for root in &tree.roots {
        collect_chunks(
            root,
            content_id,
            normalized,
            &mut chunks,
            &mut state,
            &mut Vec::new(),
            0,
        );
    }

    chunks
}

fn collect_chunks(
    node: &HeadingNode,
    content_id: &str,
    normalized: &NormalizedMarkdown,
    chunks: &mut Vec<ContentChunk>,
    state: &mut ChunkState,
    path: &mut Vec<u8>,
    sibling_index: u8,
) {
    let is_chapter_root = path.is_empty();

    if is_chapter_root {
        state.chapter_counter += 1;
        path.push(state.chapter_counter);
    } else {
        path.push(sibling_index);
    }

    let section_path = format_path(path);

    if !node.blocks.is_empty() {
        let text = extract_block_text(&node.blocks, normalized);
        let token_count = text.split_whitespace().count() as u32;

        let chunk_id = format!(
            "{}#{}#{}",
            content_id,
            section_path,
            node.blocks
                .first()
                .map(|id| id.to_string())
                .unwrap_or_default()
        );

        chunks.push(ContentChunk {
            chunk_id,
            content_id: content_id.to_string(),
            block_ids: node.blocks.clone(),
            chapter_number: path.first().copied(),
            section_path: section_path.clone(),
            chunk_type: node.section_type.clone(),
            text,
            token_count,
        });
    }

    for (i, child) in node.children.iter().enumerate() {
        collect_chunks(
            child,
            content_id,
            normalized,
            chunks,
            state,
            path,
            i as u8 + 1,
        );
    }

    path.pop();
}

fn format_path(components: &[u8]) -> String {
    components
        .iter()
        .map(|c| c.to_string())
        .collect::<Vec<_>>()
        .join(".")
}

fn extract_block_text(block_ids: &[Ulid], normalized: &NormalizedMarkdown) -> String {
    let mut parts = Vec::new();
    for id in block_ids {
        if let Some(block) = normalized.blocks.iter().find(|b| b.id == *id) {
            let text =
                crate::strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()]);
            let trimmed = text.trim();
            if !trimmed.is_empty() {
                parts.push(trimmed.to_string());
            }
        }
    }
    parts.join("\n\n")
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::heading::{SectionConfig, build_heading_tree, classify_sections};
    use crate::normalize_markdown;

    #[test]
    fn chunk_single_chapter() {
        let input = "# Chapter 1\n\nFirst paragraph.\n\nSecond paragraph.\n";
        let normalized = normalize_markdown(input).unwrap();
        let tree = build_heading_tree(&normalized).unwrap();
        let chunks = chunk_by_headings("test-doc", &tree, &normalized);

        assert_eq!(chunks.len(), 1);
        assert_eq!(chunks[0].content_id, "test-doc");
        assert_eq!(chunks[0].chapter_number, Some(1));
        assert_eq!(chunks[0].section_path, "1");
        assert_eq!(chunks[0].chunk_type, SectionType::Main);
        assert!(chunks[0].text.contains("First paragraph"));
        assert!(chunks[0].text.contains("Second paragraph"));
    }

    #[test]
    fn chunk_preserves_block_ulids() {
        let input = "# Chapter\n\nParagraph one\n\nParagraph two\n";
        let normalized = normalize_markdown(input).unwrap();
        let original_ids: Vec<Ulid> = normalized.blocks.iter().map(|b| b.id).collect();

        let tree = build_heading_tree(&normalized).unwrap();
        let chunks = chunk_by_headings("doc", &tree, &normalized);

        assert_eq!(chunks[0].block_ids.len(), 2);
        assert_eq!(chunks[0].block_ids, original_ids);
    }

    #[test]
    fn chunk_composite_ids() {
        let input = "# Chapter\n\nText\n";
        let normalized = normalize_markdown(input).unwrap();
        let tree = build_heading_tree(&normalized).unwrap();
        let chunks = chunk_by_headings("my-doc", &tree, &normalized);

        assert!(chunks[0].chunk_id.starts_with("my-doc#1#"));
    }

    #[test]
    fn chunk_nested_headings() {
        let input = "# Chapter\n\nIntro\n\n## Section A\n\nText A\n\n## Section B\n\nText B\n";
        let normalized = normalize_markdown(input).unwrap();
        let tree = build_heading_tree(&normalized).unwrap();
        let chunks = chunk_by_headings("book", &tree, &normalized);

        assert_eq!(chunks.len(), 3);
        assert_eq!(chunks[0].section_path, "1");
        assert_eq!(chunks[1].section_path, "1.1");
        assert_eq!(chunks[2].section_path, "1.2");
    }

    #[test]
    fn chunk_with_section_types() {
        let input =
            "# Chapter\n\nIntro\n\n## Power Selling: Tips\n\nTip\n\n## Selling U\n\nCareer\n";
        let normalized = normalize_markdown(input).unwrap();
        let mut tree = build_heading_tree(&normalized).unwrap();
        classify_sections(&mut tree, &SectionConfig::textbook_default());
        let chunks = chunk_by_headings("book", &tree, &normalized);

        assert_eq!(chunks[0].chunk_type, SectionType::Main);
        assert_eq!(
            chunks[1].chunk_type,
            SectionType::Sidebar("PowerSelling".to_string())
        );
        assert_eq!(chunks[2].chunk_type, SectionType::Career);
    }

    #[test]
    fn chunk_token_count() {
        let input = "# Chapter\n\nOne two three four five.\n";
        let normalized = normalize_markdown(input).unwrap();
        let tree = build_heading_tree(&normalized).unwrap();
        let chunks = chunk_by_headings("doc", &tree, &normalized);

        assert!(chunks[0].token_count > 0);
    }

    #[test]
    fn chunk_multiple_chapters() {
        let input = "# Chapter 1\n\nText 1\n\n# Chapter 2\n\nText 2\n\n# Chapter 3\n\nText 3\n";
        let normalized = normalize_markdown(input).unwrap();
        let tree = build_heading_tree(&normalized).unwrap();
        let chunks = chunk_by_headings("book", &tree, &normalized);

        assert_eq!(chunks.len(), 3);
        assert_eq!(chunks[0].chapter_number, Some(1));
        assert_eq!(chunks[1].chapter_number, Some(2));
        assert_eq!(chunks[2].chapter_number, Some(3));
    }
}