Skip to main content

terraphim_markdown_parser/
chunk.rs

1use ulid::Ulid;
2
3use crate::NormalizedMarkdown;
4use crate::heading::{HeadingNode, HeadingTree, SectionType};
5
6#[derive(Debug, Clone)]
7pub struct ContentChunk {
8    pub chunk_id: String,
9    pub content_id: String,
10    pub block_ids: Vec<Ulid>,
11    pub chapter_number: Option<u8>,
12    pub section_path: String,
13    pub chunk_type: SectionType,
14    pub text: String,
15    pub token_count: u32,
16}
17
18struct ChunkState {
19    chapter_counter: u8,
20}
21
22pub fn chunk_by_headings(
23    content_id: &str,
24    tree: &HeadingTree,
25    normalized: &NormalizedMarkdown,
26) -> Vec<ContentChunk> {
27    let mut chunks = Vec::new();
28    let mut state = ChunkState { chapter_counter: 0 };
29
30    for root in &tree.roots {
31        collect_chunks(
32            root,
33            content_id,
34            normalized,
35            &mut chunks,
36            &mut state,
37            &mut Vec::new(),
38            0,
39        );
40    }
41
42    chunks
43}
44
45fn collect_chunks(
46    node: &HeadingNode,
47    content_id: &str,
48    normalized: &NormalizedMarkdown,
49    chunks: &mut Vec<ContentChunk>,
50    state: &mut ChunkState,
51    path: &mut Vec<u8>,
52    sibling_index: u8,
53) {
54    let is_chapter_root = path.is_empty();
55
56    if is_chapter_root {
57        state.chapter_counter += 1;
58        path.push(state.chapter_counter);
59    } else {
60        path.push(sibling_index);
61    }
62
63    let section_path = format_path(path);
64
65    if !node.blocks.is_empty() {
66        let text = extract_block_text(&node.blocks, normalized);
67        let token_count = text.split_whitespace().count() as u32;
68
69        let chunk_id = format!(
70            "{}#{}#{}",
71            content_id,
72            section_path,
73            node.blocks
74                .first()
75                .map(|id| id.to_string())
76                .unwrap_or_default()
77        );
78
79        chunks.push(ContentChunk {
80            chunk_id,
81            content_id: content_id.to_string(),
82            block_ids: node.blocks.clone(),
83            chapter_number: path.first().copied(),
84            section_path: section_path.clone(),
85            chunk_type: node.section_type.clone(),
86            text,
87            token_count,
88        });
89    }
90
91    for (i, child) in node.children.iter().enumerate() {
92        collect_chunks(
93            child,
94            content_id,
95            normalized,
96            chunks,
97            state,
98            path,
99            i as u8 + 1,
100        );
101    }
102
103    path.pop();
104}
105
106fn format_path(components: &[u8]) -> String {
107    components
108        .iter()
109        .map(|c| c.to_string())
110        .collect::<Vec<_>>()
111        .join(".")
112}
113
114fn extract_block_text(block_ids: &[Ulid], normalized: &NormalizedMarkdown) -> String {
115    let mut parts = Vec::new();
116    for id in block_ids {
117        if let Some(block) = normalized.blocks.iter().find(|b| b.id == *id) {
118            let text =
119                crate::strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()]);
120            let trimmed = text.trim();
121            if !trimmed.is_empty() {
122                parts.push(trimmed.to_string());
123            }
124        }
125    }
126    parts.join("\n\n")
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132    use crate::heading::{SectionConfig, build_heading_tree, classify_sections};
133    use crate::normalize_markdown;
134
135    #[test]
136    fn chunk_single_chapter() {
137        let input = "# Chapter 1\n\nFirst paragraph.\n\nSecond paragraph.\n";
138        let normalized = normalize_markdown(input).unwrap();
139        let tree = build_heading_tree(&normalized).unwrap();
140        let chunks = chunk_by_headings("test-doc", &tree, &normalized);
141
142        assert_eq!(chunks.len(), 1);
143        assert_eq!(chunks[0].content_id, "test-doc");
144        assert_eq!(chunks[0].chapter_number, Some(1));
145        assert_eq!(chunks[0].section_path, "1");
146        assert_eq!(chunks[0].chunk_type, SectionType::Main);
147        assert!(chunks[0].text.contains("First paragraph"));
148        assert!(chunks[0].text.contains("Second paragraph"));
149    }
150
151    #[test]
152    fn chunk_preserves_block_ulids() {
153        let input = "# Chapter\n\nParagraph one\n\nParagraph two\n";
154        let normalized = normalize_markdown(input).unwrap();
155        let original_ids: Vec<Ulid> = normalized.blocks.iter().map(|b| b.id).collect();
156
157        let tree = build_heading_tree(&normalized).unwrap();
158        let chunks = chunk_by_headings("doc", &tree, &normalized);
159
160        assert_eq!(chunks[0].block_ids.len(), 2);
161        assert_eq!(chunks[0].block_ids, original_ids);
162    }
163
164    #[test]
165    fn chunk_composite_ids() {
166        let input = "# Chapter\n\nText\n";
167        let normalized = normalize_markdown(input).unwrap();
168        let tree = build_heading_tree(&normalized).unwrap();
169        let chunks = chunk_by_headings("my-doc", &tree, &normalized);
170
171        assert!(chunks[0].chunk_id.starts_with("my-doc#1#"));
172    }
173
174    #[test]
175    fn chunk_nested_headings() {
176        let input = "# Chapter\n\nIntro\n\n## Section A\n\nText A\n\n## Section B\n\nText B\n";
177        let normalized = normalize_markdown(input).unwrap();
178        let tree = build_heading_tree(&normalized).unwrap();
179        let chunks = chunk_by_headings("book", &tree, &normalized);
180
181        assert_eq!(chunks.len(), 3);
182        assert_eq!(chunks[0].section_path, "1");
183        assert_eq!(chunks[1].section_path, "1.1");
184        assert_eq!(chunks[2].section_path, "1.2");
185    }
186
187    #[test]
188    fn chunk_with_section_types() {
189        let input =
190            "# Chapter\n\nIntro\n\n## Power Selling: Tips\n\nTip\n\n## Selling U\n\nCareer\n";
191        let normalized = normalize_markdown(input).unwrap();
192        let mut tree = build_heading_tree(&normalized).unwrap();
193        classify_sections(&mut tree, &SectionConfig::textbook_default());
194        let chunks = chunk_by_headings("book", &tree, &normalized);
195
196        assert_eq!(chunks[0].chunk_type, SectionType::Main);
197        assert_eq!(
198            chunks[1].chunk_type,
199            SectionType::Sidebar("PowerSelling".to_string())
200        );
201        assert_eq!(chunks[2].chunk_type, SectionType::Career);
202    }
203
204    #[test]
205    fn chunk_token_count() {
206        let input = "# Chapter\n\nOne two three four five.\n";
207        let normalized = normalize_markdown(input).unwrap();
208        let tree = build_heading_tree(&normalized).unwrap();
209        let chunks = chunk_by_headings("doc", &tree, &normalized);
210
211        assert!(chunks[0].token_count > 0);
212    }
213
214    #[test]
215    fn chunk_multiple_chapters() {
216        let input = "# Chapter 1\n\nText 1\n\n# Chapter 2\n\nText 2\n\n# Chapter 3\n\nText 3\n";
217        let normalized = normalize_markdown(input).unwrap();
218        let tree = build_heading_tree(&normalized).unwrap();
219        let chunks = chunk_by_headings("book", &tree, &normalized);
220
221        assert_eq!(chunks.len(), 3);
222        assert_eq!(chunks[0].chapter_number, Some(1));
223        assert_eq!(chunks[1].chapter_number, Some(2));
224        assert_eq!(chunks[2].chapter_number, Some(3));
225    }
226}