terraphim_markdown_parser/
chunk.rs1use ulid::Ulid;
2
3use crate::NormalizedMarkdown;
4use crate::heading::{HeadingNode, HeadingTree, SectionType};
5
6#[derive(Debug, Clone)]
7pub struct ContentChunk {
8 pub chunk_id: String,
9 pub content_id: String,
10 pub block_ids: Vec<Ulid>,
11 pub chapter_number: Option<u8>,
12 pub section_path: String,
13 pub chunk_type: SectionType,
14 pub text: String,
15 pub token_count: u32,
16}
17
18struct ChunkState {
19 chapter_counter: u8,
20}
21
22pub fn chunk_by_headings(
23 content_id: &str,
24 tree: &HeadingTree,
25 normalized: &NormalizedMarkdown,
26) -> Vec<ContentChunk> {
27 let mut chunks = Vec::new();
28 let mut state = ChunkState { chapter_counter: 0 };
29
30 for root in &tree.roots {
31 collect_chunks(
32 root,
33 content_id,
34 normalized,
35 &mut chunks,
36 &mut state,
37 &mut Vec::new(),
38 0,
39 );
40 }
41
42 chunks
43}
44
45fn collect_chunks(
46 node: &HeadingNode,
47 content_id: &str,
48 normalized: &NormalizedMarkdown,
49 chunks: &mut Vec<ContentChunk>,
50 state: &mut ChunkState,
51 path: &mut Vec<u8>,
52 sibling_index: u8,
53) {
54 let is_chapter_root = path.is_empty();
55
56 if is_chapter_root {
57 state.chapter_counter += 1;
58 path.push(state.chapter_counter);
59 } else {
60 path.push(sibling_index);
61 }
62
63 let section_path = format_path(path);
64
65 if !node.blocks.is_empty() {
66 let text = extract_block_text(&node.blocks, normalized);
67 let token_count = text.split_whitespace().count() as u32;
68
69 let chunk_id = format!(
70 "{}#{}#{}",
71 content_id,
72 section_path,
73 node.blocks
74 .first()
75 .map(|id| id.to_string())
76 .unwrap_or_default()
77 );
78
79 chunks.push(ContentChunk {
80 chunk_id,
81 content_id: content_id.to_string(),
82 block_ids: node.blocks.clone(),
83 chapter_number: path.first().copied(),
84 section_path: section_path.clone(),
85 chunk_type: node.section_type.clone(),
86 text,
87 token_count,
88 });
89 }
90
91 for (i, child) in node.children.iter().enumerate() {
92 collect_chunks(
93 child,
94 content_id,
95 normalized,
96 chunks,
97 state,
98 path,
99 i as u8 + 1,
100 );
101 }
102
103 path.pop();
104}
105
106fn format_path(components: &[u8]) -> String {
107 components
108 .iter()
109 .map(|c| c.to_string())
110 .collect::<Vec<_>>()
111 .join(".")
112}
113
114fn extract_block_text(block_ids: &[Ulid], normalized: &NormalizedMarkdown) -> String {
115 let mut parts = Vec::new();
116 for id in block_ids {
117 if let Some(block) = normalized.blocks.iter().find(|b| b.id == *id) {
118 let text =
119 crate::strip_terraphim_block_id_comments(&normalized.markdown[block.span.clone()]);
120 let trimmed = text.trim();
121 if !trimmed.is_empty() {
122 parts.push(trimmed.to_string());
123 }
124 }
125 }
126 parts.join("\n\n")
127}
128
129#[cfg(test)]
130mod tests {
131 use super::*;
132 use crate::heading::{SectionConfig, build_heading_tree, classify_sections};
133 use crate::normalize_markdown;
134
135 #[test]
136 fn chunk_single_chapter() {
137 let input = "# Chapter 1\n\nFirst paragraph.\n\nSecond paragraph.\n";
138 let normalized = normalize_markdown(input).unwrap();
139 let tree = build_heading_tree(&normalized).unwrap();
140 let chunks = chunk_by_headings("test-doc", &tree, &normalized);
141
142 assert_eq!(chunks.len(), 1);
143 assert_eq!(chunks[0].content_id, "test-doc");
144 assert_eq!(chunks[0].chapter_number, Some(1));
145 assert_eq!(chunks[0].section_path, "1");
146 assert_eq!(chunks[0].chunk_type, SectionType::Main);
147 assert!(chunks[0].text.contains("First paragraph"));
148 assert!(chunks[0].text.contains("Second paragraph"));
149 }
150
151 #[test]
152 fn chunk_preserves_block_ulids() {
153 let input = "# Chapter\n\nParagraph one\n\nParagraph two\n";
154 let normalized = normalize_markdown(input).unwrap();
155 let original_ids: Vec<Ulid> = normalized.blocks.iter().map(|b| b.id).collect();
156
157 let tree = build_heading_tree(&normalized).unwrap();
158 let chunks = chunk_by_headings("doc", &tree, &normalized);
159
160 assert_eq!(chunks[0].block_ids.len(), 2);
161 assert_eq!(chunks[0].block_ids, original_ids);
162 }
163
164 #[test]
165 fn chunk_composite_ids() {
166 let input = "# Chapter\n\nText\n";
167 let normalized = normalize_markdown(input).unwrap();
168 let tree = build_heading_tree(&normalized).unwrap();
169 let chunks = chunk_by_headings("my-doc", &tree, &normalized);
170
171 assert!(chunks[0].chunk_id.starts_with("my-doc#1#"));
172 }
173
174 #[test]
175 fn chunk_nested_headings() {
176 let input = "# Chapter\n\nIntro\n\n## Section A\n\nText A\n\n## Section B\n\nText B\n";
177 let normalized = normalize_markdown(input).unwrap();
178 let tree = build_heading_tree(&normalized).unwrap();
179 let chunks = chunk_by_headings("book", &tree, &normalized);
180
181 assert_eq!(chunks.len(), 3);
182 assert_eq!(chunks[0].section_path, "1");
183 assert_eq!(chunks[1].section_path, "1.1");
184 assert_eq!(chunks[2].section_path, "1.2");
185 }
186
187 #[test]
188 fn chunk_with_section_types() {
189 let input =
190 "# Chapter\n\nIntro\n\n## Power Selling: Tips\n\nTip\n\n## Selling U\n\nCareer\n";
191 let normalized = normalize_markdown(input).unwrap();
192 let mut tree = build_heading_tree(&normalized).unwrap();
193 classify_sections(&mut tree, &SectionConfig::textbook_default());
194 let chunks = chunk_by_headings("book", &tree, &normalized);
195
196 assert_eq!(chunks[0].chunk_type, SectionType::Main);
197 assert_eq!(
198 chunks[1].chunk_type,
199 SectionType::Sidebar("PowerSelling".to_string())
200 );
201 assert_eq!(chunks[2].chunk_type, SectionType::Career);
202 }
203
204 #[test]
205 fn chunk_token_count() {
206 let input = "# Chapter\n\nOne two three four five.\n";
207 let normalized = normalize_markdown(input).unwrap();
208 let tree = build_heading_tree(&normalized).unwrap();
209 let chunks = chunk_by_headings("doc", &tree, &normalized);
210
211 assert!(chunks[0].token_count > 0);
212 }
213
214 #[test]
215 fn chunk_multiple_chapters() {
216 let input = "# Chapter 1\n\nText 1\n\n# Chapter 2\n\nText 2\n\n# Chapter 3\n\nText 3\n";
217 let normalized = normalize_markdown(input).unwrap();
218 let tree = build_heading_tree(&normalized).unwrap();
219 let chunks = chunk_by_headings("book", &tree, &normalized);
220
221 assert_eq!(chunks.len(), 3);
222 assert_eq!(chunks[0].chapter_number, Some(1));
223 assert_eq!(chunks[1].chapter_number, Some(2));
224 assert_eq!(chunks[2].chapter_number, Some(3));
225 }
226}