argyph_parse/structural/
markdown.rs1use super::{byte_to_line_range, line_starts, NodeKind, StructuralNode};
2use pulldown_cmark::{Event, HeadingLevel, Parser, Tag, TagEnd};
3
4fn heading_level_num(level: &HeadingLevel) -> u32 {
5 match level {
6 HeadingLevel::H1 => 1,
7 HeadingLevel::H2 => 2,
8 HeadingLevel::H3 => 3,
9 HeadingLevel::H4 => 4,
10 HeadingLevel::H5 => 5,
11 HeadingLevel::H6 => 6,
12 }
13}
14
15pub fn parse(file_id: u64, source: &str) -> Vec<StructuralNode> {
17 let ls = line_starts(source);
18 let events: Vec<(Event<'_>, std::ops::Range<usize>)> =
19 Parser::new(source).into_offset_iter().collect();
20
21 struct HeadingInfo {
22 level: u32,
23 title: String,
24 start_byte: usize,
25 }
26
27 let mut headings: Vec<HeadingInfo> = Vec::new();
28 let eof = events.last().map_or(source.len(), |(_, range)| range.end);
29
30 let mut i = 0;
31 while i < events.len() {
32 if let Event::Start(Tag::Heading { level, .. }) = &events[i].0 {
33 let h_level = heading_level_num(level);
34 let start = events[i].1.start;
35 let mut title = String::new();
36 let mut j = i + 1;
37 while j < events.len() && !matches!(&events[j].0, Event::End(TagEnd::Heading(..))) {
38 match &events[j].0 {
39 Event::Text(t) | Event::Code(t) => title.push_str(t.as_ref()),
40 _ => {}
41 }
42 j += 1;
43 }
44 headings.push(HeadingInfo {
45 level: h_level,
46 title: title.clone(),
47 start_byte: start,
48 });
49 i = j + 1;
50 } else {
51 i += 1;
52 }
53 }
54
55 let mut nodes: Vec<StructuralNode> = Vec::new();
56
57 for (idx, h) in headings.iter().enumerate() {
58 let section_end = if idx + 1 < headings.len() {
59 let mut end = eof;
60 for next in &headings[idx + 1..] {
61 if next.level <= h.level {
62 end = next.start_byte;
63 break;
64 }
65 }
66 end
67 } else {
68 eof
69 };
70
71 let path = vec![h.title.clone()];
72 let id = StructuralNode::make_id(file_id, NodeKind::MdSection, &path);
73 let (line_s, line_e) = byte_to_line_range(&ls, h.start_byte, section_end);
74
75 nodes.push(StructuralNode {
76 id,
77 file_id,
78 kind: NodeKind::MdSection,
79 label: h.title.clone(),
80 path,
81 byte_range: (h.start_byte, section_end),
82 line_range: (line_s, line_e),
83 parent: None,
84 depth: h.level,
85 });
86 }
87
88 i = 0;
89 while i < events.len() {
90 if let Event::Start(Tag::CodeBlock(kind)) = &events[i].0 {
91 let info = match kind {
92 pulldown_cmark::CodeBlockKind::Fenced(lang) => lang.as_ref().to_string(),
93 pulldown_cmark::CodeBlockKind::Indented => String::new(),
94 };
95 let start = events[i].1.start;
96 let mut j = i + 1;
97 while j < events.len() && !matches!(&events[j].0, Event::End(TagEnd::CodeBlock)) {
98 j += 1;
99 }
100 let end = if j < events.len() {
101 events[j].1.end
102 } else {
103 start
104 };
105
106 let label = if info.is_empty() {
107 "code".to_string()
108 } else {
109 info.clone()
110 };
111 let path = vec![label.clone()];
112 let id = StructuralNode::make_id(file_id, NodeKind::MdCodeBlock, &path);
113 let (line_s, line_e) = byte_to_line_range(&ls, start, end);
114
115 nodes.push(StructuralNode {
116 id,
117 file_id,
118 kind: NodeKind::MdCodeBlock,
119 label,
120 path,
121 byte_range: (start, end),
122 line_range: (line_s, line_e),
123 parent: None,
124 depth: 0,
125 });
126 i = j + 1;
127 } else {
128 i += 1;
129 }
130 }
131
132 assign_parents(&mut nodes);
133 nodes
134}
135
136fn assign_parents(nodes: &mut [StructuralNode]) {
137 let n = nodes.len();
138 for i in 0..n {
139 if nodes[i].kind != NodeKind::MdSection {
140 continue;
141 }
142 let depth = nodes[i].depth;
143 for j in (0..i).rev() {
144 if nodes[j].kind == NodeKind::MdSection && nodes[j].depth < depth {
145 nodes[i].parent = Some(nodes[j].id);
146 break;
147 }
148 }
149 }
150
151 let parent_info: Vec<(usize, Option<super::NodeId>, u32)> = nodes
152 .iter()
153 .enumerate()
154 .filter(|(_, child)| child.kind == NodeKind::MdCodeBlock)
155 .map(|(ci, child)| {
156 let parent = nodes
157 .iter()
158 .rev()
159 .find(|n| n.kind == NodeKind::MdSection && n.byte_range.0 <= child.byte_range.0);
160 let pid = parent.map(|p| p.id);
161 let new_depth = parent.map_or(0, |p| p.depth + 1);
162 (ci, pid, new_depth)
163 })
164 .collect();
165
166 for (ci, pid, new_depth) in parent_info {
167 nodes[ci].parent = pid;
168 nodes[ci].depth = new_depth;
169 }
170}
171
172#[cfg(test)]
173mod tests {
174 use super::*;
175 use crate::structural::NodeKind;
176
177 const SAMPLE: &str =
178 "# Top\n\nintro\n\n## Sub A\n\nbody a\n\n## Sub B\n\n```rust\nfn x() {}\n```\n\nbody b\n";
179
180 #[test]
181 fn extracts_nested_headings() {
182 let nodes = parse(1, SAMPLE);
183 let sections: Vec<&StructuralNode> = nodes
184 .iter()
185 .filter(|n| n.kind == NodeKind::MdSection)
186 .collect();
187 assert!(sections.len() >= 3, "expected at least 3 sections");
188
189 let top = sections.iter().find(|n| n.label == "Top").unwrap();
190 assert_eq!(top.depth, 1);
191 assert!(top.parent.is_none());
192
193 let sub_a = sections.iter().find(|n| n.label == "Sub A").unwrap();
194 assert_eq!(sub_a.depth, 2);
195 assert_eq!(sub_a.parent, Some(top.id));
196
197 let sub_b = sections.iter().find(|n| n.label == "Sub B").unwrap();
198 assert_eq!(sub_b.depth, 2);
199 assert_eq!(sub_b.parent, Some(top.id));
200 }
201
202 #[test]
203 fn extracts_code_block() {
204 let nodes = parse(1, SAMPLE);
205 let code: Vec<&StructuralNode> = nodes
206 .iter()
207 .filter(|n| n.kind == NodeKind::MdCodeBlock)
208 .collect();
209 assert!(!code.is_empty(), "expected at least one code block");
210 let cb = code[0];
211 assert!(
212 cb.label.contains("rust"),
213 "label should mention rust: {}",
214 cb.label
215 );
216 }
217
218 #[test]
219 fn section_byte_range_covers_body() {
220 let nodes = parse(1, SAMPLE);
221 let sub_a = nodes
222 .iter()
223 .find(|n| n.label == "Sub A" && n.kind == NodeKind::MdSection)
224 .unwrap();
225 let body_range = &SAMPLE[sub_a.byte_range.0..sub_a.byte_range.1];
226 assert!(
227 body_range.contains("body a"),
228 "Section Sub A should cover 'body a', got: {body_range:?}"
229 );
230 }
231}