Skip to main content

argyph_parse/structural/
markdown.rs

1use super::{byte_to_line_range, line_starts, NodeKind, StructuralNode};
2use pulldown_cmark::{Event, HeadingLevel, Parser, Tag, TagEnd};
3
4fn heading_level_num(level: &HeadingLevel) -> u32 {
5    match level {
6        HeadingLevel::H1 => 1,
7        HeadingLevel::H2 => 2,
8        HeadingLevel::H3 => 3,
9        HeadingLevel::H4 => 4,
10        HeadingLevel::H5 => 5,
11        HeadingLevel::H6 => 6,
12    }
13}
14
15/// Parse a markdown source into structural nodes (sections and code blocks).
16pub fn parse(file_id: u64, source: &str) -> Vec<StructuralNode> {
17    let ls = line_starts(source);
18    let events: Vec<(Event<'_>, std::ops::Range<usize>)> =
19        Parser::new(source).into_offset_iter().collect();
20
21    struct HeadingInfo {
22        level: u32,
23        title: String,
24        start_byte: usize,
25    }
26
27    let mut headings: Vec<HeadingInfo> = Vec::new();
28    let eof = events.last().map_or(source.len(), |(_, range)| range.end);
29
30    let mut i = 0;
31    while i < events.len() {
32        if let Event::Start(Tag::Heading { level, .. }) = &events[i].0 {
33            let h_level = heading_level_num(level);
34            let start = events[i].1.start;
35            let mut title = String::new();
36            let mut j = i + 1;
37            while j < events.len() && !matches!(&events[j].0, Event::End(TagEnd::Heading(..))) {
38                match &events[j].0 {
39                    Event::Text(t) | Event::Code(t) => title.push_str(t.as_ref()),
40                    _ => {}
41                }
42                j += 1;
43            }
44            headings.push(HeadingInfo {
45                level: h_level,
46                title: title.clone(),
47                start_byte: start,
48            });
49            i = j + 1;
50        } else {
51            i += 1;
52        }
53    }
54
55    let mut nodes: Vec<StructuralNode> = Vec::new();
56
57    for (idx, h) in headings.iter().enumerate() {
58        let section_end = if idx + 1 < headings.len() {
59            let mut end = eof;
60            for next in &headings[idx + 1..] {
61                if next.level <= h.level {
62                    end = next.start_byte;
63                    break;
64                }
65            }
66            end
67        } else {
68            eof
69        };
70
71        let path = vec![h.title.clone()];
72        let id = StructuralNode::make_id(file_id, NodeKind::MdSection, &path);
73        let (line_s, line_e) = byte_to_line_range(&ls, h.start_byte, section_end);
74
75        nodes.push(StructuralNode {
76            id,
77            file_id,
78            kind: NodeKind::MdSection,
79            label: h.title.clone(),
80            path,
81            byte_range: (h.start_byte, section_end),
82            line_range: (line_s, line_e),
83            parent: None,
84            depth: h.level,
85        });
86    }
87
88    i = 0;
89    while i < events.len() {
90        if let Event::Start(Tag::CodeBlock(kind)) = &events[i].0 {
91            let info = match kind {
92                pulldown_cmark::CodeBlockKind::Fenced(lang) => lang.as_ref().to_string(),
93                pulldown_cmark::CodeBlockKind::Indented => String::new(),
94            };
95            let start = events[i].1.start;
96            let mut j = i + 1;
97            while j < events.len() && !matches!(&events[j].0, Event::End(TagEnd::CodeBlock)) {
98                j += 1;
99            }
100            let end = if j < events.len() {
101                events[j].1.end
102            } else {
103                start
104            };
105
106            let label = if info.is_empty() {
107                "code".to_string()
108            } else {
109                info.clone()
110            };
111            let path = vec![label.clone()];
112            let id = StructuralNode::make_id(file_id, NodeKind::MdCodeBlock, &path);
113            let (line_s, line_e) = byte_to_line_range(&ls, start, end);
114
115            nodes.push(StructuralNode {
116                id,
117                file_id,
118                kind: NodeKind::MdCodeBlock,
119                label,
120                path,
121                byte_range: (start, end),
122                line_range: (line_s, line_e),
123                parent: None,
124                depth: 0,
125            });
126            i = j + 1;
127        } else {
128            i += 1;
129        }
130    }
131
132    assign_parents(&mut nodes);
133    nodes
134}
135
136fn assign_parents(nodes: &mut [StructuralNode]) {
137    let n = nodes.len();
138    for i in 0..n {
139        if nodes[i].kind != NodeKind::MdSection {
140            continue;
141        }
142        let depth = nodes[i].depth;
143        for j in (0..i).rev() {
144            if nodes[j].kind == NodeKind::MdSection && nodes[j].depth < depth {
145                nodes[i].parent = Some(nodes[j].id);
146                break;
147            }
148        }
149    }
150
151    let parent_info: Vec<(usize, Option<super::NodeId>, u32)> = nodes
152        .iter()
153        .enumerate()
154        .filter(|(_, child)| child.kind == NodeKind::MdCodeBlock)
155        .map(|(ci, child)| {
156            let parent = nodes
157                .iter()
158                .rev()
159                .find(|n| n.kind == NodeKind::MdSection && n.byte_range.0 <= child.byte_range.0);
160            let pid = parent.map(|p| p.id);
161            let new_depth = parent.map_or(0, |p| p.depth + 1);
162            (ci, pid, new_depth)
163        })
164        .collect();
165
166    for (ci, pid, new_depth) in parent_info {
167        nodes[ci].parent = pid;
168        nodes[ci].depth = new_depth;
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175    use crate::structural::NodeKind;
176
177    const SAMPLE: &str =
178        "# Top\n\nintro\n\n## Sub A\n\nbody a\n\n## Sub B\n\n```rust\nfn x() {}\n```\n\nbody b\n";
179
180    #[test]
181    fn extracts_nested_headings() {
182        let nodes = parse(1, SAMPLE);
183        let sections: Vec<&StructuralNode> = nodes
184            .iter()
185            .filter(|n| n.kind == NodeKind::MdSection)
186            .collect();
187        assert!(sections.len() >= 3, "expected at least 3 sections");
188
189        let top = sections.iter().find(|n| n.label == "Top").unwrap();
190        assert_eq!(top.depth, 1);
191        assert!(top.parent.is_none());
192
193        let sub_a = sections.iter().find(|n| n.label == "Sub A").unwrap();
194        assert_eq!(sub_a.depth, 2);
195        assert_eq!(sub_a.parent, Some(top.id));
196
197        let sub_b = sections.iter().find(|n| n.label == "Sub B").unwrap();
198        assert_eq!(sub_b.depth, 2);
199        assert_eq!(sub_b.parent, Some(top.id));
200    }
201
202    #[test]
203    fn extracts_code_block() {
204        let nodes = parse(1, SAMPLE);
205        let code: Vec<&StructuralNode> = nodes
206            .iter()
207            .filter(|n| n.kind == NodeKind::MdCodeBlock)
208            .collect();
209        assert!(!code.is_empty(), "expected at least one code block");
210        let cb = code[0];
211        assert!(
212            cb.label.contains("rust"),
213            "label should mention rust: {}",
214            cb.label
215        );
216    }
217
218    #[test]
219    fn section_byte_range_covers_body() {
220        let nodes = parse(1, SAMPLE);
221        let sub_a = nodes
222            .iter()
223            .find(|n| n.label == "Sub A" && n.kind == NodeKind::MdSection)
224            .unwrap();
225        let body_range = &SAMPLE[sub_a.byte_range.0..sub_a.byte_range.1];
226        assert!(
227            body_range.contains("body a"),
228            "Section Sub A should cover 'body a', got: {body_range:?}"
229        );
230    }
231}