Skip to main content

sem_core/parser/plugins/
markdown.rs

1use regex::Regex;
2use std::collections::HashMap;
3
4use crate::model::entity::{build_entity_id, build_entity_id_disambiguated, SemanticEntity};
5use crate::parser::plugin::SemanticParserPlugin;
6use crate::utils::hash::content_hash;
7
8pub struct MarkdownParserPlugin;
9
10impl SemanticParserPlugin for MarkdownParserPlugin {
11    fn id(&self) -> &str {
12        "markdown"
13    }
14
15    fn extensions(&self) -> &[&str] {
16        &[".md", ".mdx"]
17    }
18
19    fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
20        let mut entities = Vec::new();
21        let lines: Vec<&str> = content.lines().collect();
22        let heading_re = Regex::new(r"^(#{1,6})\s+(.+)").unwrap();
23
24        struct Section {
25            level: usize,
26            name: String,
27            start_line: usize,
28            lines: Vec<String>,
29            base_id: String,
30            parent_index: Option<usize>,
31        }
32
33        let mut sections: Vec<Section> = Vec::new();
34        let mut current_section: Option<usize> = None;
35        let mut section_stack: Vec<(usize, usize)> = Vec::new(); // (level, section index)
36
37        for (i, &line) in lines.iter().enumerate() {
38            if let Some(caps) = heading_re.captures(line) {
39                let level = caps[1].len();
40                let name = caps[2].trim().to_string();
41
42                // Find parent: pop headings with >= level
43                while section_stack.last().map_or(false, |(l, _)| *l >= level) {
44                    section_stack.pop();
45                }
46
47                let parent_index = section_stack.last().map(|(_, index)| *index);
48
49                sections.push(Section {
50                    level,
51                    name: name.clone(),
52                    start_line: i + 1,
53                    lines: vec![line.to_string()],
54                    base_id: build_entity_id(file_path, "heading", &name, None),
55                    parent_index,
56                });
57                let section_index = sections.len() - 1;
58
59                current_section = Some(section_index);
60                section_stack.push((level, section_index));
61            } else if let Some(index) = current_section {
62                sections[index].lines.push(line.to_string());
63            } else {
64                // Content before first heading — preamble
65                if !line.trim().is_empty() {
66                    if current_section.is_none() {
67                        sections.push(Section {
68                            level: 0,
69                            name: "(preamble)".to_string(),
70                            start_line: i + 1,
71                            lines: vec![line.to_string()],
72                            base_id: build_entity_id(file_path, "preamble", "(preamble)", None),
73                            parent_index: None,
74                        });
75                        current_section = Some(sections.len() - 1);
76                    }
77                }
78            }
79        }
80
81        let mut id_counts: HashMap<&str, usize> = HashMap::new();
82        for section in &sections {
83            *id_counts.entry(section.base_id.as_str()).or_default() += 1;
84        }
85
86        let section_ids: Vec<String> = sections
87            .iter()
88            .map(|section| {
89                if id_counts[section.base_id.as_str()] > 1 {
90                    let entity_type = if section.level == 0 {
91                        "preamble"
92                    } else {
93                        "heading"
94                    };
95                    build_entity_id_disambiguated(
96                        file_path,
97                        entity_type,
98                        &section.name,
99                        None,
100                        section.start_line,
101                    )
102                } else {
103                    section.base_id.clone()
104                }
105            })
106            .collect();
107
108        for (index, section) in sections.iter().enumerate() {
109            let section_content = section.lines.join("\n").trim().to_string();
110            if section_content.is_empty() {
111                continue;
112            }
113
114            let entity_type = if section.level == 0 {
115                "preamble"
116            } else {
117                "heading"
118            };
119
120            entities.push(SemanticEntity {
121                id: section_ids[index].clone(),
122                file_path: file_path.to_string(),
123                entity_type: entity_type.to_string(),
124                name: section.name.clone(),
125                parent_id: section
126                    .parent_index
127                    .map(|parent_index| section_ids[parent_index].clone()),
128                content_hash: content_hash(&section_content),
129                structural_hash: None,
130                content: section_content,
131                start_line: section.start_line,
132                end_line: section.start_line + section.lines.len() - 1,
133                metadata: None,
134            });
135        }
136
137        entities
138    }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    fn unique_heading_keeps_legacy_id() {
147        let content = "# Overview\n\nbody\n";
148        let plugin = MarkdownParserPlugin;
149        let entities = plugin.extract_entities(content, "doc.md");
150
151        assert_eq!(entities.len(), 1);
152        assert_eq!(entities[0].id, "doc.md::heading::Overview");
153    }
154
155    #[test]
156    fn duplicate_heading_names_get_line_disambiguated_ids() {
157        let content = "# Same Title\n\nfirst body\n\n# Same Title\n\nsecond body\n";
158        let plugin = MarkdownParserPlugin;
159        let entities = plugin.extract_entities(content, "doc.md");
160
161        let headings: Vec<&SemanticEntity> = entities
162            .iter()
163            .filter(|entity| entity.entity_type == "heading")
164            .collect();
165
166        assert_eq!(headings.len(), 2);
167        assert_eq!(headings[0].id, "doc.md::heading::Same Title@L1");
168        assert_eq!(headings[1].id, "doc.md::heading::Same Title@L5");
169        assert_ne!(headings[0].content_hash, headings[1].content_hash);
170    }
171
172    #[test]
173    fn duplicate_parent_headings_disambiguate_child_parent_ids() {
174        let content = "# Release\n## Fixed\nfirst fix\n# Release\n## Fixed\nsecond fix\n";
175        let plugin = MarkdownParserPlugin;
176        let entities = plugin.extract_entities(content, "CHANGELOG.md");
177
178        let fixed_sections: Vec<&SemanticEntity> = entities
179            .iter()
180            .filter(|entity| entity.name == "Fixed")
181            .collect();
182
183        assert_eq!(fixed_sections.len(), 2);
184        assert_eq!(
185            fixed_sections[0].parent_id.as_deref(),
186            Some("CHANGELOG.md::heading::Release@L1")
187        );
188        assert_eq!(
189            fixed_sections[1].parent_id.as_deref(),
190            Some("CHANGELOG.md::heading::Release@L4")
191        );
192    }
193
194    #[test]
195    fn duplicate_child_headings_under_unique_parents_keep_distinct_parents() {
196        let content = "# Product A\n## Usage\nfirst usage\n# Product B\n## Usage\nsecond usage\n";
197        let plugin = MarkdownParserPlugin;
198        let entities = plugin.extract_entities(content, "README.md");
199
200        let usage_sections: Vec<&SemanticEntity> = entities
201            .iter()
202            .filter(|entity| entity.name == "Usage")
203            .collect();
204
205        assert_eq!(usage_sections.len(), 2);
206        assert_eq!(usage_sections[0].id, "README.md::heading::Usage@L2");
207        assert_eq!(usage_sections[1].id, "README.md::heading::Usage@L5");
208        assert_eq!(
209            usage_sections[0].parent_id.as_deref(),
210            Some("README.md::heading::Product A")
211        );
212        assert_eq!(
213            usage_sections[1].parent_id.as_deref(),
214            Some("README.md::heading::Product B")
215        );
216    }
217}