sem_core/parser/plugins/
markdown.rs1use regex::Regex;
2use std::collections::HashMap;
3
4use crate::model::entity::{build_entity_id, build_entity_id_disambiguated, SemanticEntity};
5use crate::parser::plugin::SemanticParserPlugin;
6use crate::utils::hash::content_hash;
7
8pub struct MarkdownParserPlugin;
9
10impl SemanticParserPlugin for MarkdownParserPlugin {
11 fn id(&self) -> &str {
12 "markdown"
13 }
14
15 fn extensions(&self) -> &[&str] {
16 &[".md", ".mdx"]
17 }
18
19 fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
20 let mut entities = Vec::new();
21 let lines: Vec<&str> = content.lines().collect();
22 let heading_re = Regex::new(r"^(#{1,6})\s+(.+)").unwrap();
23
24 struct Section {
25 level: usize,
26 name: String,
27 start_line: usize,
28 lines: Vec<String>,
29 base_id: String,
30 parent_index: Option<usize>,
31 }
32
33 let mut sections: Vec<Section> = Vec::new();
34 let mut current_section: Option<usize> = None;
35 let mut section_stack: Vec<(usize, usize)> = Vec::new(); for (i, &line) in lines.iter().enumerate() {
38 if let Some(caps) = heading_re.captures(line) {
39 let level = caps[1].len();
40 let name = caps[2].trim().to_string();
41
42 while section_stack.last().map_or(false, |(l, _)| *l >= level) {
44 section_stack.pop();
45 }
46
47 let parent_index = section_stack.last().map(|(_, index)| *index);
48
49 sections.push(Section {
50 level,
51 name: name.clone(),
52 start_line: i + 1,
53 lines: vec![line.to_string()],
54 base_id: build_entity_id(file_path, "heading", &name, None),
55 parent_index,
56 });
57 let section_index = sections.len() - 1;
58
59 current_section = Some(section_index);
60 section_stack.push((level, section_index));
61 } else if let Some(index) = current_section {
62 sections[index].lines.push(line.to_string());
63 } else {
64 if !line.trim().is_empty() {
66 if current_section.is_none() {
67 sections.push(Section {
68 level: 0,
69 name: "(preamble)".to_string(),
70 start_line: i + 1,
71 lines: vec![line.to_string()],
72 base_id: build_entity_id(file_path, "preamble", "(preamble)", None),
73 parent_index: None,
74 });
75 current_section = Some(sections.len() - 1);
76 }
77 }
78 }
79 }
80
81 let mut id_counts: HashMap<&str, usize> = HashMap::new();
82 for section in §ions {
83 *id_counts.entry(section.base_id.as_str()).or_default() += 1;
84 }
85
86 let section_ids: Vec<String> = sections
87 .iter()
88 .map(|section| {
89 if id_counts[section.base_id.as_str()] > 1 {
90 let entity_type = if section.level == 0 {
91 "preamble"
92 } else {
93 "heading"
94 };
95 build_entity_id_disambiguated(
96 file_path,
97 entity_type,
98 §ion.name,
99 None,
100 section.start_line,
101 )
102 } else {
103 section.base_id.clone()
104 }
105 })
106 .collect();
107
108 for (index, section) in sections.iter().enumerate() {
109 let section_content = section.lines.join("\n").trim().to_string();
110 if section_content.is_empty() {
111 continue;
112 }
113
114 let entity_type = if section.level == 0 {
115 "preamble"
116 } else {
117 "heading"
118 };
119
120 entities.push(SemanticEntity {
121 id: section_ids[index].clone(),
122 file_path: file_path.to_string(),
123 entity_type: entity_type.to_string(),
124 name: section.name.clone(),
125 parent_id: section
126 .parent_index
127 .map(|parent_index| section_ids[parent_index].clone()),
128 content_hash: content_hash(§ion_content),
129 structural_hash: None,
130 content: section_content,
131 start_line: section.start_line,
132 end_line: section.start_line + section.lines.len() - 1,
133 metadata: None,
134 });
135 }
136
137 entities
138 }
139}
140
141#[cfg(test)]
142mod tests {
143 use super::*;
144
145 #[test]
146 fn unique_heading_keeps_legacy_id() {
147 let content = "# Overview\n\nbody\n";
148 let plugin = MarkdownParserPlugin;
149 let entities = plugin.extract_entities(content, "doc.md");
150
151 assert_eq!(entities.len(), 1);
152 assert_eq!(entities[0].id, "doc.md::heading::Overview");
153 }
154
155 #[test]
156 fn duplicate_heading_names_get_line_disambiguated_ids() {
157 let content = "# Same Title\n\nfirst body\n\n# Same Title\n\nsecond body\n";
158 let plugin = MarkdownParserPlugin;
159 let entities = plugin.extract_entities(content, "doc.md");
160
161 let headings: Vec<&SemanticEntity> = entities
162 .iter()
163 .filter(|entity| entity.entity_type == "heading")
164 .collect();
165
166 assert_eq!(headings.len(), 2);
167 assert_eq!(headings[0].id, "doc.md::heading::Same Title@L1");
168 assert_eq!(headings[1].id, "doc.md::heading::Same Title@L5");
169 assert_ne!(headings[0].content_hash, headings[1].content_hash);
170 }
171
172 #[test]
173 fn duplicate_parent_headings_disambiguate_child_parent_ids() {
174 let content = "# Release\n## Fixed\nfirst fix\n# Release\n## Fixed\nsecond fix\n";
175 let plugin = MarkdownParserPlugin;
176 let entities = plugin.extract_entities(content, "CHANGELOG.md");
177
178 let fixed_sections: Vec<&SemanticEntity> = entities
179 .iter()
180 .filter(|entity| entity.name == "Fixed")
181 .collect();
182
183 assert_eq!(fixed_sections.len(), 2);
184 assert_eq!(
185 fixed_sections[0].parent_id.as_deref(),
186 Some("CHANGELOG.md::heading::Release@L1")
187 );
188 assert_eq!(
189 fixed_sections[1].parent_id.as_deref(),
190 Some("CHANGELOG.md::heading::Release@L4")
191 );
192 }
193
194 #[test]
195 fn duplicate_child_headings_under_unique_parents_keep_distinct_parents() {
196 let content = "# Product A\n## Usage\nfirst usage\n# Product B\n## Usage\nsecond usage\n";
197 let plugin = MarkdownParserPlugin;
198 let entities = plugin.extract_entities(content, "README.md");
199
200 let usage_sections: Vec<&SemanticEntity> = entities
201 .iter()
202 .filter(|entity| entity.name == "Usage")
203 .collect();
204
205 assert_eq!(usage_sections.len(), 2);
206 assert_eq!(usage_sections[0].id, "README.md::heading::Usage@L2");
207 assert_eq!(usage_sections[1].id, "README.md::heading::Usage@L5");
208 assert_eq!(
209 usage_sections[0].parent_id.as_deref(),
210 Some("README.md::heading::Product A")
211 );
212 assert_eq!(
213 usage_sections[1].parent_id.as_deref(),
214 Some("README.md::heading::Product B")
215 );
216 }
217}