sem_core/parser/plugins/
markdown.rs1use regex::Regex;
2
3use crate::model::entity::{build_entity_id, SemanticEntity};
4use crate::parser::plugin::SemanticParserPlugin;
5use crate::utils::hash::content_hash;
6
7pub struct MarkdownParserPlugin;
8
9impl SemanticParserPlugin for MarkdownParserPlugin {
10 fn id(&self) -> &str {
11 "markdown"
12 }
13
14 fn extensions(&self) -> &[&str] {
15 &[".md", ".mdx"]
16 }
17
18 fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
19 let mut entities = Vec::new();
20 let lines: Vec<&str> = content.lines().collect();
21 let heading_re = Regex::new(r"^(#{1,6})\s+(.+)").unwrap();
22
23 struct Section {
24 level: usize,
25 name: String,
26 start_line: usize,
27 lines: Vec<String>,
28 parent_id: Option<String>,
29 }
30
31 let mut sections: Vec<Section> = Vec::new();
32 let mut current_section: Option<Section> = None;
33 let mut section_stack: Vec<(usize, String)> = Vec::new(); for (i, &line) in lines.iter().enumerate() {
36 if let Some(caps) = heading_re.captures(line) {
37 if let Some(sec) = current_section.take() {
39 sections.push(sec);
40 }
41
42 let level = caps[1].len();
43 let name = caps[2].trim().to_string();
44
45 while section_stack
47 .last()
48 .map_or(false, |(l, _)| *l >= level)
49 {
50 section_stack.pop();
51 }
52
53 let parent_id = section_stack.last().map(|(_, parent_name)| {
54 build_entity_id(file_path, "heading", parent_name, None)
55 });
56
57 current_section = Some(Section {
58 level,
59 name: name.clone(),
60 start_line: i + 1,
61 lines: vec![line.to_string()],
62 parent_id,
63 });
64
65 section_stack.push((level, name));
66 } else if let Some(ref mut sec) = current_section {
67 sec.lines.push(line.to_string());
68 } else {
69 if !line.trim().is_empty() {
71 if current_section.is_none() {
72 current_section = Some(Section {
73 level: 0,
74 name: "(preamble)".to_string(),
75 start_line: i + 1,
76 lines: vec![line.to_string()],
77 parent_id: None,
78 });
79 }
80 }
81 }
82 }
83
84 if let Some(sec) = current_section {
85 sections.push(sec);
86 }
87
88 for section in §ions {
89 let section_content = section.lines.join("\n").trim().to_string();
90 if section_content.is_empty() {
91 continue;
92 }
93
94 let entity_type = if section.level == 0 {
95 "preamble"
96 } else {
97 "heading"
98 };
99
100 entities.push(SemanticEntity {
101 id: build_entity_id(file_path, entity_type, §ion.name, None),
102 file_path: file_path.to_string(),
103 entity_type: entity_type.to_string(),
104 name: section.name.clone(),
105 parent_id: section.parent_id.clone(),
106 content_hash: content_hash(§ion_content),
107 structural_hash: None,
108 content: section_content,
109 start_line: section.start_line,
110 end_line: section.start_line + section.lines.len() - 1,
111 metadata: None,
112 });
113 }
114
115 entities
116 }
117}