Skip to main content

sem_core/parser/plugins/
markdown.rs

1use regex::Regex;
2
3use crate::model::entity::{build_entity_id, SemanticEntity};
4use crate::parser::plugin::SemanticParserPlugin;
5use crate::utils::hash::content_hash;
6
7pub struct MarkdownParserPlugin;
8
9impl SemanticParserPlugin for MarkdownParserPlugin {
10    fn id(&self) -> &str {
11        "markdown"
12    }
13
14    fn extensions(&self) -> &[&str] {
15        &[".md", ".mdx"]
16    }
17
18    fn extract_entities(&self, content: &str, file_path: &str) -> Vec<SemanticEntity> {
19        let mut entities = Vec::new();
20        let lines: Vec<&str> = content.lines().collect();
21        let heading_re = Regex::new(r"^(#{1,6})\s+(.+)").unwrap();
22
23        struct Section {
24            level: usize,
25            name: String,
26            start_line: usize,
27            lines: Vec<String>,
28            parent_id: Option<String>,
29        }
30
31        let mut sections: Vec<Section> = Vec::new();
32        let mut current_section: Option<Section> = None;
33        let mut section_stack: Vec<(usize, String)> = Vec::new(); // (level, name)
34
35        for (i, &line) in lines.iter().enumerate() {
36            if let Some(caps) = heading_re.captures(line) {
37                // Close previous section
38                if let Some(sec) = current_section.take() {
39                    sections.push(sec);
40                }
41
42                let level = caps[1].len();
43                let name = caps[2].trim().to_string();
44
45                // Find parent: pop headings with >= level
46                while section_stack
47                    .last()
48                    .map_or(false, |(l, _)| *l >= level)
49                {
50                    section_stack.pop();
51                }
52
53                let parent_id = section_stack.last().map(|(_, parent_name)| {
54                    build_entity_id(file_path, "heading", parent_name, None)
55                });
56
57                current_section = Some(Section {
58                    level,
59                    name: name.clone(),
60                    start_line: i + 1,
61                    lines: vec![line.to_string()],
62                    parent_id,
63                });
64
65                section_stack.push((level, name));
66            } else if let Some(ref mut sec) = current_section {
67                sec.lines.push(line.to_string());
68            } else {
69                // Content before first heading — preamble
70                if !line.trim().is_empty() {
71                    if current_section.is_none() {
72                        current_section = Some(Section {
73                            level: 0,
74                            name: "(preamble)".to_string(),
75                            start_line: i + 1,
76                            lines: vec![line.to_string()],
77                            parent_id: None,
78                        });
79                    }
80                }
81            }
82        }
83
84        if let Some(sec) = current_section {
85            sections.push(sec);
86        }
87
88        for section in &sections {
89            let section_content = section.lines.join("\n").trim().to_string();
90            if section_content.is_empty() {
91                continue;
92            }
93
94            let entity_type = if section.level == 0 {
95                "preamble"
96            } else {
97                "heading"
98            };
99
100            entities.push(SemanticEntity {
101                id: build_entity_id(file_path, entity_type, &section.name, None),
102                file_path: file_path.to_string(),
103                entity_type: entity_type.to_string(),
104                name: section.name.clone(),
105                parent_id: section.parent_id.clone(),
106                content_hash: content_hash(&section_content),
107                structural_hash: None,
108                content: section_content,
109                start_line: section.start_line,
110                end_line: section.start_line + section.lines.len() - 1,
111                metadata: None,
112            });
113        }
114
115        entities
116    }
117}