Skip to main content

mdql_core/
parser.rs

1//! Parse markdown files into structured representations.
2//!
3//! Handles frontmatter extraction, H1/H2 detection, code fence tracking,
4//! and numbered heading normalization.
5
6use std::path::Path;
7
8use regex::Regex;
9use std::sync::LazyLock;
10
11use crate::errors::MdqlError;
12
13#[derive(Debug, Clone, PartialEq)]
14pub struct Section {
15    pub raw_heading: String,
16    pub normalized_heading: String,
17    pub body: String,
18    pub line_number: usize,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedFile {
23    pub path: String,
24    pub raw_frontmatter: serde_yaml::Value,
25    pub h1: Option<String>,
26    pub h1_line_number: Option<usize>,
27    pub sections: Vec<Section>,
28    pub parse_errors: Vec<String>,
29}
30
31static NUMBERED_HEADING_RE: LazyLock<Regex> =
32    LazyLock::new(|| Regex::new(r"^\d+\.\s+").unwrap());
33static FENCE_OPEN_RE: LazyLock<Regex> =
34    LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
35static H1_RE: LazyLock<Regex> =
36    LazyLock::new(|| Regex::new(r"^#\s+(.+)$").unwrap());
37static H2_RE: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"^##\s+(.+)$").unwrap());
39
40pub fn normalize_heading(raw: &str) -> String {
41    NUMBERED_HEADING_RE.replace(raw, "").trim().to_string()
42}
43
44pub fn parse_file(
45    path: &Path,
46    relative_to: Option<&Path>,
47    normalize_numbered: bool,
48) -> crate::errors::Result<ParsedFile> {
49    let rel_path = if let Some(base) = relative_to {
50        path.strip_prefix(base)
51            .unwrap_or(path)
52            .to_string_lossy()
53            .to_string()
54    } else {
55        path.to_string_lossy().to_string()
56    };
57
58    let text = std::fs::read_to_string(path).map_err(|e| {
59        MdqlError::Parse(format!("Cannot read {}: {}", rel_path, e))
60    })?;
61
62    Ok(parse_text(&text, &rel_path, normalize_numbered))
63}
64
65/// Parse markdown text directly (useful for testing and when content is already in memory).
66pub fn parse_text(text: &str, rel_path: &str, normalize_numbered: bool) -> ParsedFile {
67    let lines: Vec<&str> = text.split('\n').collect();
68    let mut raw_frontmatter = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
69    let mut body_start: usize = 0;
70    let mut parse_errors: Vec<String> = Vec::new();
71
72    // --- Parse frontmatter ---
73    if !lines.is_empty() && lines[0].trim() == "---" {
74        let mut closing = None;
75        for i in 1..lines.len() {
76            if lines[i].trim() == "---" {
77                closing = Some(i);
78                break;
79            }
80        }
81
82        if let Some(close_idx) = closing {
83            let fm_text: String = lines[1..close_idx].join("\n");
84            match serde_yaml::from_str::<serde_yaml::Value>(&fm_text) {
85                Ok(serde_yaml::Value::Null) => {
86                    // Empty frontmatter
87                }
88                Ok(val @ serde_yaml::Value::Mapping(_)) => {
89                    raw_frontmatter = val;
90                }
91                Ok(val) => {
92                    let type_name = match &val {
93                        serde_yaml::Value::Bool(_) => "bool",
94                        serde_yaml::Value::Number(_) => "number",
95                        serde_yaml::Value::String(_) => "str",
96                        serde_yaml::Value::Sequence(_) => "list",
97                        _ => "unknown",
98                    };
99                    parse_errors.push(format!(
100                        "Frontmatter is not a mapping (got {})",
101                        type_name
102                    ));
103                }
104                Err(e) => {
105                    parse_errors.push(format!("Malformed YAML in frontmatter: {}", e));
106                }
107            }
108            body_start = close_idx + 1;
109        } else {
110            parse_errors.push("Unclosed frontmatter (no closing '---')".to_string());
111            body_start = 1;
112        }
113    } else {
114        parse_errors.push("No frontmatter found (file must start with '---')".to_string());
115    }
116
117    // --- Parse body: H1, H2 sections ---
118    let mut h1: Option<String> = None;
119    let mut h1_line_number: Option<usize> = None;
120    let mut sections: Vec<Section> = Vec::new();
121
122    let mut in_fence = false;
123    let mut fence_char: Option<char> = None;
124    let mut fence_width: usize = 0;
125
126    let mut current_heading: Option<String> = None;
127    let mut current_heading_normalized: Option<String> = None;
128    let mut current_heading_line: Option<usize> = None;
129    let mut current_body_lines: Vec<&str> = Vec::new();
130
131    let finalize_section = |heading: &mut Option<String>,
132                                heading_norm: &mut Option<String>,
133                                heading_line: &mut Option<usize>,
134                                body_lines: &mut Vec<&str>,
135                                sections: &mut Vec<Section>| {
136        if let Some(raw_h) = heading.take() {
137            let norm_h = heading_norm.take().unwrap_or_else(|| raw_h.clone());
138            let body = body_lines.join("\n").trim().to_string();
139            sections.push(Section {
140                raw_heading: raw_h,
141                normalized_heading: norm_h,
142                body,
143                line_number: heading_line.take().unwrap_or(0),
144            });
145            body_lines.clear();
146        }
147    };
148
149    for i in body_start..lines.len() {
150        let line = lines[i];
151        let line_num = i + 1; // 1-indexed
152
153        // --- Code fence tracking ---
154        if let Some(caps) = FENCE_OPEN_RE.captures(line) {
155            let marker = caps.get(1).unwrap().as_str();
156            let char = marker.chars().next().unwrap();
157            let width = marker.len();
158
159            if !in_fence {
160                in_fence = true;
161                fence_char = Some(char);
162                fence_width = width;
163                if current_heading.is_some() {
164                    current_body_lines.push(line);
165                }
166                continue;
167            } else if Some(char) == fence_char
168                && width >= fence_width
169                && line.trim() == marker
170            {
171                // Closing fence
172                in_fence = false;
173                fence_char = None;
174                fence_width = 0;
175                if current_heading.is_some() {
176                    current_body_lines.push(line);
177                }
178                continue;
179            }
180        }
181
182        if in_fence {
183            if current_heading.is_some() {
184                current_body_lines.push(line);
185            }
186            continue;
187        }
188
189        // --- H1 detection ---
190        if let Some(caps) = H1_RE.captures(line) {
191            if h1.is_none() {
192                h1 = Some(caps.get(1).unwrap().as_str().trim().to_string());
193                h1_line_number = Some(line_num);
194            } else {
195                parse_errors.push(format!(
196                    "Duplicate H1 at line {} (first was at line {})",
197                    line_num,
198                    h1_line_number.unwrap_or(0)
199                ));
200            }
201            continue;
202        }
203
204        // --- H2 detection ---
205        if let Some(caps) = H2_RE.captures(line) {
206            finalize_section(
207                &mut current_heading,
208                &mut current_heading_normalized,
209                &mut current_heading_line,
210                &mut current_body_lines,
211                &mut sections,
212            );
213            let raw_h = caps.get(1).unwrap().as_str().trim().to_string();
214            let norm_h = if normalize_numbered {
215                normalize_heading(&raw_h)
216            } else {
217                raw_h.clone()
218            };
219            current_heading = Some(raw_h);
220            current_heading_normalized = Some(norm_h);
221            current_heading_line = Some(line_num);
222            current_body_lines.clear();
223            continue;
224        }
225
226        // --- Regular content ---
227        if current_heading.is_some() {
228            current_body_lines.push(line);
229        }
230    }
231
232    finalize_section(
233        &mut current_heading,
234        &mut current_heading_normalized,
235        &mut current_heading_line,
236        &mut current_body_lines,
237        &mut sections,
238    );
239
240    ParsedFile {
241        path: rel_path.to_string(),
242        raw_frontmatter,
243        h1,
244        h1_line_number,
245        sections,
246        parse_errors,
247    }
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253
254    #[test]
255    fn test_basic_parse() {
256        let text = "---\ntitle: \"Hello\"\nstatus: \"active\"\n---\n\n## Summary\n\nA summary.\n\n## Details\n\nSome details.\n";
257        let parsed = parse_text(text, "test.md", false);
258        assert!(parsed.parse_errors.is_empty());
259        assert_eq!(parsed.sections.len(), 2);
260        assert_eq!(parsed.sections[0].normalized_heading, "Summary");
261        assert_eq!(parsed.sections[0].body, "A summary.");
262        assert_eq!(parsed.sections[1].normalized_heading, "Details");
263        assert_eq!(parsed.sections[1].body, "Some details.");
264    }
265
266    #[test]
267    fn test_frontmatter_extraction() {
268        let text = "---\ntitle: \"Test\"\ncount: 42\n---\n\nBody text.\n";
269        let parsed = parse_text(text, "test.md", false);
270        assert!(parsed.parse_errors.is_empty());
271        let fm = parsed.raw_frontmatter.as_mapping().unwrap();
272        assert_eq!(
273            fm.get(&serde_yaml::Value::String("title".into()))
274                .unwrap()
275                .as_str()
276                .unwrap(),
277            "Test"
278        );
279        assert_eq!(
280            fm.get(&serde_yaml::Value::String("count".into()))
281                .unwrap()
282                .as_u64()
283                .unwrap(),
284            42
285        );
286    }
287
288    #[test]
289    fn test_no_frontmatter() {
290        let text = "Just some text.\n";
291        let parsed = parse_text(text, "test.md", false);
292        assert_eq!(parsed.parse_errors.len(), 1);
293        assert!(parsed.parse_errors[0].contains("No frontmatter"));
294    }
295
296    #[test]
297    fn test_unclosed_frontmatter() {
298        let text = "---\ntitle: Test\nNo closing delimiter.\n";
299        let parsed = parse_text(text, "test.md", false);
300        assert!(parsed.parse_errors.iter().any(|e| e.contains("Unclosed")));
301    }
302
303    #[test]
304    fn test_h1_detection() {
305        let text = "---\ntitle: \"Test\"\n---\n\n# My Title\n\n## Section\n\nBody.\n";
306        let parsed = parse_text(text, "test.md", false);
307        assert!(parsed.parse_errors.is_empty());
308        assert_eq!(parsed.h1.as_deref(), Some("My Title"));
309        assert_eq!(parsed.h1_line_number, Some(5));
310    }
311
312    #[test]
313    fn test_duplicate_h1() {
314        let text = "---\ntitle: \"Test\"\n---\n\n# First\n\n# Second\n";
315        let parsed = parse_text(text, "test.md", false);
316        assert!(parsed.parse_errors.iter().any(|e| e.contains("Duplicate H1")));
317    }
318
319    #[test]
320    fn test_code_fence_ignores_headings() {
321        let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n```\n# Not a heading\n## Also not\n```\n\nAfter fence.\n";
322        let parsed = parse_text(text, "test.md", false);
323        assert!(parsed.parse_errors.is_empty());
324        assert!(parsed.h1.is_none());
325        assert_eq!(parsed.sections.len(), 1);
326        assert!(parsed.sections[0].body.contains("# Not a heading"));
327    }
328
329    #[test]
330    fn test_numbered_heading_normalization() {
331        let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n\n## 2. Method\n\nMore.\n";
332        let parsed = parse_text(text, "test.md", true);
333        assert!(parsed.parse_errors.is_empty());
334        assert_eq!(parsed.sections[0].raw_heading, "1. Hypothesis");
335        assert_eq!(parsed.sections[0].normalized_heading, "Hypothesis");
336        assert_eq!(parsed.sections[1].normalized_heading, "Method");
337    }
338
339    #[test]
340    fn test_numbered_heading_no_normalization() {
341        let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n";
342        let parsed = parse_text(text, "test.md", false);
343        assert_eq!(parsed.sections[0].normalized_heading, "1. Hypothesis");
344    }
345
346    #[test]
347    fn test_tilde_fence() {
348        let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n~~~\n## fake heading\n~~~\n\nReal content.\n";
349        let parsed = parse_text(text, "test.md", false);
350        assert_eq!(parsed.sections.len(), 1);
351        assert!(parsed.sections[0].body.contains("## fake heading"));
352    }
353
354    #[test]
355    fn test_section_line_numbers() {
356        let text = "---\ntitle: \"Test\"\n---\n\n## First\n\nBody 1.\n\n## Second\n\nBody 2.\n";
357        let parsed = parse_text(text, "test.md", false);
358        assert_eq!(parsed.sections[0].line_number, 5);
359        assert_eq!(parsed.sections[1].line_number, 9);
360    }
361
362    #[test]
363    fn test_empty_sections() {
364        let text = "---\ntitle: \"Test\"\n---\n\n## Empty\n\n## Also Empty\n";
365        let parsed = parse_text(text, "test.md", false);
366        assert_eq!(parsed.sections.len(), 2);
367        assert_eq!(parsed.sections[0].body, "");
368        assert_eq!(parsed.sections[1].body, "");
369    }
370
371    #[test]
372    fn test_malformed_yaml() {
373        let text = "---\n: [invalid yaml\n---\n";
374        let parsed = parse_text(text, "test.md", false);
375        assert!(parsed.parse_errors.iter().any(|e| e.contains("Malformed YAML")));
376    }
377
378    #[test]
379    fn test_non_mapping_frontmatter() {
380        let text = "---\n- a list\n- not a mapping\n---\n";
381        let parsed = parse_text(text, "test.md", false);
382        assert!(parsed.parse_errors.iter().any(|e| e.contains("not a mapping")));
383    }
384}