Skip to main content

mdql_core/
parser.rs

1//! Parse markdown files into structured representations.
2//!
3//! Handles frontmatter extraction, H1/H2 detection, code fence tracking,
4//! and numbered heading normalization.
5
6use std::path::Path;
7
8use regex::Regex;
9use std::sync::LazyLock;
10
11use crate::errors::MdqlError;
12
13#[derive(Debug, Clone, PartialEq)]
14pub struct Section {
15    pub raw_heading: String,
16    pub normalized_heading: String,
17    pub body: String,
18    pub line_number: usize,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedFile {
23    pub path: String,
24    pub raw_frontmatter: serde_yaml::Value,
25    pub h1: Option<String>,
26    pub h1_line_number: Option<usize>,
27    pub sections: Vec<Section>,
28    pub has_loose_body: bool,
29    pub parse_errors: Vec<String>,
30}
31
32static NUMBERED_HEADING_RE: LazyLock<Regex> =
33    LazyLock::new(|| Regex::new(r"^\d+\.\s+").unwrap());
34static FENCE_OPEN_RE: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
36static H1_RE: LazyLock<Regex> =
37    LazyLock::new(|| Regex::new(r"^#\s+(.+)$").unwrap());
38static H2_RE: LazyLock<Regex> =
39    LazyLock::new(|| Regex::new(r"^##\s+(.+)$").unwrap());
40
41pub fn normalize_heading(raw: &str) -> String {
42    NUMBERED_HEADING_RE.replace(raw, "").trim().to_string()
43}
44
45pub fn parse_file(
46    path: &Path,
47    relative_to: Option<&Path>,
48    normalize_numbered: bool,
49) -> crate::errors::Result<ParsedFile> {
50    let rel_path = if let Some(base) = relative_to {
51        path.strip_prefix(base)
52            .unwrap_or(path)
53            .to_string_lossy()
54            .to_string()
55    } else {
56        path.to_string_lossy().to_string()
57    };
58
59    let text = std::fs::read_to_string(path).map_err(|e| {
60        MdqlError::Parse(format!("Cannot read {}: {}", rel_path, e))
61    })?;
62
63    Ok(parse_text(&text, &rel_path, normalize_numbered))
64}
65
66/// Parse markdown text directly (useful for testing and when content is already in memory).
67pub(crate) fn parse_text(text: &str, rel_path: &str, normalize_numbered: bool) -> ParsedFile {
68    let lines: Vec<&str> = text.split('\n').collect();
69    let mut raw_frontmatter = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
70    let mut body_start: usize = 0;
71    let mut parse_errors: Vec<String> = Vec::new();
72
73    // --- Parse frontmatter ---
74    if !lines.is_empty() && lines[0].trim() == "---" {
75        let mut closing = None;
76        for i in 1..lines.len() {
77            if lines[i].trim() == "---" {
78                closing = Some(i);
79                break;
80            }
81        }
82
83        if let Some(close_idx) = closing {
84            let fm_text: String = lines[1..close_idx].join("\n");
85            match serde_yaml::from_str::<serde_yaml::Value>(&fm_text) {
86                Ok(serde_yaml::Value::Null) => {
87                    // Empty frontmatter
88                }
89                Ok(val @ serde_yaml::Value::Mapping(_)) => {
90                    raw_frontmatter = val;
91                }
92                Ok(val) => {
93                    let type_name = match &val {
94                        serde_yaml::Value::Bool(_) => "bool",
95                        serde_yaml::Value::Number(_) => "number",
96                        serde_yaml::Value::String(_) => "str",
97                        serde_yaml::Value::Sequence(_) => "list",
98                        _ => "unknown",
99                    };
100                    parse_errors.push(format!(
101                        "Frontmatter is not a mapping (got {})",
102                        type_name
103                    ));
104                }
105                Err(e) => {
106                    parse_errors.push(format!("Malformed YAML in frontmatter: {}", e));
107                }
108            }
109            body_start = close_idx + 1;
110        } else {
111            parse_errors.push("Unclosed frontmatter (no closing '---')".to_string());
112            body_start = 1;
113        }
114    } else {
115        parse_errors.push("No frontmatter found (file must start with '---')".to_string());
116    }
117
118    // --- Parse body: H1, H2 sections ---
119    let mut h1: Option<String> = None;
120    let mut h1_line_number: Option<usize> = None;
121    let mut sections: Vec<Section> = Vec::new();
122
123    let mut in_fence = false;
124    let mut fence_char: Option<char> = None;
125    let mut fence_width: usize = 0;
126
127    let mut current_heading: Option<String> = None;
128    let mut current_heading_normalized: Option<String> = None;
129    let mut current_heading_line: Option<usize> = None;
130    let mut current_body_lines: Vec<&str> = Vec::new();
131    let mut has_loose_body = false;
132
133    let finalize_section = |heading: &mut Option<String>,
134                                heading_norm: &mut Option<String>,
135                                heading_line: &mut Option<usize>,
136                                body_lines: &mut Vec<&str>,
137                                sections: &mut Vec<Section>| {
138        if let Some(raw_h) = heading.take() {
139            let norm_h = heading_norm.take().unwrap_or_else(|| raw_h.clone());
140            let body = body_lines.join("\n").trim().to_string();
141            sections.push(Section {
142                raw_heading: raw_h,
143                normalized_heading: norm_h,
144                body,
145                line_number: heading_line.take().unwrap_or(0),
146            });
147            body_lines.clear();
148        }
149    };
150
151    for i in body_start..lines.len() {
152        let line = lines[i];
153        let line_num = i + 1; // 1-indexed
154
155        // --- Code fence tracking ---
156        if let Some(caps) = FENCE_OPEN_RE.captures(line) {
157            let marker = caps.get(1).unwrap().as_str();
158            let char = marker.chars().next().unwrap();
159            let width = marker.len();
160
161            if !in_fence {
162                in_fence = true;
163                fence_char = Some(char);
164                fence_width = width;
165                if current_heading.is_some() {
166                    current_body_lines.push(line);
167                }
168                continue;
169            } else if Some(char) == fence_char
170                && width >= fence_width
171                && line.trim() == marker
172            {
173                // Closing fence
174                in_fence = false;
175                fence_char = None;
176                fence_width = 0;
177                if current_heading.is_some() {
178                    current_body_lines.push(line);
179                }
180                continue;
181            }
182        }
183
184        if in_fence {
185            if current_heading.is_some() {
186                current_body_lines.push(line);
187            }
188            continue;
189        }
190
191        // --- H1 detection ---
192        if let Some(caps) = H1_RE.captures(line) {
193            if h1.is_none() {
194                h1 = Some(caps.get(1).unwrap().as_str().trim().to_string());
195                h1_line_number = Some(line_num);
196            } else {
197                parse_errors.push(format!(
198                    "Duplicate H1 at line {} (first was at line {})",
199                    line_num,
200                    h1_line_number.unwrap_or(0)
201                ));
202            }
203            continue;
204        }
205
206        // --- H2 detection ---
207        if let Some(caps) = H2_RE.captures(line) {
208            finalize_section(
209                &mut current_heading,
210                &mut current_heading_normalized,
211                &mut current_heading_line,
212                &mut current_body_lines,
213                &mut sections,
214            );
215            let raw_h = caps.get(1).unwrap().as_str().trim().to_string();
216            let norm_h = if normalize_numbered {
217                normalize_heading(&raw_h)
218            } else {
219                raw_h.clone()
220            };
221            current_heading = Some(raw_h);
222            current_heading_normalized = Some(norm_h);
223            current_heading_line = Some(line_num);
224            current_body_lines.clear();
225            continue;
226        }
227
228        // --- Regular content ---
229        if current_heading.is_some() {
230            current_body_lines.push(line);
231        } else if !has_loose_body && !line.trim().is_empty() {
232            has_loose_body = true;
233        }
234    }
235
236    finalize_section(
237        &mut current_heading,
238        &mut current_heading_normalized,
239        &mut current_heading_line,
240        &mut current_body_lines,
241        &mut sections,
242    );
243
244    ParsedFile {
245        path: rel_path.to_string(),
246        raw_frontmatter,
247        h1,
248        h1_line_number,
249        sections,
250        has_loose_body,
251        parse_errors,
252    }
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn test_basic_parse() {
261        let text = "---\ntitle: \"Hello\"\nstatus: \"active\"\n---\n\n## Summary\n\nA summary.\n\n## Details\n\nSome details.\n";
262        let parsed = parse_text(text, "test.md", false);
263        assert!(parsed.parse_errors.is_empty());
264        assert_eq!(parsed.sections.len(), 2);
265        assert_eq!(parsed.sections[0].normalized_heading, "Summary");
266        assert_eq!(parsed.sections[0].body, "A summary.");
267        assert_eq!(parsed.sections[1].normalized_heading, "Details");
268        assert_eq!(parsed.sections[1].body, "Some details.");
269    }
270
271    #[test]
272    fn test_frontmatter_extraction() {
273        let text = "---\ntitle: \"Test\"\ncount: 42\n---\n\nBody text.\n";
274        let parsed = parse_text(text, "test.md", false);
275        assert!(parsed.parse_errors.is_empty());
276        let fm = parsed.raw_frontmatter.as_mapping().unwrap();
277        assert_eq!(
278            fm.get(&serde_yaml::Value::String("title".into()))
279                .unwrap()
280                .as_str()
281                .unwrap(),
282            "Test"
283        );
284        assert_eq!(
285            fm.get(&serde_yaml::Value::String("count".into()))
286                .unwrap()
287                .as_u64()
288                .unwrap(),
289            42
290        );
291    }
292
293    #[test]
294    fn test_no_frontmatter() {
295        let text = "Just some text.\n";
296        let parsed = parse_text(text, "test.md", false);
297        assert_eq!(parsed.parse_errors.len(), 1);
298        assert!(parsed.parse_errors[0].contains("No frontmatter"));
299    }
300
301    #[test]
302    fn test_unclosed_frontmatter() {
303        let text = "---\ntitle: Test\nNo closing delimiter.\n";
304        let parsed = parse_text(text, "test.md", false);
305        assert!(parsed.parse_errors.iter().any(|e| e.contains("Unclosed")));
306    }
307
308    #[test]
309    fn test_h1_detection() {
310        let text = "---\ntitle: \"Test\"\n---\n\n# My Title\n\n## Section\n\nBody.\n";
311        let parsed = parse_text(text, "test.md", false);
312        assert!(parsed.parse_errors.is_empty());
313        assert_eq!(parsed.h1.as_deref(), Some("My Title"));
314        assert_eq!(parsed.h1_line_number, Some(5));
315    }
316
317    #[test]
318    fn test_duplicate_h1() {
319        let text = "---\ntitle: \"Test\"\n---\n\n# First\n\n# Second\n";
320        let parsed = parse_text(text, "test.md", false);
321        assert!(parsed.parse_errors.iter().any(|e| e.contains("Duplicate H1")));
322    }
323
324    #[test]
325    fn test_code_fence_ignores_headings() {
326        let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n```\n# Not a heading\n## Also not\n```\n\nAfter fence.\n";
327        let parsed = parse_text(text, "test.md", false);
328        assert!(parsed.parse_errors.is_empty());
329        assert!(parsed.h1.is_none());
330        assert_eq!(parsed.sections.len(), 1);
331        assert!(parsed.sections[0].body.contains("# Not a heading"));
332    }
333
334    #[test]
335    fn test_numbered_heading_normalization() {
336        let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n\n## 2. Method\n\nMore.\n";
337        let parsed = parse_text(text, "test.md", true);
338        assert!(parsed.parse_errors.is_empty());
339        assert_eq!(parsed.sections[0].raw_heading, "1. Hypothesis");
340        assert_eq!(parsed.sections[0].normalized_heading, "Hypothesis");
341        assert_eq!(parsed.sections[1].normalized_heading, "Method");
342    }
343
344    #[test]
345    fn test_numbered_heading_no_normalization() {
346        let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n";
347        let parsed = parse_text(text, "test.md", false);
348        assert_eq!(parsed.sections[0].normalized_heading, "1. Hypothesis");
349    }
350
351    #[test]
352    fn test_tilde_fence() {
353        let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n~~~\n## fake heading\n~~~\n\nReal content.\n";
354        let parsed = parse_text(text, "test.md", false);
355        assert_eq!(parsed.sections.len(), 1);
356        assert!(parsed.sections[0].body.contains("## fake heading"));
357    }
358
359    #[test]
360    fn test_section_line_numbers() {
361        let text = "---\ntitle: \"Test\"\n---\n\n## First\n\nBody 1.\n\n## Second\n\nBody 2.\n";
362        let parsed = parse_text(text, "test.md", false);
363        assert_eq!(parsed.sections[0].line_number, 5);
364        assert_eq!(parsed.sections[1].line_number, 9);
365    }
366
367    #[test]
368    fn test_empty_sections() {
369        let text = "---\ntitle: \"Test\"\n---\n\n## Empty\n\n## Also Empty\n";
370        let parsed = parse_text(text, "test.md", false);
371        assert_eq!(parsed.sections.len(), 2);
372        assert_eq!(parsed.sections[0].body, "");
373        assert_eq!(parsed.sections[1].body, "");
374    }
375
376    #[test]
377    fn test_malformed_yaml() {
378        let text = "---\n: [invalid yaml\n---\n";
379        let parsed = parse_text(text, "test.md", false);
380        assert!(parsed.parse_errors.iter().any(|e| e.contains("Malformed YAML")));
381    }
382
383    #[test]
384    fn test_non_mapping_frontmatter() {
385        let text = "---\n- a list\n- not a mapping\n---\n";
386        let parsed = parse_text(text, "test.md", false);
387        assert!(parsed.parse_errors.iter().any(|e| e.contains("not a mapping")));
388    }
389}