1use std::path::Path;
7
8use regex::Regex;
9use std::sync::LazyLock;
10
11use crate::errors::MdqlError;
12
13#[derive(Debug, Clone, PartialEq)]
14pub struct Section {
15 pub raw_heading: String,
16 pub normalized_heading: String,
17 pub body: String,
18 pub line_number: usize,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedFile {
23 pub path: String,
24 pub raw_frontmatter: serde_yaml::Value,
25 pub h1: Option<String>,
26 pub h1_line_number: Option<usize>,
27 pub sections: Vec<Section>,
28 pub has_loose_body: bool,
29 pub parse_errors: Vec<String>,
30}
31
32static NUMBERED_HEADING_RE: LazyLock<Regex> =
33 LazyLock::new(|| Regex::new(r"^\d+\.\s+").unwrap());
34static FENCE_OPEN_RE: LazyLock<Regex> =
35 LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
36static H1_RE: LazyLock<Regex> =
37 LazyLock::new(|| Regex::new(r"^#\s+(.+)$").unwrap());
38static H2_RE: LazyLock<Regex> =
39 LazyLock::new(|| Regex::new(r"^##\s+(.+)$").unwrap());
40
41pub fn normalize_heading(raw: &str) -> String {
42 NUMBERED_HEADING_RE.replace(raw, "").trim().to_string()
43}
44
45pub fn parse_file(
46 path: &Path,
47 relative_to: Option<&Path>,
48 normalize_numbered: bool,
49) -> crate::errors::Result<ParsedFile> {
50 let rel_path = if let Some(base) = relative_to {
51 path.strip_prefix(base)
52 .unwrap_or(path)
53 .to_string_lossy()
54 .to_string()
55 } else {
56 path.to_string_lossy().to_string()
57 };
58
59 let text = std::fs::read_to_string(path).map_err(|e| {
60 MdqlError::Parse(format!("Cannot read {}: {}", rel_path, e))
61 })?;
62
63 Ok(parse_text(&text, &rel_path, normalize_numbered))
64}
65
66pub(crate) fn parse_text(text: &str, rel_path: &str, normalize_numbered: bool) -> ParsedFile {
68 let lines: Vec<&str> = text.split('\n').collect();
69 let mut raw_frontmatter = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
70 let mut body_start: usize = 0;
71 let mut parse_errors: Vec<String> = Vec::new();
72
73 if !lines.is_empty() && lines[0].trim() == "---" {
75 let mut closing = None;
76 for i in 1..lines.len() {
77 if lines[i].trim() == "---" {
78 closing = Some(i);
79 break;
80 }
81 }
82
83 if let Some(close_idx) = closing {
84 let fm_text: String = lines[1..close_idx].join("\n");
85 match serde_yaml::from_str::<serde_yaml::Value>(&fm_text) {
86 Ok(serde_yaml::Value::Null) => {
87 }
89 Ok(val @ serde_yaml::Value::Mapping(_)) => {
90 raw_frontmatter = val;
91 }
92 Ok(val) => {
93 let type_name = match &val {
94 serde_yaml::Value::Bool(_) => "bool",
95 serde_yaml::Value::Number(_) => "number",
96 serde_yaml::Value::String(_) => "str",
97 serde_yaml::Value::Sequence(_) => "list",
98 _ => "unknown",
99 };
100 parse_errors.push(format!(
101 "Frontmatter is not a mapping (got {})",
102 type_name
103 ));
104 }
105 Err(e) => {
106 parse_errors.push(format!("Malformed YAML in frontmatter: {}", e));
107 }
108 }
109 body_start = close_idx + 1;
110 } else {
111 parse_errors.push("Unclosed frontmatter (no closing '---')".to_string());
112 body_start = 1;
113 }
114 } else {
115 parse_errors.push("No frontmatter found (file must start with '---')".to_string());
116 }
117
118 let mut h1: Option<String> = None;
120 let mut h1_line_number: Option<usize> = None;
121 let mut sections: Vec<Section> = Vec::new();
122
123 let mut in_fence = false;
124 let mut fence_char: Option<char> = None;
125 let mut fence_width: usize = 0;
126
127 let mut current_heading: Option<String> = None;
128 let mut current_heading_normalized: Option<String> = None;
129 let mut current_heading_line: Option<usize> = None;
130 let mut current_body_lines: Vec<&str> = Vec::new();
131 let mut has_loose_body = false;
132
133 let finalize_section = |heading: &mut Option<String>,
134 heading_norm: &mut Option<String>,
135 heading_line: &mut Option<usize>,
136 body_lines: &mut Vec<&str>,
137 sections: &mut Vec<Section>| {
138 if let Some(raw_h) = heading.take() {
139 let norm_h = heading_norm.take().unwrap_or_else(|| raw_h.clone());
140 let body = body_lines.join("\n").trim().to_string();
141 sections.push(Section {
142 raw_heading: raw_h,
143 normalized_heading: norm_h,
144 body,
145 line_number: heading_line.take().unwrap_or(0),
146 });
147 body_lines.clear();
148 }
149 };
150
151 for i in body_start..lines.len() {
152 let line = lines[i];
153 let line_num = i + 1; if let Some(caps) = FENCE_OPEN_RE.captures(line) {
157 let marker = caps.get(1).unwrap().as_str();
158 let char = marker.chars().next().unwrap();
159 let width = marker.len();
160
161 if !in_fence {
162 in_fence = true;
163 fence_char = Some(char);
164 fence_width = width;
165 if current_heading.is_some() {
166 current_body_lines.push(line);
167 }
168 continue;
169 } else if Some(char) == fence_char
170 && width >= fence_width
171 && line.trim() == marker
172 {
173 in_fence = false;
175 fence_char = None;
176 fence_width = 0;
177 if current_heading.is_some() {
178 current_body_lines.push(line);
179 }
180 continue;
181 }
182 }
183
184 if in_fence {
185 if current_heading.is_some() {
186 current_body_lines.push(line);
187 }
188 continue;
189 }
190
191 if let Some(caps) = H1_RE.captures(line) {
193 if h1.is_none() {
194 h1 = Some(caps.get(1).unwrap().as_str().trim().to_string());
195 h1_line_number = Some(line_num);
196 } else {
197 parse_errors.push(format!(
198 "Duplicate H1 at line {} (first was at line {})",
199 line_num,
200 h1_line_number.unwrap_or(0)
201 ));
202 }
203 continue;
204 }
205
206 if let Some(caps) = H2_RE.captures(line) {
208 finalize_section(
209 &mut current_heading,
210 &mut current_heading_normalized,
211 &mut current_heading_line,
212 &mut current_body_lines,
213 &mut sections,
214 );
215 let raw_h = caps.get(1).unwrap().as_str().trim().to_string();
216 let norm_h = if normalize_numbered {
217 normalize_heading(&raw_h)
218 } else {
219 raw_h.clone()
220 };
221 current_heading = Some(raw_h);
222 current_heading_normalized = Some(norm_h);
223 current_heading_line = Some(line_num);
224 current_body_lines.clear();
225 continue;
226 }
227
228 if current_heading.is_some() {
230 current_body_lines.push(line);
231 } else if !has_loose_body && !line.trim().is_empty() {
232 has_loose_body = true;
233 }
234 }
235
236 finalize_section(
237 &mut current_heading,
238 &mut current_heading_normalized,
239 &mut current_heading_line,
240 &mut current_body_lines,
241 &mut sections,
242 );
243
244 ParsedFile {
245 path: rel_path.to_string(),
246 raw_frontmatter,
247 h1,
248 h1_line_number,
249 sections,
250 has_loose_body,
251 parse_errors,
252 }
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258
259 #[test]
260 fn test_basic_parse() {
261 let text = "---\ntitle: \"Hello\"\nstatus: \"active\"\n---\n\n## Summary\n\nA summary.\n\n## Details\n\nSome details.\n";
262 let parsed = parse_text(text, "test.md", false);
263 assert!(parsed.parse_errors.is_empty());
264 assert_eq!(parsed.sections.len(), 2);
265 assert_eq!(parsed.sections[0].normalized_heading, "Summary");
266 assert_eq!(parsed.sections[0].body, "A summary.");
267 assert_eq!(parsed.sections[1].normalized_heading, "Details");
268 assert_eq!(parsed.sections[1].body, "Some details.");
269 }
270
271 #[test]
272 fn test_frontmatter_extraction() {
273 let text = "---\ntitle: \"Test\"\ncount: 42\n---\n\nBody text.\n";
274 let parsed = parse_text(text, "test.md", false);
275 assert!(parsed.parse_errors.is_empty());
276 let fm = parsed.raw_frontmatter.as_mapping().unwrap();
277 assert_eq!(
278 fm.get(&serde_yaml::Value::String("title".into()))
279 .unwrap()
280 .as_str()
281 .unwrap(),
282 "Test"
283 );
284 assert_eq!(
285 fm.get(&serde_yaml::Value::String("count".into()))
286 .unwrap()
287 .as_u64()
288 .unwrap(),
289 42
290 );
291 }
292
293 #[test]
294 fn test_no_frontmatter() {
295 let text = "Just some text.\n";
296 let parsed = parse_text(text, "test.md", false);
297 assert_eq!(parsed.parse_errors.len(), 1);
298 assert!(parsed.parse_errors[0].contains("No frontmatter"));
299 }
300
301 #[test]
302 fn test_unclosed_frontmatter() {
303 let text = "---\ntitle: Test\nNo closing delimiter.\n";
304 let parsed = parse_text(text, "test.md", false);
305 assert!(parsed.parse_errors.iter().any(|e| e.contains("Unclosed")));
306 }
307
308 #[test]
309 fn test_h1_detection() {
310 let text = "---\ntitle: \"Test\"\n---\n\n# My Title\n\n## Section\n\nBody.\n";
311 let parsed = parse_text(text, "test.md", false);
312 assert!(parsed.parse_errors.is_empty());
313 assert_eq!(parsed.h1.as_deref(), Some("My Title"));
314 assert_eq!(parsed.h1_line_number, Some(5));
315 }
316
317 #[test]
318 fn test_duplicate_h1() {
319 let text = "---\ntitle: \"Test\"\n---\n\n# First\n\n# Second\n";
320 let parsed = parse_text(text, "test.md", false);
321 assert!(parsed.parse_errors.iter().any(|e| e.contains("Duplicate H1")));
322 }
323
324 #[test]
325 fn test_code_fence_ignores_headings() {
326 let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n```\n# Not a heading\n## Also not\n```\n\nAfter fence.\n";
327 let parsed = parse_text(text, "test.md", false);
328 assert!(parsed.parse_errors.is_empty());
329 assert!(parsed.h1.is_none());
330 assert_eq!(parsed.sections.len(), 1);
331 assert!(parsed.sections[0].body.contains("# Not a heading"));
332 }
333
334 #[test]
335 fn test_numbered_heading_normalization() {
336 let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n\n## 2. Method\n\nMore.\n";
337 let parsed = parse_text(text, "test.md", true);
338 assert!(parsed.parse_errors.is_empty());
339 assert_eq!(parsed.sections[0].raw_heading, "1. Hypothesis");
340 assert_eq!(parsed.sections[0].normalized_heading, "Hypothesis");
341 assert_eq!(parsed.sections[1].normalized_heading, "Method");
342 }
343
344 #[test]
345 fn test_numbered_heading_no_normalization() {
346 let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n";
347 let parsed = parse_text(text, "test.md", false);
348 assert_eq!(parsed.sections[0].normalized_heading, "1. Hypothesis");
349 }
350
351 #[test]
352 fn test_tilde_fence() {
353 let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n~~~\n## fake heading\n~~~\n\nReal content.\n";
354 let parsed = parse_text(text, "test.md", false);
355 assert_eq!(parsed.sections.len(), 1);
356 assert!(parsed.sections[0].body.contains("## fake heading"));
357 }
358
359 #[test]
360 fn test_section_line_numbers() {
361 let text = "---\ntitle: \"Test\"\n---\n\n## First\n\nBody 1.\n\n## Second\n\nBody 2.\n";
362 let parsed = parse_text(text, "test.md", false);
363 assert_eq!(parsed.sections[0].line_number, 5);
364 assert_eq!(parsed.sections[1].line_number, 9);
365 }
366
367 #[test]
368 fn test_empty_sections() {
369 let text = "---\ntitle: \"Test\"\n---\n\n## Empty\n\n## Also Empty\n";
370 let parsed = parse_text(text, "test.md", false);
371 assert_eq!(parsed.sections.len(), 2);
372 assert_eq!(parsed.sections[0].body, "");
373 assert_eq!(parsed.sections[1].body, "");
374 }
375
376 #[test]
377 fn test_malformed_yaml() {
378 let text = "---\n: [invalid yaml\n---\n";
379 let parsed = parse_text(text, "test.md", false);
380 assert!(parsed.parse_errors.iter().any(|e| e.contains("Malformed YAML")));
381 }
382
383 #[test]
384 fn test_non_mapping_frontmatter() {
385 let text = "---\n- a list\n- not a mapping\n---\n";
386 let parsed = parse_text(text, "test.md", false);
387 assert!(parsed.parse_errors.iter().any(|e| e.contains("not a mapping")));
388 }
389}