1use std::path::Path;
7
8use regex::Regex;
9use std::sync::LazyLock;
10
11use crate::errors::MdqlError;
12
13#[derive(Debug, Clone, PartialEq)]
14pub struct Section {
15 pub raw_heading: String,
16 pub normalized_heading: String,
17 pub body: String,
18 pub line_number: usize,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedFile {
23 pub path: String,
24 pub raw_frontmatter: serde_yaml::Value,
25 pub h1: Option<String>,
26 pub h1_line_number: Option<usize>,
27 pub sections: Vec<Section>,
28 pub parse_errors: Vec<String>,
29}
30
31static NUMBERED_HEADING_RE: LazyLock<Regex> =
32 LazyLock::new(|| Regex::new(r"^\d+\.\s+").unwrap());
33static FENCE_OPEN_RE: LazyLock<Regex> =
34 LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
35static H1_RE: LazyLock<Regex> =
36 LazyLock::new(|| Regex::new(r"^#\s+(.+)$").unwrap());
37static H2_RE: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r"^##\s+(.+)$").unwrap());
39
40pub fn normalize_heading(raw: &str) -> String {
41 NUMBERED_HEADING_RE.replace(raw, "").trim().to_string()
42}
43
44pub fn parse_file(
45 path: &Path,
46 relative_to: Option<&Path>,
47 normalize_numbered: bool,
48) -> crate::errors::Result<ParsedFile> {
49 let rel_path = if let Some(base) = relative_to {
50 path.strip_prefix(base)
51 .unwrap_or(path)
52 .to_string_lossy()
53 .to_string()
54 } else {
55 path.to_string_lossy().to_string()
56 };
57
58 let text = std::fs::read_to_string(path).map_err(|e| {
59 MdqlError::Parse(format!("Cannot read {}: {}", rel_path, e))
60 })?;
61
62 Ok(parse_text(&text, &rel_path, normalize_numbered))
63}
64
65pub fn parse_text(text: &str, rel_path: &str, normalize_numbered: bool) -> ParsedFile {
67 let lines: Vec<&str> = text.split('\n').collect();
68 let mut raw_frontmatter = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
69 let mut body_start: usize = 0;
70 let mut parse_errors: Vec<String> = Vec::new();
71
72 if !lines.is_empty() && lines[0].trim() == "---" {
74 let mut closing = None;
75 for i in 1..lines.len() {
76 if lines[i].trim() == "---" {
77 closing = Some(i);
78 break;
79 }
80 }
81
82 if let Some(close_idx) = closing {
83 let fm_text: String = lines[1..close_idx].join("\n");
84 match serde_yaml::from_str::<serde_yaml::Value>(&fm_text) {
85 Ok(serde_yaml::Value::Null) => {
86 }
88 Ok(val @ serde_yaml::Value::Mapping(_)) => {
89 raw_frontmatter = val;
90 }
91 Ok(val) => {
92 let type_name = match &val {
93 serde_yaml::Value::Bool(_) => "bool",
94 serde_yaml::Value::Number(_) => "number",
95 serde_yaml::Value::String(_) => "str",
96 serde_yaml::Value::Sequence(_) => "list",
97 _ => "unknown",
98 };
99 parse_errors.push(format!(
100 "Frontmatter is not a mapping (got {})",
101 type_name
102 ));
103 }
104 Err(e) => {
105 parse_errors.push(format!("Malformed YAML in frontmatter: {}", e));
106 }
107 }
108 body_start = close_idx + 1;
109 } else {
110 parse_errors.push("Unclosed frontmatter (no closing '---')".to_string());
111 body_start = 1;
112 }
113 } else {
114 parse_errors.push("No frontmatter found (file must start with '---')".to_string());
115 }
116
117 let mut h1: Option<String> = None;
119 let mut h1_line_number: Option<usize> = None;
120 let mut sections: Vec<Section> = Vec::new();
121
122 let mut in_fence = false;
123 let mut fence_char: Option<char> = None;
124 let mut fence_width: usize = 0;
125
126 let mut current_heading: Option<String> = None;
127 let mut current_heading_normalized: Option<String> = None;
128 let mut current_heading_line: Option<usize> = None;
129 let mut current_body_lines: Vec<&str> = Vec::new();
130
131 let finalize_section = |heading: &mut Option<String>,
132 heading_norm: &mut Option<String>,
133 heading_line: &mut Option<usize>,
134 body_lines: &mut Vec<&str>,
135 sections: &mut Vec<Section>| {
136 if let Some(raw_h) = heading.take() {
137 let norm_h = heading_norm.take().unwrap_or_else(|| raw_h.clone());
138 let body = body_lines.join("\n").trim().to_string();
139 sections.push(Section {
140 raw_heading: raw_h,
141 normalized_heading: norm_h,
142 body,
143 line_number: heading_line.take().unwrap_or(0),
144 });
145 body_lines.clear();
146 }
147 };
148
149 for i in body_start..lines.len() {
150 let line = lines[i];
151 let line_num = i + 1; if let Some(caps) = FENCE_OPEN_RE.captures(line) {
155 let marker = caps.get(1).unwrap().as_str();
156 let char = marker.chars().next().unwrap();
157 let width = marker.len();
158
159 if !in_fence {
160 in_fence = true;
161 fence_char = Some(char);
162 fence_width = width;
163 if current_heading.is_some() {
164 current_body_lines.push(line);
165 }
166 continue;
167 } else if Some(char) == fence_char
168 && width >= fence_width
169 && line.trim() == marker
170 {
171 in_fence = false;
173 fence_char = None;
174 fence_width = 0;
175 if current_heading.is_some() {
176 current_body_lines.push(line);
177 }
178 continue;
179 }
180 }
181
182 if in_fence {
183 if current_heading.is_some() {
184 current_body_lines.push(line);
185 }
186 continue;
187 }
188
189 if let Some(caps) = H1_RE.captures(line) {
191 if h1.is_none() {
192 h1 = Some(caps.get(1).unwrap().as_str().trim().to_string());
193 h1_line_number = Some(line_num);
194 } else {
195 parse_errors.push(format!(
196 "Duplicate H1 at line {} (first was at line {})",
197 line_num,
198 h1_line_number.unwrap_or(0)
199 ));
200 }
201 continue;
202 }
203
204 if let Some(caps) = H2_RE.captures(line) {
206 finalize_section(
207 &mut current_heading,
208 &mut current_heading_normalized,
209 &mut current_heading_line,
210 &mut current_body_lines,
211 &mut sections,
212 );
213 let raw_h = caps.get(1).unwrap().as_str().trim().to_string();
214 let norm_h = if normalize_numbered {
215 normalize_heading(&raw_h)
216 } else {
217 raw_h.clone()
218 };
219 current_heading = Some(raw_h);
220 current_heading_normalized = Some(norm_h);
221 current_heading_line = Some(line_num);
222 current_body_lines.clear();
223 continue;
224 }
225
226 if current_heading.is_some() {
228 current_body_lines.push(line);
229 }
230 }
231
232 finalize_section(
233 &mut current_heading,
234 &mut current_heading_normalized,
235 &mut current_heading_line,
236 &mut current_body_lines,
237 &mut sections,
238 );
239
240 ParsedFile {
241 path: rel_path.to_string(),
242 raw_frontmatter,
243 h1,
244 h1_line_number,
245 sections,
246 parse_errors,
247 }
248}
249
250#[cfg(test)]
251mod tests {
252 use super::*;
253
254 #[test]
255 fn test_basic_parse() {
256 let text = "---\ntitle: \"Hello\"\nstatus: \"active\"\n---\n\n## Summary\n\nA summary.\n\n## Details\n\nSome details.\n";
257 let parsed = parse_text(text, "test.md", false);
258 assert!(parsed.parse_errors.is_empty());
259 assert_eq!(parsed.sections.len(), 2);
260 assert_eq!(parsed.sections[0].normalized_heading, "Summary");
261 assert_eq!(parsed.sections[0].body, "A summary.");
262 assert_eq!(parsed.sections[1].normalized_heading, "Details");
263 assert_eq!(parsed.sections[1].body, "Some details.");
264 }
265
266 #[test]
267 fn test_frontmatter_extraction() {
268 let text = "---\ntitle: \"Test\"\ncount: 42\n---\n\nBody text.\n";
269 let parsed = parse_text(text, "test.md", false);
270 assert!(parsed.parse_errors.is_empty());
271 let fm = parsed.raw_frontmatter.as_mapping().unwrap();
272 assert_eq!(
273 fm.get(&serde_yaml::Value::String("title".into()))
274 .unwrap()
275 .as_str()
276 .unwrap(),
277 "Test"
278 );
279 assert_eq!(
280 fm.get(&serde_yaml::Value::String("count".into()))
281 .unwrap()
282 .as_u64()
283 .unwrap(),
284 42
285 );
286 }
287
288 #[test]
289 fn test_no_frontmatter() {
290 let text = "Just some text.\n";
291 let parsed = parse_text(text, "test.md", false);
292 assert_eq!(parsed.parse_errors.len(), 1);
293 assert!(parsed.parse_errors[0].contains("No frontmatter"));
294 }
295
296 #[test]
297 fn test_unclosed_frontmatter() {
298 let text = "---\ntitle: Test\nNo closing delimiter.\n";
299 let parsed = parse_text(text, "test.md", false);
300 assert!(parsed.parse_errors.iter().any(|e| e.contains("Unclosed")));
301 }
302
303 #[test]
304 fn test_h1_detection() {
305 let text = "---\ntitle: \"Test\"\n---\n\n# My Title\n\n## Section\n\nBody.\n";
306 let parsed = parse_text(text, "test.md", false);
307 assert!(parsed.parse_errors.is_empty());
308 assert_eq!(parsed.h1.as_deref(), Some("My Title"));
309 assert_eq!(parsed.h1_line_number, Some(5));
310 }
311
312 #[test]
313 fn test_duplicate_h1() {
314 let text = "---\ntitle: \"Test\"\n---\n\n# First\n\n# Second\n";
315 let parsed = parse_text(text, "test.md", false);
316 assert!(parsed.parse_errors.iter().any(|e| e.contains("Duplicate H1")));
317 }
318
319 #[test]
320 fn test_code_fence_ignores_headings() {
321 let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n```\n# Not a heading\n## Also not\n```\n\nAfter fence.\n";
322 let parsed = parse_text(text, "test.md", false);
323 assert!(parsed.parse_errors.is_empty());
324 assert!(parsed.h1.is_none());
325 assert_eq!(parsed.sections.len(), 1);
326 assert!(parsed.sections[0].body.contains("# Not a heading"));
327 }
328
329 #[test]
330 fn test_numbered_heading_normalization() {
331 let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n\n## 2. Method\n\nMore.\n";
332 let parsed = parse_text(text, "test.md", true);
333 assert!(parsed.parse_errors.is_empty());
334 assert_eq!(parsed.sections[0].raw_heading, "1. Hypothesis");
335 assert_eq!(parsed.sections[0].normalized_heading, "Hypothesis");
336 assert_eq!(parsed.sections[1].normalized_heading, "Method");
337 }
338
339 #[test]
340 fn test_numbered_heading_no_normalization() {
341 let text = "---\ntitle: \"Test\"\n---\n\n## 1. Hypothesis\n\nContent.\n";
342 let parsed = parse_text(text, "test.md", false);
343 assert_eq!(parsed.sections[0].normalized_heading, "1. Hypothesis");
344 }
345
346 #[test]
347 fn test_tilde_fence() {
348 let text = "---\ntitle: \"Test\"\n---\n\n## Section\n\n~~~\n## fake heading\n~~~\n\nReal content.\n";
349 let parsed = parse_text(text, "test.md", false);
350 assert_eq!(parsed.sections.len(), 1);
351 assert!(parsed.sections[0].body.contains("## fake heading"));
352 }
353
354 #[test]
355 fn test_section_line_numbers() {
356 let text = "---\ntitle: \"Test\"\n---\n\n## First\n\nBody 1.\n\n## Second\n\nBody 2.\n";
357 let parsed = parse_text(text, "test.md", false);
358 assert_eq!(parsed.sections[0].line_number, 5);
359 assert_eq!(parsed.sections[1].line_number, 9);
360 }
361
362 #[test]
363 fn test_empty_sections() {
364 let text = "---\ntitle: \"Test\"\n---\n\n## Empty\n\n## Also Empty\n";
365 let parsed = parse_text(text, "test.md", false);
366 assert_eq!(parsed.sections.len(), 2);
367 assert_eq!(parsed.sections[0].body, "");
368 assert_eq!(parsed.sections[1].body, "");
369 }
370
371 #[test]
372 fn test_malformed_yaml() {
373 let text = "---\n: [invalid yaml\n---\n";
374 let parsed = parse_text(text, "test.md", false);
375 assert!(parsed.parse_errors.iter().any(|e| e.contains("Malformed YAML")));
376 }
377
378 #[test]
379 fn test_non_mapping_frontmatter() {
380 let text = "---\n- a list\n- not a mapping\n---\n";
381 let parsed = parse_text(text, "test.md", false);
382 assert!(parsed.parse_errors.iter().any(|e| e.contains("not a mapping")));
383 }
384}