1use std::collections::HashMap;
2
3pub const BODY_FIELD: &str = "body";
5
6#[derive(Debug, Clone)]
8pub struct ParsedDocument {
9 fields: HashMap<String, serde_yaml::Value>,
10}
11
12impl ParsedDocument {
13 pub fn new(fields: HashMap<String, serde_yaml::Value>) -> Self {
15 Self { fields }
16 }
17
18 pub fn body(&self) -> Option<&str> {
20 self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
21 }
22
23 pub fn get_field(&self, name: &str) -> Option<&serde_yaml::Value> {
25 self.fields.get(name)
26 }
27
28 pub fn fields(&self) -> &HashMap<String, serde_yaml::Value> {
30 &self.fields
31 }
32}
33
34#[derive(Debug)]
35struct MetadataBlock {
36 start: usize, end: usize, yaml_content: String,
39 tag: Option<String>, }
41
42fn is_valid_tag_name(name: &str) -> bool {
44 if name.is_empty() {
45 return false;
46 }
47
48 let mut chars = name.chars();
49 let first = chars.next().unwrap();
50
51 if !first.is_ascii_lowercase() && first != '_' {
52 return false;
53 }
54
55 for ch in chars {
56 if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
57 return false;
58 }
59 }
60
61 true
62}
63
64fn find_metadata_blocks(markdown: &str) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
66 let mut blocks = Vec::new();
67 let mut pos = 0;
68
69 while pos < markdown.len() {
70 let search_str = &markdown[pos..];
72 let delimiter_result = if let Some(p) = search_str.find("---\n") {
73 Some((p, 4, "\n"))
74 } else if let Some(p) = search_str.find("---\r\n") {
75 Some((p, 5, "\r\n"))
76 } else {
77 None
78 };
79
80 if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
81 let abs_pos = pos + delimiter_pos;
82 let content_start = abs_pos + delimiter_len; let followed_by_blank = if content_start < markdown.len() {
86 markdown[content_start..].starts_with('\n') || markdown[content_start..].starts_with("\r\n")
87 } else {
88 false
89 };
90
91 if followed_by_blank {
92 pos = abs_pos + 3; continue;
95 }
96
97 let rest = &markdown[content_start..];
100
101 let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
103 let closing_with_newline = closing_patterns
104 .iter()
105 .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
106 .min_by_key(|(p, _)| *p);
107
108 let closing_at_eof = ["\n---", "\r\n---"]
110 .iter()
111 .filter_map(|delim| {
112 rest.find(delim).and_then(|p| {
113 if p + delim.len() == rest.len() {
114 Some((p, delim.len()))
115 } else {
116 None
117 }
118 })
119 })
120 .min_by_key(|(p, _)| *p);
121
122 let closing_result = match (closing_with_newline, closing_at_eof) {
123 (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
124 (Some(_), Some(_)) => closing_with_newline,
125 (Some(_), None) => closing_with_newline,
126 (None, Some(_)) => closing_at_eof,
127 (None, None) => None,
128 };
129
130 if let Some((closing_pos, closing_len)) = closing_result {
131 let abs_closing_pos = content_start + closing_pos;
132 let content = &markdown[content_start..abs_closing_pos];
133
134 if content.contains("\n\n") || content.contains("\r\n\r\n") {
136 if abs_pos == 0 {
138 return Err("Frontmatter started but not closed with ---".into());
140 }
141 pos = abs_pos + 3;
143 continue;
144 }
145
146 let (tag, yaml_content) = if content.starts_with('!') {
148 if let Some(newline_pos) = content.find(|c| c == '\n' || c == '\r') {
149 let tag_line = &content[1..newline_pos];
150 let yaml_start = if content[newline_pos..].starts_with("\r\n") {
152 newline_pos + 2
153 } else {
154 newline_pos + 1
155 };
156 let yaml = if yaml_start < content.len() {
157 &content[yaml_start..]
158 } else {
159 ""
160 };
161 (Some(tag_line.trim().to_string()), yaml.to_string())
162 } else {
163 (Some(content[1..].trim().to_string()), String::new())
165 }
166 } else {
167 (None, content.to_string())
168 };
169
170 if let Some(ref tag_name) = tag {
172 if !is_valid_tag_name(tag_name) {
173 return Err(format!("Invalid tag name '{}': must match pattern [a-z_][a-z0-9_]*", tag_name).into());
174 }
175 if tag_name == BODY_FIELD {
176 return Err(format!("Cannot use reserved field name '{}' as tag directive", BODY_FIELD).into());
177 }
178 }
179
180 blocks.push(MetadataBlock {
181 start: abs_pos,
182 end: abs_closing_pos + closing_len, yaml_content,
184 tag,
185 });
186
187 pos = abs_closing_pos + closing_len;
188 } else if abs_pos == 0 {
189 return Err("Frontmatter started but not closed with ---".into());
191 } else {
192 pos = abs_pos + 3;
194 }
195 } else {
196 break;
197 }
198 }
199
200 Ok(blocks)
201}
202
203pub fn decompose(
205 markdown: &str,
206) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
207 let mut fields = HashMap::new();
208
209 let blocks = find_metadata_blocks(markdown)?;
211
212 if blocks.is_empty() {
213 fields.insert(
215 BODY_FIELD.to_string(),
216 serde_yaml::Value::String(markdown.to_string()),
217 );
218 return Ok(ParsedDocument::new(fields));
219 }
220
221 let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
223 let mut has_global_frontmatter = false;
224 let mut global_frontmatter_index: Option<usize> = None;
225
226 for (idx, block) in blocks.iter().enumerate() {
228 if block.tag.is_none() {
229 if has_global_frontmatter {
230 return Err("Multiple global frontmatter blocks found: only one untagged block allowed".into());
231 }
232 has_global_frontmatter = true;
233 global_frontmatter_index = Some(idx);
234 }
235 }
236
237 if let Some(idx) = global_frontmatter_index {
239 let block = &blocks[idx];
240
241 let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
243 HashMap::new()
244 } else {
245 serde_yaml::from_str(&block.yaml_content)
246 .map_err(|e| format!("Invalid YAML frontmatter: {}", e))?
247 };
248
249 for other_block in &blocks {
251 if let Some(ref tag) = other_block.tag {
252 if yaml_fields.contains_key(tag) {
253 return Err(format!("Name collision: global field '{}' conflicts with tagged attribute", tag).into());
254 }
255 }
256 }
257
258 fields.extend(yaml_fields);
259 }
260
261 for (idx, block) in blocks.iter().enumerate() {
263 if let Some(ref tag_name) = block.tag {
264 if fields.contains_key(tag_name) {
266 return Err(format!("Name collision: tagged attribute '{}' conflicts with global field", tag_name).into());
267 }
268
269 let mut item_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
271 HashMap::new()
272 } else {
273 serde_yaml::from_str(&block.yaml_content)
274 .map_err(|e| format!("Invalid YAML in tagged block '{}': {}", tag_name, e))?
275 };
276
277 let body_start = block.end;
279 let body_end = if idx + 1 < blocks.len() {
280 blocks[idx + 1].start
281 } else {
282 markdown.len()
283 };
284 let body = &markdown[body_start..body_end];
285
286 item_fields.insert(
288 BODY_FIELD.to_string(),
289 serde_yaml::Value::String(body.to_string()),
290 );
291
292 let item_value = serde_yaml::to_value(item_fields)?;
294
295 tagged_attributes.entry(tag_name.clone())
297 .or_insert_with(Vec::new)
298 .push(item_value);
299 }
300 }
301
302 let (body_start, body_end) = if let Some(idx) = global_frontmatter_index {
304 let start = blocks[idx].end;
306
307 let end = blocks.iter()
309 .skip(idx + 1)
310 .find(|b| b.tag.is_some())
311 .map(|b| b.start)
312 .unwrap_or(markdown.len());
313
314 (start, end)
315 } else {
316 let end = blocks.iter()
318 .find(|b| b.tag.is_some())
319 .map(|b| b.start)
320 .unwrap_or(0);
321
322 (0, end)
323 };
324
325 let global_body = &markdown[body_start..body_end];
326
327 fields.insert(
328 BODY_FIELD.to_string(),
329 serde_yaml::Value::String(global_body.to_string()),
330 );
331
332 for (tag_name, items) in tagged_attributes {
334 fields.insert(tag_name, serde_yaml::Value::Sequence(items));
335 }
336
337 Ok(ParsedDocument::new(fields))
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn test_no_frontmatter() {
346 let markdown = "# Hello World\n\nThis is a test.";
347 let doc = decompose(markdown).unwrap();
348
349 assert_eq!(doc.body(), Some(markdown));
350 assert_eq!(doc.fields().len(), 1);
351 }
352
353 #[test]
354 fn test_with_frontmatter() {
355 let markdown = r#"---
356title: Test Document
357author: Test Author
358---
359
360# Hello World
361
362This is the body."#;
363
364 let doc = decompose(markdown).unwrap();
365
366 assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
367 assert_eq!(
368 doc.get_field("title").unwrap().as_str().unwrap(),
369 "Test Document"
370 );
371 assert_eq!(
372 doc.get_field("author").unwrap().as_str().unwrap(),
373 "Test Author"
374 );
375 assert_eq!(doc.fields().len(), 3); }
377
378 #[test]
379 fn test_complex_yaml_frontmatter() {
380 let markdown = r#"---
381title: Complex Document
382tags:
383 - test
384 - yaml
385metadata:
386 version: 1.0
387 nested:
388 field: value
389---
390
391Content here."#;
392
393 let doc = decompose(markdown).unwrap();
394
395 assert_eq!(doc.body(), Some("\nContent here."));
396 assert_eq!(
397 doc.get_field("title").unwrap().as_str().unwrap(),
398 "Complex Document"
399 );
400
401 let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
402 assert_eq!(tags.len(), 2);
403 assert_eq!(tags[0].as_str().unwrap(), "test");
404 assert_eq!(tags[1].as_str().unwrap(), "yaml");
405 }
406
407 #[test]
408 fn test_invalid_yaml() {
409 let markdown = r#"---
410title: [invalid yaml
411author: missing close bracket
412---
413
414Content here."#;
415
416 let result = decompose(markdown);
417 assert!(result.is_err());
418 assert!(result
419 .unwrap_err()
420 .to_string()
421 .contains("Invalid YAML frontmatter"));
422 }
423
424 #[test]
425 fn test_unclosed_frontmatter() {
426 let markdown = r#"---
427title: Test
428author: Test Author
429
430Content without closing ---"#;
431
432 let result = decompose(markdown);
433 assert!(result.is_err());
434 assert!(result.unwrap_err().to_string().contains("not closed"));
435 }
436
437 #[test]
440 fn test_basic_tagged_block() {
441 let markdown = r#"---
442title: Main Document
443---
444
445Main body content.
446
447---
448!items
449name: Item 1
450---
451
452Body of item 1."#;
453
454 let doc = decompose(markdown).unwrap();
455
456 assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
457 assert_eq!(
458 doc.get_field("title").unwrap().as_str().unwrap(),
459 "Main Document"
460 );
461
462 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
463 assert_eq!(items.len(), 1);
464
465 let item = items[0].as_mapping().unwrap();
466 assert_eq!(
467 item.get(&serde_yaml::Value::String("name".to_string()))
468 .unwrap()
469 .as_str()
470 .unwrap(),
471 "Item 1"
472 );
473 assert_eq!(
474 item.get(&serde_yaml::Value::String("body".to_string()))
475 .unwrap()
476 .as_str()
477 .unwrap(),
478 "\nBody of item 1."
479 );
480 }
481
482 #[test]
483 fn test_multiple_tagged_blocks() {
484 let markdown = r#"---
485!items
486name: Item 1
487tags: [a, b]
488---
489
490First item body.
491
492---
493!items
494name: Item 2
495tags: [c, d]
496---
497
498Second item body."#;
499
500 let doc = decompose(markdown).unwrap();
501
502 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
503 assert_eq!(items.len(), 2);
504
505 let item1 = items[0].as_mapping().unwrap();
506 assert_eq!(
507 item1.get(&serde_yaml::Value::String("name".to_string()))
508 .unwrap()
509 .as_str()
510 .unwrap(),
511 "Item 1"
512 );
513
514 let item2 = items[1].as_mapping().unwrap();
515 assert_eq!(
516 item2.get(&serde_yaml::Value::String("name".to_string()))
517 .unwrap()
518 .as_str()
519 .unwrap(),
520 "Item 2"
521 );
522 }
523
524 #[test]
525 fn test_mixed_global_and_tagged() {
526 let markdown = r#"---
527title: Global
528author: John Doe
529---
530
531Global body.
532
533---
534!sections
535title: Section 1
536---
537
538Section 1 content.
539
540---
541!sections
542title: Section 2
543---
544
545Section 2 content."#;
546
547 let doc = decompose(markdown).unwrap();
548
549 assert_eq!(
550 doc.get_field("title").unwrap().as_str().unwrap(),
551 "Global"
552 );
553 assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
554
555 let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
556 assert_eq!(sections.len(), 2);
557 }
558
559 #[test]
560 fn test_empty_tagged_metadata() {
561 let markdown = r#"---
562!items
563---
564
565Body without metadata."#;
566
567 let doc = decompose(markdown).unwrap();
568
569 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
570 assert_eq!(items.len(), 1);
571
572 let item = items[0].as_mapping().unwrap();
573 assert_eq!(
574 item.get(&serde_yaml::Value::String("body".to_string()))
575 .unwrap()
576 .as_str()
577 .unwrap(),
578 "\nBody without metadata."
579 );
580 }
581
582 #[test]
583 fn test_tagged_block_without_body() {
584 let markdown = r#"---
585!items
586name: Item
587---"#;
588
589 let doc = decompose(markdown).unwrap();
590
591 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
592 assert_eq!(items.len(), 1);
593
594 let item = items[0].as_mapping().unwrap();
595 assert_eq!(
596 item.get(&serde_yaml::Value::String("body".to_string()))
597 .unwrap()
598 .as_str()
599 .unwrap(),
600 ""
601 );
602 }
603
604 #[test]
605 fn test_name_collision_global_and_tagged() {
606 let markdown = r#"---
607items: "global value"
608---
609
610Body
611
612---
613!items
614name: Item
615---
616
617Item body"#;
618
619 let result = decompose(markdown);
620 assert!(result.is_err());
621 assert!(result.unwrap_err().to_string().contains("collision"));
622 }
623
624 #[test]
625 fn test_reserved_field_name() {
626 let markdown = r#"---
627!body
628content: Test
629---"#;
630
631 let result = decompose(markdown);
632 assert!(result.is_err());
633 assert!(result.unwrap_err().to_string().contains("reserved"));
634 }
635
636 #[test]
637 fn test_invalid_tag_syntax() {
638 let markdown = r#"---
639!Invalid-Name
640title: Test
641---"#;
642
643 let result = decompose(markdown);
644 assert!(result.is_err());
645 assert!(result.unwrap_err().to_string().contains("Invalid tag name"));
646 }
647
648 #[test]
649 fn test_multiple_global_frontmatter_blocks() {
650 let markdown = r#"---
651title: First
652---
653
654Body
655
656---
657author: Second
658---
659
660More body"#;
661
662 let result = decompose(markdown);
663 assert!(result.is_err());
664 assert!(result.unwrap_err().to_string().contains("Multiple global frontmatter"));
665 }
666
667 #[test]
668 fn test_adjacent_blocks_different_tags() {
669 let markdown = r#"---
670!items
671name: Item 1
672---
673
674Item 1 body
675
676---
677!sections
678title: Section 1
679---
680
681Section 1 body"#;
682
683 let doc = decompose(markdown).unwrap();
684
685 assert!(doc.get_field("items").is_some());
686 assert!(doc.get_field("sections").is_some());
687
688 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
689 assert_eq!(items.len(), 1);
690
691 let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
692 assert_eq!(sections.len(), 1);
693 }
694
695 #[test]
696 fn test_order_preservation() {
697 let markdown = r#"---
698!items
699id: 1
700---
701
702First
703
704---
705!items
706id: 2
707---
708
709Second
710
711---
712!items
713id: 3
714---
715
716Third"#;
717
718 let doc = decompose(markdown).unwrap();
719
720 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
721 assert_eq!(items.len(), 3);
722
723 for (i, item) in items.iter().enumerate() {
724 let mapping = item.as_mapping().unwrap();
725 let id = mapping.get(&serde_yaml::Value::String("id".to_string()))
726 .unwrap()
727 .as_i64()
728 .unwrap();
729 assert_eq!(id, (i + 1) as i64);
730 }
731 }
732
733 #[test]
734 fn test_product_catalog_integration() {
735 let markdown = r#"---
736title: Product Catalog
737author: John Doe
738date: 2024-01-01
739---
740
741This is the main catalog description.
742
743---
744!products
745name: Widget A
746price: 19.99
747sku: WID-001
748---
749
750The **Widget A** is our most popular product.
751
752---
753!products
754name: Gadget B
755price: 29.99
756sku: GAD-002
757---
758
759The **Gadget B** is perfect for professionals.
760
761---
762!reviews
763product: Widget A
764rating: 5
765---
766
767"Excellent product! Highly recommended."
768
769---
770!reviews
771product: Gadget B
772rating: 4
773---
774
775"Very good, but a bit pricey.""#;
776
777 let doc = decompose(markdown).unwrap();
778
779 assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Product Catalog");
781 assert_eq!(doc.get_field("author").unwrap().as_str().unwrap(), "John Doe");
782 assert_eq!(doc.get_field("date").unwrap().as_str().unwrap(), "2024-01-01");
783
784 assert!(doc.body().unwrap().contains("main catalog description"));
786
787 let products = doc.get_field("products").unwrap().as_sequence().unwrap();
789 assert_eq!(products.len(), 2);
790
791 let product1 = products[0].as_mapping().unwrap();
792 assert_eq!(
793 product1.get(&serde_yaml::Value::String("name".to_string()))
794 .unwrap().as_str().unwrap(),
795 "Widget A"
796 );
797 assert_eq!(
798 product1.get(&serde_yaml::Value::String("price".to_string()))
799 .unwrap().as_f64().unwrap(),
800 19.99
801 );
802
803 let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
805 assert_eq!(reviews.len(), 2);
806
807 let review1 = reviews[0].as_mapping().unwrap();
808 assert_eq!(
809 review1.get(&serde_yaml::Value::String("product".to_string()))
810 .unwrap().as_str().unwrap(),
811 "Widget A"
812 );
813 assert_eq!(
814 review1.get(&serde_yaml::Value::String("rating".to_string()))
815 .unwrap().as_i64().unwrap(),
816 5
817 );
818
819 assert_eq!(doc.fields().len(), 6);
821 }
822}
823#[cfg(test)]
824mod demo_file_test {
825 use super::*;
826
827 #[test]
828 fn test_extended_metadata_demo_file() {
829 let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
830 let doc = decompose(markdown).unwrap();
831
832 assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Extended Metadata Demo");
834 assert_eq!(doc.get_field("author").unwrap().as_str().unwrap(), "Quillmark Team");
835 assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
837
838 assert!(doc.body().unwrap().contains("extended YAML metadata standard"));
840
841 let features = doc.get_field("features").unwrap().as_sequence().unwrap();
843 assert_eq!(features.len(), 3);
844
845 let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
847 assert_eq!(use_cases.len(), 2);
848
849 let feature1 = features[0].as_mapping().unwrap();
851 assert_eq!(
852 feature1.get(&serde_yaml::Value::String("name".to_string()))
853 .unwrap().as_str().unwrap(),
854 "Tag Directives"
855 );
856 }
857}