1use std::collections::HashMap;
85
86pub const BODY_FIELD: &str = "body";
88
89#[derive(Debug, Clone)]
91pub struct ParsedDocument {
92 fields: HashMap<String, serde_yaml::Value>,
93}
94
95impl ParsedDocument {
96 pub fn new(fields: HashMap<String, serde_yaml::Value>) -> Self {
98 Self { fields }
99 }
100
101 pub fn body(&self) -> Option<&str> {
103 self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
104 }
105
106 pub fn get_field(&self, name: &str) -> Option<&serde_yaml::Value> {
108 self.fields.get(name)
109 }
110
111 pub fn fields(&self) -> &HashMap<String, serde_yaml::Value> {
113 &self.fields
114 }
115}
116
117#[derive(Debug)]
118struct MetadataBlock {
119 start: usize, end: usize, yaml_content: String,
122 tag: Option<String>, }
124
125fn is_valid_tag_name(name: &str) -> bool {
127 if name.is_empty() {
128 return false;
129 }
130
131 let mut chars = name.chars();
132 let first = chars.next().unwrap();
133
134 if !first.is_ascii_lowercase() && first != '_' {
135 return false;
136 }
137
138 for ch in chars {
139 if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
140 return false;
141 }
142 }
143
144 true
145}
146
147fn find_metadata_blocks(
149 markdown: &str,
150) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
151 let mut blocks = Vec::new();
152 let mut pos = 0;
153
154 while pos < markdown.len() {
155 let search_str = &markdown[pos..];
157 let delimiter_result = if let Some(p) = search_str.find("---\n") {
158 Some((p, 4, "\n"))
159 } else if let Some(p) = search_str.find("---\r\n") {
160 Some((p, 5, "\r\n"))
161 } else {
162 None
163 };
164
165 if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
166 let abs_pos = pos + delimiter_pos;
167 let content_start = abs_pos + delimiter_len; let followed_by_blank = if content_start < markdown.len() {
171 markdown[content_start..].starts_with('\n')
172 || markdown[content_start..].starts_with("\r\n")
173 } else {
174 false
175 };
176
177 if followed_by_blank {
178 pos = abs_pos + 3; continue;
181 }
182
183 let rest = &markdown[content_start..];
186
187 let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
189 let closing_with_newline = closing_patterns
190 .iter()
191 .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
192 .min_by_key(|(p, _)| *p);
193
194 let closing_at_eof = ["\n---", "\r\n---"]
196 .iter()
197 .filter_map(|delim| {
198 rest.find(delim).and_then(|p| {
199 if p + delim.len() == rest.len() {
200 Some((p, delim.len()))
201 } else {
202 None
203 }
204 })
205 })
206 .min_by_key(|(p, _)| *p);
207
208 let closing_result = match (closing_with_newline, closing_at_eof) {
209 (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
210 (Some(_), Some(_)) => closing_with_newline,
211 (Some(_), None) => closing_with_newline,
212 (None, Some(_)) => closing_at_eof,
213 (None, None) => None,
214 };
215
216 if let Some((closing_pos, closing_len)) = closing_result {
217 let abs_closing_pos = content_start + closing_pos;
218 let content = &markdown[content_start..abs_closing_pos];
219
220 if content.len() > crate::error::MAX_YAML_SIZE {
222 return Err(format!(
223 "YAML block too large: {} bytes (max: {} bytes)",
224 content.len(),
225 crate::error::MAX_YAML_SIZE
226 )
227 .into());
228 }
229
230 if content.contains("\n\n") || content.contains("\r\n\r\n") {
232 if abs_pos == 0 {
234 return Err("Frontmatter started but not closed with ---".into());
236 }
237 pos = abs_pos + 3;
239 continue;
240 }
241
242 let (tag, yaml_content) = if content.starts_with('!') {
244 if let Some(newline_pos) = content.find(|c| c == '\n' || c == '\r') {
245 let tag_line = &content[1..newline_pos];
246 let yaml_start = if content[newline_pos..].starts_with("\r\n") {
248 newline_pos + 2
249 } else {
250 newline_pos + 1
251 };
252 let yaml = if yaml_start < content.len() {
253 &content[yaml_start..]
254 } else {
255 ""
256 };
257 (Some(tag_line.trim().to_string()), yaml.to_string())
258 } else {
259 (Some(content[1..].trim().to_string()), String::new())
261 }
262 } else {
263 (None, content.to_string())
264 };
265
266 if let Some(ref tag_name) = tag {
268 if !is_valid_tag_name(tag_name) {
269 return Err(format!(
270 "Invalid tag name '{}': must match pattern [a-z_][a-z0-9_]*",
271 tag_name
272 )
273 .into());
274 }
275 if tag_name == BODY_FIELD {
276 return Err(format!(
277 "Cannot use reserved field name '{}' as tag directive",
278 BODY_FIELD
279 )
280 .into());
281 }
282 }
283
284 blocks.push(MetadataBlock {
285 start: abs_pos,
286 end: abs_closing_pos + closing_len, yaml_content,
288 tag,
289 });
290
291 pos = abs_closing_pos + closing_len;
292 } else if abs_pos == 0 {
293 return Err("Frontmatter started but not closed with ---".into());
295 } else {
296 pos = abs_pos + 3;
298 }
299 } else {
300 break;
301 }
302 }
303
304 Ok(blocks)
305}
306
307pub fn decompose(
309 markdown: &str,
310) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
311 if markdown.len() > crate::error::MAX_INPUT_SIZE {
313 return Err(format!(
314 "Input too large: {} bytes (max: {} bytes)",
315 markdown.len(),
316 crate::error::MAX_INPUT_SIZE
317 )
318 .into());
319 }
320
321 let mut fields = HashMap::new();
322
323 let blocks = find_metadata_blocks(markdown)?;
325
326 if blocks.is_empty() {
327 fields.insert(
329 BODY_FIELD.to_string(),
330 serde_yaml::Value::String(markdown.to_string()),
331 );
332 return Ok(ParsedDocument::new(fields));
333 }
334
335 let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
337 let mut has_global_frontmatter = false;
338 let mut global_frontmatter_index: Option<usize> = None;
339
340 for (idx, block) in blocks.iter().enumerate() {
342 if block.tag.is_none() {
343 if has_global_frontmatter {
344 return Err(
345 "Multiple global frontmatter blocks found: only one untagged block allowed"
346 .into(),
347 );
348 }
349 has_global_frontmatter = true;
350 global_frontmatter_index = Some(idx);
351 }
352 }
353
354 if let Some(idx) = global_frontmatter_index {
356 let block = &blocks[idx];
357
358 let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
360 HashMap::new()
361 } else {
362 serde_yaml::from_str(&block.yaml_content)
363 .map_err(|e| format!("Invalid YAML frontmatter: {}", e))?
364 };
365
366 for other_block in &blocks {
368 if let Some(ref tag) = other_block.tag {
369 if yaml_fields.contains_key(tag) {
370 return Err(format!(
371 "Name collision: global field '{}' conflicts with tagged attribute",
372 tag
373 )
374 .into());
375 }
376 }
377 }
378
379 fields.extend(yaml_fields);
380 }
381
382 for (idx, block) in blocks.iter().enumerate() {
384 if let Some(ref tag_name) = block.tag {
385 if fields.contains_key(tag_name) {
387 return Err(format!(
388 "Name collision: tagged attribute '{}' conflicts with global field",
389 tag_name
390 )
391 .into());
392 }
393
394 let mut item_fields: HashMap<String, serde_yaml::Value> =
396 if block.yaml_content.is_empty() {
397 HashMap::new()
398 } else {
399 serde_yaml::from_str(&block.yaml_content).map_err(|e| {
400 format!("Invalid YAML in tagged block '{}': {}", tag_name, e)
401 })?
402 };
403
404 let body_start = block.end;
406 let body_end = if idx + 1 < blocks.len() {
407 blocks[idx + 1].start
408 } else {
409 markdown.len()
410 };
411 let body = &markdown[body_start..body_end];
412
413 item_fields.insert(
415 BODY_FIELD.to_string(),
416 serde_yaml::Value::String(body.to_string()),
417 );
418
419 let item_value = serde_yaml::to_value(item_fields)?;
421
422 tagged_attributes
424 .entry(tag_name.clone())
425 .or_insert_with(Vec::new)
426 .push(item_value);
427 }
428 }
429
430 let (body_start, body_end) = if let Some(idx) = global_frontmatter_index {
432 let start = blocks[idx].end;
434
435 let end = blocks
437 .iter()
438 .skip(idx + 1)
439 .find(|b| b.tag.is_some())
440 .map(|b| b.start)
441 .unwrap_or(markdown.len());
442
443 (start, end)
444 } else {
445 let end = blocks
447 .iter()
448 .find(|b| b.tag.is_some())
449 .map(|b| b.start)
450 .unwrap_or(0);
451
452 (0, end)
453 };
454
455 let global_body = &markdown[body_start..body_end];
456
457 fields.insert(
458 BODY_FIELD.to_string(),
459 serde_yaml::Value::String(global_body.to_string()),
460 );
461
462 for (tag_name, items) in tagged_attributes {
464 fields.insert(tag_name, serde_yaml::Value::Sequence(items));
465 }
466
467 Ok(ParsedDocument::new(fields))
468}
469
470#[cfg(test)]
471mod tests {
472 use super::*;
473
474 #[test]
475 fn test_no_frontmatter() {
476 let markdown = "# Hello World\n\nThis is a test.";
477 let doc = decompose(markdown).unwrap();
478
479 assert_eq!(doc.body(), Some(markdown));
480 assert_eq!(doc.fields().len(), 1);
481 }
482
483 #[test]
484 fn test_with_frontmatter() {
485 let markdown = r#"---
486title: Test Document
487author: Test Author
488---
489
490# Hello World
491
492This is the body."#;
493
494 let doc = decompose(markdown).unwrap();
495
496 assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
497 assert_eq!(
498 doc.get_field("title").unwrap().as_str().unwrap(),
499 "Test Document"
500 );
501 assert_eq!(
502 doc.get_field("author").unwrap().as_str().unwrap(),
503 "Test Author"
504 );
505 assert_eq!(doc.fields().len(), 3); }
507
508 #[test]
509 fn test_complex_yaml_frontmatter() {
510 let markdown = r#"---
511title: Complex Document
512tags:
513 - test
514 - yaml
515metadata:
516 version: 1.0
517 nested:
518 field: value
519---
520
521Content here."#;
522
523 let doc = decompose(markdown).unwrap();
524
525 assert_eq!(doc.body(), Some("\nContent here."));
526 assert_eq!(
527 doc.get_field("title").unwrap().as_str().unwrap(),
528 "Complex Document"
529 );
530
531 let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
532 assert_eq!(tags.len(), 2);
533 assert_eq!(tags[0].as_str().unwrap(), "test");
534 assert_eq!(tags[1].as_str().unwrap(), "yaml");
535 }
536
537 #[test]
538 fn test_invalid_yaml() {
539 let markdown = r#"---
540title: [invalid yaml
541author: missing close bracket
542---
543
544Content here."#;
545
546 let result = decompose(markdown);
547 assert!(result.is_err());
548 assert!(result
549 .unwrap_err()
550 .to_string()
551 .contains("Invalid YAML frontmatter"));
552 }
553
554 #[test]
555 fn test_unclosed_frontmatter() {
556 let markdown = r#"---
557title: Test
558author: Test Author
559
560Content without closing ---"#;
561
562 let result = decompose(markdown);
563 assert!(result.is_err());
564 assert!(result.unwrap_err().to_string().contains("not closed"));
565 }
566
567 #[test]
570 fn test_basic_tagged_block() {
571 let markdown = r#"---
572title: Main Document
573---
574
575Main body content.
576
577---
578!items
579name: Item 1
580---
581
582Body of item 1."#;
583
584 let doc = decompose(markdown).unwrap();
585
586 assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
587 assert_eq!(
588 doc.get_field("title").unwrap().as_str().unwrap(),
589 "Main Document"
590 );
591
592 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
593 assert_eq!(items.len(), 1);
594
595 let item = items[0].as_mapping().unwrap();
596 assert_eq!(
597 item.get(&serde_yaml::Value::String("name".to_string()))
598 .unwrap()
599 .as_str()
600 .unwrap(),
601 "Item 1"
602 );
603 assert_eq!(
604 item.get(&serde_yaml::Value::String("body".to_string()))
605 .unwrap()
606 .as_str()
607 .unwrap(),
608 "\nBody of item 1."
609 );
610 }
611
612 #[test]
613 fn test_multiple_tagged_blocks() {
614 let markdown = r#"---
615!items
616name: Item 1
617tags: [a, b]
618---
619
620First item body.
621
622---
623!items
624name: Item 2
625tags: [c, d]
626---
627
628Second item body."#;
629
630 let doc = decompose(markdown).unwrap();
631
632 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
633 assert_eq!(items.len(), 2);
634
635 let item1 = items[0].as_mapping().unwrap();
636 assert_eq!(
637 item1
638 .get(&serde_yaml::Value::String("name".to_string()))
639 .unwrap()
640 .as_str()
641 .unwrap(),
642 "Item 1"
643 );
644
645 let item2 = items[1].as_mapping().unwrap();
646 assert_eq!(
647 item2
648 .get(&serde_yaml::Value::String("name".to_string()))
649 .unwrap()
650 .as_str()
651 .unwrap(),
652 "Item 2"
653 );
654 }
655
656 #[test]
657 fn test_mixed_global_and_tagged() {
658 let markdown = r#"---
659title: Global
660author: John Doe
661---
662
663Global body.
664
665---
666!sections
667title: Section 1
668---
669
670Section 1 content.
671
672---
673!sections
674title: Section 2
675---
676
677Section 2 content."#;
678
679 let doc = decompose(markdown).unwrap();
680
681 assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Global");
682 assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
683
684 let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
685 assert_eq!(sections.len(), 2);
686 }
687
688 #[test]
689 fn test_empty_tagged_metadata() {
690 let markdown = r#"---
691!items
692---
693
694Body without metadata."#;
695
696 let doc = decompose(markdown).unwrap();
697
698 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
699 assert_eq!(items.len(), 1);
700
701 let item = items[0].as_mapping().unwrap();
702 assert_eq!(
703 item.get(&serde_yaml::Value::String("body".to_string()))
704 .unwrap()
705 .as_str()
706 .unwrap(),
707 "\nBody without metadata."
708 );
709 }
710
711 #[test]
712 fn test_tagged_block_without_body() {
713 let markdown = r#"---
714!items
715name: Item
716---"#;
717
718 let doc = decompose(markdown).unwrap();
719
720 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
721 assert_eq!(items.len(), 1);
722
723 let item = items[0].as_mapping().unwrap();
724 assert_eq!(
725 item.get(&serde_yaml::Value::String("body".to_string()))
726 .unwrap()
727 .as_str()
728 .unwrap(),
729 ""
730 );
731 }
732
733 #[test]
734 fn test_name_collision_global_and_tagged() {
735 let markdown = r#"---
736items: "global value"
737---
738
739Body
740
741---
742!items
743name: Item
744---
745
746Item body"#;
747
748 let result = decompose(markdown);
749 assert!(result.is_err());
750 assert!(result.unwrap_err().to_string().contains("collision"));
751 }
752
753 #[test]
754 fn test_reserved_field_name() {
755 let markdown = r#"---
756!body
757content: Test
758---"#;
759
760 let result = decompose(markdown);
761 assert!(result.is_err());
762 assert!(result.unwrap_err().to_string().contains("reserved"));
763 }
764
765 #[test]
766 fn test_invalid_tag_syntax() {
767 let markdown = r#"---
768!Invalid-Name
769title: Test
770---"#;
771
772 let result = decompose(markdown);
773 assert!(result.is_err());
774 assert!(result.unwrap_err().to_string().contains("Invalid tag name"));
775 }
776
777 #[test]
778 fn test_multiple_global_frontmatter_blocks() {
779 let markdown = r#"---
780title: First
781---
782
783Body
784
785---
786author: Second
787---
788
789More body"#;
790
791 let result = decompose(markdown);
792 assert!(result.is_err());
793 assert!(result
794 .unwrap_err()
795 .to_string()
796 .contains("Multiple global frontmatter"));
797 }
798
799 #[test]
800 fn test_adjacent_blocks_different_tags() {
801 let markdown = r#"---
802!items
803name: Item 1
804---
805
806Item 1 body
807
808---
809!sections
810title: Section 1
811---
812
813Section 1 body"#;
814
815 let doc = decompose(markdown).unwrap();
816
817 assert!(doc.get_field("items").is_some());
818 assert!(doc.get_field("sections").is_some());
819
820 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
821 assert_eq!(items.len(), 1);
822
823 let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
824 assert_eq!(sections.len(), 1);
825 }
826
827 #[test]
828 fn test_order_preservation() {
829 let markdown = r#"---
830!items
831id: 1
832---
833
834First
835
836---
837!items
838id: 2
839---
840
841Second
842
843---
844!items
845id: 3
846---
847
848Third"#;
849
850 let doc = decompose(markdown).unwrap();
851
852 let items = doc.get_field("items").unwrap().as_sequence().unwrap();
853 assert_eq!(items.len(), 3);
854
855 for (i, item) in items.iter().enumerate() {
856 let mapping = item.as_mapping().unwrap();
857 let id = mapping
858 .get(&serde_yaml::Value::String("id".to_string()))
859 .unwrap()
860 .as_i64()
861 .unwrap();
862 assert_eq!(id, (i + 1) as i64);
863 }
864 }
865
866 #[test]
867 fn test_product_catalog_integration() {
868 let markdown = r#"---
869title: Product Catalog
870author: John Doe
871date: 2024-01-01
872---
873
874This is the main catalog description.
875
876---
877!products
878name: Widget A
879price: 19.99
880sku: WID-001
881---
882
883The **Widget A** is our most popular product.
884
885---
886!products
887name: Gadget B
888price: 29.99
889sku: GAD-002
890---
891
892The **Gadget B** is perfect for professionals.
893
894---
895!reviews
896product: Widget A
897rating: 5
898---
899
900"Excellent product! Highly recommended."
901
902---
903!reviews
904product: Gadget B
905rating: 4
906---
907
908"Very good, but a bit pricey.""#;
909
910 let doc = decompose(markdown).unwrap();
911
912 assert_eq!(
914 doc.get_field("title").unwrap().as_str().unwrap(),
915 "Product Catalog"
916 );
917 assert_eq!(
918 doc.get_field("author").unwrap().as_str().unwrap(),
919 "John Doe"
920 );
921 assert_eq!(
922 doc.get_field("date").unwrap().as_str().unwrap(),
923 "2024-01-01"
924 );
925
926 assert!(doc.body().unwrap().contains("main catalog description"));
928
929 let products = doc.get_field("products").unwrap().as_sequence().unwrap();
931 assert_eq!(products.len(), 2);
932
933 let product1 = products[0].as_mapping().unwrap();
934 assert_eq!(
935 product1
936 .get(&serde_yaml::Value::String("name".to_string()))
937 .unwrap()
938 .as_str()
939 .unwrap(),
940 "Widget A"
941 );
942 assert_eq!(
943 product1
944 .get(&serde_yaml::Value::String("price".to_string()))
945 .unwrap()
946 .as_f64()
947 .unwrap(),
948 19.99
949 );
950
951 let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
953 assert_eq!(reviews.len(), 2);
954
955 let review1 = reviews[0].as_mapping().unwrap();
956 assert_eq!(
957 review1
958 .get(&serde_yaml::Value::String("product".to_string()))
959 .unwrap()
960 .as_str()
961 .unwrap(),
962 "Widget A"
963 );
964 assert_eq!(
965 review1
966 .get(&serde_yaml::Value::String("rating".to_string()))
967 .unwrap()
968 .as_i64()
969 .unwrap(),
970 5
971 );
972
973 assert_eq!(doc.fields().len(), 6);
975 }
976}
977#[cfg(test)]
978mod demo_file_test {
979 use super::*;
980
981 #[test]
982 fn test_extended_metadata_demo_file() {
983 let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
984 let doc = decompose(markdown).unwrap();
985
986 assert_eq!(
988 doc.get_field("title").unwrap().as_str().unwrap(),
989 "Extended Metadata Demo"
990 );
991 assert_eq!(
992 doc.get_field("author").unwrap().as_str().unwrap(),
993 "Quillmark Team"
994 );
995 assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
997
998 assert!(doc
1000 .body()
1001 .unwrap()
1002 .contains("extended YAML metadata standard"));
1003
1004 let features = doc.get_field("features").unwrap().as_sequence().unwrap();
1006 assert_eq!(features.len(), 3);
1007
1008 let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
1010 assert_eq!(use_cases.len(), 2);
1011
1012 let feature1 = features[0].as_mapping().unwrap();
1014 assert_eq!(
1015 feature1
1016 .get(&serde_yaml::Value::String("name".to_string()))
1017 .unwrap()
1018 .as_str()
1019 .unwrap(),
1020 "Tag Directives"
1021 );
1022 }
1023
1024 #[test]
1025 fn test_input_size_limit() {
1026 let size = crate::error::MAX_INPUT_SIZE + 1;
1028 let large_markdown = "a".repeat(size);
1029
1030 let result = decompose(&large_markdown);
1031 assert!(result.is_err());
1032
1033 let err_msg = result.unwrap_err().to_string();
1034 assert!(err_msg.contains("Input too large"));
1035 }
1036
1037 #[test]
1038 fn test_yaml_size_limit() {
1039 let mut markdown = String::from("---\n");
1041
1042 let size = crate::error::MAX_YAML_SIZE + 1;
1044 markdown.push_str("data: \"");
1045 markdown.push_str(&"x".repeat(size));
1046 markdown.push_str("\"\n---\n\nBody");
1047
1048 let result = decompose(&markdown);
1049 assert!(result.is_err());
1050
1051 let err_msg = result.unwrap_err().to_string();
1052 assert!(err_msg.contains("YAML block too large"));
1053 }
1054
1055 #[test]
1056 fn test_input_within_size_limit() {
1057 let size = 1000; let markdown = format!("---\ntitle: Test\n---\n\n{}", "a".repeat(size));
1060
1061 let result = decompose(&markdown);
1062 assert!(result.is_ok());
1063 }
1064
1065 #[test]
1066 fn test_yaml_within_size_limit() {
1067 let markdown = "---\ntitle: Test\nauthor: John Doe\n---\n\nBody content";
1069
1070 let result = decompose(&markdown);
1071 assert!(result.is_ok());
1072 }
1073}