quillmark_core/
parse.rs

1use std::collections::HashMap;
2
3/// The field name used to store the document body
4pub const BODY_FIELD: &str = "body";
5
6/// A parsed markdown document with frontmatter
7#[derive(Debug, Clone)]
8pub struct ParsedDocument {
9    fields: HashMap<String, serde_yaml::Value>,
10}
11
12impl ParsedDocument {
13    /// Create a new ParsedDocument with the given fields
14    pub fn new(fields: HashMap<String, serde_yaml::Value>) -> Self {
15        Self { fields }
16    }
17
18    /// Get the document body
19    pub fn body(&self) -> Option<&str> {
20        self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
21    }
22
23    /// Get a specific field
24    pub fn get_field(&self, name: &str) -> Option<&serde_yaml::Value> {
25        self.fields.get(name)
26    }
27
28    /// Get all fields (including body)
29    pub fn fields(&self) -> &HashMap<String, serde_yaml::Value> {
30        &self.fields
31    }
32}
33
34#[derive(Debug)]
35struct MetadataBlock {
36    start: usize,        // Position of opening "---"
37    end: usize,          // Position after closing "---\n"
38    yaml_content: String,
39    tag: Option<String>, // Tag directive if present
40}
41
42/// Validate tag name follows pattern [a-z_][a-z0-9_]*
43fn is_valid_tag_name(name: &str) -> bool {
44    if name.is_empty() {
45        return false;
46    }
47    
48    let mut chars = name.chars();
49    let first = chars.next().unwrap();
50    
51    if !first.is_ascii_lowercase() && first != '_' {
52        return false;
53    }
54    
55    for ch in chars {
56        if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
57            return false;
58        }
59    }
60    
61    true
62}
63
64/// Find all metadata blocks in the document
65fn find_metadata_blocks(markdown: &str) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
66    let mut blocks = Vec::new();
67    let mut pos = 0;
68    
69    while pos < markdown.len() {
70        // Look for opening "---\n" or "---\r\n"
71        let search_str = &markdown[pos..];
72        let delimiter_result = if let Some(p) = search_str.find("---\n") {
73            Some((p, 4, "\n"))
74        } else if let Some(p) = search_str.find("---\r\n") {
75            Some((p, 5, "\r\n"))
76        } else {
77            None
78        };
79        
80        if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
81            let abs_pos = pos + delimiter_pos;
82            let content_start = abs_pos + delimiter_len; // After "---\n" or "---\r\n"
83            
84            // Check if opening --- is followed by a blank line (horizontal rule, not metadata)
85            let followed_by_blank = if content_start < markdown.len() {
86                markdown[content_start..].starts_with('\n') || markdown[content_start..].starts_with("\r\n")
87            } else {
88                false
89            };
90            
91            if followed_by_blank {
92                // This is a horizontal rule in the body, skip it
93                pos = abs_pos + 3; // Skip past "---"
94                continue;
95            }
96            
97            // Found potential metadata block opening
98            // Look for closing "\n---\n" or "\r\n---\r\n" etc., OR "\n---" / "\r\n---" at end of document
99            let rest = &markdown[content_start..];
100            
101            // First try to find delimiters with trailing newlines
102            let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
103            let closing_with_newline = closing_patterns
104                .iter()
105                .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
106                .min_by_key(|(p, _)| *p);
107            
108            // Also check for closing at end of document (no trailing newline)
109            let closing_at_eof = ["\n---", "\r\n---"]
110                .iter()
111                .filter_map(|delim| {
112                    rest.find(delim).and_then(|p| {
113                        if p + delim.len() == rest.len() {
114                            Some((p, delim.len()))
115                        } else {
116                            None
117                        }
118                    })
119                })
120                .min_by_key(|(p, _)| *p);
121            
122            let closing_result = match (closing_with_newline, closing_at_eof) {
123                (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
124                (Some(_), Some(_)) => closing_with_newline,
125                (Some(_), None) => closing_with_newline,
126                (None, Some(_)) => closing_at_eof,
127                (None, None) => None,
128            };
129            
130            if let Some((closing_pos, closing_len)) = closing_result {
131                let abs_closing_pos = content_start + closing_pos;
132                let content = &markdown[content_start..abs_closing_pos];
133                
134                // Check if the block is contiguous (no blank lines in the YAML content)
135                if content.contains("\n\n") || content.contains("\r\n\r\n") {
136                    // Not a contiguous block
137                    if abs_pos == 0 {
138                        // Started at beginning but has blank lines - this is an error
139                        return Err("Frontmatter started but not closed with ---".into());
140                    }
141                    // Otherwise treat as horizontal rule in body
142                    pos = abs_pos + 3;
143                    continue;
144                }
145                
146                // Extract tag directive if present
147                let (tag, yaml_content) = if content.starts_with('!') {
148                    if let Some(newline_pos) = content.find(|c| c == '\n' || c == '\r') {
149                        let tag_line = &content[1..newline_pos];
150                        // Skip newline(s) after tag
151                        let yaml_start = if content[newline_pos..].starts_with("\r\n") {
152                            newline_pos + 2
153                        } else {
154                            newline_pos + 1
155                        };
156                        let yaml = if yaml_start < content.len() {
157                            &content[yaml_start..]
158                        } else {
159                            ""
160                        };
161                        (Some(tag_line.trim().to_string()), yaml.to_string())
162                    } else {
163                        // Tag directive with no YAML content (entire content is just tag)
164                        (Some(content[1..].trim().to_string()), String::new())
165                    }
166                } else {
167                    (None, content.to_string())
168                };
169                
170                // Validate tag name if present
171                if let Some(ref tag_name) = tag {
172                    if !is_valid_tag_name(tag_name) {
173                        return Err(format!("Invalid tag name '{}': must match pattern [a-z_][a-z0-9_]*", tag_name).into());
174                    }
175                    if tag_name == BODY_FIELD {
176                        return Err(format!("Cannot use reserved field name '{}' as tag directive", BODY_FIELD).into());
177                    }
178                }
179                
180                blocks.push(MetadataBlock {
181                    start: abs_pos,
182                    end: abs_closing_pos + closing_len, // After closing delimiter
183                    yaml_content,
184                    tag,
185                });
186                
187                pos = abs_closing_pos + closing_len;
188            } else if abs_pos == 0 {
189                // Frontmatter started but not closed
190                return Err("Frontmatter started but not closed with ---".into());
191            } else {
192                // Not a valid metadata block, skip this position
193                pos = abs_pos + 3;
194            }
195        } else {
196            break;
197        }
198    }
199    
200    Ok(blocks)
201}
202
203/// Decompose markdown into frontmatter fields and body
204pub fn decompose(
205    markdown: &str,
206) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
207    let mut fields = HashMap::new();
208    
209    // Find all metadata blocks
210    let blocks = find_metadata_blocks(markdown)?;
211    
212    if blocks.is_empty() {
213        // No metadata blocks, entire content is body
214        fields.insert(
215            BODY_FIELD.to_string(),
216            serde_yaml::Value::String(markdown.to_string()),
217        );
218        return Ok(ParsedDocument::new(fields));
219    }
220    
221    // Track which attributes are used for tagged blocks
222    let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
223    let mut has_global_frontmatter = false;
224    let mut global_frontmatter_index: Option<usize> = None;
225    
226    // First pass: identify global frontmatter and validate
227    for (idx, block) in blocks.iter().enumerate() {
228        if block.tag.is_none() {
229            if has_global_frontmatter {
230                return Err("Multiple global frontmatter blocks found: only one untagged block allowed".into());
231            }
232            has_global_frontmatter = true;
233            global_frontmatter_index = Some(idx);
234        }
235    }
236    
237    // Parse global frontmatter if present
238    if let Some(idx) = global_frontmatter_index {
239        let block = &blocks[idx];
240        
241        // Parse YAML frontmatter
242        let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
243            HashMap::new()
244        } else {
245            serde_yaml::from_str(&block.yaml_content)
246                .map_err(|e| format!("Invalid YAML frontmatter: {}", e))?
247        };
248        
249        // Check that all tagged blocks don't conflict with global fields
250        for other_block in &blocks {
251            if let Some(ref tag) = other_block.tag {
252                if yaml_fields.contains_key(tag) {
253                    return Err(format!("Name collision: global field '{}' conflicts with tagged attribute", tag).into());
254                }
255            }
256        }
257        
258        fields.extend(yaml_fields);
259    }
260    
261    // Parse tagged blocks
262    for (idx, block) in blocks.iter().enumerate() {
263        if let Some(ref tag_name) = block.tag {
264            // Check if this conflicts with global fields
265            if fields.contains_key(tag_name) {
266                return Err(format!("Name collision: tagged attribute '{}' conflicts with global field", tag_name).into());
267            }
268            
269            // Parse YAML metadata
270            let mut item_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
271                HashMap::new()
272            } else {
273                serde_yaml::from_str(&block.yaml_content)
274                    .map_err(|e| format!("Invalid YAML in tagged block '{}': {}", tag_name, e))?
275            };
276            
277            // Extract body for this tagged block
278            let body_start = block.end;
279            let body_end = if idx + 1 < blocks.len() {
280                blocks[idx + 1].start
281            } else {
282                markdown.len()
283            };
284            let body = &markdown[body_start..body_end];
285            
286            // Add body to item fields
287            item_fields.insert(
288                BODY_FIELD.to_string(),
289                serde_yaml::Value::String(body.to_string()),
290            );
291            
292            // Convert HashMap to serde_yaml::Value::Mapping
293            let item_value = serde_yaml::to_value(item_fields)?;
294            
295            // Add to collection
296            tagged_attributes.entry(tag_name.clone())
297                .or_insert_with(Vec::new)
298                .push(item_value);
299        }
300    }
301    
302    // Extract global body
303    let (body_start, body_end) = if let Some(idx) = global_frontmatter_index {
304        // Global body starts after frontmatter
305        let start = blocks[idx].end;
306        
307        // Global body ends at the first tagged block after the frontmatter, or EOF
308        let end = blocks.iter()
309            .skip(idx + 1)
310            .find(|b| b.tag.is_some())
311            .map(|b| b.start)
312            .unwrap_or(markdown.len());
313        
314        (start, end)
315    } else {
316        // No global frontmatter - body is everything before the first tagged block
317        let end = blocks.iter()
318            .find(|b| b.tag.is_some())
319            .map(|b| b.start)
320            .unwrap_or(0);
321        
322        (0, end)
323    };
324    
325    let global_body = &markdown[body_start..body_end];
326    
327    fields.insert(
328        BODY_FIELD.to_string(),
329        serde_yaml::Value::String(global_body.to_string()),
330    );
331    
332    // Add all tagged collections to fields
333    for (tag_name, items) in tagged_attributes {
334        fields.insert(tag_name, serde_yaml::Value::Sequence(items));
335    }
336    
337    Ok(ParsedDocument::new(fields))
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn test_no_frontmatter() {
346        let markdown = "# Hello World\n\nThis is a test.";
347        let doc = decompose(markdown).unwrap();
348
349        assert_eq!(doc.body(), Some(markdown));
350        assert_eq!(doc.fields().len(), 1);
351    }
352
353    #[test]
354    fn test_with_frontmatter() {
355        let markdown = r#"---
356title: Test Document
357author: Test Author
358---
359
360# Hello World
361
362This is the body."#;
363
364        let doc = decompose(markdown).unwrap();
365
366        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
367        assert_eq!(
368            doc.get_field("title").unwrap().as_str().unwrap(),
369            "Test Document"
370        );
371        assert_eq!(
372            doc.get_field("author").unwrap().as_str().unwrap(),
373            "Test Author"
374        );
375        assert_eq!(doc.fields().len(), 3); // title, author, body
376    }
377
378    #[test]
379    fn test_complex_yaml_frontmatter() {
380        let markdown = r#"---
381title: Complex Document
382tags:
383  - test
384  - yaml
385metadata:
386  version: 1.0
387  nested:
388    field: value
389---
390
391Content here."#;
392
393        let doc = decompose(markdown).unwrap();
394
395        assert_eq!(doc.body(), Some("\nContent here."));
396        assert_eq!(
397            doc.get_field("title").unwrap().as_str().unwrap(),
398            "Complex Document"
399        );
400
401        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
402        assert_eq!(tags.len(), 2);
403        assert_eq!(tags[0].as_str().unwrap(), "test");
404        assert_eq!(tags[1].as_str().unwrap(), "yaml");
405    }
406
407    #[test]
408    fn test_invalid_yaml() {
409        let markdown = r#"---
410title: [invalid yaml
411author: missing close bracket
412---
413
414Content here."#;
415
416        let result = decompose(markdown);
417        assert!(result.is_err());
418        assert!(result
419            .unwrap_err()
420            .to_string()
421            .contains("Invalid YAML frontmatter"));
422    }
423
424    #[test]
425    fn test_unclosed_frontmatter() {
426        let markdown = r#"---
427title: Test
428author: Test Author
429
430Content without closing ---"#;
431
432        let result = decompose(markdown);
433        assert!(result.is_err());
434        assert!(result.unwrap_err().to_string().contains("not closed"));
435    }
436
437    // Extended metadata tests
438
439    #[test]
440    fn test_basic_tagged_block() {
441        let markdown = r#"---
442title: Main Document
443---
444
445Main body content.
446
447---
448!items
449name: Item 1
450---
451
452Body of item 1."#;
453
454        let doc = decompose(markdown).unwrap();
455
456        assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
457        assert_eq!(
458            doc.get_field("title").unwrap().as_str().unwrap(),
459            "Main Document"
460        );
461
462        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
463        assert_eq!(items.len(), 1);
464        
465        let item = items[0].as_mapping().unwrap();
466        assert_eq!(
467            item.get(&serde_yaml::Value::String("name".to_string()))
468                .unwrap()
469                .as_str()
470                .unwrap(),
471            "Item 1"
472        );
473        assert_eq!(
474            item.get(&serde_yaml::Value::String("body".to_string()))
475                .unwrap()
476                .as_str()
477                .unwrap(),
478            "\nBody of item 1."
479        );
480    }
481
482    #[test]
483    fn test_multiple_tagged_blocks() {
484        let markdown = r#"---
485!items
486name: Item 1
487tags: [a, b]
488---
489
490First item body.
491
492---
493!items
494name: Item 2
495tags: [c, d]
496---
497
498Second item body."#;
499
500        let doc = decompose(markdown).unwrap();
501
502        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
503        assert_eq!(items.len(), 2);
504        
505        let item1 = items[0].as_mapping().unwrap();
506        assert_eq!(
507            item1.get(&serde_yaml::Value::String("name".to_string()))
508                .unwrap()
509                .as_str()
510                .unwrap(),
511            "Item 1"
512        );
513        
514        let item2 = items[1].as_mapping().unwrap();
515        assert_eq!(
516            item2.get(&serde_yaml::Value::String("name".to_string()))
517                .unwrap()
518                .as_str()
519                .unwrap(),
520            "Item 2"
521        );
522    }
523
524    #[test]
525    fn test_mixed_global_and_tagged() {
526        let markdown = r#"---
527title: Global
528author: John Doe
529---
530
531Global body.
532
533---
534!sections
535title: Section 1
536---
537
538Section 1 content.
539
540---
541!sections
542title: Section 2
543---
544
545Section 2 content."#;
546
547        let doc = decompose(markdown).unwrap();
548
549        assert_eq!(
550            doc.get_field("title").unwrap().as_str().unwrap(),
551            "Global"
552        );
553        assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
554
555        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
556        assert_eq!(sections.len(), 2);
557    }
558
559    #[test]
560    fn test_empty_tagged_metadata() {
561        let markdown = r#"---
562!items
563---
564
565Body without metadata."#;
566
567        let doc = decompose(markdown).unwrap();
568
569        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
570        assert_eq!(items.len(), 1);
571        
572        let item = items[0].as_mapping().unwrap();
573        assert_eq!(
574            item.get(&serde_yaml::Value::String("body".to_string()))
575                .unwrap()
576                .as_str()
577                .unwrap(),
578            "\nBody without metadata."
579        );
580    }
581
582    #[test]
583    fn test_tagged_block_without_body() {
584        let markdown = r#"---
585!items
586name: Item
587---"#;
588
589        let doc = decompose(markdown).unwrap();
590
591        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
592        assert_eq!(items.len(), 1);
593        
594        let item = items[0].as_mapping().unwrap();
595        assert_eq!(
596            item.get(&serde_yaml::Value::String("body".to_string()))
597                .unwrap()
598                .as_str()
599                .unwrap(),
600            ""
601        );
602    }
603
604    #[test]
605    fn test_name_collision_global_and_tagged() {
606        let markdown = r#"---
607items: "global value"
608---
609
610Body
611
612---
613!items
614name: Item
615---
616
617Item body"#;
618
619        let result = decompose(markdown);
620        assert!(result.is_err());
621        assert!(result.unwrap_err().to_string().contains("collision"));
622    }
623
624    #[test]
625    fn test_reserved_field_name() {
626        let markdown = r#"---
627!body
628content: Test
629---"#;
630
631        let result = decompose(markdown);
632        assert!(result.is_err());
633        assert!(result.unwrap_err().to_string().contains("reserved"));
634    }
635
636    #[test]
637    fn test_invalid_tag_syntax() {
638        let markdown = r#"---
639!Invalid-Name
640title: Test
641---"#;
642
643        let result = decompose(markdown);
644        assert!(result.is_err());
645        assert!(result.unwrap_err().to_string().contains("Invalid tag name"));
646    }
647
648    #[test]
649    fn test_multiple_global_frontmatter_blocks() {
650        let markdown = r#"---
651title: First
652---
653
654Body
655
656---
657author: Second
658---
659
660More body"#;
661
662        let result = decompose(markdown);
663        assert!(result.is_err());
664        assert!(result.unwrap_err().to_string().contains("Multiple global frontmatter"));
665    }
666
667    #[test]
668    fn test_adjacent_blocks_different_tags() {
669        let markdown = r#"---
670!items
671name: Item 1
672---
673
674Item 1 body
675
676---
677!sections
678title: Section 1
679---
680
681Section 1 body"#;
682
683        let doc = decompose(markdown).unwrap();
684
685        assert!(doc.get_field("items").is_some());
686        assert!(doc.get_field("sections").is_some());
687        
688        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
689        assert_eq!(items.len(), 1);
690        
691        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
692        assert_eq!(sections.len(), 1);
693    }
694
695    #[test]
696    fn test_order_preservation() {
697        let markdown = r#"---
698!items
699id: 1
700---
701
702First
703
704---
705!items
706id: 2
707---
708
709Second
710
711---
712!items
713id: 3
714---
715
716Third"#;
717
718        let doc = decompose(markdown).unwrap();
719
720        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
721        assert_eq!(items.len(), 3);
722        
723        for (i, item) in items.iter().enumerate() {
724            let mapping = item.as_mapping().unwrap();
725            let id = mapping.get(&serde_yaml::Value::String("id".to_string()))
726                .unwrap()
727                .as_i64()
728                .unwrap();
729            assert_eq!(id, (i + 1) as i64);
730        }
731    }
732
733    #[test]
734    fn test_product_catalog_integration() {
735        let markdown = r#"---
736title: Product Catalog
737author: John Doe
738date: 2024-01-01
739---
740
741This is the main catalog description.
742
743---
744!products
745name: Widget A
746price: 19.99
747sku: WID-001
748---
749
750The **Widget A** is our most popular product.
751
752---
753!products
754name: Gadget B
755price: 29.99
756sku: GAD-002
757---
758
759The **Gadget B** is perfect for professionals.
760
761---
762!reviews
763product: Widget A
764rating: 5
765---
766
767"Excellent product! Highly recommended."
768
769---
770!reviews
771product: Gadget B
772rating: 4
773---
774
775"Very good, but a bit pricey.""#;
776
777        let doc = decompose(markdown).unwrap();
778        
779        // Verify global fields
780        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Product Catalog");
781        assert_eq!(doc.get_field("author").unwrap().as_str().unwrap(), "John Doe");
782        assert_eq!(doc.get_field("date").unwrap().as_str().unwrap(), "2024-01-01");
783        
784        // Verify global body
785        assert!(doc.body().unwrap().contains("main catalog description"));
786        
787        // Verify products collection
788        let products = doc.get_field("products").unwrap().as_sequence().unwrap();
789        assert_eq!(products.len(), 2);
790        
791        let product1 = products[0].as_mapping().unwrap();
792        assert_eq!(
793            product1.get(&serde_yaml::Value::String("name".to_string()))
794                .unwrap().as_str().unwrap(),
795            "Widget A"
796        );
797        assert_eq!(
798            product1.get(&serde_yaml::Value::String("price".to_string()))
799                .unwrap().as_f64().unwrap(),
800            19.99
801        );
802        
803        // Verify reviews collection
804        let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
805        assert_eq!(reviews.len(), 2);
806        
807        let review1 = reviews[0].as_mapping().unwrap();
808        assert_eq!(
809            review1.get(&serde_yaml::Value::String("product".to_string()))
810                .unwrap().as_str().unwrap(),
811            "Widget A"
812        );
813        assert_eq!(
814            review1.get(&serde_yaml::Value::String("rating".to_string()))
815                .unwrap().as_i64().unwrap(),
816            5
817        );
818        
819        // Total fields: title, author, date, body, products, reviews = 6
820        assert_eq!(doc.fields().len(), 6);
821    }
822}
823#[cfg(test)]
824mod demo_file_test {
825    use super::*;
826
827    #[test]
828    fn test_extended_metadata_demo_file() {
829        let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
830        let doc = decompose(markdown).unwrap();
831        
832        // Verify global fields
833        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Extended Metadata Demo");
834        assert_eq!(doc.get_field("author").unwrap().as_str().unwrap(), "Quillmark Team");
835        // version is parsed as a number by YAML
836        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
837        
838        // Verify body
839        assert!(doc.body().unwrap().contains("extended YAML metadata standard"));
840        
841        // Verify features collection
842        let features = doc.get_field("features").unwrap().as_sequence().unwrap();
843        assert_eq!(features.len(), 3);
844        
845        // Verify use_cases collection
846        let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
847        assert_eq!(use_cases.len(), 2);
848        
849        // Check first feature
850        let feature1 = features[0].as_mapping().unwrap();
851        assert_eq!(
852            feature1.get(&serde_yaml::Value::String("name".to_string()))
853                .unwrap().as_str().unwrap(),
854            "Tag Directives"
855        );
856    }
857}