quillmark_core/
parse.rs

1//! # Parsing Module
2//!
3//! Parsing functionality for markdown documents with YAML frontmatter.
4//!
5//! ## Overview
6//!
7//! The `parse` module provides the [`ParsedDocument::from_markdown`] function for parsing markdown documents
8//!
9//! ## Key Types
10//!
11//! - [`ParsedDocument`]: Container for parsed frontmatter fields and body content
12//! - [`BODY_FIELD`]: Constant for the field name storing document body
13//!
14//! ## Examples
15//!
16//! ### Basic Parsing
17//!
18//! ```
19//! use quillmark_core::ParsedDocument;
20//!
21//! let markdown = r#"---
22//! title: My Document
23//! author: John Doe
24//! ---
25//!
26//! # Introduction
27//!
28//! Document content here.
29//! "#;
30//!
31//! let doc = ParsedDocument::from_markdown(markdown).unwrap();
32//! let title = doc.get_field("title")
33//!     .and_then(|v| v.as_str())
34//!     .unwrap_or("Untitled");
35//! ```
36//!
37//! ## Error Handling
38//!
39//! The [`ParsedDocument::from_markdown`] function returns errors for:
40//! - Malformed YAML syntax
41//! - Unclosed frontmatter blocks
42//! - Multiple global frontmatter blocks
43//! - Both QUILL and SCOPE specified in the same block
44//! - Reserved field name usage
45//! - Name collisions
46//!
47//! See [PARSE.md](https://github.com/nibsbin/quillmark/blob/main/designs/PARSE.md) for comprehensive documentation of the Extended YAML Metadata Standard.
48
49use std::collections::HashMap;
50
51use crate::value::QuillValue;
52
53/// The field name used to store the document body
54pub const BODY_FIELD: &str = "body";
55
56/// Helper function to convert serde_yaml::Error with location extraction
57fn yaml_error_to_string(e: serde_yaml::Error, context: &str) -> String {
58    let mut msg = format!("{}: {}", context, e);
59
60    if let Some(loc) = e.location() {
61        msg.push_str(&format!(" at line {}, column {}", loc.line(), loc.column()));
62    }
63
64    msg
65}
66
67/// Reserved tag name for quill specification
68pub const QUILL_TAG: &str = "quill";
69
70/// A parsed markdown document with frontmatter
71#[derive(Debug, Clone)]
72pub struct ParsedDocument {
73    fields: HashMap<String, QuillValue>,
74    quill_tag: Option<String>,
75}
76
77impl ParsedDocument {
78    /// Create a new ParsedDocument with the given fields
79    pub fn new(fields: HashMap<String, QuillValue>) -> Self {
80        Self {
81            fields,
82            quill_tag: None,
83        }
84    }
85
86    /// Create a ParsedDocument from fields and optional quill tag
87    pub fn with_quill_tag(fields: HashMap<String, QuillValue>, quill_tag: Option<String>) -> Self {
88        Self { fields, quill_tag }
89    }
90
91    /// Create a ParsedDocument from markdown string
92    pub fn from_markdown(markdown: &str) -> Result<Self, crate::error::ParseError> {
93        decompose(markdown).map_err(|e| crate::error::ParseError::from(e))
94    }
95
96    /// Get the quill tag if specified (from QUILL key)
97    pub fn quill_tag(&self) -> Option<&str> {
98        self.quill_tag.as_deref()
99    }
100
101    /// Get the document body
102    pub fn body(&self) -> Option<&str> {
103        self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
104    }
105
106    /// Get a specific field
107    pub fn get_field(&self, name: &str) -> Option<&QuillValue> {
108        self.fields.get(name)
109    }
110
111    /// Get all fields (including body)
112    pub fn fields(&self) -> &HashMap<String, QuillValue> {
113        &self.fields
114    }
115}
116
117#[derive(Debug)]
118struct MetadataBlock {
119    start: usize, // Position of opening "---"
120    end: usize,   // Position after closing "---\n"
121    yaml_content: String,
122    tag: Option<String>,        // Field name from SCOPE key
123    quill_name: Option<String>, // Quill name from QUILL key
124}
125
126/// Validate tag name follows pattern [a-z_][a-z0-9_]*
127fn is_valid_tag_name(name: &str) -> bool {
128    if name.is_empty() {
129        return false;
130    }
131
132    let mut chars = name.chars();
133    let first = chars.next().unwrap();
134
135    if !first.is_ascii_lowercase() && first != '_' {
136        return false;
137    }
138
139    for ch in chars {
140        if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
141            return false;
142        }
143    }
144
145    true
146}
147
148/// Find all metadata blocks in the document
149fn find_metadata_blocks(
150    markdown: &str,
151) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
152    let mut blocks = Vec::new();
153    let mut pos = 0;
154
155    while pos < markdown.len() {
156        // Look for opening "---\n" or "---\r\n"
157        let search_str = &markdown[pos..];
158        let delimiter_result = if let Some(p) = search_str.find("---\n") {
159            Some((p, 4, "\n"))
160        } else if let Some(p) = search_str.find("---\r\n") {
161            Some((p, 5, "\r\n"))
162        } else {
163            None
164        };
165
166        if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
167            let abs_pos = pos + delimiter_pos;
168            let content_start = abs_pos + delimiter_len; // After "---\n" or "---\r\n"
169
170            // Check if this --- is a horizontal rule (blank lines above AND below)
171            let preceded_by_blank = if abs_pos > 0 {
172                // Check if there's a blank line before the ---
173                let before = &markdown[..abs_pos];
174                before.ends_with("\n\n") || before.ends_with("\r\n\r\n")
175            } else {
176                false
177            };
178
179            let followed_by_blank = if content_start < markdown.len() {
180                markdown[content_start..].starts_with('\n')
181                    || markdown[content_start..].starts_with("\r\n")
182            } else {
183                false
184            };
185
186            // Horizontal rule: blank lines both above and below
187            if preceded_by_blank && followed_by_blank {
188                // This is a horizontal rule in the body, skip it
189                pos = abs_pos + 3; // Skip past "---"
190                continue;
191            }
192
193            // Check if followed by non-blank line (or if we're at document start)
194            // This starts a metadata block
195            if followed_by_blank {
196                // --- followed by blank line but NOT preceded by blank line
197                // This is NOT a metadata block opening, skip it
198                pos = abs_pos + 3;
199                continue;
200            }
201
202            // Found potential metadata block opening (followed by non-blank line)
203            // Look for closing "\n---\n" or "\r\n---\r\n" etc., OR "\n---" / "\r\n---" at end of document
204            let rest = &markdown[content_start..];
205
206            // First try to find delimiters with trailing newlines
207            let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
208            let closing_with_newline = closing_patterns
209                .iter()
210                .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
211                .min_by_key(|(p, _)| *p);
212
213            // Also check for closing at end of document (no trailing newline)
214            let closing_at_eof = ["\n---", "\r\n---"]
215                .iter()
216                .filter_map(|delim| {
217                    rest.find(delim).and_then(|p| {
218                        if p + delim.len() == rest.len() {
219                            Some((p, delim.len()))
220                        } else {
221                            None
222                        }
223                    })
224                })
225                .min_by_key(|(p, _)| *p);
226
227            let closing_result = match (closing_with_newline, closing_at_eof) {
228                (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
229                (Some(_), Some(_)) => closing_with_newline,
230                (Some(_), None) => closing_with_newline,
231                (None, Some(_)) => closing_at_eof,
232                (None, None) => None,
233            };
234
235            if let Some((closing_pos, closing_len)) = closing_result {
236                let abs_closing_pos = content_start + closing_pos;
237                let content = &markdown[content_start..abs_closing_pos];
238
239                // Check YAML size limit
240                if content.len() > crate::error::MAX_YAML_SIZE {
241                    return Err(format!(
242                        "YAML block too large: {} bytes (max: {} bytes)",
243                        content.len(),
244                        crate::error::MAX_YAML_SIZE
245                    )
246                    .into());
247                }
248
249                // Parse YAML content to check for reserved keys (QUILL, SCOPE)
250                // First, try to parse as YAML
251                let (tag, quill_name, yaml_content) = if !content.is_empty() {
252                    // Try to parse the YAML to check for reserved keys
253                    match serde_yaml::from_str::<serde_yaml::Value>(content) {
254                        Ok(yaml_value) => {
255                            if let Some(mapping) = yaml_value.as_mapping() {
256                                let quill_key = serde_yaml::Value::String("QUILL".to_string());
257                                let scope_key = serde_yaml::Value::String("SCOPE".to_string());
258
259                                let has_quill = mapping.contains_key(&quill_key);
260                                let has_scope = mapping.contains_key(&scope_key);
261
262                                if has_quill && has_scope {
263                                    return Err(
264                                        "Cannot specify both QUILL and SCOPE in the same block"
265                                            .into(),
266                                    );
267                                }
268
269                                if has_quill {
270                                    // Extract quill name
271                                    let quill_value = mapping.get(&quill_key).unwrap();
272                                    let quill_name_str = quill_value
273                                        .as_str()
274                                        .ok_or_else(|| "QUILL value must be a string")?;
275
276                                    if !is_valid_tag_name(quill_name_str) {
277                                        return Err(format!(
278                                            "Invalid quill name '{}': must match pattern [a-z_][a-z0-9_]*",
279                                            quill_name_str
280                                        )
281                                        .into());
282                                    }
283
284                                    // Remove QUILL from the YAML content for processing
285                                    let mut new_mapping = mapping.clone();
286                                    new_mapping.remove(&quill_key);
287                                    let new_yaml = serde_yaml::to_string(&new_mapping)
288                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
289
290                                    (None, Some(quill_name_str.to_string()), new_yaml)
291                                } else if has_scope {
292                                    // Extract scope field name
293                                    let scope_value = mapping.get(&scope_key).unwrap();
294                                    let field_name = scope_value
295                                        .as_str()
296                                        .ok_or_else(|| "SCOPE value must be a string")?;
297
298                                    if !is_valid_tag_name(field_name) {
299                                        return Err(format!(
300                                            "Invalid field name '{}': must match pattern [a-z_][a-z0-9_]*",
301                                            field_name
302                                        )
303                                        .into());
304                                    }
305
306                                    if field_name == BODY_FIELD {
307                                        return Err(format!(
308                                            "Cannot use reserved field name '{}' as SCOPE value",
309                                            BODY_FIELD
310                                        )
311                                        .into());
312                                    }
313
314                                    // Remove SCOPE from the YAML content for processing
315                                    let mut new_mapping = mapping.clone();
316                                    new_mapping.remove(&scope_key);
317                                    let new_yaml = serde_yaml::to_string(&new_mapping)
318                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
319
320                                    (Some(field_name.to_string()), None, new_yaml)
321                                } else {
322                                    // No reserved keys, treat as normal YAML
323                                    (None, None, content.to_string())
324                                }
325                            } else {
326                                // Not a mapping, treat as normal YAML
327                                (None, None, content.to_string())
328                            }
329                        }
330                        Err(_) => {
331                            // If YAML parsing fails here, we'll catch it later
332                            (None, None, content.to_string())
333                        }
334                    }
335                } else {
336                    (None, None, content.to_string())
337                };
338
339                blocks.push(MetadataBlock {
340                    start: abs_pos,
341                    end: abs_closing_pos + closing_len, // After closing delimiter
342                    yaml_content,
343                    tag,
344                    quill_name,
345                });
346
347                pos = abs_closing_pos + closing_len;
348            } else if abs_pos == 0 {
349                // Frontmatter started but not closed
350                return Err("Frontmatter started but not closed with ---".into());
351            } else {
352                // Not a valid metadata block, skip this position
353                pos = abs_pos + 3;
354            }
355        } else {
356            break;
357        }
358    }
359
360    Ok(blocks)
361}
362
363/// Decompose markdown into frontmatter fields and body
364fn decompose(markdown: &str) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
365    // Check input size limit
366    if markdown.len() > crate::error::MAX_INPUT_SIZE {
367        return Err(format!(
368            "Input too large: {} bytes (max: {} bytes)",
369            markdown.len(),
370            crate::error::MAX_INPUT_SIZE
371        )
372        .into());
373    }
374
375    let mut fields = HashMap::new();
376
377    // Find all metadata blocks
378    let blocks = find_metadata_blocks(markdown)?;
379
380    if blocks.is_empty() {
381        // No metadata blocks, entire content is body
382        fields.insert(
383            BODY_FIELD.to_string(),
384            QuillValue::from_json(serde_json::Value::String(markdown.to_string())),
385        );
386        return Ok(ParsedDocument::new(fields));
387    }
388
389    // Track which attributes are used for tagged blocks
390    let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
391    let mut has_global_frontmatter = false;
392    let mut global_frontmatter_index: Option<usize> = None;
393    let mut quill_name: Option<String> = None;
394
395    // First pass: identify global frontmatter, quill directive, and validate
396    for (idx, block) in blocks.iter().enumerate() {
397        // Check for quill directive
398        if let Some(ref name) = block.quill_name {
399            if quill_name.is_some() {
400                return Err("Multiple quill directives found: only one allowed".into());
401            }
402            quill_name = Some(name.clone());
403        }
404
405        // Check for global frontmatter (no tag and no quill directive)
406        if block.tag.is_none() && block.quill_name.is_none() {
407            if has_global_frontmatter {
408                return Err(
409                    "Multiple global frontmatter blocks found: only one untagged block allowed"
410                        .into(),
411                );
412            }
413            has_global_frontmatter = true;
414            global_frontmatter_index = Some(idx);
415        }
416    }
417
418    // Parse global frontmatter if present
419    if let Some(idx) = global_frontmatter_index {
420        let block = &blocks[idx];
421
422        // Parse YAML frontmatter
423        let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
424            HashMap::new()
425        } else {
426            serde_yaml::from_str(&block.yaml_content)
427                .map_err(|e| yaml_error_to_string(e, "Invalid YAML frontmatter"))?
428        };
429
430        // Check that all tagged blocks don't conflict with global fields
431        // Exception: if the global field is an array, allow it (we'll merge later)
432        for other_block in &blocks {
433            if let Some(ref tag) = other_block.tag {
434                if let Some(global_value) = yaml_fields.get(tag) {
435                    // Check if the global value is an array
436                    if global_value.as_sequence().is_none() {
437                        return Err(format!(
438                            "Name collision: global field '{}' conflicts with tagged attribute",
439                            tag
440                        )
441                        .into());
442                    }
443                }
444            }
445        }
446
447        // Convert YAML values to QuillValue at boundary
448        for (key, value) in yaml_fields {
449            fields.insert(key, QuillValue::from_yaml(value)?);
450        }
451    }
452
453    // Process blocks with quill directives
454    for block in &blocks {
455        if block.quill_name.is_some() {
456            // Quill directive blocks can have YAML content (becomes part of frontmatter)
457            if !block.yaml_content.is_empty() {
458                let yaml_fields: HashMap<String, serde_yaml::Value> =
459                    serde_yaml::from_str(&block.yaml_content)
460                        .map_err(|e| yaml_error_to_string(e, "Invalid YAML in quill block"))?;
461
462                // Check for conflicts with existing fields
463                for key in yaml_fields.keys() {
464                    if fields.contains_key(key) {
465                        return Err(format!(
466                            "Name collision: quill block field '{}' conflicts with existing field",
467                            key
468                        )
469                        .into());
470                    }
471                }
472
473                // Convert YAML values to QuillValue at boundary
474                for (key, value) in yaml_fields {
475                    fields.insert(key, QuillValue::from_yaml(value)?);
476                }
477            }
478        }
479    }
480
481    // Parse tagged blocks
482    for (idx, block) in blocks.iter().enumerate() {
483        if let Some(ref tag_name) = block.tag {
484            // Check if this conflicts with global fields
485            // Exception: if the global field is an array, allow it (we'll merge later)
486            if let Some(existing_value) = fields.get(tag_name) {
487                if existing_value.as_array().is_none() {
488                    return Err(format!(
489                        "Name collision: tagged attribute '{}' conflicts with global field",
490                        tag_name
491                    )
492                    .into());
493                }
494            }
495
496            // Parse YAML metadata
497            let mut item_fields: HashMap<String, serde_yaml::Value> = if block
498                .yaml_content
499                .is_empty()
500            {
501                HashMap::new()
502            } else {
503                serde_yaml::from_str(&block.yaml_content).map_err(|e| {
504                    yaml_error_to_string(e, &format!("Invalid YAML in tagged block '{}'", tag_name))
505                })?
506            };
507
508            // Extract body for this tagged block
509            let body_start = block.end;
510            let body_end = if idx + 1 < blocks.len() {
511                blocks[idx + 1].start
512            } else {
513                markdown.len()
514            };
515            let body = &markdown[body_start..body_end];
516
517            // Add body to item fields
518            item_fields.insert(
519                BODY_FIELD.to_string(),
520                serde_yaml::Value::String(body.to_string()),
521            );
522
523            // Convert HashMap to serde_yaml::Value::Mapping
524            let item_value = serde_yaml::to_value(item_fields)?;
525
526            // Add to collection
527            tagged_attributes
528                .entry(tag_name.clone())
529                .or_insert_with(Vec::new)
530                .push(item_value);
531        }
532    }
533
534    // Extract global body
535    // Body starts after global frontmatter or quill block (whichever comes first)
536    // Body ends at the first scope block or EOF
537    let first_non_scope_block_idx = blocks
538        .iter()
539        .position(|b| b.tag.is_none() && b.quill_name.is_none())
540        .or_else(|| blocks.iter().position(|b| b.quill_name.is_some()));
541
542    let (body_start, body_end) = if let Some(idx) = first_non_scope_block_idx {
543        // Body starts after the first non-scope block (global frontmatter or quill)
544        let start = blocks[idx].end;
545
546        // Body ends at the first scope block after this, or EOF
547        let end = blocks
548            .iter()
549            .skip(idx + 1)
550            .find(|b| b.tag.is_some())
551            .map(|b| b.start)
552            .unwrap_or(markdown.len());
553
554        (start, end)
555    } else {
556        // No global frontmatter or quill block - body is everything before the first scope block
557        let end = blocks
558            .iter()
559            .find(|b| b.tag.is_some())
560            .map(|b| b.start)
561            .unwrap_or(0);
562
563        (0, end)
564    };
565
566    let global_body = &markdown[body_start..body_end];
567
568    fields.insert(
569        BODY_FIELD.to_string(),
570        QuillValue::from_json(serde_json::Value::String(global_body.to_string())),
571    );
572
573    // Add all tagged collections to fields (convert to QuillValue)
574    // If a field already exists and is an array, merge the new items into it
575    for (tag_name, items) in tagged_attributes {
576        if let Some(existing_value) = fields.get(&tag_name) {
577            // The existing value must be an array (checked earlier)
578            if let Some(existing_array) = existing_value.as_array() {
579                // Convert new items from YAML to JSON
580                let new_items_json: Vec<serde_json::Value> = items
581                    .into_iter()
582                    .map(|yaml_val| {
583                        serde_json::to_value(&yaml_val)
584                            .map_err(|e| format!("Failed to convert YAML to JSON: {}", e))
585                    })
586                    .collect::<Result<Vec<_>, _>>()?;
587
588                // Combine existing and new items
589                let mut merged_array = existing_array.clone();
590                merged_array.extend(new_items_json);
591
592                // Create QuillValue from merged JSON array
593                let quill_value = QuillValue::from_json(serde_json::Value::Array(merged_array));
594                fields.insert(tag_name, quill_value);
595            } else {
596                // This should not happen due to earlier validation, but handle it gracefully
597                return Err(format!(
598                    "Internal error: field '{}' exists but is not an array",
599                    tag_name
600                )
601                .into());
602            }
603        } else {
604            // No existing field, just create a new sequence
605            let quill_value = QuillValue::from_yaml(serde_yaml::Value::Sequence(items))?;
606            fields.insert(tag_name, quill_value);
607        }
608    }
609
610    let mut parsed = ParsedDocument::new(fields);
611
612    // Set quill tag if present
613    if let Some(name) = quill_name {
614        parsed.quill_tag = Some(name);
615    }
616
617    Ok(parsed)
618}
619
620#[cfg(test)]
621mod tests {
622    use super::*;
623
624    #[test]
625    fn test_no_frontmatter() {
626        let markdown = "# Hello World\n\nThis is a test.";
627        let doc = decompose(markdown).unwrap();
628
629        assert_eq!(doc.body(), Some(markdown));
630        assert_eq!(doc.fields().len(), 1);
631    }
632
633    #[test]
634    fn test_with_frontmatter() {
635        let markdown = r#"---
636title: Test Document
637author: Test Author
638---
639
640# Hello World
641
642This is the body."#;
643
644        let doc = decompose(markdown).unwrap();
645
646        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
647        assert_eq!(
648            doc.get_field("title").unwrap().as_str().unwrap(),
649            "Test Document"
650        );
651        assert_eq!(
652            doc.get_field("author").unwrap().as_str().unwrap(),
653            "Test Author"
654        );
655        assert_eq!(doc.fields().len(), 3); // title, author, body
656    }
657
658    #[test]
659    fn test_complex_yaml_frontmatter() {
660        let markdown = r#"---
661title: Complex Document
662tags:
663  - test
664  - yaml
665metadata:
666  version: 1.0
667  nested:
668    field: value
669---
670
671Content here."#;
672
673        let doc = decompose(markdown).unwrap();
674
675        assert_eq!(doc.body(), Some("\nContent here."));
676        assert_eq!(
677            doc.get_field("title").unwrap().as_str().unwrap(),
678            "Complex Document"
679        );
680
681        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
682        assert_eq!(tags.len(), 2);
683        assert_eq!(tags[0].as_str().unwrap(), "test");
684        assert_eq!(tags[1].as_str().unwrap(), "yaml");
685    }
686
687    #[test]
688    fn test_invalid_yaml() {
689        let markdown = r#"---
690title: [invalid yaml
691author: missing close bracket
692---
693
694Content here."#;
695
696        let result = decompose(markdown);
697        assert!(result.is_err());
698        assert!(result
699            .unwrap_err()
700            .to_string()
701            .contains("Invalid YAML frontmatter"));
702    }
703
704    #[test]
705    fn test_unclosed_frontmatter() {
706        let markdown = r#"---
707title: Test
708author: Test Author
709
710Content without closing ---"#;
711
712        let result = decompose(markdown);
713        assert!(result.is_err());
714        assert!(result.unwrap_err().to_string().contains("not closed"));
715    }
716
717    // Extended metadata tests
718
719    #[test]
720    fn test_basic_tagged_block() {
721        let markdown = r#"---
722title: Main Document
723---
724
725Main body content.
726
727---
728SCOPE: items
729name: Item 1
730---
731
732Body of item 1."#;
733
734        let doc = decompose(markdown).unwrap();
735
736        assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
737        assert_eq!(
738            doc.get_field("title").unwrap().as_str().unwrap(),
739            "Main Document"
740        );
741
742        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
743        assert_eq!(items.len(), 1);
744
745        let item = items[0].as_object().unwrap();
746        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
747        assert_eq!(
748            item.get("body").unwrap().as_str().unwrap(),
749            "\nBody of item 1."
750        );
751    }
752
753    #[test]
754    fn test_multiple_tagged_blocks() {
755        let markdown = r#"---
756SCOPE: items
757name: Item 1
758tags: [a, b]
759---
760
761First item body.
762
763---
764SCOPE: items
765name: Item 2
766tags: [c, d]
767---
768
769Second item body."#;
770
771        let doc = decompose(markdown).unwrap();
772
773        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
774        assert_eq!(items.len(), 2);
775
776        let item1 = items[0].as_object().unwrap();
777        assert_eq!(item1.get("name").unwrap().as_str().unwrap(), "Item 1");
778
779        let item2 = items[1].as_object().unwrap();
780        assert_eq!(item2.get("name").unwrap().as_str().unwrap(), "Item 2");
781    }
782
783    #[test]
784    fn test_mixed_global_and_tagged() {
785        let markdown = r#"---
786title: Global
787author: John Doe
788---
789
790Global body.
791
792---
793SCOPE: sections
794title: Section 1
795---
796
797Section 1 content.
798
799---
800SCOPE: sections
801title: Section 2
802---
803
804Section 2 content."#;
805
806        let doc = decompose(markdown).unwrap();
807
808        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Global");
809        assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
810
811        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
812        assert_eq!(sections.len(), 2);
813    }
814
815    #[test]
816    fn test_empty_tagged_metadata() {
817        let markdown = r#"---
818SCOPE: items
819---
820
821Body without metadata."#;
822
823        let doc = decompose(markdown).unwrap();
824
825        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
826        assert_eq!(items.len(), 1);
827
828        let item = items[0].as_object().unwrap();
829        assert_eq!(
830            item.get("body").unwrap().as_str().unwrap(),
831            "\nBody without metadata."
832        );
833    }
834
835    #[test]
836    fn test_tagged_block_without_body() {
837        let markdown = r#"---
838SCOPE: items
839name: Item
840---"#;
841
842        let doc = decompose(markdown).unwrap();
843
844        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
845        assert_eq!(items.len(), 1);
846
847        let item = items[0].as_object().unwrap();
848        assert_eq!(item.get("body").unwrap().as_str().unwrap(), "");
849    }
850
851    #[test]
852    fn test_name_collision_global_and_tagged() {
853        let markdown = r#"---
854items: "global value"
855---
856
857Body
858
859---
860SCOPE: items
861name: Item
862---
863
864Item body"#;
865
866        let result = decompose(markdown);
867        assert!(result.is_err());
868        assert!(result.unwrap_err().to_string().contains("collision"));
869    }
870
871    #[test]
872    fn test_global_array_merged_with_scope() {
873        // When global frontmatter has an array field with the same name as a SCOPE,
874        // the SCOPE items should be added to the array
875        let markdown = r#"---
876items:
877  - name: Global Item 1
878    value: 100
879  - name: Global Item 2
880    value: 200
881---
882
883Global body
884
885---
886SCOPE: items
887name: Scope Item 1
888value: 300
889---
890
891Scope item 1 body
892
893---
894SCOPE: items
895name: Scope Item 2
896value: 400
897---
898
899Scope item 2 body"#;
900
901        let doc = decompose(markdown).unwrap();
902
903        // Verify the items array has all 4 items (2 from global + 2 from SCOPE)
904        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
905        assert_eq!(items.len(), 4);
906
907        // Verify first two items (from global array)
908        let item1 = items[0].as_object().unwrap();
909        assert_eq!(
910            item1.get("name").unwrap().as_str().unwrap(),
911            "Global Item 1"
912        );
913        assert_eq!(item1.get("value").unwrap().as_i64().unwrap(), 100);
914
915        let item2 = items[1].as_object().unwrap();
916        assert_eq!(
917            item2.get("name").unwrap().as_str().unwrap(),
918            "Global Item 2"
919        );
920        assert_eq!(item2.get("value").unwrap().as_i64().unwrap(), 200);
921
922        // Verify last two items (from SCOPE blocks)
923        let item3 = items[2].as_object().unwrap();
924        assert_eq!(item3.get("name").unwrap().as_str().unwrap(), "Scope Item 1");
925        assert_eq!(item3.get("value").unwrap().as_i64().unwrap(), 300);
926        assert_eq!(
927            item3.get("body").unwrap().as_str().unwrap(),
928            "\nScope item 1 body\n\n"
929        );
930
931        let item4 = items[3].as_object().unwrap();
932        assert_eq!(item4.get("name").unwrap().as_str().unwrap(), "Scope Item 2");
933        assert_eq!(item4.get("value").unwrap().as_i64().unwrap(), 400);
934        assert_eq!(
935            item4.get("body").unwrap().as_str().unwrap(),
936            "\nScope item 2 body"
937        );
938    }
939
940    #[test]
941    fn test_empty_global_array_with_scope() {
942        // Edge case: global frontmatter has an empty array
943        let markdown = r#"---
944items: []
945---
946
947Global body
948
949---
950SCOPE: items
951name: Item 1
952---
953
954Item 1 body"#;
955
956        let doc = decompose(markdown).unwrap();
957
958        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
959        assert_eq!(items.len(), 1);
960
961        let item = items[0].as_object().unwrap();
962        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
963    }
964
965    #[test]
966    fn test_reserved_field_name() {
967        let markdown = r#"---
968SCOPE: body
969content: Test
970---"#;
971
972        let result = decompose(markdown);
973        assert!(result.is_err());
974        assert!(result.unwrap_err().to_string().contains("reserved"));
975    }
976
977    #[test]
978    fn test_invalid_tag_syntax() {
979        let markdown = r#"---
980SCOPE: Invalid-Name
981title: Test
982---"#;
983
984        let result = decompose(markdown);
985        assert!(result.is_err());
986        assert!(result
987            .unwrap_err()
988            .to_string()
989            .contains("Invalid field name"));
990    }
991
992    #[test]
993    fn test_multiple_global_frontmatter_blocks() {
994        let markdown = r#"---
995title: First
996---
997
998Body
999
1000---
1001author: Second
1002---
1003
1004More body"#;
1005
1006        let result = decompose(markdown);
1007        assert!(result.is_err());
1008        assert!(result
1009            .unwrap_err()
1010            .to_string()
1011            .contains("Multiple global frontmatter"));
1012    }
1013
1014    #[test]
1015    fn test_adjacent_blocks_different_tags() {
1016        let markdown = r#"---
1017SCOPE: items
1018name: Item 1
1019---
1020
1021Item 1 body
1022
1023---
1024SCOPE: sections
1025title: Section 1
1026---
1027
1028Section 1 body"#;
1029
1030        let doc = decompose(markdown).unwrap();
1031
1032        assert!(doc.get_field("items").is_some());
1033        assert!(doc.get_field("sections").is_some());
1034
1035        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1036        assert_eq!(items.len(), 1);
1037
1038        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1039        assert_eq!(sections.len(), 1);
1040    }
1041
1042    #[test]
1043    fn test_order_preservation() {
1044        let markdown = r#"---
1045SCOPE: items
1046id: 1
1047---
1048
1049First
1050
1051---
1052SCOPE: items
1053id: 2
1054---
1055
1056Second
1057
1058---
1059SCOPE: items
1060id: 3
1061---
1062
1063Third"#;
1064
1065        let doc = decompose(markdown).unwrap();
1066
1067        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1068        assert_eq!(items.len(), 3);
1069
1070        for (i, item) in items.iter().enumerate() {
1071            let mapping = item.as_object().unwrap();
1072            let id = mapping.get("id").unwrap().as_i64().unwrap();
1073            assert_eq!(id, (i + 1) as i64);
1074        }
1075    }
1076
1077    #[test]
1078    fn test_product_catalog_integration() {
1079        let markdown = r#"---
1080title: Product Catalog
1081author: John Doe
1082date: 2024-01-01
1083---
1084
1085This is the main catalog description.
1086
1087---
1088SCOPE: products
1089name: Widget A
1090price: 19.99
1091sku: WID-001
1092---
1093
1094The **Widget A** is our most popular product.
1095
1096---
1097SCOPE: products
1098name: Gadget B
1099price: 29.99
1100sku: GAD-002
1101---
1102
1103The **Gadget B** is perfect for professionals.
1104
1105---
1106SCOPE: reviews
1107product: Widget A
1108rating: 5
1109---
1110
1111"Excellent product! Highly recommended."
1112
1113---
1114SCOPE: reviews
1115product: Gadget B
1116rating: 4
1117---
1118
1119"Very good, but a bit pricey.""#;
1120
1121        let doc = decompose(markdown).unwrap();
1122
1123        // Verify global fields
1124        assert_eq!(
1125            doc.get_field("title").unwrap().as_str().unwrap(),
1126            "Product Catalog"
1127        );
1128        assert_eq!(
1129            doc.get_field("author").unwrap().as_str().unwrap(),
1130            "John Doe"
1131        );
1132        assert_eq!(
1133            doc.get_field("date").unwrap().as_str().unwrap(),
1134            "2024-01-01"
1135        );
1136
1137        // Verify global body
1138        assert!(doc.body().unwrap().contains("main catalog description"));
1139
1140        // Verify products collection
1141        let products = doc.get_field("products").unwrap().as_sequence().unwrap();
1142        assert_eq!(products.len(), 2);
1143
1144        let product1 = products[0].as_object().unwrap();
1145        assert_eq!(product1.get("name").unwrap().as_str().unwrap(), "Widget A");
1146        assert_eq!(product1.get("price").unwrap().as_f64().unwrap(), 19.99);
1147
1148        // Verify reviews collection
1149        let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
1150        assert_eq!(reviews.len(), 2);
1151
1152        let review1 = reviews[0].as_object().unwrap();
1153        assert_eq!(
1154            review1.get("product").unwrap().as_str().unwrap(),
1155            "Widget A"
1156        );
1157        assert_eq!(review1.get("rating").unwrap().as_i64().unwrap(), 5);
1158
1159        // Total fields: title, author, date, body, products, reviews = 6
1160        assert_eq!(doc.fields().len(), 6);
1161    }
1162
1163    #[test]
1164    fn taro_quill_directive() {
1165        let markdown = r#"---
1166QUILL: usaf_memo
1167memo_for: [ORG/SYMBOL]
1168memo_from: [ORG/SYMBOL]
1169---
1170
1171This is the memo body."#;
1172
1173        let doc = decompose(markdown).unwrap();
1174
1175        // Verify quill tag is set
1176        assert_eq!(doc.quill_tag(), Some("usaf_memo"));
1177
1178        // Verify fields from quill block become frontmatter
1179        assert_eq!(
1180            doc.get_field("memo_for").unwrap().as_sequence().unwrap()[0]
1181                .as_str()
1182                .unwrap(),
1183            "ORG/SYMBOL"
1184        );
1185
1186        // Verify body
1187        assert_eq!(doc.body(), Some("\nThis is the memo body."));
1188    }
1189
1190    #[test]
1191    fn test_quill_with_scope_blocks() {
1192        let markdown = r#"---
1193QUILL: document
1194title: Test Document
1195---
1196
1197Main body.
1198
1199---
1200SCOPE: sections
1201name: Section 1
1202---
1203
1204Section 1 body."#;
1205
1206        let doc = decompose(markdown).unwrap();
1207
1208        // Verify quill tag
1209        assert_eq!(doc.quill_tag(), Some("document"));
1210
1211        // Verify global field from quill block
1212        assert_eq!(
1213            doc.get_field("title").unwrap().as_str().unwrap(),
1214            "Test Document"
1215        );
1216
1217        // Verify scope blocks work
1218        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1219        assert_eq!(sections.len(), 1);
1220
1221        // Verify body
1222        assert_eq!(doc.body(), Some("\nMain body.\n\n"));
1223    }
1224
1225    #[test]
1226    fn test_multiple_quill_directives_error() {
1227        let markdown = r#"---
1228QUILL: first
1229---
1230
1231---
1232QUILL: second
1233---"#;
1234
1235        let result = decompose(markdown);
1236        assert!(result.is_err());
1237        assert!(result
1238            .unwrap_err()
1239            .to_string()
1240            .contains("Multiple quill directives"));
1241    }
1242
1243    #[test]
1244    fn test_invalid_quill_name() {
1245        let markdown = r#"---
1246QUILL: Invalid-Name
1247---"#;
1248
1249        let result = decompose(markdown);
1250        assert!(result.is_err());
1251        assert!(result
1252            .unwrap_err()
1253            .to_string()
1254            .contains("Invalid quill name"));
1255    }
1256
1257    #[test]
1258    fn test_quill_wrong_value_type() {
1259        let markdown = r#"---
1260QUILL: 123
1261---"#;
1262
1263        let result = decompose(markdown);
1264        assert!(result.is_err());
1265        assert!(result
1266            .unwrap_err()
1267            .to_string()
1268            .contains("QUILL value must be a string"));
1269    }
1270
1271    #[test]
1272    fn test_scope_wrong_value_type() {
1273        let markdown = r#"---
1274SCOPE: 123
1275---"#;
1276
1277        let result = decompose(markdown);
1278        assert!(result.is_err());
1279        assert!(result
1280            .unwrap_err()
1281            .to_string()
1282            .contains("SCOPE value must be a string"));
1283    }
1284
1285    #[test]
1286    fn test_both_quill_and_scope_error() {
1287        let markdown = r#"---
1288QUILL: test
1289SCOPE: items
1290---"#;
1291
1292        let result = decompose(markdown);
1293        assert!(result.is_err());
1294        assert!(result
1295            .unwrap_err()
1296            .to_string()
1297            .contains("Cannot specify both QUILL and SCOPE"));
1298    }
1299
1300    #[test]
1301    fn test_blank_lines_in_frontmatter() {
1302        // New parsing standard: blank lines are allowed within YAML blocks
1303        let markdown = r#"---
1304title: Test Document
1305author: Test Author
1306
1307description: This has a blank line above it
1308tags:
1309  - one
1310  - two
1311---
1312
1313# Hello World
1314
1315This is the body."#;
1316
1317        let doc = decompose(markdown).unwrap();
1318
1319        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
1320        assert_eq!(
1321            doc.get_field("title").unwrap().as_str().unwrap(),
1322            "Test Document"
1323        );
1324        assert_eq!(
1325            doc.get_field("author").unwrap().as_str().unwrap(),
1326            "Test Author"
1327        );
1328        assert_eq!(
1329            doc.get_field("description").unwrap().as_str().unwrap(),
1330            "This has a blank line above it"
1331        );
1332
1333        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
1334        assert_eq!(tags.len(), 2);
1335    }
1336
1337    #[test]
1338    fn test_blank_lines_in_scope_blocks() {
1339        // Blank lines should be allowed in SCOPE blocks too
1340        let markdown = r#"---
1341SCOPE: items
1342name: Item 1
1343
1344price: 19.99
1345
1346tags:
1347  - electronics
1348  - gadgets
1349---
1350
1351Body of item 1."#;
1352
1353        let doc = decompose(markdown).unwrap();
1354
1355        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1356        assert_eq!(items.len(), 1);
1357
1358        let item = items[0].as_object().unwrap();
1359        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
1360        assert_eq!(item.get("price").unwrap().as_f64().unwrap(), 19.99);
1361
1362        let tags = item.get("tags").unwrap().as_array().unwrap();
1363        assert_eq!(tags.len(), 2);
1364    }
1365
1366    #[test]
1367    fn test_horizontal_rule_with_blank_lines_above_and_below() {
1368        // Horizontal rule: blank lines both above AND below the ---
1369        let markdown = r#"---
1370title: Test
1371---
1372
1373First paragraph.
1374
1375---
1376
1377Second paragraph."#;
1378
1379        let doc = decompose(markdown).unwrap();
1380
1381        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Test");
1382
1383        // The body should contain the horizontal rule (---) as part of the content
1384        let body = doc.body().unwrap();
1385        assert!(body.contains("First paragraph."));
1386        assert!(body.contains("---"));
1387        assert!(body.contains("Second paragraph."));
1388    }
1389
1390    #[test]
1391    fn test_horizontal_rule_not_preceded_by_blank() {
1392        // --- not preceded by blank line but followed by blank line is NOT a horizontal rule
1393        // It's also NOT a valid metadata block opening (since it's followed by blank)
1394        let markdown = r#"---
1395title: Test
1396---
1397
1398First paragraph.
1399---
1400
1401Second paragraph."#;
1402
1403        let doc = decompose(markdown).unwrap();
1404
1405        let body = doc.body().unwrap();
1406        // The second --- should be in the body as text (not a horizontal rule since no blank above)
1407        assert!(body.contains("---"));
1408    }
1409
1410    #[test]
1411    fn test_multiple_blank_lines_in_yaml() {
1412        // Multiple blank lines should also be allowed
1413        let markdown = r#"---
1414title: Test
1415
1416
1417author: John Doe
1418
1419
1420version: 1.0
1421---
1422
1423Body content."#;
1424
1425        let doc = decompose(markdown).unwrap();
1426
1427        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Test");
1428        assert_eq!(
1429            doc.get_field("author").unwrap().as_str().unwrap(),
1430            "John Doe"
1431        );
1432        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
1433    }
1434}
1435#[cfg(test)]
1436mod demo_file_test {
1437    use super::*;
1438
1439    #[test]
1440    fn test_extended_metadata_demo_file() {
1441        let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
1442        let doc = decompose(markdown).unwrap();
1443
1444        // Verify global fields
1445        assert_eq!(
1446            doc.get_field("title").unwrap().as_str().unwrap(),
1447            "Extended Metadata Demo"
1448        );
1449        assert_eq!(
1450            doc.get_field("author").unwrap().as_str().unwrap(),
1451            "Quillmark Team"
1452        );
1453        // version is parsed as a number by YAML
1454        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
1455
1456        // Verify body
1457        assert!(doc
1458            .body()
1459            .unwrap()
1460            .contains("extended YAML metadata standard"));
1461
1462        // Verify features collection
1463        let features = doc.get_field("features").unwrap().as_sequence().unwrap();
1464        assert_eq!(features.len(), 3);
1465
1466        // Verify use_cases collection
1467        let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
1468        assert_eq!(use_cases.len(), 2);
1469
1470        // Check first feature
1471        let feature1 = features[0].as_object().unwrap();
1472        assert_eq!(
1473            feature1.get("name").unwrap().as_str().unwrap(),
1474            "Tag Directives"
1475        );
1476    }
1477
1478    #[test]
1479    fn test_input_size_limit() {
1480        // Create markdown larger than MAX_INPUT_SIZE (10 MB)
1481        let size = crate::error::MAX_INPUT_SIZE + 1;
1482        let large_markdown = "a".repeat(size);
1483
1484        let result = decompose(&large_markdown);
1485        assert!(result.is_err());
1486
1487        let err_msg = result.unwrap_err().to_string();
1488        assert!(err_msg.contains("Input too large"));
1489    }
1490
1491    #[test]
1492    fn test_yaml_size_limit() {
1493        // Create YAML block larger than MAX_YAML_SIZE (1 MB)
1494        let mut markdown = String::from("---\n");
1495
1496        // Create a very large YAML field
1497        let size = crate::error::MAX_YAML_SIZE + 1;
1498        markdown.push_str("data: \"");
1499        markdown.push_str(&"x".repeat(size));
1500        markdown.push_str("\"\n---\n\nBody");
1501
1502        let result = decompose(&markdown);
1503        assert!(result.is_err());
1504
1505        let err_msg = result.unwrap_err().to_string();
1506        assert!(err_msg.contains("YAML block too large"));
1507    }
1508
1509    #[test]
1510    fn test_input_within_size_limit() {
1511        // Create markdown just under the limit
1512        let size = 1000; // Much smaller than limit
1513        let markdown = format!("---\ntitle: Test\n---\n\n{}", "a".repeat(size));
1514
1515        let result = decompose(&markdown);
1516        assert!(result.is_ok());
1517    }
1518
1519    #[test]
1520    fn test_yaml_within_size_limit() {
1521        // Create YAML block well within the limit
1522        let markdown = "---\ntitle: Test\nauthor: John Doe\n---\n\nBody content";
1523
1524        let result = decompose(&markdown);
1525        assert!(result.is_ok());
1526    }
1527}