quillmark_core/
parse.rs

1//! # Parsing Module
2//!
3//! Parsing functionality for markdown documents with YAML frontmatter.
4//!
5//! ## Overview
6//!
7//! The `parse` module provides the [`decompose`] function for parsing markdown documents
8//! and the [`ParsedDocument`] type for accessing parsed content.
9//!
10//! ## Key Types
11//!
12//! - [`ParsedDocument`]: Container for parsed frontmatter fields and body content
13//! - [`BODY_FIELD`]: Constant for the field name storing document body
14//!
15//! ## Examples
16//!
17//! ### Basic Parsing
18//!
19//! ```
20//! use quillmark_core::decompose;
21//!
22//! let markdown = r#"---
23//! title: My Document
24//! author: John Doe
25//! ---
26//!
27//! # Introduction
28//!
29//! Document content here.
30//! "#;
31//!
32//! let doc = decompose(markdown).unwrap();
33//! let title = doc.get_field("title")
34//!     .and_then(|v| v.as_str())
35//!     .unwrap_or("Untitled");
36//! ```
37//!
38//! ### Extended Metadata with Tags
39//!
40//! ```
41//! use quillmark_core::decompose;
42//!
43//! let markdown = r#"---
44//! catalog_title: Product Catalog
45//! ---
46//!
47//! # Products
48//!
49//! ---
50//! SCOPE: products
51//! name: Widget
52//! price: 19.99
53//! ---
54//!
55//! A versatile widget for all occasions.
56//! "#;
57//!
58//! let doc = decompose(markdown).unwrap();
59//!
60//! // Access tagged collections
61//! if let Some(products) = doc.get_field("products")
62//!     .and_then(|v| v.as_sequence())
63//! {
64//!     for product in products {
65//!         let name = product.get("name").and_then(|v| v.as_str()).unwrap();
66//!         let price = product.get("price").and_then(|v| v.as_f64()).unwrap();
67//!         println!("{}: ${}", name, price);
68//!     }
69//! }
70//! ```
71//!
72//! ## Error Handling
73//!
74//! The [`decompose`] function returns errors for:
75//! - Malformed YAML syntax
76//! - Unclosed frontmatter blocks
77//! - Multiple global frontmatter blocks
78//! - Both QUILL and SCOPE specified in the same block
79//! - Reserved field name usage
80//! - Name collisions
81//!
82//! See [PARSE.md](https://github.com/nibsbin/quillmark/blob/main/designs/PARSE.md) for comprehensive documentation of the Extended YAML Metadata Standard.
83
84use std::collections::HashMap;
85
86use crate::value::QuillValue;
87
88/// The field name used to store the document body
89pub const BODY_FIELD: &str = "body";
90
91/// Reserved tag name for quill specification
92pub const QUILL_TAG: &str = "quill";
93
94/// A parsed markdown document with frontmatter
95#[derive(Debug, Clone)]
96pub struct ParsedDocument {
97    fields: HashMap<String, QuillValue>,
98    quill_tag: Option<String>,
99}
100
101impl ParsedDocument {
102    /// Create a new ParsedDocument with the given fields
103    pub fn new(fields: HashMap<String, QuillValue>) -> Self {
104        Self {
105            fields,
106            quill_tag: None,
107        }
108    }
109
110    /// Create a ParsedDocument from fields and optional quill tag
111    pub fn with_quill_tag(fields: HashMap<String, QuillValue>, quill_tag: Option<String>) -> Self {
112        Self { fields, quill_tag }
113    }
114
115    /// Create a ParsedDocument from markdown string
116    pub fn from_markdown(markdown: &str) -> Result<Self, crate::error::ParseError> {
117        decompose(markdown).map_err(|e| crate::error::ParseError::from(e))
118    }
119
120    /// Get the quill tag if specified (from QUILL key)
121    pub fn quill_tag(&self) -> Option<&str> {
122        self.quill_tag.as_deref()
123    }
124
125    /// Get the document body
126    pub fn body(&self) -> Option<&str> {
127        self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
128    }
129
130    /// Get a specific field
131    pub fn get_field(&self, name: &str) -> Option<&QuillValue> {
132        self.fields.get(name)
133    }
134
135    /// Get all fields (including body)
136    pub fn fields(&self) -> &HashMap<String, QuillValue> {
137        &self.fields
138    }
139}
140
141#[derive(Debug)]
142struct MetadataBlock {
143    start: usize, // Position of opening "---"
144    end: usize,   // Position after closing "---\n"
145    yaml_content: String,
146    tag: Option<String>,        // Field name from SCOPE key
147    quill_name: Option<String>, // Quill name from QUILL key
148}
149
150/// Validate tag name follows pattern [a-z_][a-z0-9_]*
151fn is_valid_tag_name(name: &str) -> bool {
152    if name.is_empty() {
153        return false;
154    }
155
156    let mut chars = name.chars();
157    let first = chars.next().unwrap();
158
159    if !first.is_ascii_lowercase() && first != '_' {
160        return false;
161    }
162
163    for ch in chars {
164        if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
165            return false;
166        }
167    }
168
169    true
170}
171
172/// Find all metadata blocks in the document
173fn find_metadata_blocks(
174    markdown: &str,
175) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
176    let mut blocks = Vec::new();
177    let mut pos = 0;
178
179    while pos < markdown.len() {
180        // Look for opening "---\n" or "---\r\n"
181        let search_str = &markdown[pos..];
182        let delimiter_result = if let Some(p) = search_str.find("---\n") {
183            Some((p, 4, "\n"))
184        } else if let Some(p) = search_str.find("---\r\n") {
185            Some((p, 5, "\r\n"))
186        } else {
187            None
188        };
189
190        if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
191            let abs_pos = pos + delimiter_pos;
192            let content_start = abs_pos + delimiter_len; // After "---\n" or "---\r\n"
193
194            // Check if this --- is a horizontal rule (blank lines above AND below)
195            let preceded_by_blank = if abs_pos > 0 {
196                // Check if there's a blank line before the ---
197                let before = &markdown[..abs_pos];
198                before.ends_with("\n\n") || before.ends_with("\r\n\r\n")
199            } else {
200                false
201            };
202
203            let followed_by_blank = if content_start < markdown.len() {
204                markdown[content_start..].starts_with('\n')
205                    || markdown[content_start..].starts_with("\r\n")
206            } else {
207                false
208            };
209
210            // Horizontal rule: blank lines both above and below
211            if preceded_by_blank && followed_by_blank {
212                // This is a horizontal rule in the body, skip it
213                pos = abs_pos + 3; // Skip past "---"
214                continue;
215            }
216
217            // Check if followed by non-blank line (or if we're at document start)
218            // This starts a metadata block
219            if followed_by_blank {
220                // --- followed by blank line but NOT preceded by blank line
221                // This is NOT a metadata block opening, skip it
222                pos = abs_pos + 3;
223                continue;
224            }
225
226            // Found potential metadata block opening (followed by non-blank line)
227            // Look for closing "\n---\n" or "\r\n---\r\n" etc., OR "\n---" / "\r\n---" at end of document
228            let rest = &markdown[content_start..];
229
230            // First try to find delimiters with trailing newlines
231            let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
232            let closing_with_newline = closing_patterns
233                .iter()
234                .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
235                .min_by_key(|(p, _)| *p);
236
237            // Also check for closing at end of document (no trailing newline)
238            let closing_at_eof = ["\n---", "\r\n---"]
239                .iter()
240                .filter_map(|delim| {
241                    rest.find(delim).and_then(|p| {
242                        if p + delim.len() == rest.len() {
243                            Some((p, delim.len()))
244                        } else {
245                            None
246                        }
247                    })
248                })
249                .min_by_key(|(p, _)| *p);
250
251            let closing_result = match (closing_with_newline, closing_at_eof) {
252                (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
253                (Some(_), Some(_)) => closing_with_newline,
254                (Some(_), None) => closing_with_newline,
255                (None, Some(_)) => closing_at_eof,
256                (None, None) => None,
257            };
258
259            if let Some((closing_pos, closing_len)) = closing_result {
260                let abs_closing_pos = content_start + closing_pos;
261                let content = &markdown[content_start..abs_closing_pos];
262
263                // Check YAML size limit
264                if content.len() > crate::error::MAX_YAML_SIZE {
265                    return Err(format!(
266                        "YAML block too large: {} bytes (max: {} bytes)",
267                        content.len(),
268                        crate::error::MAX_YAML_SIZE
269                    )
270                    .into());
271                }
272
273                // Parse YAML content to check for reserved keys (QUILL, SCOPE)
274                // First, try to parse as YAML
275                let (tag, quill_name, yaml_content) = if !content.is_empty() {
276                    // Try to parse the YAML to check for reserved keys
277                    match serde_yaml::from_str::<serde_yaml::Value>(content) {
278                        Ok(yaml_value) => {
279                            if let Some(mapping) = yaml_value.as_mapping() {
280                                let quill_key = serde_yaml::Value::String("QUILL".to_string());
281                                let scope_key = serde_yaml::Value::String("SCOPE".to_string());
282
283                                let has_quill = mapping.contains_key(&quill_key);
284                                let has_scope = mapping.contains_key(&scope_key);
285
286                                if has_quill && has_scope {
287                                    return Err(
288                                        "Cannot specify both QUILL and SCOPE in the same block"
289                                            .into(),
290                                    );
291                                }
292
293                                if has_quill {
294                                    // Extract quill name
295                                    let quill_value = mapping.get(&quill_key).unwrap();
296                                    let quill_name_str = quill_value
297                                        .as_str()
298                                        .ok_or_else(|| "QUILL value must be a string")?;
299
300                                    if !is_valid_tag_name(quill_name_str) {
301                                        return Err(format!(
302                                            "Invalid quill name '{}': must match pattern [a-z_][a-z0-9_]*",
303                                            quill_name_str
304                                        )
305                                        .into());
306                                    }
307
308                                    // Remove QUILL from the YAML content for processing
309                                    let mut new_mapping = mapping.clone();
310                                    new_mapping.remove(&quill_key);
311                                    let new_yaml = serde_yaml::to_string(&new_mapping)
312                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
313
314                                    (None, Some(quill_name_str.to_string()), new_yaml)
315                                } else if has_scope {
316                                    // Extract scope field name
317                                    let scope_value = mapping.get(&scope_key).unwrap();
318                                    let field_name = scope_value
319                                        .as_str()
320                                        .ok_or_else(|| "SCOPE value must be a string")?;
321
322                                    if !is_valid_tag_name(field_name) {
323                                        return Err(format!(
324                                            "Invalid field name '{}': must match pattern [a-z_][a-z0-9_]*",
325                                            field_name
326                                        )
327                                        .into());
328                                    }
329
330                                    if field_name == BODY_FIELD {
331                                        return Err(format!(
332                                            "Cannot use reserved field name '{}' as SCOPE value",
333                                            BODY_FIELD
334                                        )
335                                        .into());
336                                    }
337
338                                    // Remove SCOPE from the YAML content for processing
339                                    let mut new_mapping = mapping.clone();
340                                    new_mapping.remove(&scope_key);
341                                    let new_yaml = serde_yaml::to_string(&new_mapping)
342                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
343
344                                    (Some(field_name.to_string()), None, new_yaml)
345                                } else {
346                                    // No reserved keys, treat as normal YAML
347                                    (None, None, content.to_string())
348                                }
349                            } else {
350                                // Not a mapping, treat as normal YAML
351                                (None, None, content.to_string())
352                            }
353                        }
354                        Err(_) => {
355                            // If YAML parsing fails here, we'll catch it later
356                            (None, None, content.to_string())
357                        }
358                    }
359                } else {
360                    (None, None, content.to_string())
361                };
362
363                blocks.push(MetadataBlock {
364                    start: abs_pos,
365                    end: abs_closing_pos + closing_len, // After closing delimiter
366                    yaml_content,
367                    tag,
368                    quill_name,
369                });
370
371                pos = abs_closing_pos + closing_len;
372            } else if abs_pos == 0 {
373                // Frontmatter started but not closed
374                return Err("Frontmatter started but not closed with ---".into());
375            } else {
376                // Not a valid metadata block, skip this position
377                pos = abs_pos + 3;
378            }
379        } else {
380            break;
381        }
382    }
383
384    Ok(blocks)
385}
386
387/// Decompose markdown into frontmatter fields and body
388pub fn decompose(
389    markdown: &str,
390) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
391    // Check input size limit
392    if markdown.len() > crate::error::MAX_INPUT_SIZE {
393        return Err(format!(
394            "Input too large: {} bytes (max: {} bytes)",
395            markdown.len(),
396            crate::error::MAX_INPUT_SIZE
397        )
398        .into());
399    }
400
401    let mut fields = HashMap::new();
402
403    // Find all metadata blocks
404    let blocks = find_metadata_blocks(markdown)?;
405
406    if blocks.is_empty() {
407        // No metadata blocks, entire content is body
408        fields.insert(
409            BODY_FIELD.to_string(),
410            QuillValue::from_json(serde_json::Value::String(markdown.to_string())),
411        );
412        return Ok(ParsedDocument::new(fields));
413    }
414
415    // Track which attributes are used for tagged blocks
416    let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
417    let mut has_global_frontmatter = false;
418    let mut global_frontmatter_index: Option<usize> = None;
419    let mut quill_name: Option<String> = None;
420
421    // First pass: identify global frontmatter, quill directive, and validate
422    for (idx, block) in blocks.iter().enumerate() {
423        // Check for quill directive
424        if let Some(ref name) = block.quill_name {
425            if quill_name.is_some() {
426                return Err("Multiple quill directives found: only one allowed".into());
427            }
428            quill_name = Some(name.clone());
429        }
430
431        // Check for global frontmatter (no tag and no quill directive)
432        if block.tag.is_none() && block.quill_name.is_none() {
433            if has_global_frontmatter {
434                return Err(
435                    "Multiple global frontmatter blocks found: only one untagged block allowed"
436                        .into(),
437                );
438            }
439            has_global_frontmatter = true;
440            global_frontmatter_index = Some(idx);
441        }
442    }
443
444    // Parse global frontmatter if present
445    if let Some(idx) = global_frontmatter_index {
446        let block = &blocks[idx];
447
448        // Parse YAML frontmatter
449        let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
450            HashMap::new()
451        } else {
452            serde_yaml::from_str(&block.yaml_content)
453                .map_err(|e| format!("Invalid YAML frontmatter: {}", e))?
454        };
455
456        // Check that all tagged blocks don't conflict with global fields
457        // Exception: if the global field is an array, allow it (we'll merge later)
458        for other_block in &blocks {
459            if let Some(ref tag) = other_block.tag {
460                if let Some(global_value) = yaml_fields.get(tag) {
461                    // Check if the global value is an array
462                    if global_value.as_sequence().is_none() {
463                        return Err(format!(
464                            "Name collision: global field '{}' conflicts with tagged attribute",
465                            tag
466                        )
467                        .into());
468                    }
469                }
470            }
471        }
472
473        // Convert YAML values to QuillValue at boundary
474        for (key, value) in yaml_fields {
475            fields.insert(key, QuillValue::from_yaml(value)?);
476        }
477    }
478
479    // Process blocks with quill directives
480    for block in &blocks {
481        if block.quill_name.is_some() {
482            // Quill directive blocks can have YAML content (becomes part of frontmatter)
483            if !block.yaml_content.is_empty() {
484                let yaml_fields: HashMap<String, serde_yaml::Value> =
485                    serde_yaml::from_str(&block.yaml_content)
486                        .map_err(|e| format!("Invalid YAML in quill block: {}", e))?;
487
488                // Check for conflicts with existing fields
489                for key in yaml_fields.keys() {
490                    if fields.contains_key(key) {
491                        return Err(format!(
492                            "Name collision: quill block field '{}' conflicts with existing field",
493                            key
494                        )
495                        .into());
496                    }
497                }
498
499                // Convert YAML values to QuillValue at boundary
500                for (key, value) in yaml_fields {
501                    fields.insert(key, QuillValue::from_yaml(value)?);
502                }
503            }
504        }
505    }
506
507    // Parse tagged blocks
508    for (idx, block) in blocks.iter().enumerate() {
509        if let Some(ref tag_name) = block.tag {
510            // Check if this conflicts with global fields
511            // Exception: if the global field is an array, allow it (we'll merge later)
512            if let Some(existing_value) = fields.get(tag_name) {
513                if existing_value.as_array().is_none() {
514                    return Err(format!(
515                        "Name collision: tagged attribute '{}' conflicts with global field",
516                        tag_name
517                    )
518                    .into());
519                }
520            }
521
522            // Parse YAML metadata
523            let mut item_fields: HashMap<String, serde_yaml::Value> =
524                if block.yaml_content.is_empty() {
525                    HashMap::new()
526                } else {
527                    serde_yaml::from_str(&block.yaml_content).map_err(|e| {
528                        format!("Invalid YAML in tagged block '{}': {}", tag_name, e)
529                    })?
530                };
531
532            // Extract body for this tagged block
533            let body_start = block.end;
534            let body_end = if idx + 1 < blocks.len() {
535                blocks[idx + 1].start
536            } else {
537                markdown.len()
538            };
539            let body = &markdown[body_start..body_end];
540
541            // Add body to item fields
542            item_fields.insert(
543                BODY_FIELD.to_string(),
544                serde_yaml::Value::String(body.to_string()),
545            );
546
547            // Convert HashMap to serde_yaml::Value::Mapping
548            let item_value = serde_yaml::to_value(item_fields)?;
549
550            // Add to collection
551            tagged_attributes
552                .entry(tag_name.clone())
553                .or_insert_with(Vec::new)
554                .push(item_value);
555        }
556    }
557
558    // Extract global body
559    // Body starts after global frontmatter or quill block (whichever comes first)
560    // Body ends at the first scope block or EOF
561    let first_non_scope_block_idx = blocks
562        .iter()
563        .position(|b| b.tag.is_none() && b.quill_name.is_none())
564        .or_else(|| blocks.iter().position(|b| b.quill_name.is_some()));
565
566    let (body_start, body_end) = if let Some(idx) = first_non_scope_block_idx {
567        // Body starts after the first non-scope block (global frontmatter or quill)
568        let start = blocks[idx].end;
569
570        // Body ends at the first scope block after this, or EOF
571        let end = blocks
572            .iter()
573            .skip(idx + 1)
574            .find(|b| b.tag.is_some())
575            .map(|b| b.start)
576            .unwrap_or(markdown.len());
577
578        (start, end)
579    } else {
580        // No global frontmatter or quill block - body is everything before the first scope block
581        let end = blocks
582            .iter()
583            .find(|b| b.tag.is_some())
584            .map(|b| b.start)
585            .unwrap_or(0);
586
587        (0, end)
588    };
589
590    let global_body = &markdown[body_start..body_end];
591
592    fields.insert(
593        BODY_FIELD.to_string(),
594        QuillValue::from_json(serde_json::Value::String(global_body.to_string())),
595    );
596
597    // Add all tagged collections to fields (convert to QuillValue)
598    // If a field already exists and is an array, merge the new items into it
599    for (tag_name, items) in tagged_attributes {
600        if let Some(existing_value) = fields.get(&tag_name) {
601            // The existing value must be an array (checked earlier)
602            if let Some(existing_array) = existing_value.as_array() {
603                // Convert new items from YAML to JSON
604                let new_items_json: Vec<serde_json::Value> = items
605                    .into_iter()
606                    .map(|yaml_val| {
607                        serde_json::to_value(&yaml_val)
608                            .map_err(|e| format!("Failed to convert YAML to JSON: {}", e))
609                    })
610                    .collect::<Result<Vec<_>, _>>()?;
611
612                // Combine existing and new items
613                let mut merged_array = existing_array.clone();
614                merged_array.extend(new_items_json);
615
616                // Create QuillValue from merged JSON array
617                let quill_value = QuillValue::from_json(serde_json::Value::Array(merged_array));
618                fields.insert(tag_name, quill_value);
619            } else {
620                // This should not happen due to earlier validation, but handle it gracefully
621                return Err(format!(
622                    "Internal error: field '{}' exists but is not an array",
623                    tag_name
624                )
625                .into());
626            }
627        } else {
628            // No existing field, just create a new sequence
629            let quill_value = QuillValue::from_yaml(serde_yaml::Value::Sequence(items))?;
630            fields.insert(tag_name, quill_value);
631        }
632    }
633
634    let mut parsed = ParsedDocument::new(fields);
635
636    // Set quill tag if present
637    if let Some(name) = quill_name {
638        parsed.quill_tag = Some(name);
639    }
640
641    Ok(parsed)
642}
643
644#[cfg(test)]
645mod tests {
646    use super::*;
647
648    #[test]
649    fn test_no_frontmatter() {
650        let markdown = "# Hello World\n\nThis is a test.";
651        let doc = decompose(markdown).unwrap();
652
653        assert_eq!(doc.body(), Some(markdown));
654        assert_eq!(doc.fields().len(), 1);
655    }
656
657    #[test]
658    fn test_with_frontmatter() {
659        let markdown = r#"---
660title: Test Document
661author: Test Author
662---
663
664# Hello World
665
666This is the body."#;
667
668        let doc = decompose(markdown).unwrap();
669
670        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
671        assert_eq!(
672            doc.get_field("title").unwrap().as_str().unwrap(),
673            "Test Document"
674        );
675        assert_eq!(
676            doc.get_field("author").unwrap().as_str().unwrap(),
677            "Test Author"
678        );
679        assert_eq!(doc.fields().len(), 3); // title, author, body
680    }
681
682    #[test]
683    fn test_complex_yaml_frontmatter() {
684        let markdown = r#"---
685title: Complex Document
686tags:
687  - test
688  - yaml
689metadata:
690  version: 1.0
691  nested:
692    field: value
693---
694
695Content here."#;
696
697        let doc = decompose(markdown).unwrap();
698
699        assert_eq!(doc.body(), Some("\nContent here."));
700        assert_eq!(
701            doc.get_field("title").unwrap().as_str().unwrap(),
702            "Complex Document"
703        );
704
705        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
706        assert_eq!(tags.len(), 2);
707        assert_eq!(tags[0].as_str().unwrap(), "test");
708        assert_eq!(tags[1].as_str().unwrap(), "yaml");
709    }
710
711    #[test]
712    fn test_invalid_yaml() {
713        let markdown = r#"---
714title: [invalid yaml
715author: missing close bracket
716---
717
718Content here."#;
719
720        let result = decompose(markdown);
721        assert!(result.is_err());
722        assert!(result
723            .unwrap_err()
724            .to_string()
725            .contains("Invalid YAML frontmatter"));
726    }
727
728    #[test]
729    fn test_unclosed_frontmatter() {
730        let markdown = r#"---
731title: Test
732author: Test Author
733
734Content without closing ---"#;
735
736        let result = decompose(markdown);
737        assert!(result.is_err());
738        assert!(result.unwrap_err().to_string().contains("not closed"));
739    }
740
741    // Extended metadata tests
742
743    #[test]
744    fn test_basic_tagged_block() {
745        let markdown = r#"---
746title: Main Document
747---
748
749Main body content.
750
751---
752SCOPE: items
753name: Item 1
754---
755
756Body of item 1."#;
757
758        let doc = decompose(markdown).unwrap();
759
760        assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
761        assert_eq!(
762            doc.get_field("title").unwrap().as_str().unwrap(),
763            "Main Document"
764        );
765
766        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
767        assert_eq!(items.len(), 1);
768
769        let item = items[0].as_object().unwrap();
770        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
771        assert_eq!(
772            item.get("body").unwrap().as_str().unwrap(),
773            "\nBody of item 1."
774        );
775    }
776
777    #[test]
778    fn test_multiple_tagged_blocks() {
779        let markdown = r#"---
780SCOPE: items
781name: Item 1
782tags: [a, b]
783---
784
785First item body.
786
787---
788SCOPE: items
789name: Item 2
790tags: [c, d]
791---
792
793Second item body."#;
794
795        let doc = decompose(markdown).unwrap();
796
797        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
798        assert_eq!(items.len(), 2);
799
800        let item1 = items[0].as_object().unwrap();
801        assert_eq!(item1.get("name").unwrap().as_str().unwrap(), "Item 1");
802
803        let item2 = items[1].as_object().unwrap();
804        assert_eq!(item2.get("name").unwrap().as_str().unwrap(), "Item 2");
805    }
806
807    #[test]
808    fn test_mixed_global_and_tagged() {
809        let markdown = r#"---
810title: Global
811author: John Doe
812---
813
814Global body.
815
816---
817SCOPE: sections
818title: Section 1
819---
820
821Section 1 content.
822
823---
824SCOPE: sections
825title: Section 2
826---
827
828Section 2 content."#;
829
830        let doc = decompose(markdown).unwrap();
831
832        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Global");
833        assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
834
835        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
836        assert_eq!(sections.len(), 2);
837    }
838
839    #[test]
840    fn test_empty_tagged_metadata() {
841        let markdown = r#"---
842SCOPE: items
843---
844
845Body without metadata."#;
846
847        let doc = decompose(markdown).unwrap();
848
849        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
850        assert_eq!(items.len(), 1);
851
852        let item = items[0].as_object().unwrap();
853        assert_eq!(
854            item.get("body").unwrap().as_str().unwrap(),
855            "\nBody without metadata."
856        );
857    }
858
859    #[test]
860    fn test_tagged_block_without_body() {
861        let markdown = r#"---
862SCOPE: items
863name: Item
864---"#;
865
866        let doc = decompose(markdown).unwrap();
867
868        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
869        assert_eq!(items.len(), 1);
870
871        let item = items[0].as_object().unwrap();
872        assert_eq!(item.get("body").unwrap().as_str().unwrap(), "");
873    }
874
875    #[test]
876    fn test_name_collision_global_and_tagged() {
877        let markdown = r#"---
878items: "global value"
879---
880
881Body
882
883---
884SCOPE: items
885name: Item
886---
887
888Item body"#;
889
890        let result = decompose(markdown);
891        assert!(result.is_err());
892        assert!(result.unwrap_err().to_string().contains("collision"));
893    }
894
895    #[test]
896    fn test_global_array_merged_with_scope() {
897        // When global frontmatter has an array field with the same name as a SCOPE,
898        // the SCOPE items should be added to the array
899        let markdown = r#"---
900items:
901  - name: Global Item 1
902    value: 100
903  - name: Global Item 2
904    value: 200
905---
906
907Global body
908
909---
910SCOPE: items
911name: Scope Item 1
912value: 300
913---
914
915Scope item 1 body
916
917---
918SCOPE: items
919name: Scope Item 2
920value: 400
921---
922
923Scope item 2 body"#;
924
925        let doc = decompose(markdown).unwrap();
926
927        // Verify the items array has all 4 items (2 from global + 2 from SCOPE)
928        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
929        assert_eq!(items.len(), 4);
930
931        // Verify first two items (from global array)
932        let item1 = items[0].as_object().unwrap();
933        assert_eq!(
934            item1.get("name").unwrap().as_str().unwrap(),
935            "Global Item 1"
936        );
937        assert_eq!(item1.get("value").unwrap().as_i64().unwrap(), 100);
938
939        let item2 = items[1].as_object().unwrap();
940        assert_eq!(
941            item2.get("name").unwrap().as_str().unwrap(),
942            "Global Item 2"
943        );
944        assert_eq!(item2.get("value").unwrap().as_i64().unwrap(), 200);
945
946        // Verify last two items (from SCOPE blocks)
947        let item3 = items[2].as_object().unwrap();
948        assert_eq!(item3.get("name").unwrap().as_str().unwrap(), "Scope Item 1");
949        assert_eq!(item3.get("value").unwrap().as_i64().unwrap(), 300);
950        assert_eq!(
951            item3.get("body").unwrap().as_str().unwrap(),
952            "\nScope item 1 body\n\n"
953        );
954
955        let item4 = items[3].as_object().unwrap();
956        assert_eq!(item4.get("name").unwrap().as_str().unwrap(), "Scope Item 2");
957        assert_eq!(item4.get("value").unwrap().as_i64().unwrap(), 400);
958        assert_eq!(
959            item4.get("body").unwrap().as_str().unwrap(),
960            "\nScope item 2 body"
961        );
962    }
963
964    #[test]
965    fn test_empty_global_array_with_scope() {
966        // Edge case: global frontmatter has an empty array
967        let markdown = r#"---
968items: []
969---
970
971Global body
972
973---
974SCOPE: items
975name: Item 1
976---
977
978Item 1 body"#;
979
980        let doc = decompose(markdown).unwrap();
981
982        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
983        assert_eq!(items.len(), 1);
984
985        let item = items[0].as_object().unwrap();
986        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
987    }
988
989    #[test]
990    fn test_reserved_field_name() {
991        let markdown = r#"---
992SCOPE: body
993content: Test
994---"#;
995
996        let result = decompose(markdown);
997        assert!(result.is_err());
998        assert!(result.unwrap_err().to_string().contains("reserved"));
999    }
1000
1001    #[test]
1002    fn test_invalid_tag_syntax() {
1003        let markdown = r#"---
1004SCOPE: Invalid-Name
1005title: Test
1006---"#;
1007
1008        let result = decompose(markdown);
1009        assert!(result.is_err());
1010        assert!(result
1011            .unwrap_err()
1012            .to_string()
1013            .contains("Invalid field name"));
1014    }
1015
1016    #[test]
1017    fn test_multiple_global_frontmatter_blocks() {
1018        let markdown = r#"---
1019title: First
1020---
1021
1022Body
1023
1024---
1025author: Second
1026---
1027
1028More body"#;
1029
1030        let result = decompose(markdown);
1031        assert!(result.is_err());
1032        assert!(result
1033            .unwrap_err()
1034            .to_string()
1035            .contains("Multiple global frontmatter"));
1036    }
1037
1038    #[test]
1039    fn test_adjacent_blocks_different_tags() {
1040        let markdown = r#"---
1041SCOPE: items
1042name: Item 1
1043---
1044
1045Item 1 body
1046
1047---
1048SCOPE: sections
1049title: Section 1
1050---
1051
1052Section 1 body"#;
1053
1054        let doc = decompose(markdown).unwrap();
1055
1056        assert!(doc.get_field("items").is_some());
1057        assert!(doc.get_field("sections").is_some());
1058
1059        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1060        assert_eq!(items.len(), 1);
1061
1062        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1063        assert_eq!(sections.len(), 1);
1064    }
1065
1066    #[test]
1067    fn test_order_preservation() {
1068        let markdown = r#"---
1069SCOPE: items
1070id: 1
1071---
1072
1073First
1074
1075---
1076SCOPE: items
1077id: 2
1078---
1079
1080Second
1081
1082---
1083SCOPE: items
1084id: 3
1085---
1086
1087Third"#;
1088
1089        let doc = decompose(markdown).unwrap();
1090
1091        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1092        assert_eq!(items.len(), 3);
1093
1094        for (i, item) in items.iter().enumerate() {
1095            let mapping = item.as_object().unwrap();
1096            let id = mapping.get("id").unwrap().as_i64().unwrap();
1097            assert_eq!(id, (i + 1) as i64);
1098        }
1099    }
1100
1101    #[test]
1102    fn test_product_catalog_integration() {
1103        let markdown = r#"---
1104title: Product Catalog
1105author: John Doe
1106date: 2024-01-01
1107---
1108
1109This is the main catalog description.
1110
1111---
1112SCOPE: products
1113name: Widget A
1114price: 19.99
1115sku: WID-001
1116---
1117
1118The **Widget A** is our most popular product.
1119
1120---
1121SCOPE: products
1122name: Gadget B
1123price: 29.99
1124sku: GAD-002
1125---
1126
1127The **Gadget B** is perfect for professionals.
1128
1129---
1130SCOPE: reviews
1131product: Widget A
1132rating: 5
1133---
1134
1135"Excellent product! Highly recommended."
1136
1137---
1138SCOPE: reviews
1139product: Gadget B
1140rating: 4
1141---
1142
1143"Very good, but a bit pricey.""#;
1144
1145        let doc = decompose(markdown).unwrap();
1146
1147        // Verify global fields
1148        assert_eq!(
1149            doc.get_field("title").unwrap().as_str().unwrap(),
1150            "Product Catalog"
1151        );
1152        assert_eq!(
1153            doc.get_field("author").unwrap().as_str().unwrap(),
1154            "John Doe"
1155        );
1156        assert_eq!(
1157            doc.get_field("date").unwrap().as_str().unwrap(),
1158            "2024-01-01"
1159        );
1160
1161        // Verify global body
1162        assert!(doc.body().unwrap().contains("main catalog description"));
1163
1164        // Verify products collection
1165        let products = doc.get_field("products").unwrap().as_sequence().unwrap();
1166        assert_eq!(products.len(), 2);
1167
1168        let product1 = products[0].as_object().unwrap();
1169        assert_eq!(product1.get("name").unwrap().as_str().unwrap(), "Widget A");
1170        assert_eq!(product1.get("price").unwrap().as_f64().unwrap(), 19.99);
1171
1172        // Verify reviews collection
1173        let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
1174        assert_eq!(reviews.len(), 2);
1175
1176        let review1 = reviews[0].as_object().unwrap();
1177        assert_eq!(
1178            review1.get("product").unwrap().as_str().unwrap(),
1179            "Widget A"
1180        );
1181        assert_eq!(review1.get("rating").unwrap().as_i64().unwrap(), 5);
1182
1183        // Total fields: title, author, date, body, products, reviews = 6
1184        assert_eq!(doc.fields().len(), 6);
1185    }
1186
1187    #[test]
1188    fn taro_quill_directive() {
1189        let markdown = r#"---
1190QUILL: usaf_memo
1191memo_for: [ORG/SYMBOL]
1192memo_from: [ORG/SYMBOL]
1193---
1194
1195This is the memo body."#;
1196
1197        let doc = decompose(markdown).unwrap();
1198
1199        // Verify quill tag is set
1200        assert_eq!(doc.quill_tag(), Some("usaf_memo"));
1201
1202        // Verify fields from quill block become frontmatter
1203        assert_eq!(
1204            doc.get_field("memo_for").unwrap().as_sequence().unwrap()[0]
1205                .as_str()
1206                .unwrap(),
1207            "ORG/SYMBOL"
1208        );
1209
1210        // Verify body
1211        assert_eq!(doc.body(), Some("\nThis is the memo body."));
1212    }
1213
1214    #[test]
1215    fn test_quill_with_scope_blocks() {
1216        let markdown = r#"---
1217QUILL: document
1218title: Test Document
1219---
1220
1221Main body.
1222
1223---
1224SCOPE: sections
1225name: Section 1
1226---
1227
1228Section 1 body."#;
1229
1230        let doc = decompose(markdown).unwrap();
1231
1232        // Verify quill tag
1233        assert_eq!(doc.quill_tag(), Some("document"));
1234
1235        // Verify global field from quill block
1236        assert_eq!(
1237            doc.get_field("title").unwrap().as_str().unwrap(),
1238            "Test Document"
1239        );
1240
1241        // Verify scope blocks work
1242        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1243        assert_eq!(sections.len(), 1);
1244
1245        // Verify body
1246        assert_eq!(doc.body(), Some("\nMain body.\n\n"));
1247    }
1248
1249    #[test]
1250    fn test_multiple_quill_directives_error() {
1251        let markdown = r#"---
1252QUILL: first
1253---
1254
1255---
1256QUILL: second
1257---"#;
1258
1259        let result = decompose(markdown);
1260        assert!(result.is_err());
1261        assert!(result
1262            .unwrap_err()
1263            .to_string()
1264            .contains("Multiple quill directives"));
1265    }
1266
1267    #[test]
1268    fn test_invalid_quill_name() {
1269        let markdown = r#"---
1270QUILL: Invalid-Name
1271---"#;
1272
1273        let result = decompose(markdown);
1274        assert!(result.is_err());
1275        assert!(result
1276            .unwrap_err()
1277            .to_string()
1278            .contains("Invalid quill name"));
1279    }
1280
1281    #[test]
1282    fn test_quill_wrong_value_type() {
1283        let markdown = r#"---
1284QUILL: 123
1285---"#;
1286
1287        let result = decompose(markdown);
1288        assert!(result.is_err());
1289        assert!(result
1290            .unwrap_err()
1291            .to_string()
1292            .contains("QUILL value must be a string"));
1293    }
1294
1295    #[test]
1296    fn test_scope_wrong_value_type() {
1297        let markdown = r#"---
1298SCOPE: 123
1299---"#;
1300
1301        let result = decompose(markdown);
1302        assert!(result.is_err());
1303        assert!(result
1304            .unwrap_err()
1305            .to_string()
1306            .contains("SCOPE value must be a string"));
1307    }
1308
1309    #[test]
1310    fn test_both_quill_and_scope_error() {
1311        let markdown = r#"---
1312QUILL: test
1313SCOPE: items
1314---"#;
1315
1316        let result = decompose(markdown);
1317        assert!(result.is_err());
1318        assert!(result
1319            .unwrap_err()
1320            .to_string()
1321            .contains("Cannot specify both QUILL and SCOPE"));
1322    }
1323
1324    #[test]
1325    fn test_blank_lines_in_frontmatter() {
1326        // New parsing standard: blank lines are allowed within YAML blocks
1327        let markdown = r#"---
1328title: Test Document
1329author: Test Author
1330
1331description: This has a blank line above it
1332tags:
1333  - one
1334  - two
1335---
1336
1337# Hello World
1338
1339This is the body."#;
1340
1341        let doc = decompose(markdown).unwrap();
1342
1343        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
1344        assert_eq!(
1345            doc.get_field("title").unwrap().as_str().unwrap(),
1346            "Test Document"
1347        );
1348        assert_eq!(
1349            doc.get_field("author").unwrap().as_str().unwrap(),
1350            "Test Author"
1351        );
1352        assert_eq!(
1353            doc.get_field("description").unwrap().as_str().unwrap(),
1354            "This has a blank line above it"
1355        );
1356
1357        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
1358        assert_eq!(tags.len(), 2);
1359    }
1360
1361    #[test]
1362    fn test_blank_lines_in_scope_blocks() {
1363        // Blank lines should be allowed in SCOPE blocks too
1364        let markdown = r#"---
1365SCOPE: items
1366name: Item 1
1367
1368price: 19.99
1369
1370tags:
1371  - electronics
1372  - gadgets
1373---
1374
1375Body of item 1."#;
1376
1377        let doc = decompose(markdown).unwrap();
1378
1379        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1380        assert_eq!(items.len(), 1);
1381
1382        let item = items[0].as_object().unwrap();
1383        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
1384        assert_eq!(item.get("price").unwrap().as_f64().unwrap(), 19.99);
1385
1386        let tags = item.get("tags").unwrap().as_array().unwrap();
1387        assert_eq!(tags.len(), 2);
1388    }
1389
1390    #[test]
1391    fn test_horizontal_rule_with_blank_lines_above_and_below() {
1392        // Horizontal rule: blank lines both above AND below the ---
1393        let markdown = r#"---
1394title: Test
1395---
1396
1397First paragraph.
1398
1399---
1400
1401Second paragraph."#;
1402
1403        let doc = decompose(markdown).unwrap();
1404
1405        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Test");
1406
1407        // The body should contain the horizontal rule (---) as part of the content
1408        let body = doc.body().unwrap();
1409        assert!(body.contains("First paragraph."));
1410        assert!(body.contains("---"));
1411        assert!(body.contains("Second paragraph."));
1412    }
1413
1414    #[test]
1415    fn test_horizontal_rule_not_preceded_by_blank() {
1416        // --- not preceded by blank line but followed by blank line is NOT a horizontal rule
1417        // It's also NOT a valid metadata block opening (since it's followed by blank)
1418        let markdown = r#"---
1419title: Test
1420---
1421
1422First paragraph.
1423---
1424
1425Second paragraph."#;
1426
1427        let doc = decompose(markdown).unwrap();
1428
1429        let body = doc.body().unwrap();
1430        // The second --- should be in the body as text (not a horizontal rule since no blank above)
1431        assert!(body.contains("---"));
1432    }
1433
1434    #[test]
1435    fn test_multiple_blank_lines_in_yaml() {
1436        // Multiple blank lines should also be allowed
1437        let markdown = r#"---
1438title: Test
1439
1440
1441author: John Doe
1442
1443
1444version: 1.0
1445---
1446
1447Body content."#;
1448
1449        let doc = decompose(markdown).unwrap();
1450
1451        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Test");
1452        assert_eq!(
1453            doc.get_field("author").unwrap().as_str().unwrap(),
1454            "John Doe"
1455        );
1456        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
1457    }
1458}
1459#[cfg(test)]
1460mod demo_file_test {
1461    use super::*;
1462
1463    #[test]
1464    fn test_extended_metadata_demo_file() {
1465        let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
1466        let doc = decompose(markdown).unwrap();
1467
1468        // Verify global fields
1469        assert_eq!(
1470            doc.get_field("title").unwrap().as_str().unwrap(),
1471            "Extended Metadata Demo"
1472        );
1473        assert_eq!(
1474            doc.get_field("author").unwrap().as_str().unwrap(),
1475            "Quillmark Team"
1476        );
1477        // version is parsed as a number by YAML
1478        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
1479
1480        // Verify body
1481        assert!(doc
1482            .body()
1483            .unwrap()
1484            .contains("extended YAML metadata standard"));
1485
1486        // Verify features collection
1487        let features = doc.get_field("features").unwrap().as_sequence().unwrap();
1488        assert_eq!(features.len(), 3);
1489
1490        // Verify use_cases collection
1491        let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
1492        assert_eq!(use_cases.len(), 2);
1493
1494        // Check first feature
1495        let feature1 = features[0].as_object().unwrap();
1496        assert_eq!(
1497            feature1.get("name").unwrap().as_str().unwrap(),
1498            "Tag Directives"
1499        );
1500    }
1501
1502    #[test]
1503    fn test_input_size_limit() {
1504        // Create markdown larger than MAX_INPUT_SIZE (10 MB)
1505        let size = crate::error::MAX_INPUT_SIZE + 1;
1506        let large_markdown = "a".repeat(size);
1507
1508        let result = decompose(&large_markdown);
1509        assert!(result.is_err());
1510
1511        let err_msg = result.unwrap_err().to_string();
1512        assert!(err_msg.contains("Input too large"));
1513    }
1514
1515    #[test]
1516    fn test_yaml_size_limit() {
1517        // Create YAML block larger than MAX_YAML_SIZE (1 MB)
1518        let mut markdown = String::from("---\n");
1519
1520        // Create a very large YAML field
1521        let size = crate::error::MAX_YAML_SIZE + 1;
1522        markdown.push_str("data: \"");
1523        markdown.push_str(&"x".repeat(size));
1524        markdown.push_str("\"\n---\n\nBody");
1525
1526        let result = decompose(&markdown);
1527        assert!(result.is_err());
1528
1529        let err_msg = result.unwrap_err().to_string();
1530        assert!(err_msg.contains("YAML block too large"));
1531    }
1532
1533    #[test]
1534    fn test_input_within_size_limit() {
1535        // Create markdown just under the limit
1536        let size = 1000; // Much smaller than limit
1537        let markdown = format!("---\ntitle: Test\n---\n\n{}", "a".repeat(size));
1538
1539        let result = decompose(&markdown);
1540        assert!(result.is_ok());
1541    }
1542
1543    #[test]
1544    fn test_yaml_within_size_limit() {
1545        // Create YAML block well within the limit
1546        let markdown = "---\ntitle: Test\nauthor: John Doe\n---\n\nBody content";
1547
1548        let result = decompose(&markdown);
1549        assert!(result.is_ok());
1550    }
1551}