quillmark_core/
parse.rs

1//! # Parsing Module
2//!
3//! Parsing functionality for markdown documents with YAML frontmatter.
4//!
5//! ## Overview
6//!
7//! The `parse` module provides the [`decompose`] function for parsing markdown documents
8//! and the [`ParsedDocument`] type for accessing parsed content.
9//!
10//! ## Key Types
11//!
12//! - [`ParsedDocument`]: Container for parsed frontmatter fields and body content
13//! - [`BODY_FIELD`]: Constant for the field name storing document body
14//!
15//! ## Examples
16//!
17//! ### Basic Parsing
18//!
19//! ```
20//! use quillmark_core::decompose;
21//!
22//! let markdown = r#"---
23//! title: My Document
24//! author: John Doe
25//! ---
26//!
27//! # Introduction
28//!
29//! Document content here.
30//! "#;
31//!
32//! let doc = decompose(markdown).unwrap();
33//! let title = doc.get_field("title")
34//!     .and_then(|v| v.as_str())
35//!     .unwrap_or("Untitled");
36//! ```
37//!
38//! ### Extended Metadata with Tags
39//!
40//! ```
41//! use quillmark_core::decompose;
42//!
43//! let markdown = r#"---
44//! catalog_title: Product Catalog
45//! ---
46//!
47//! # Products
48//!
49//! ---
50//! SCOPE: products
51//! name: Widget
52//! price: 19.99
53//! ---
54//!
55//! A versatile widget for all occasions.
56//! "#;
57//!
58//! let doc = decompose(markdown).unwrap();
59//!
60//! // Access tagged collections
61//! if let Some(products) = doc.get_field("products")
62//!     .and_then(|v| v.as_sequence())
63//! {
64//!     for product in products {
65//!         let name = product.get("name").and_then(|v| v.as_str()).unwrap();
66//!         let price = product.get("price").and_then(|v| v.as_f64()).unwrap();
67//!         println!("{}: ${}", name, price);
68//!     }
69//! }
70//! ```
71//!
72//! ## Error Handling
73//!
74//! The [`decompose`] function returns errors for:
75//! - Malformed YAML syntax
76//! - Unclosed frontmatter blocks
77//! - Multiple global frontmatter blocks
78//! - Both QUILL and SCOPE specified in the same block
79//! - Reserved field name usage
80//! - Name collisions
81//!
82//! See [PARSE.md](https://github.com/nibsbin/quillmark/blob/main/designs/PARSE.md) for comprehensive documentation of the Extended YAML Metadata Standard.
83
84use std::collections::HashMap;
85
86use crate::value::QuillValue;
87
88/// The field name used to store the document body
89pub const BODY_FIELD: &str = "body";
90
91/// Reserved tag name for quill specification
92pub const QUILL_TAG: &str = "quill";
93
94/// A parsed markdown document with frontmatter
95#[derive(Debug, Clone)]
96pub struct ParsedDocument {
97    fields: HashMap<String, QuillValue>,
98    quill_tag: Option<String>,
99}
100
101impl ParsedDocument {
102    /// Create a new ParsedDocument with the given fields
103    pub fn new(fields: HashMap<String, QuillValue>) -> Self {
104        Self {
105            fields,
106            quill_tag: None,
107        }
108    }
109
110    /// Create a ParsedDocument from markdown string
111    pub fn from_markdown(markdown: &str) -> Result<Self, crate::error::ParseError> {
112        decompose(markdown).map_err(|e| crate::error::ParseError::from(e))
113    }
114
115    /// Get the quill tag if specified (from QUILL key)
116    pub fn quill_tag(&self) -> Option<&str> {
117        self.quill_tag.as_deref()
118    }
119
120    /// Get the document body
121    pub fn body(&self) -> Option<&str> {
122        self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
123    }
124
125    /// Get a specific field
126    pub fn get_field(&self, name: &str) -> Option<&QuillValue> {
127        self.fields.get(name)
128    }
129
130    /// Get all fields (including body)
131    pub fn fields(&self) -> &HashMap<String, QuillValue> {
132        &self.fields
133    }
134}
135
136#[derive(Debug)]
137struct MetadataBlock {
138    start: usize, // Position of opening "---"
139    end: usize,   // Position after closing "---\n"
140    yaml_content: String,
141    tag: Option<String>,        // Field name from SCOPE key
142    quill_name: Option<String>, // Quill name from QUILL key
143}
144
145/// Validate tag name follows pattern [a-z_][a-z0-9_]*
146fn is_valid_tag_name(name: &str) -> bool {
147    if name.is_empty() {
148        return false;
149    }
150
151    let mut chars = name.chars();
152    let first = chars.next().unwrap();
153
154    if !first.is_ascii_lowercase() && first != '_' {
155        return false;
156    }
157
158    for ch in chars {
159        if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
160            return false;
161        }
162    }
163
164    true
165}
166
167/// Find all metadata blocks in the document
168fn find_metadata_blocks(
169    markdown: &str,
170) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
171    let mut blocks = Vec::new();
172    let mut pos = 0;
173
174    while pos < markdown.len() {
175        // Look for opening "---\n" or "---\r\n"
176        let search_str = &markdown[pos..];
177        let delimiter_result = if let Some(p) = search_str.find("---\n") {
178            Some((p, 4, "\n"))
179        } else if let Some(p) = search_str.find("---\r\n") {
180            Some((p, 5, "\r\n"))
181        } else {
182            None
183        };
184
185        if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
186            let abs_pos = pos + delimiter_pos;
187            let content_start = abs_pos + delimiter_len; // After "---\n" or "---\r\n"
188
189            // Check if opening --- is followed by a blank line (horizontal rule, not metadata)
190            let followed_by_blank = if content_start < markdown.len() {
191                markdown[content_start..].starts_with('\n')
192                    || markdown[content_start..].starts_with("\r\n")
193            } else {
194                false
195            };
196
197            if followed_by_blank {
198                // This is a horizontal rule in the body, skip it
199                pos = abs_pos + 3; // Skip past "---"
200                continue;
201            }
202
203            // Found potential metadata block opening
204            // Look for closing "\n---\n" or "\r\n---\r\n" etc., OR "\n---" / "\r\n---" at end of document
205            let rest = &markdown[content_start..];
206
207            // First try to find delimiters with trailing newlines
208            let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
209            let closing_with_newline = closing_patterns
210                .iter()
211                .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
212                .min_by_key(|(p, _)| *p);
213
214            // Also check for closing at end of document (no trailing newline)
215            let closing_at_eof = ["\n---", "\r\n---"]
216                .iter()
217                .filter_map(|delim| {
218                    rest.find(delim).and_then(|p| {
219                        if p + delim.len() == rest.len() {
220                            Some((p, delim.len()))
221                        } else {
222                            None
223                        }
224                    })
225                })
226                .min_by_key(|(p, _)| *p);
227
228            let closing_result = match (closing_with_newline, closing_at_eof) {
229                (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
230                (Some(_), Some(_)) => closing_with_newline,
231                (Some(_), None) => closing_with_newline,
232                (None, Some(_)) => closing_at_eof,
233                (None, None) => None,
234            };
235
236            if let Some((closing_pos, closing_len)) = closing_result {
237                let abs_closing_pos = content_start + closing_pos;
238                let content = &markdown[content_start..abs_closing_pos];
239
240                // Check YAML size limit
241                if content.len() > crate::error::MAX_YAML_SIZE {
242                    return Err(format!(
243                        "YAML block too large: {} bytes (max: {} bytes)",
244                        content.len(),
245                        crate::error::MAX_YAML_SIZE
246                    )
247                    .into());
248                }
249
250                // Check if the block is contiguous (no blank lines in the YAML content)
251                if content.contains("\n\n") || content.contains("\r\n\r\n") {
252                    // Not a contiguous block
253                    if abs_pos == 0 {
254                        // Started at beginning but has blank lines - this is an error
255                        return Err("Frontmatter started but not closed with ---".into());
256                    }
257                    // Otherwise treat as horizontal rule in body
258                    pos = abs_pos + 3;
259                    continue;
260                }
261
262                // Parse YAML content to check for reserved keys (QUILL, SCOPE)
263                // First, try to parse as YAML
264                let (tag, quill_name, yaml_content) = if !content.is_empty() {
265                    // Try to parse the YAML to check for reserved keys
266                    match serde_yaml::from_str::<serde_yaml::Value>(content) {
267                        Ok(yaml_value) => {
268                            if let Some(mapping) = yaml_value.as_mapping() {
269                                let quill_key = serde_yaml::Value::String("QUILL".to_string());
270                                let scope_key = serde_yaml::Value::String("SCOPE".to_string());
271
272                                let has_quill = mapping.contains_key(&quill_key);
273                                let has_scope = mapping.contains_key(&scope_key);
274
275                                if has_quill && has_scope {
276                                    return Err(
277                                        "Cannot specify both QUILL and SCOPE in the same block"
278                                            .into(),
279                                    );
280                                }
281
282                                if has_quill {
283                                    // Extract quill name
284                                    let quill_value = mapping.get(&quill_key).unwrap();
285                                    let quill_name_str = quill_value
286                                        .as_str()
287                                        .ok_or_else(|| "QUILL value must be a string")?;
288
289                                    if !is_valid_tag_name(quill_name_str) {
290                                        return Err(format!(
291                                            "Invalid quill name '{}': must match pattern [a-z_][a-z0-9_]*",
292                                            quill_name_str
293                                        )
294                                        .into());
295                                    }
296
297                                    // Remove QUILL from the YAML content for processing
298                                    let mut new_mapping = mapping.clone();
299                                    new_mapping.remove(&quill_key);
300                                    let new_yaml = serde_yaml::to_string(&new_mapping)
301                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
302
303                                    (None, Some(quill_name_str.to_string()), new_yaml)
304                                } else if has_scope {
305                                    // Extract scope field name
306                                    let scope_value = mapping.get(&scope_key).unwrap();
307                                    let field_name = scope_value
308                                        .as_str()
309                                        .ok_or_else(|| "SCOPE value must be a string")?;
310
311                                    if !is_valid_tag_name(field_name) {
312                                        return Err(format!(
313                                            "Invalid field name '{}': must match pattern [a-z_][a-z0-9_]*",
314                                            field_name
315                                        )
316                                        .into());
317                                    }
318
319                                    if field_name == BODY_FIELD {
320                                        return Err(format!(
321                                            "Cannot use reserved field name '{}' as SCOPE value",
322                                            BODY_FIELD
323                                        )
324                                        .into());
325                                    }
326
327                                    // Remove SCOPE from the YAML content for processing
328                                    let mut new_mapping = mapping.clone();
329                                    new_mapping.remove(&scope_key);
330                                    let new_yaml = serde_yaml::to_string(&new_mapping)
331                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
332
333                                    (Some(field_name.to_string()), None, new_yaml)
334                                } else {
335                                    // No reserved keys, treat as normal YAML
336                                    (None, None, content.to_string())
337                                }
338                            } else {
339                                // Not a mapping, treat as normal YAML
340                                (None, None, content.to_string())
341                            }
342                        }
343                        Err(_) => {
344                            // If YAML parsing fails here, we'll catch it later
345                            (None, None, content.to_string())
346                        }
347                    }
348                } else {
349                    (None, None, content.to_string())
350                };
351
352                blocks.push(MetadataBlock {
353                    start: abs_pos,
354                    end: abs_closing_pos + closing_len, // After closing delimiter
355                    yaml_content,
356                    tag,
357                    quill_name,
358                });
359
360                pos = abs_closing_pos + closing_len;
361            } else if abs_pos == 0 {
362                // Frontmatter started but not closed
363                return Err("Frontmatter started but not closed with ---".into());
364            } else {
365                // Not a valid metadata block, skip this position
366                pos = abs_pos + 3;
367            }
368        } else {
369            break;
370        }
371    }
372
373    Ok(blocks)
374}
375
376/// Decompose markdown into frontmatter fields and body
377pub fn decompose(
378    markdown: &str,
379) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
380    // Check input size limit
381    if markdown.len() > crate::error::MAX_INPUT_SIZE {
382        return Err(format!(
383            "Input too large: {} bytes (max: {} bytes)",
384            markdown.len(),
385            crate::error::MAX_INPUT_SIZE
386        )
387        .into());
388    }
389
390    let mut fields = HashMap::new();
391
392    // Find all metadata blocks
393    let blocks = find_metadata_blocks(markdown)?;
394
395    if blocks.is_empty() {
396        // No metadata blocks, entire content is body
397        fields.insert(
398            BODY_FIELD.to_string(),
399            QuillValue::from_json(serde_json::Value::String(markdown.to_string())),
400        );
401        return Ok(ParsedDocument::new(fields));
402    }
403
404    // Track which attributes are used for tagged blocks
405    let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
406    let mut has_global_frontmatter = false;
407    let mut global_frontmatter_index: Option<usize> = None;
408    let mut quill_name: Option<String> = None;
409
410    // First pass: identify global frontmatter, quill directive, and validate
411    for (idx, block) in blocks.iter().enumerate() {
412        // Check for quill directive
413        if let Some(ref name) = block.quill_name {
414            if quill_name.is_some() {
415                return Err("Multiple quill directives found: only one allowed".into());
416            }
417            quill_name = Some(name.clone());
418        }
419
420        // Check for global frontmatter (no tag and no quill directive)
421        if block.tag.is_none() && block.quill_name.is_none() {
422            if has_global_frontmatter {
423                return Err(
424                    "Multiple global frontmatter blocks found: only one untagged block allowed"
425                        .into(),
426                );
427            }
428            has_global_frontmatter = true;
429            global_frontmatter_index = Some(idx);
430        }
431    }
432
433    // Parse global frontmatter if present
434    if let Some(idx) = global_frontmatter_index {
435        let block = &blocks[idx];
436
437        // Parse YAML frontmatter
438        let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
439            HashMap::new()
440        } else {
441            serde_yaml::from_str(&block.yaml_content)
442                .map_err(|e| format!("Invalid YAML frontmatter: {}", e))?
443        };
444
445        // Check that all tagged blocks don't conflict with global fields
446        // Exception: if the global field is an array, allow it (we'll merge later)
447        for other_block in &blocks {
448            if let Some(ref tag) = other_block.tag {
449                if let Some(global_value) = yaml_fields.get(tag) {
450                    // Check if the global value is an array
451                    if global_value.as_sequence().is_none() {
452                        return Err(format!(
453                            "Name collision: global field '{}' conflicts with tagged attribute",
454                            tag
455                        )
456                        .into());
457                    }
458                }
459            }
460        }
461
462        // Convert YAML values to QuillValue at boundary
463        for (key, value) in yaml_fields {
464            fields.insert(key, QuillValue::from_yaml(value)?);
465        }
466    }
467
468    // Process blocks with quill directives
469    for block in &blocks {
470        if block.quill_name.is_some() {
471            // Quill directive blocks can have YAML content (becomes part of frontmatter)
472            if !block.yaml_content.is_empty() {
473                let yaml_fields: HashMap<String, serde_yaml::Value> =
474                    serde_yaml::from_str(&block.yaml_content)
475                        .map_err(|e| format!("Invalid YAML in quill block: {}", e))?;
476
477                // Check for conflicts with existing fields
478                for key in yaml_fields.keys() {
479                    if fields.contains_key(key) {
480                        return Err(format!(
481                            "Name collision: quill block field '{}' conflicts with existing field",
482                            key
483                        )
484                        .into());
485                    }
486                }
487
488                // Convert YAML values to QuillValue at boundary
489                for (key, value) in yaml_fields {
490                    fields.insert(key, QuillValue::from_yaml(value)?);
491                }
492            }
493        }
494    }
495
496    // Parse tagged blocks
497    for (idx, block) in blocks.iter().enumerate() {
498        if let Some(ref tag_name) = block.tag {
499            // Check if this conflicts with global fields
500            // Exception: if the global field is an array, allow it (we'll merge later)
501            if let Some(existing_value) = fields.get(tag_name) {
502                if existing_value.as_array().is_none() {
503                    return Err(format!(
504                        "Name collision: tagged attribute '{}' conflicts with global field",
505                        tag_name
506                    )
507                    .into());
508                }
509            }
510
511            // Parse YAML metadata
512            let mut item_fields: HashMap<String, serde_yaml::Value> =
513                if block.yaml_content.is_empty() {
514                    HashMap::new()
515                } else {
516                    serde_yaml::from_str(&block.yaml_content).map_err(|e| {
517                        format!("Invalid YAML in tagged block '{}': {}", tag_name, e)
518                    })?
519                };
520
521            // Extract body for this tagged block
522            let body_start = block.end;
523            let body_end = if idx + 1 < blocks.len() {
524                blocks[idx + 1].start
525            } else {
526                markdown.len()
527            };
528            let body = &markdown[body_start..body_end];
529
530            // Add body to item fields
531            item_fields.insert(
532                BODY_FIELD.to_string(),
533                serde_yaml::Value::String(body.to_string()),
534            );
535
536            // Convert HashMap to serde_yaml::Value::Mapping
537            let item_value = serde_yaml::to_value(item_fields)?;
538
539            // Add to collection
540            tagged_attributes
541                .entry(tag_name.clone())
542                .or_insert_with(Vec::new)
543                .push(item_value);
544        }
545    }
546
547    // Extract global body
548    // Body starts after global frontmatter or quill block (whichever comes first)
549    // Body ends at the first scope block or EOF
550    let first_non_scope_block_idx = blocks
551        .iter()
552        .position(|b| b.tag.is_none() && b.quill_name.is_none())
553        .or_else(|| blocks.iter().position(|b| b.quill_name.is_some()));
554
555    let (body_start, body_end) = if let Some(idx) = first_non_scope_block_idx {
556        // Body starts after the first non-scope block (global frontmatter or quill)
557        let start = blocks[idx].end;
558
559        // Body ends at the first scope block after this, or EOF
560        let end = blocks
561            .iter()
562            .skip(idx + 1)
563            .find(|b| b.tag.is_some())
564            .map(|b| b.start)
565            .unwrap_or(markdown.len());
566
567        (start, end)
568    } else {
569        // No global frontmatter or quill block - body is everything before the first scope block
570        let end = blocks
571            .iter()
572            .find(|b| b.tag.is_some())
573            .map(|b| b.start)
574            .unwrap_or(0);
575
576        (0, end)
577    };
578
579    let global_body = &markdown[body_start..body_end];
580
581    fields.insert(
582        BODY_FIELD.to_string(),
583        QuillValue::from_json(serde_json::Value::String(global_body.to_string())),
584    );
585
586    // Add all tagged collections to fields (convert to QuillValue)
587    // If a field already exists and is an array, merge the new items into it
588    for (tag_name, items) in tagged_attributes {
589        if let Some(existing_value) = fields.get(&tag_name) {
590            // The existing value must be an array (checked earlier)
591            if let Some(existing_array) = existing_value.as_array() {
592                // Convert new items from YAML to JSON
593                let new_items_json: Vec<serde_json::Value> = items
594                    .into_iter()
595                    .map(|yaml_val| {
596                        serde_json::to_value(&yaml_val)
597                            .map_err(|e| format!("Failed to convert YAML to JSON: {}", e))
598                    })
599                    .collect::<Result<Vec<_>, _>>()?;
600
601                // Combine existing and new items
602                let mut merged_array = existing_array.clone();
603                merged_array.extend(new_items_json);
604
605                // Create QuillValue from merged JSON array
606                let quill_value = QuillValue::from_json(serde_json::Value::Array(merged_array));
607                fields.insert(tag_name, quill_value);
608            } else {
609                // This should not happen due to earlier validation, but handle it gracefully
610                return Err(format!(
611                    "Internal error: field '{}' exists but is not an array",
612                    tag_name
613                )
614                .into());
615            }
616        } else {
617            // No existing field, just create a new sequence
618            let quill_value = QuillValue::from_yaml(serde_yaml::Value::Sequence(items))?;
619            fields.insert(tag_name, quill_value);
620        }
621    }
622
623    let mut parsed = ParsedDocument::new(fields);
624
625    // Set quill tag if present
626    if let Some(name) = quill_name {
627        parsed.quill_tag = Some(name);
628    }
629
630    Ok(parsed)
631}
632
633#[cfg(test)]
634mod tests {
635    use super::*;
636
637    #[test]
638    fn test_no_frontmatter() {
639        let markdown = "# Hello World\n\nThis is a test.";
640        let doc = decompose(markdown).unwrap();
641
642        assert_eq!(doc.body(), Some(markdown));
643        assert_eq!(doc.fields().len(), 1);
644    }
645
646    #[test]
647    fn test_with_frontmatter() {
648        let markdown = r#"---
649title: Test Document
650author: Test Author
651---
652
653# Hello World
654
655This is the body."#;
656
657        let doc = decompose(markdown).unwrap();
658
659        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
660        assert_eq!(
661            doc.get_field("title").unwrap().as_str().unwrap(),
662            "Test Document"
663        );
664        assert_eq!(
665            doc.get_field("author").unwrap().as_str().unwrap(),
666            "Test Author"
667        );
668        assert_eq!(doc.fields().len(), 3); // title, author, body
669    }
670
671    #[test]
672    fn test_complex_yaml_frontmatter() {
673        let markdown = r#"---
674title: Complex Document
675tags:
676  - test
677  - yaml
678metadata:
679  version: 1.0
680  nested:
681    field: value
682---
683
684Content here."#;
685
686        let doc = decompose(markdown).unwrap();
687
688        assert_eq!(doc.body(), Some("\nContent here."));
689        assert_eq!(
690            doc.get_field("title").unwrap().as_str().unwrap(),
691            "Complex Document"
692        );
693
694        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
695        assert_eq!(tags.len(), 2);
696        assert_eq!(tags[0].as_str().unwrap(), "test");
697        assert_eq!(tags[1].as_str().unwrap(), "yaml");
698    }
699
700    #[test]
701    fn test_invalid_yaml() {
702        let markdown = r#"---
703title: [invalid yaml
704author: missing close bracket
705---
706
707Content here."#;
708
709        let result = decompose(markdown);
710        assert!(result.is_err());
711        assert!(result
712            .unwrap_err()
713            .to_string()
714            .contains("Invalid YAML frontmatter"));
715    }
716
717    #[test]
718    fn test_unclosed_frontmatter() {
719        let markdown = r#"---
720title: Test
721author: Test Author
722
723Content without closing ---"#;
724
725        let result = decompose(markdown);
726        assert!(result.is_err());
727        assert!(result.unwrap_err().to_string().contains("not closed"));
728    }
729
730    // Extended metadata tests
731
732    #[test]
733    fn test_basic_tagged_block() {
734        let markdown = r#"---
735title: Main Document
736---
737
738Main body content.
739
740---
741SCOPE: items
742name: Item 1
743---
744
745Body of item 1."#;
746
747        let doc = decompose(markdown).unwrap();
748
749        assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
750        assert_eq!(
751            doc.get_field("title").unwrap().as_str().unwrap(),
752            "Main Document"
753        );
754
755        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
756        assert_eq!(items.len(), 1);
757
758        let item = items[0].as_object().unwrap();
759        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
760        assert_eq!(
761            item.get("body").unwrap().as_str().unwrap(),
762            "\nBody of item 1."
763        );
764    }
765
766    #[test]
767    fn test_multiple_tagged_blocks() {
768        let markdown = r#"---
769SCOPE: items
770name: Item 1
771tags: [a, b]
772---
773
774First item body.
775
776---
777SCOPE: items
778name: Item 2
779tags: [c, d]
780---
781
782Second item body."#;
783
784        let doc = decompose(markdown).unwrap();
785
786        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
787        assert_eq!(items.len(), 2);
788
789        let item1 = items[0].as_object().unwrap();
790        assert_eq!(item1.get("name").unwrap().as_str().unwrap(), "Item 1");
791
792        let item2 = items[1].as_object().unwrap();
793        assert_eq!(item2.get("name").unwrap().as_str().unwrap(), "Item 2");
794    }
795
796    #[test]
797    fn test_mixed_global_and_tagged() {
798        let markdown = r#"---
799title: Global
800author: John Doe
801---
802
803Global body.
804
805---
806SCOPE: sections
807title: Section 1
808---
809
810Section 1 content.
811
812---
813SCOPE: sections
814title: Section 2
815---
816
817Section 2 content."#;
818
819        let doc = decompose(markdown).unwrap();
820
821        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Global");
822        assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
823
824        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
825        assert_eq!(sections.len(), 2);
826    }
827
828    #[test]
829    fn test_empty_tagged_metadata() {
830        let markdown = r#"---
831SCOPE: items
832---
833
834Body without metadata."#;
835
836        let doc = decompose(markdown).unwrap();
837
838        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
839        assert_eq!(items.len(), 1);
840
841        let item = items[0].as_object().unwrap();
842        assert_eq!(
843            item.get("body").unwrap().as_str().unwrap(),
844            "\nBody without metadata."
845        );
846    }
847
848    #[test]
849    fn test_tagged_block_without_body() {
850        let markdown = r#"---
851SCOPE: items
852name: Item
853---"#;
854
855        let doc = decompose(markdown).unwrap();
856
857        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
858        assert_eq!(items.len(), 1);
859
860        let item = items[0].as_object().unwrap();
861        assert_eq!(item.get("body").unwrap().as_str().unwrap(), "");
862    }
863
864    #[test]
865    fn test_name_collision_global_and_tagged() {
866        let markdown = r#"---
867items: "global value"
868---
869
870Body
871
872---
873SCOPE: items
874name: Item
875---
876
877Item body"#;
878
879        let result = decompose(markdown);
880        assert!(result.is_err());
881        assert!(result.unwrap_err().to_string().contains("collision"));
882    }
883
884    #[test]
885    fn test_global_array_merged_with_scope() {
886        // When global frontmatter has an array field with the same name as a SCOPE,
887        // the SCOPE items should be added to the array
888        let markdown = r#"---
889items:
890  - name: Global Item 1
891    value: 100
892  - name: Global Item 2
893    value: 200
894---
895
896Global body
897
898---
899SCOPE: items
900name: Scope Item 1
901value: 300
902---
903
904Scope item 1 body
905
906---
907SCOPE: items
908name: Scope Item 2
909value: 400
910---
911
912Scope item 2 body"#;
913
914        let doc = decompose(markdown).unwrap();
915
916        // Verify the items array has all 4 items (2 from global + 2 from SCOPE)
917        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
918        assert_eq!(items.len(), 4);
919
920        // Verify first two items (from global array)
921        let item1 = items[0].as_object().unwrap();
922        assert_eq!(
923            item1.get("name").unwrap().as_str().unwrap(),
924            "Global Item 1"
925        );
926        assert_eq!(item1.get("value").unwrap().as_i64().unwrap(), 100);
927
928        let item2 = items[1].as_object().unwrap();
929        assert_eq!(
930            item2.get("name").unwrap().as_str().unwrap(),
931            "Global Item 2"
932        );
933        assert_eq!(item2.get("value").unwrap().as_i64().unwrap(), 200);
934
935        // Verify last two items (from SCOPE blocks)
936        let item3 = items[2].as_object().unwrap();
937        assert_eq!(item3.get("name").unwrap().as_str().unwrap(), "Scope Item 1");
938        assert_eq!(item3.get("value").unwrap().as_i64().unwrap(), 300);
939        assert_eq!(
940            item3.get("body").unwrap().as_str().unwrap(),
941            "\nScope item 1 body\n\n"
942        );
943
944        let item4 = items[3].as_object().unwrap();
945        assert_eq!(item4.get("name").unwrap().as_str().unwrap(), "Scope Item 2");
946        assert_eq!(item4.get("value").unwrap().as_i64().unwrap(), 400);
947        assert_eq!(
948            item4.get("body").unwrap().as_str().unwrap(),
949            "\nScope item 2 body"
950        );
951    }
952
953    #[test]
954    fn test_empty_global_array_with_scope() {
955        // Edge case: global frontmatter has an empty array
956        let markdown = r#"---
957items: []
958---
959
960Global body
961
962---
963SCOPE: items
964name: Item 1
965---
966
967Item 1 body"#;
968
969        let doc = decompose(markdown).unwrap();
970
971        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
972        assert_eq!(items.len(), 1);
973
974        let item = items[0].as_object().unwrap();
975        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
976    }
977
978    #[test]
979    fn test_reserved_field_name() {
980        let markdown = r#"---
981SCOPE: body
982content: Test
983---"#;
984
985        let result = decompose(markdown);
986        assert!(result.is_err());
987        assert!(result.unwrap_err().to_string().contains("reserved"));
988    }
989
990    #[test]
991    fn test_invalid_tag_syntax() {
992        let markdown = r#"---
993SCOPE: Invalid-Name
994title: Test
995---"#;
996
997        let result = decompose(markdown);
998        assert!(result.is_err());
999        assert!(result
1000            .unwrap_err()
1001            .to_string()
1002            .contains("Invalid field name"));
1003    }
1004
1005    #[test]
1006    fn test_multiple_global_frontmatter_blocks() {
1007        let markdown = r#"---
1008title: First
1009---
1010
1011Body
1012
1013---
1014author: Second
1015---
1016
1017More body"#;
1018
1019        let result = decompose(markdown);
1020        assert!(result.is_err());
1021        assert!(result
1022            .unwrap_err()
1023            .to_string()
1024            .contains("Multiple global frontmatter"));
1025    }
1026
1027    #[test]
1028    fn test_adjacent_blocks_different_tags() {
1029        let markdown = r#"---
1030SCOPE: items
1031name: Item 1
1032---
1033
1034Item 1 body
1035
1036---
1037SCOPE: sections
1038title: Section 1
1039---
1040
1041Section 1 body"#;
1042
1043        let doc = decompose(markdown).unwrap();
1044
1045        assert!(doc.get_field("items").is_some());
1046        assert!(doc.get_field("sections").is_some());
1047
1048        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1049        assert_eq!(items.len(), 1);
1050
1051        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1052        assert_eq!(sections.len(), 1);
1053    }
1054
1055    #[test]
1056    fn test_order_preservation() {
1057        let markdown = r#"---
1058SCOPE: items
1059id: 1
1060---
1061
1062First
1063
1064---
1065SCOPE: items
1066id: 2
1067---
1068
1069Second
1070
1071---
1072SCOPE: items
1073id: 3
1074---
1075
1076Third"#;
1077
1078        let doc = decompose(markdown).unwrap();
1079
1080        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1081        assert_eq!(items.len(), 3);
1082
1083        for (i, item) in items.iter().enumerate() {
1084            let mapping = item.as_object().unwrap();
1085            let id = mapping.get("id").unwrap().as_i64().unwrap();
1086            assert_eq!(id, (i + 1) as i64);
1087        }
1088    }
1089
1090    #[test]
1091    fn test_product_catalog_integration() {
1092        let markdown = r#"---
1093title: Product Catalog
1094author: John Doe
1095date: 2024-01-01
1096---
1097
1098This is the main catalog description.
1099
1100---
1101SCOPE: products
1102name: Widget A
1103price: 19.99
1104sku: WID-001
1105---
1106
1107The **Widget A** is our most popular product.
1108
1109---
1110SCOPE: products
1111name: Gadget B
1112price: 29.99
1113sku: GAD-002
1114---
1115
1116The **Gadget B** is perfect for professionals.
1117
1118---
1119SCOPE: reviews
1120product: Widget A
1121rating: 5
1122---
1123
1124"Excellent product! Highly recommended."
1125
1126---
1127SCOPE: reviews
1128product: Gadget B
1129rating: 4
1130---
1131
1132"Very good, but a bit pricey.""#;
1133
1134        let doc = decompose(markdown).unwrap();
1135
1136        // Verify global fields
1137        assert_eq!(
1138            doc.get_field("title").unwrap().as_str().unwrap(),
1139            "Product Catalog"
1140        );
1141        assert_eq!(
1142            doc.get_field("author").unwrap().as_str().unwrap(),
1143            "John Doe"
1144        );
1145        assert_eq!(
1146            doc.get_field("date").unwrap().as_str().unwrap(),
1147            "2024-01-01"
1148        );
1149
1150        // Verify global body
1151        assert!(doc.body().unwrap().contains("main catalog description"));
1152
1153        // Verify products collection
1154        let products = doc.get_field("products").unwrap().as_sequence().unwrap();
1155        assert_eq!(products.len(), 2);
1156
1157        let product1 = products[0].as_object().unwrap();
1158        assert_eq!(product1.get("name").unwrap().as_str().unwrap(), "Widget A");
1159        assert_eq!(product1.get("price").unwrap().as_f64().unwrap(), 19.99);
1160
1161        // Verify reviews collection
1162        let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
1163        assert_eq!(reviews.len(), 2);
1164
1165        let review1 = reviews[0].as_object().unwrap();
1166        assert_eq!(
1167            review1.get("product").unwrap().as_str().unwrap(),
1168            "Widget A"
1169        );
1170        assert_eq!(review1.get("rating").unwrap().as_i64().unwrap(), 5);
1171
1172        // Total fields: title, author, date, body, products, reviews = 6
1173        assert_eq!(doc.fields().len(), 6);
1174    }
1175
1176    #[test]
1177    fn taro_quill_directive() {
1178        let markdown = r#"---
1179QUILL: usaf_memo
1180memo_for: [ORG/SYMBOL]
1181memo_from: [ORG/SYMBOL]
1182---
1183
1184This is the memo body."#;
1185
1186        let doc = decompose(markdown).unwrap();
1187
1188        // Verify quill tag is set
1189        assert_eq!(doc.quill_tag(), Some("usaf_memo"));
1190
1191        // Verify fields from quill block become frontmatter
1192        assert_eq!(
1193            doc.get_field("memo_for").unwrap().as_sequence().unwrap()[0]
1194                .as_str()
1195                .unwrap(),
1196            "ORG/SYMBOL"
1197        );
1198
1199        // Verify body
1200        assert_eq!(doc.body(), Some("\nThis is the memo body."));
1201    }
1202
1203    #[test]
1204    fn test_quill_with_scope_blocks() {
1205        let markdown = r#"---
1206QUILL: document
1207title: Test Document
1208---
1209
1210Main body.
1211
1212---
1213SCOPE: sections
1214name: Section 1
1215---
1216
1217Section 1 body."#;
1218
1219        let doc = decompose(markdown).unwrap();
1220
1221        // Verify quill tag
1222        assert_eq!(doc.quill_tag(), Some("document"));
1223
1224        // Verify global field from quill block
1225        assert_eq!(
1226            doc.get_field("title").unwrap().as_str().unwrap(),
1227            "Test Document"
1228        );
1229
1230        // Verify scope blocks work
1231        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1232        assert_eq!(sections.len(), 1);
1233
1234        // Verify body
1235        assert_eq!(doc.body(), Some("\nMain body.\n\n"));
1236    }
1237
1238    #[test]
1239    fn test_multiple_quill_directives_error() {
1240        let markdown = r#"---
1241QUILL: first
1242---
1243
1244---
1245QUILL: second
1246---"#;
1247
1248        let result = decompose(markdown);
1249        assert!(result.is_err());
1250        assert!(result
1251            .unwrap_err()
1252            .to_string()
1253            .contains("Multiple quill directives"));
1254    }
1255
1256    #[test]
1257    fn test_invalid_quill_name() {
1258        let markdown = r#"---
1259QUILL: Invalid-Name
1260---"#;
1261
1262        let result = decompose(markdown);
1263        assert!(result.is_err());
1264        assert!(result
1265            .unwrap_err()
1266            .to_string()
1267            .contains("Invalid quill name"));
1268    }
1269
1270    #[test]
1271    fn test_quill_wrong_value_type() {
1272        let markdown = r#"---
1273QUILL: 123
1274---"#;
1275
1276        let result = decompose(markdown);
1277        assert!(result.is_err());
1278        assert!(result
1279            .unwrap_err()
1280            .to_string()
1281            .contains("QUILL value must be a string"));
1282    }
1283
1284    #[test]
1285    fn test_scope_wrong_value_type() {
1286        let markdown = r#"---
1287SCOPE: 123
1288---"#;
1289
1290        let result = decompose(markdown);
1291        assert!(result.is_err());
1292        assert!(result
1293            .unwrap_err()
1294            .to_string()
1295            .contains("SCOPE value must be a string"));
1296    }
1297
1298    #[test]
1299    fn test_both_quill_and_scope_error() {
1300        let markdown = r#"---
1301QUILL: test
1302SCOPE: items
1303---"#;
1304
1305        let result = decompose(markdown);
1306        assert!(result.is_err());
1307        assert!(result
1308            .unwrap_err()
1309            .to_string()
1310            .contains("Cannot specify both QUILL and SCOPE"));
1311    }
1312}
1313#[cfg(test)]
1314mod demo_file_test {
1315    use super::*;
1316
1317    #[test]
1318    fn test_extended_metadata_demo_file() {
1319        let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
1320        let doc = decompose(markdown).unwrap();
1321
1322        // Verify global fields
1323        assert_eq!(
1324            doc.get_field("title").unwrap().as_str().unwrap(),
1325            "Extended Metadata Demo"
1326        );
1327        assert_eq!(
1328            doc.get_field("author").unwrap().as_str().unwrap(),
1329            "Quillmark Team"
1330        );
1331        // version is parsed as a number by YAML
1332        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
1333
1334        // Verify body
1335        assert!(doc
1336            .body()
1337            .unwrap()
1338            .contains("extended YAML metadata standard"));
1339
1340        // Verify features collection
1341        let features = doc.get_field("features").unwrap().as_sequence().unwrap();
1342        assert_eq!(features.len(), 3);
1343
1344        // Verify use_cases collection
1345        let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
1346        assert_eq!(use_cases.len(), 2);
1347
1348        // Check first feature
1349        let feature1 = features[0].as_object().unwrap();
1350        assert_eq!(
1351            feature1.get("name").unwrap().as_str().unwrap(),
1352            "Tag Directives"
1353        );
1354    }
1355
1356    #[test]
1357    fn test_input_size_limit() {
1358        // Create markdown larger than MAX_INPUT_SIZE (10 MB)
1359        let size = crate::error::MAX_INPUT_SIZE + 1;
1360        let large_markdown = "a".repeat(size);
1361
1362        let result = decompose(&large_markdown);
1363        assert!(result.is_err());
1364
1365        let err_msg = result.unwrap_err().to_string();
1366        assert!(err_msg.contains("Input too large"));
1367    }
1368
1369    #[test]
1370    fn test_yaml_size_limit() {
1371        // Create YAML block larger than MAX_YAML_SIZE (1 MB)
1372        let mut markdown = String::from("---\n");
1373
1374        // Create a very large YAML field
1375        let size = crate::error::MAX_YAML_SIZE + 1;
1376        markdown.push_str("data: \"");
1377        markdown.push_str(&"x".repeat(size));
1378        markdown.push_str("\"\n---\n\nBody");
1379
1380        let result = decompose(&markdown);
1381        assert!(result.is_err());
1382
1383        let err_msg = result.unwrap_err().to_string();
1384        assert!(err_msg.contains("YAML block too large"));
1385    }
1386
1387    #[test]
1388    fn test_input_within_size_limit() {
1389        // Create markdown just under the limit
1390        let size = 1000; // Much smaller than limit
1391        let markdown = format!("---\ntitle: Test\n---\n\n{}", "a".repeat(size));
1392
1393        let result = decompose(&markdown);
1394        assert!(result.is_ok());
1395    }
1396
1397    #[test]
1398    fn test_yaml_within_size_limit() {
1399        // Create YAML block well within the limit
1400        let markdown = "---\ntitle: Test\nauthor: John Doe\n---\n\nBody content";
1401
1402        let result = decompose(&markdown);
1403        assert!(result.is_ok());
1404    }
1405}