quillmark_core/
parse.rs

1//! # Parsing Module
2//!
3//! Parsing functionality for markdown documents with YAML frontmatter.
4//!
5//! ## Overview
6//!
7//! The `parse` module provides the [`decompose`] function for parsing markdown documents
8//! and the [`ParsedDocument`] type for accessing parsed content.
9//!
10//! ## Key Types
11//!
12//! - [`ParsedDocument`]: Container for parsed frontmatter fields and body content
13//! - [`BODY_FIELD`]: Constant for the field name storing document body
14//!
15//! ## Examples
16//!
17//! ### Basic Parsing
18//!
19//! ```
20//! use quillmark_core::decompose;
21//!
22//! let markdown = r#"---
23//! title: My Document
24//! author: John Doe
25//! ---
26//!
27//! # Introduction
28//!
29//! Document content here.
30//! "#;
31//!
32//! let doc = decompose(markdown).unwrap();
33//! let title = doc.get_field("title")
34//!     .and_then(|v| v.as_str())
35//!     .unwrap_or("Untitled");
36//! ```
37//!
38//! ### Extended Metadata with Tags
39//!
40//! ```
41//! use quillmark_core::decompose;
42//!
43//! let markdown = r#"---
44//! catalog_title: Product Catalog
45//! ---
46//!
47//! # Products
48//!
49//! ---
50//! SCOPE: products
51//! name: Widget
52//! price: 19.99
53//! ---
54//!
55//! A versatile widget for all occasions.
56//! "#;
57//!
58//! let doc = decompose(markdown).unwrap();
59//!
60//! // Access tagged collections
61//! if let Some(products) = doc.get_field("products")
62//!     .and_then(|v| v.as_sequence())
63//! {
64//!     for product in products {
65//!         let name = product.get("name").and_then(|v| v.as_str()).unwrap();
66//!         let price = product.get("price").and_then(|v| v.as_f64()).unwrap();
67//!         println!("{}: ${}", name, price);
68//!     }
69//! }
70//! ```
71//!
72//! ## Error Handling
73//!
74//! The [`decompose`] function returns errors for:
75//! - Malformed YAML syntax
76//! - Unclosed frontmatter blocks
77//! - Multiple global frontmatter blocks
78//! - Both QUILL and SCOPE specified in the same block
79//! - Reserved field name usage
80//! - Name collisions
81//!
82//! See [PARSE.md](https://github.com/nibsbin/quillmark/blob/main/designs/PARSE.md) for comprehensive documentation of the Extended YAML Metadata Standard.
83
84use std::collections::HashMap;
85
86use crate::value::QuillValue;
87
88/// The field name used to store the document body
89pub const BODY_FIELD: &str = "body";
90
91/// Reserved tag name for quill specification
92pub const QUILL_TAG: &str = "quill";
93
94/// A parsed markdown document with frontmatter
95#[derive(Debug, Clone)]
96pub struct ParsedDocument {
97    fields: HashMap<String, QuillValue>,
98    quill_tag: Option<String>,
99}
100
101impl ParsedDocument {
102    /// Create a new ParsedDocument with the given fields
103    pub fn new(fields: HashMap<String, QuillValue>) -> Self {
104        Self {
105            fields,
106            quill_tag: None,
107        }
108    }
109
110    /// Create a ParsedDocument from fields and optional quill tag
111    pub fn with_quill_tag(fields: HashMap<String, QuillValue>, quill_tag: Option<String>) -> Self {
112        Self { fields, quill_tag }
113    }
114
115    /// Create a ParsedDocument from markdown string
116    pub fn from_markdown(markdown: &str) -> Result<Self, crate::error::ParseError> {
117        decompose(markdown).map_err(|e| crate::error::ParseError::from(e))
118    }
119
120    /// Get the quill tag if specified (from QUILL key)
121    pub fn quill_tag(&self) -> Option<&str> {
122        self.quill_tag.as_deref()
123    }
124
125    /// Get the document body
126    pub fn body(&self) -> Option<&str> {
127        self.fields.get(BODY_FIELD).and_then(|v| v.as_str())
128    }
129
130    /// Get a specific field
131    pub fn get_field(&self, name: &str) -> Option<&QuillValue> {
132        self.fields.get(name)
133    }
134
135    /// Get all fields (including body)
136    pub fn fields(&self) -> &HashMap<String, QuillValue> {
137        &self.fields
138    }
139}
140
141#[derive(Debug)]
142struct MetadataBlock {
143    start: usize, // Position of opening "---"
144    end: usize,   // Position after closing "---\n"
145    yaml_content: String,
146    tag: Option<String>,        // Field name from SCOPE key
147    quill_name: Option<String>, // Quill name from QUILL key
148}
149
150/// Validate tag name follows pattern [a-z_][a-z0-9_]*
151fn is_valid_tag_name(name: &str) -> bool {
152    if name.is_empty() {
153        return false;
154    }
155
156    let mut chars = name.chars();
157    let first = chars.next().unwrap();
158
159    if !first.is_ascii_lowercase() && first != '_' {
160        return false;
161    }
162
163    for ch in chars {
164        if !ch.is_ascii_lowercase() && !ch.is_ascii_digit() && ch != '_' {
165            return false;
166        }
167    }
168
169    true
170}
171
172/// Find all metadata blocks in the document
173fn find_metadata_blocks(
174    markdown: &str,
175) -> Result<Vec<MetadataBlock>, Box<dyn std::error::Error + Send + Sync>> {
176    let mut blocks = Vec::new();
177    let mut pos = 0;
178
179    while pos < markdown.len() {
180        // Look for opening "---\n" or "---\r\n"
181        let search_str = &markdown[pos..];
182        let delimiter_result = if let Some(p) = search_str.find("---\n") {
183            Some((p, 4, "\n"))
184        } else if let Some(p) = search_str.find("---\r\n") {
185            Some((p, 5, "\r\n"))
186        } else {
187            None
188        };
189
190        if let Some((delimiter_pos, delimiter_len, _line_ending)) = delimiter_result {
191            let abs_pos = pos + delimiter_pos;
192            let content_start = abs_pos + delimiter_len; // After "---\n" or "---\r\n"
193
194            // Check if opening --- is followed by a blank line (horizontal rule, not metadata)
195            let followed_by_blank = if content_start < markdown.len() {
196                markdown[content_start..].starts_with('\n')
197                    || markdown[content_start..].starts_with("\r\n")
198            } else {
199                false
200            };
201
202            if followed_by_blank {
203                // This is a horizontal rule in the body, skip it
204                pos = abs_pos + 3; // Skip past "---"
205                continue;
206            }
207
208            // Found potential metadata block opening
209            // Look for closing "\n---\n" or "\r\n---\r\n" etc., OR "\n---" / "\r\n---" at end of document
210            let rest = &markdown[content_start..];
211
212            // First try to find delimiters with trailing newlines
213            let closing_patterns = ["\n---\n", "\r\n---\r\n", "\n---\r\n", "\r\n---\n"];
214            let closing_with_newline = closing_patterns
215                .iter()
216                .filter_map(|delim| rest.find(delim).map(|p| (p, delim.len())))
217                .min_by_key(|(p, _)| *p);
218
219            // Also check for closing at end of document (no trailing newline)
220            let closing_at_eof = ["\n---", "\r\n---"]
221                .iter()
222                .filter_map(|delim| {
223                    rest.find(delim).and_then(|p| {
224                        if p + delim.len() == rest.len() {
225                            Some((p, delim.len()))
226                        } else {
227                            None
228                        }
229                    })
230                })
231                .min_by_key(|(p, _)| *p);
232
233            let closing_result = match (closing_with_newline, closing_at_eof) {
234                (Some((p1, _l1)), Some((p2, _))) if p2 < p1 => closing_at_eof,
235                (Some(_), Some(_)) => closing_with_newline,
236                (Some(_), None) => closing_with_newline,
237                (None, Some(_)) => closing_at_eof,
238                (None, None) => None,
239            };
240
241            if let Some((closing_pos, closing_len)) = closing_result {
242                let abs_closing_pos = content_start + closing_pos;
243                let content = &markdown[content_start..abs_closing_pos];
244
245                // Check YAML size limit
246                if content.len() > crate::error::MAX_YAML_SIZE {
247                    return Err(format!(
248                        "YAML block too large: {} bytes (max: {} bytes)",
249                        content.len(),
250                        crate::error::MAX_YAML_SIZE
251                    )
252                    .into());
253                }
254
255                // Check if the block is contiguous (no blank lines in the YAML content)
256                if content.contains("\n\n") || content.contains("\r\n\r\n") {
257                    // Not a contiguous block
258                    if abs_pos == 0 {
259                        // Started at beginning but has blank lines - this is an error
260                        return Err("Frontmatter started but not closed with ---".into());
261                    }
262                    // Otherwise treat as horizontal rule in body
263                    pos = abs_pos + 3;
264                    continue;
265                }
266
267                // Parse YAML content to check for reserved keys (QUILL, SCOPE)
268                // First, try to parse as YAML
269                let (tag, quill_name, yaml_content) = if !content.is_empty() {
270                    // Try to parse the YAML to check for reserved keys
271                    match serde_yaml::from_str::<serde_yaml::Value>(content) {
272                        Ok(yaml_value) => {
273                            if let Some(mapping) = yaml_value.as_mapping() {
274                                let quill_key = serde_yaml::Value::String("QUILL".to_string());
275                                let scope_key = serde_yaml::Value::String("SCOPE".to_string());
276
277                                let has_quill = mapping.contains_key(&quill_key);
278                                let has_scope = mapping.contains_key(&scope_key);
279
280                                if has_quill && has_scope {
281                                    return Err(
282                                        "Cannot specify both QUILL and SCOPE in the same block"
283                                            .into(),
284                                    );
285                                }
286
287                                if has_quill {
288                                    // Extract quill name
289                                    let quill_value = mapping.get(&quill_key).unwrap();
290                                    let quill_name_str = quill_value
291                                        .as_str()
292                                        .ok_or_else(|| "QUILL value must be a string")?;
293
294                                    if !is_valid_tag_name(quill_name_str) {
295                                        return Err(format!(
296                                            "Invalid quill name '{}': must match pattern [a-z_][a-z0-9_]*",
297                                            quill_name_str
298                                        )
299                                        .into());
300                                    }
301
302                                    // Remove QUILL from the YAML content for processing
303                                    let mut new_mapping = mapping.clone();
304                                    new_mapping.remove(&quill_key);
305                                    let new_yaml = serde_yaml::to_string(&new_mapping)
306                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
307
308                                    (None, Some(quill_name_str.to_string()), new_yaml)
309                                } else if has_scope {
310                                    // Extract scope field name
311                                    let scope_value = mapping.get(&scope_key).unwrap();
312                                    let field_name = scope_value
313                                        .as_str()
314                                        .ok_or_else(|| "SCOPE value must be a string")?;
315
316                                    if !is_valid_tag_name(field_name) {
317                                        return Err(format!(
318                                            "Invalid field name '{}': must match pattern [a-z_][a-z0-9_]*",
319                                            field_name
320                                        )
321                                        .into());
322                                    }
323
324                                    if field_name == BODY_FIELD {
325                                        return Err(format!(
326                                            "Cannot use reserved field name '{}' as SCOPE value",
327                                            BODY_FIELD
328                                        )
329                                        .into());
330                                    }
331
332                                    // Remove SCOPE from the YAML content for processing
333                                    let mut new_mapping = mapping.clone();
334                                    new_mapping.remove(&scope_key);
335                                    let new_yaml = serde_yaml::to_string(&new_mapping)
336                                        .map_err(|e| format!("Failed to serialize YAML: {}", e))?;
337
338                                    (Some(field_name.to_string()), None, new_yaml)
339                                } else {
340                                    // No reserved keys, treat as normal YAML
341                                    (None, None, content.to_string())
342                                }
343                            } else {
344                                // Not a mapping, treat as normal YAML
345                                (None, None, content.to_string())
346                            }
347                        }
348                        Err(_) => {
349                            // If YAML parsing fails here, we'll catch it later
350                            (None, None, content.to_string())
351                        }
352                    }
353                } else {
354                    (None, None, content.to_string())
355                };
356
357                blocks.push(MetadataBlock {
358                    start: abs_pos,
359                    end: abs_closing_pos + closing_len, // After closing delimiter
360                    yaml_content,
361                    tag,
362                    quill_name,
363                });
364
365                pos = abs_closing_pos + closing_len;
366            } else if abs_pos == 0 {
367                // Frontmatter started but not closed
368                return Err("Frontmatter started but not closed with ---".into());
369            } else {
370                // Not a valid metadata block, skip this position
371                pos = abs_pos + 3;
372            }
373        } else {
374            break;
375        }
376    }
377
378    Ok(blocks)
379}
380
381/// Decompose markdown into frontmatter fields and body
382pub fn decompose(
383    markdown: &str,
384) -> Result<ParsedDocument, Box<dyn std::error::Error + Send + Sync>> {
385    // Check input size limit
386    if markdown.len() > crate::error::MAX_INPUT_SIZE {
387        return Err(format!(
388            "Input too large: {} bytes (max: {} bytes)",
389            markdown.len(),
390            crate::error::MAX_INPUT_SIZE
391        )
392        .into());
393    }
394
395    let mut fields = HashMap::new();
396
397    // Find all metadata blocks
398    let blocks = find_metadata_blocks(markdown)?;
399
400    if blocks.is_empty() {
401        // No metadata blocks, entire content is body
402        fields.insert(
403            BODY_FIELD.to_string(),
404            QuillValue::from_json(serde_json::Value::String(markdown.to_string())),
405        );
406        return Ok(ParsedDocument::new(fields));
407    }
408
409    // Track which attributes are used for tagged blocks
410    let mut tagged_attributes: HashMap<String, Vec<serde_yaml::Value>> = HashMap::new();
411    let mut has_global_frontmatter = false;
412    let mut global_frontmatter_index: Option<usize> = None;
413    let mut quill_name: Option<String> = None;
414
415    // First pass: identify global frontmatter, quill directive, and validate
416    for (idx, block) in blocks.iter().enumerate() {
417        // Check for quill directive
418        if let Some(ref name) = block.quill_name {
419            if quill_name.is_some() {
420                return Err("Multiple quill directives found: only one allowed".into());
421            }
422            quill_name = Some(name.clone());
423        }
424
425        // Check for global frontmatter (no tag and no quill directive)
426        if block.tag.is_none() && block.quill_name.is_none() {
427            if has_global_frontmatter {
428                return Err(
429                    "Multiple global frontmatter blocks found: only one untagged block allowed"
430                        .into(),
431                );
432            }
433            has_global_frontmatter = true;
434            global_frontmatter_index = Some(idx);
435        }
436    }
437
438    // Parse global frontmatter if present
439    if let Some(idx) = global_frontmatter_index {
440        let block = &blocks[idx];
441
442        // Parse YAML frontmatter
443        let yaml_fields: HashMap<String, serde_yaml::Value> = if block.yaml_content.is_empty() {
444            HashMap::new()
445        } else {
446            serde_yaml::from_str(&block.yaml_content)
447                .map_err(|e| format!("Invalid YAML frontmatter: {}", e))?
448        };
449
450        // Check that all tagged blocks don't conflict with global fields
451        // Exception: if the global field is an array, allow it (we'll merge later)
452        for other_block in &blocks {
453            if let Some(ref tag) = other_block.tag {
454                if let Some(global_value) = yaml_fields.get(tag) {
455                    // Check if the global value is an array
456                    if global_value.as_sequence().is_none() {
457                        return Err(format!(
458                            "Name collision: global field '{}' conflicts with tagged attribute",
459                            tag
460                        )
461                        .into());
462                    }
463                }
464            }
465        }
466
467        // Convert YAML values to QuillValue at boundary
468        for (key, value) in yaml_fields {
469            fields.insert(key, QuillValue::from_yaml(value)?);
470        }
471    }
472
473    // Process blocks with quill directives
474    for block in &blocks {
475        if block.quill_name.is_some() {
476            // Quill directive blocks can have YAML content (becomes part of frontmatter)
477            if !block.yaml_content.is_empty() {
478                let yaml_fields: HashMap<String, serde_yaml::Value> =
479                    serde_yaml::from_str(&block.yaml_content)
480                        .map_err(|e| format!("Invalid YAML in quill block: {}", e))?;
481
482                // Check for conflicts with existing fields
483                for key in yaml_fields.keys() {
484                    if fields.contains_key(key) {
485                        return Err(format!(
486                            "Name collision: quill block field '{}' conflicts with existing field",
487                            key
488                        )
489                        .into());
490                    }
491                }
492
493                // Convert YAML values to QuillValue at boundary
494                for (key, value) in yaml_fields {
495                    fields.insert(key, QuillValue::from_yaml(value)?);
496                }
497            }
498        }
499    }
500
501    // Parse tagged blocks
502    for (idx, block) in blocks.iter().enumerate() {
503        if let Some(ref tag_name) = block.tag {
504            // Check if this conflicts with global fields
505            // Exception: if the global field is an array, allow it (we'll merge later)
506            if let Some(existing_value) = fields.get(tag_name) {
507                if existing_value.as_array().is_none() {
508                    return Err(format!(
509                        "Name collision: tagged attribute '{}' conflicts with global field",
510                        tag_name
511                    )
512                    .into());
513                }
514            }
515
516            // Parse YAML metadata
517            let mut item_fields: HashMap<String, serde_yaml::Value> =
518                if block.yaml_content.is_empty() {
519                    HashMap::new()
520                } else {
521                    serde_yaml::from_str(&block.yaml_content).map_err(|e| {
522                        format!("Invalid YAML in tagged block '{}': {}", tag_name, e)
523                    })?
524                };
525
526            // Extract body for this tagged block
527            let body_start = block.end;
528            let body_end = if idx + 1 < blocks.len() {
529                blocks[idx + 1].start
530            } else {
531                markdown.len()
532            };
533            let body = &markdown[body_start..body_end];
534
535            // Add body to item fields
536            item_fields.insert(
537                BODY_FIELD.to_string(),
538                serde_yaml::Value::String(body.to_string()),
539            );
540
541            // Convert HashMap to serde_yaml::Value::Mapping
542            let item_value = serde_yaml::to_value(item_fields)?;
543
544            // Add to collection
545            tagged_attributes
546                .entry(tag_name.clone())
547                .or_insert_with(Vec::new)
548                .push(item_value);
549        }
550    }
551
552    // Extract global body
553    // Body starts after global frontmatter or quill block (whichever comes first)
554    // Body ends at the first scope block or EOF
555    let first_non_scope_block_idx = blocks
556        .iter()
557        .position(|b| b.tag.is_none() && b.quill_name.is_none())
558        .or_else(|| blocks.iter().position(|b| b.quill_name.is_some()));
559
560    let (body_start, body_end) = if let Some(idx) = first_non_scope_block_idx {
561        // Body starts after the first non-scope block (global frontmatter or quill)
562        let start = blocks[idx].end;
563
564        // Body ends at the first scope block after this, or EOF
565        let end = blocks
566            .iter()
567            .skip(idx + 1)
568            .find(|b| b.tag.is_some())
569            .map(|b| b.start)
570            .unwrap_or(markdown.len());
571
572        (start, end)
573    } else {
574        // No global frontmatter or quill block - body is everything before the first scope block
575        let end = blocks
576            .iter()
577            .find(|b| b.tag.is_some())
578            .map(|b| b.start)
579            .unwrap_or(0);
580
581        (0, end)
582    };
583
584    let global_body = &markdown[body_start..body_end];
585
586    fields.insert(
587        BODY_FIELD.to_string(),
588        QuillValue::from_json(serde_json::Value::String(global_body.to_string())),
589    );
590
591    // Add all tagged collections to fields (convert to QuillValue)
592    // If a field already exists and is an array, merge the new items into it
593    for (tag_name, items) in tagged_attributes {
594        if let Some(existing_value) = fields.get(&tag_name) {
595            // The existing value must be an array (checked earlier)
596            if let Some(existing_array) = existing_value.as_array() {
597                // Convert new items from YAML to JSON
598                let new_items_json: Vec<serde_json::Value> = items
599                    .into_iter()
600                    .map(|yaml_val| {
601                        serde_json::to_value(&yaml_val)
602                            .map_err(|e| format!("Failed to convert YAML to JSON: {}", e))
603                    })
604                    .collect::<Result<Vec<_>, _>>()?;
605
606                // Combine existing and new items
607                let mut merged_array = existing_array.clone();
608                merged_array.extend(new_items_json);
609
610                // Create QuillValue from merged JSON array
611                let quill_value = QuillValue::from_json(serde_json::Value::Array(merged_array));
612                fields.insert(tag_name, quill_value);
613            } else {
614                // This should not happen due to earlier validation, but handle it gracefully
615                return Err(format!(
616                    "Internal error: field '{}' exists but is not an array",
617                    tag_name
618                )
619                .into());
620            }
621        } else {
622            // No existing field, just create a new sequence
623            let quill_value = QuillValue::from_yaml(serde_yaml::Value::Sequence(items))?;
624            fields.insert(tag_name, quill_value);
625        }
626    }
627
628    let mut parsed = ParsedDocument::new(fields);
629
630    // Set quill tag if present
631    if let Some(name) = quill_name {
632        parsed.quill_tag = Some(name);
633    }
634
635    Ok(parsed)
636}
637
638#[cfg(test)]
639mod tests {
640    use super::*;
641
642    #[test]
643    fn test_no_frontmatter() {
644        let markdown = "# Hello World\n\nThis is a test.";
645        let doc = decompose(markdown).unwrap();
646
647        assert_eq!(doc.body(), Some(markdown));
648        assert_eq!(doc.fields().len(), 1);
649    }
650
651    #[test]
652    fn test_with_frontmatter() {
653        let markdown = r#"---
654title: Test Document
655author: Test Author
656---
657
658# Hello World
659
660This is the body."#;
661
662        let doc = decompose(markdown).unwrap();
663
664        assert_eq!(doc.body(), Some("\n# Hello World\n\nThis is the body."));
665        assert_eq!(
666            doc.get_field("title").unwrap().as_str().unwrap(),
667            "Test Document"
668        );
669        assert_eq!(
670            doc.get_field("author").unwrap().as_str().unwrap(),
671            "Test Author"
672        );
673        assert_eq!(doc.fields().len(), 3); // title, author, body
674    }
675
676    #[test]
677    fn test_complex_yaml_frontmatter() {
678        let markdown = r#"---
679title: Complex Document
680tags:
681  - test
682  - yaml
683metadata:
684  version: 1.0
685  nested:
686    field: value
687---
688
689Content here."#;
690
691        let doc = decompose(markdown).unwrap();
692
693        assert_eq!(doc.body(), Some("\nContent here."));
694        assert_eq!(
695            doc.get_field("title").unwrap().as_str().unwrap(),
696            "Complex Document"
697        );
698
699        let tags = doc.get_field("tags").unwrap().as_sequence().unwrap();
700        assert_eq!(tags.len(), 2);
701        assert_eq!(tags[0].as_str().unwrap(), "test");
702        assert_eq!(tags[1].as_str().unwrap(), "yaml");
703    }
704
705    #[test]
706    fn test_invalid_yaml() {
707        let markdown = r#"---
708title: [invalid yaml
709author: missing close bracket
710---
711
712Content here."#;
713
714        let result = decompose(markdown);
715        assert!(result.is_err());
716        assert!(result
717            .unwrap_err()
718            .to_string()
719            .contains("Invalid YAML frontmatter"));
720    }
721
722    #[test]
723    fn test_unclosed_frontmatter() {
724        let markdown = r#"---
725title: Test
726author: Test Author
727
728Content without closing ---"#;
729
730        let result = decompose(markdown);
731        assert!(result.is_err());
732        assert!(result.unwrap_err().to_string().contains("not closed"));
733    }
734
735    // Extended metadata tests
736
737    #[test]
738    fn test_basic_tagged_block() {
739        let markdown = r#"---
740title: Main Document
741---
742
743Main body content.
744
745---
746SCOPE: items
747name: Item 1
748---
749
750Body of item 1."#;
751
752        let doc = decompose(markdown).unwrap();
753
754        assert_eq!(doc.body(), Some("\nMain body content.\n\n"));
755        assert_eq!(
756            doc.get_field("title").unwrap().as_str().unwrap(),
757            "Main Document"
758        );
759
760        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
761        assert_eq!(items.len(), 1);
762
763        let item = items[0].as_object().unwrap();
764        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
765        assert_eq!(
766            item.get("body").unwrap().as_str().unwrap(),
767            "\nBody of item 1."
768        );
769    }
770
771    #[test]
772    fn test_multiple_tagged_blocks() {
773        let markdown = r#"---
774SCOPE: items
775name: Item 1
776tags: [a, b]
777---
778
779First item body.
780
781---
782SCOPE: items
783name: Item 2
784tags: [c, d]
785---
786
787Second item body."#;
788
789        let doc = decompose(markdown).unwrap();
790
791        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
792        assert_eq!(items.len(), 2);
793
794        let item1 = items[0].as_object().unwrap();
795        assert_eq!(item1.get("name").unwrap().as_str().unwrap(), "Item 1");
796
797        let item2 = items[1].as_object().unwrap();
798        assert_eq!(item2.get("name").unwrap().as_str().unwrap(), "Item 2");
799    }
800
801    #[test]
802    fn test_mixed_global_and_tagged() {
803        let markdown = r#"---
804title: Global
805author: John Doe
806---
807
808Global body.
809
810---
811SCOPE: sections
812title: Section 1
813---
814
815Section 1 content.
816
817---
818SCOPE: sections
819title: Section 2
820---
821
822Section 2 content."#;
823
824        let doc = decompose(markdown).unwrap();
825
826        assert_eq!(doc.get_field("title").unwrap().as_str().unwrap(), "Global");
827        assert_eq!(doc.body(), Some("\nGlobal body.\n\n"));
828
829        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
830        assert_eq!(sections.len(), 2);
831    }
832
833    #[test]
834    fn test_empty_tagged_metadata() {
835        let markdown = r#"---
836SCOPE: items
837---
838
839Body without metadata."#;
840
841        let doc = decompose(markdown).unwrap();
842
843        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
844        assert_eq!(items.len(), 1);
845
846        let item = items[0].as_object().unwrap();
847        assert_eq!(
848            item.get("body").unwrap().as_str().unwrap(),
849            "\nBody without metadata."
850        );
851    }
852
853    #[test]
854    fn test_tagged_block_without_body() {
855        let markdown = r#"---
856SCOPE: items
857name: Item
858---"#;
859
860        let doc = decompose(markdown).unwrap();
861
862        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
863        assert_eq!(items.len(), 1);
864
865        let item = items[0].as_object().unwrap();
866        assert_eq!(item.get("body").unwrap().as_str().unwrap(), "");
867    }
868
869    #[test]
870    fn test_name_collision_global_and_tagged() {
871        let markdown = r#"---
872items: "global value"
873---
874
875Body
876
877---
878SCOPE: items
879name: Item
880---
881
882Item body"#;
883
884        let result = decompose(markdown);
885        assert!(result.is_err());
886        assert!(result.unwrap_err().to_string().contains("collision"));
887    }
888
889    #[test]
890    fn test_global_array_merged_with_scope() {
891        // When global frontmatter has an array field with the same name as a SCOPE,
892        // the SCOPE items should be added to the array
893        let markdown = r#"---
894items:
895  - name: Global Item 1
896    value: 100
897  - name: Global Item 2
898    value: 200
899---
900
901Global body
902
903---
904SCOPE: items
905name: Scope Item 1
906value: 300
907---
908
909Scope item 1 body
910
911---
912SCOPE: items
913name: Scope Item 2
914value: 400
915---
916
917Scope item 2 body"#;
918
919        let doc = decompose(markdown).unwrap();
920
921        // Verify the items array has all 4 items (2 from global + 2 from SCOPE)
922        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
923        assert_eq!(items.len(), 4);
924
925        // Verify first two items (from global array)
926        let item1 = items[0].as_object().unwrap();
927        assert_eq!(
928            item1.get("name").unwrap().as_str().unwrap(),
929            "Global Item 1"
930        );
931        assert_eq!(item1.get("value").unwrap().as_i64().unwrap(), 100);
932
933        let item2 = items[1].as_object().unwrap();
934        assert_eq!(
935            item2.get("name").unwrap().as_str().unwrap(),
936            "Global Item 2"
937        );
938        assert_eq!(item2.get("value").unwrap().as_i64().unwrap(), 200);
939
940        // Verify last two items (from SCOPE blocks)
941        let item3 = items[2].as_object().unwrap();
942        assert_eq!(item3.get("name").unwrap().as_str().unwrap(), "Scope Item 1");
943        assert_eq!(item3.get("value").unwrap().as_i64().unwrap(), 300);
944        assert_eq!(
945            item3.get("body").unwrap().as_str().unwrap(),
946            "\nScope item 1 body\n\n"
947        );
948
949        let item4 = items[3].as_object().unwrap();
950        assert_eq!(item4.get("name").unwrap().as_str().unwrap(), "Scope Item 2");
951        assert_eq!(item4.get("value").unwrap().as_i64().unwrap(), 400);
952        assert_eq!(
953            item4.get("body").unwrap().as_str().unwrap(),
954            "\nScope item 2 body"
955        );
956    }
957
958    #[test]
959    fn test_empty_global_array_with_scope() {
960        // Edge case: global frontmatter has an empty array
961        let markdown = r#"---
962items: []
963---
964
965Global body
966
967---
968SCOPE: items
969name: Item 1
970---
971
972Item 1 body"#;
973
974        let doc = decompose(markdown).unwrap();
975
976        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
977        assert_eq!(items.len(), 1);
978
979        let item = items[0].as_object().unwrap();
980        assert_eq!(item.get("name").unwrap().as_str().unwrap(), "Item 1");
981    }
982
983    #[test]
984    fn test_reserved_field_name() {
985        let markdown = r#"---
986SCOPE: body
987content: Test
988---"#;
989
990        let result = decompose(markdown);
991        assert!(result.is_err());
992        assert!(result.unwrap_err().to_string().contains("reserved"));
993    }
994
995    #[test]
996    fn test_invalid_tag_syntax() {
997        let markdown = r#"---
998SCOPE: Invalid-Name
999title: Test
1000---"#;
1001
1002        let result = decompose(markdown);
1003        assert!(result.is_err());
1004        assert!(result
1005            .unwrap_err()
1006            .to_string()
1007            .contains("Invalid field name"));
1008    }
1009
1010    #[test]
1011    fn test_multiple_global_frontmatter_blocks() {
1012        let markdown = r#"---
1013title: First
1014---
1015
1016Body
1017
1018---
1019author: Second
1020---
1021
1022More body"#;
1023
1024        let result = decompose(markdown);
1025        assert!(result.is_err());
1026        assert!(result
1027            .unwrap_err()
1028            .to_string()
1029            .contains("Multiple global frontmatter"));
1030    }
1031
1032    #[test]
1033    fn test_adjacent_blocks_different_tags() {
1034        let markdown = r#"---
1035SCOPE: items
1036name: Item 1
1037---
1038
1039Item 1 body
1040
1041---
1042SCOPE: sections
1043title: Section 1
1044---
1045
1046Section 1 body"#;
1047
1048        let doc = decompose(markdown).unwrap();
1049
1050        assert!(doc.get_field("items").is_some());
1051        assert!(doc.get_field("sections").is_some());
1052
1053        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1054        assert_eq!(items.len(), 1);
1055
1056        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1057        assert_eq!(sections.len(), 1);
1058    }
1059
1060    #[test]
1061    fn test_order_preservation() {
1062        let markdown = r#"---
1063SCOPE: items
1064id: 1
1065---
1066
1067First
1068
1069---
1070SCOPE: items
1071id: 2
1072---
1073
1074Second
1075
1076---
1077SCOPE: items
1078id: 3
1079---
1080
1081Third"#;
1082
1083        let doc = decompose(markdown).unwrap();
1084
1085        let items = doc.get_field("items").unwrap().as_sequence().unwrap();
1086        assert_eq!(items.len(), 3);
1087
1088        for (i, item) in items.iter().enumerate() {
1089            let mapping = item.as_object().unwrap();
1090            let id = mapping.get("id").unwrap().as_i64().unwrap();
1091            assert_eq!(id, (i + 1) as i64);
1092        }
1093    }
1094
1095    #[test]
1096    fn test_product_catalog_integration() {
1097        let markdown = r#"---
1098title: Product Catalog
1099author: John Doe
1100date: 2024-01-01
1101---
1102
1103This is the main catalog description.
1104
1105---
1106SCOPE: products
1107name: Widget A
1108price: 19.99
1109sku: WID-001
1110---
1111
1112The **Widget A** is our most popular product.
1113
1114---
1115SCOPE: products
1116name: Gadget B
1117price: 29.99
1118sku: GAD-002
1119---
1120
1121The **Gadget B** is perfect for professionals.
1122
1123---
1124SCOPE: reviews
1125product: Widget A
1126rating: 5
1127---
1128
1129"Excellent product! Highly recommended."
1130
1131---
1132SCOPE: reviews
1133product: Gadget B
1134rating: 4
1135---
1136
1137"Very good, but a bit pricey.""#;
1138
1139        let doc = decompose(markdown).unwrap();
1140
1141        // Verify global fields
1142        assert_eq!(
1143            doc.get_field("title").unwrap().as_str().unwrap(),
1144            "Product Catalog"
1145        );
1146        assert_eq!(
1147            doc.get_field("author").unwrap().as_str().unwrap(),
1148            "John Doe"
1149        );
1150        assert_eq!(
1151            doc.get_field("date").unwrap().as_str().unwrap(),
1152            "2024-01-01"
1153        );
1154
1155        // Verify global body
1156        assert!(doc.body().unwrap().contains("main catalog description"));
1157
1158        // Verify products collection
1159        let products = doc.get_field("products").unwrap().as_sequence().unwrap();
1160        assert_eq!(products.len(), 2);
1161
1162        let product1 = products[0].as_object().unwrap();
1163        assert_eq!(product1.get("name").unwrap().as_str().unwrap(), "Widget A");
1164        assert_eq!(product1.get("price").unwrap().as_f64().unwrap(), 19.99);
1165
1166        // Verify reviews collection
1167        let reviews = doc.get_field("reviews").unwrap().as_sequence().unwrap();
1168        assert_eq!(reviews.len(), 2);
1169
1170        let review1 = reviews[0].as_object().unwrap();
1171        assert_eq!(
1172            review1.get("product").unwrap().as_str().unwrap(),
1173            "Widget A"
1174        );
1175        assert_eq!(review1.get("rating").unwrap().as_i64().unwrap(), 5);
1176
1177        // Total fields: title, author, date, body, products, reviews = 6
1178        assert_eq!(doc.fields().len(), 6);
1179    }
1180
1181    #[test]
1182    fn taro_quill_directive() {
1183        let markdown = r#"---
1184QUILL: usaf_memo
1185memo_for: [ORG/SYMBOL]
1186memo_from: [ORG/SYMBOL]
1187---
1188
1189This is the memo body."#;
1190
1191        let doc = decompose(markdown).unwrap();
1192
1193        // Verify quill tag is set
1194        assert_eq!(doc.quill_tag(), Some("usaf_memo"));
1195
1196        // Verify fields from quill block become frontmatter
1197        assert_eq!(
1198            doc.get_field("memo_for").unwrap().as_sequence().unwrap()[0]
1199                .as_str()
1200                .unwrap(),
1201            "ORG/SYMBOL"
1202        );
1203
1204        // Verify body
1205        assert_eq!(doc.body(), Some("\nThis is the memo body."));
1206    }
1207
1208    #[test]
1209    fn test_quill_with_scope_blocks() {
1210        let markdown = r#"---
1211QUILL: document
1212title: Test Document
1213---
1214
1215Main body.
1216
1217---
1218SCOPE: sections
1219name: Section 1
1220---
1221
1222Section 1 body."#;
1223
1224        let doc = decompose(markdown).unwrap();
1225
1226        // Verify quill tag
1227        assert_eq!(doc.quill_tag(), Some("document"));
1228
1229        // Verify global field from quill block
1230        assert_eq!(
1231            doc.get_field("title").unwrap().as_str().unwrap(),
1232            "Test Document"
1233        );
1234
1235        // Verify scope blocks work
1236        let sections = doc.get_field("sections").unwrap().as_sequence().unwrap();
1237        assert_eq!(sections.len(), 1);
1238
1239        // Verify body
1240        assert_eq!(doc.body(), Some("\nMain body.\n\n"));
1241    }
1242
1243    #[test]
1244    fn test_multiple_quill_directives_error() {
1245        let markdown = r#"---
1246QUILL: first
1247---
1248
1249---
1250QUILL: second
1251---"#;
1252
1253        let result = decompose(markdown);
1254        assert!(result.is_err());
1255        assert!(result
1256            .unwrap_err()
1257            .to_string()
1258            .contains("Multiple quill directives"));
1259    }
1260
1261    #[test]
1262    fn test_invalid_quill_name() {
1263        let markdown = r#"---
1264QUILL: Invalid-Name
1265---"#;
1266
1267        let result = decompose(markdown);
1268        assert!(result.is_err());
1269        assert!(result
1270            .unwrap_err()
1271            .to_string()
1272            .contains("Invalid quill name"));
1273    }
1274
1275    #[test]
1276    fn test_quill_wrong_value_type() {
1277        let markdown = r#"---
1278QUILL: 123
1279---"#;
1280
1281        let result = decompose(markdown);
1282        assert!(result.is_err());
1283        assert!(result
1284            .unwrap_err()
1285            .to_string()
1286            .contains("QUILL value must be a string"));
1287    }
1288
1289    #[test]
1290    fn test_scope_wrong_value_type() {
1291        let markdown = r#"---
1292SCOPE: 123
1293---"#;
1294
1295        let result = decompose(markdown);
1296        assert!(result.is_err());
1297        assert!(result
1298            .unwrap_err()
1299            .to_string()
1300            .contains("SCOPE value must be a string"));
1301    }
1302
1303    #[test]
1304    fn test_both_quill_and_scope_error() {
1305        let markdown = r#"---
1306QUILL: test
1307SCOPE: items
1308---"#;
1309
1310        let result = decompose(markdown);
1311        assert!(result.is_err());
1312        assert!(result
1313            .unwrap_err()
1314            .to_string()
1315            .contains("Cannot specify both QUILL and SCOPE"));
1316    }
1317}
1318#[cfg(test)]
1319mod demo_file_test {
1320    use super::*;
1321
1322    #[test]
1323    fn test_extended_metadata_demo_file() {
1324        let markdown = include_str!("../../quillmark-fixtures/resources/extended_metadata_demo.md");
1325        let doc = decompose(markdown).unwrap();
1326
1327        // Verify global fields
1328        assert_eq!(
1329            doc.get_field("title").unwrap().as_str().unwrap(),
1330            "Extended Metadata Demo"
1331        );
1332        assert_eq!(
1333            doc.get_field("author").unwrap().as_str().unwrap(),
1334            "Quillmark Team"
1335        );
1336        // version is parsed as a number by YAML
1337        assert_eq!(doc.get_field("version").unwrap().as_f64().unwrap(), 1.0);
1338
1339        // Verify body
1340        assert!(doc
1341            .body()
1342            .unwrap()
1343            .contains("extended YAML metadata standard"));
1344
1345        // Verify features collection
1346        let features = doc.get_field("features").unwrap().as_sequence().unwrap();
1347        assert_eq!(features.len(), 3);
1348
1349        // Verify use_cases collection
1350        let use_cases = doc.get_field("use_cases").unwrap().as_sequence().unwrap();
1351        assert_eq!(use_cases.len(), 2);
1352
1353        // Check first feature
1354        let feature1 = features[0].as_object().unwrap();
1355        assert_eq!(
1356            feature1.get("name").unwrap().as_str().unwrap(),
1357            "Tag Directives"
1358        );
1359    }
1360
1361    #[test]
1362    fn test_input_size_limit() {
1363        // Create markdown larger than MAX_INPUT_SIZE (10 MB)
1364        let size = crate::error::MAX_INPUT_SIZE + 1;
1365        let large_markdown = "a".repeat(size);
1366
1367        let result = decompose(&large_markdown);
1368        assert!(result.is_err());
1369
1370        let err_msg = result.unwrap_err().to_string();
1371        assert!(err_msg.contains("Input too large"));
1372    }
1373
1374    #[test]
1375    fn test_yaml_size_limit() {
1376        // Create YAML block larger than MAX_YAML_SIZE (1 MB)
1377        let mut markdown = String::from("---\n");
1378
1379        // Create a very large YAML field
1380        let size = crate::error::MAX_YAML_SIZE + 1;
1381        markdown.push_str("data: \"");
1382        markdown.push_str(&"x".repeat(size));
1383        markdown.push_str("\"\n---\n\nBody");
1384
1385        let result = decompose(&markdown);
1386        assert!(result.is_err());
1387
1388        let err_msg = result.unwrap_err().to_string();
1389        assert!(err_msg.contains("YAML block too large"));
1390    }
1391
1392    #[test]
1393    fn test_input_within_size_limit() {
1394        // Create markdown just under the limit
1395        let size = 1000; // Much smaller than limit
1396        let markdown = format!("---\ntitle: Test\n---\n\n{}", "a".repeat(size));
1397
1398        let result = decompose(&markdown);
1399        assert!(result.is_ok());
1400    }
1401
1402    #[test]
1403    fn test_yaml_within_size_limit() {
1404        // Create YAML block well within the limit
1405        let markdown = "---\ntitle: Test\nauthor: John Doe\n---\n\nBody content";
1406
1407        let result = decompose(&markdown);
1408        assert!(result.is_ok());
1409    }
1410}