quillmark_core/
normalize.rs

1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Fix HTML comment fences to preserve trailing text
14//! - Apply all normalizations in the correct order
15//!
16//! Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
17//!
18//! ## Functions
19//!
20//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
21//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
22//! - [`normalize_fields`] - Normalize document frontmatter fields (bidi stripping on body only)
23//! - [`normalize_document`] - Normalize a typed [`crate::document::Document`] in-place
24//!
25//! ## Why Normalize?
26//!
27//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
28//! control characters used for bidirectional text layout. When placed adjacent to markdown
29//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
30//!
31//! ```text
32//! **bold** or <U+202D>**(1234**
33//!             ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
34//! ```
35//!
36//! These characters commonly appear when copying text from:
37//! - Web pages with mixed LTR/RTL content
38//! - PDF documents
39//! - Word processors
40//! - Some clipboard managers
41//!
42//! ## Examples
43//!
44//! ```
45//! use quillmark_core::normalize::strip_bidi_formatting;
46//!
47//! // Input with invisible U+202D (LRO) before second **
48//! let input = "**asdf** or \u{202D}**(1234**";
49//! let cleaned = strip_bidi_formatting(input);
50//! assert_eq!(cleaned, "**asdf** or **(1234**");
51//! ```
52
53use crate::document::Card;
54use crate::value::QuillValue;
55use indexmap::IndexMap;
56use unicode_normalization::UnicodeNormalization;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61    /// JSON nesting depth exceeded maximum allowed
62    #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63    NestingTooDeep {
64        /// Actual depth
65        depth: usize,
66        /// Maximum allowed depth
67        max: usize,
68    },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74    matches!(
75        c,
76        '\u{061C}' // ARABIC LETTER MARK (ALM)
77        | '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
78        | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
79        | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
80        | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
81        | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
82        | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
83        | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
84        | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
85        | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
86        | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
87        | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
88    )
89}
90
91/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
92///
93/// These invisible control characters are used for bidirectional text layout but can
94/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
95///
96/// # Characters Stripped
97///
98/// - U+061C (ARABIC LETTER MARK, ALM)
99/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
100/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
101/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
102/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
103/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
104/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
105/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
106/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
107/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
108/// - U+2068 (FIRST STRONG ISOLATE, FSI)
109/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
110///
111/// # Examples
112///
113/// ```
114/// use quillmark_core::normalize::strip_bidi_formatting;
115///
116/// // Normal text is unchanged
117/// assert_eq!(strip_bidi_formatting("hello"), "hello");
118///
119/// // LRO character is stripped
120/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
121///
122/// // All bidi characters are stripped
123/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
124/// assert_eq!(strip_bidi_formatting(input), "");
125/// ```
126pub fn strip_bidi_formatting(s: &str) -> String {
127    // Early return optimization: avoid allocation if no bidi characters present
128    if !s.chars().any(is_bidi_char) {
129        return s.to_string();
130    }
131
132    s.chars().filter(|c| !is_bidi_char(*c)).collect()
133}
134
135/// Fixes HTML comment closing fences to prevent content loss.
136///
137/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
138/// This means any text on the same line after `-->` is included in the HTML block and would
139/// be discarded by markdown parsers that ignore HTML blocks.
140///
141/// This function inserts a newline after `-->` when followed by non-whitespace content,
142/// ensuring the trailing text is parsed as regular markdown.
143///
144/// # Examples
145///
146/// ```
147/// use quillmark_core::normalize::fix_html_comment_fences;
148///
149/// // Text on same line as --> is moved to next line
150/// assert_eq!(
151///     fix_html_comment_fences("<!-- comment -->Some text"),
152///     "<!-- comment -->\nSome text"
153/// );
154///
155/// // Already on separate line - no change
156/// assert_eq!(
157///     fix_html_comment_fences("<!-- comment -->\nSome text"),
158///     "<!-- comment -->\nSome text"
159/// );
160///
161/// // Only whitespace after --> - no change needed
162/// assert_eq!(
163///     fix_html_comment_fences("<!-- comment -->   \nSome text"),
164///     "<!-- comment -->   \nSome text"
165/// );
166///
167/// // Multi-line comments with trailing text
168/// assert_eq!(
169///     fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
170///     "<!--\nmultiline\n-->\nTrailing text"
171/// );
172/// ```
173pub fn fix_html_comment_fences(s: &str) -> String {
174    // Early return if no HTML comment closing fence present
175    if !s.contains("-->") {
176        return s.to_string();
177    }
178
179    // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
180    let mut result = String::with_capacity(s.len() + 16);
181    let mut current_pos = 0;
182
183    // Find first opener
184    while let Some(open_idx) = s[current_pos..].find("<!--") {
185        let abs_open = current_pos + open_idx;
186
187        // Find matching closer AFTER the opener
188        if let Some(close_idx) = s[abs_open..].find("-->") {
189            let abs_close = abs_open + close_idx;
190            let mut after_fence = abs_close + 3;
191
192            // Handle `<!--- ... --->` style fences by treating the extra
193            // hyphen as part of the comment content, not leaked trailing text.
194            // 4 == "<!--".len(); check whether opener is `<!---` (extra hyphen).
195            let opener_has_extra_hyphen = s
196                .get(abs_open + 4..)
197                .is_some_and(|rest| rest.starts_with('-'));
198            if opener_has_extra_hyphen
199                && s.get(after_fence..)
200                    .is_some_and(|rest| rest.starts_with('-'))
201            {
202                after_fence += 1;
203            }
204
205            // Append everything up to and including the closing fence
206            result.push_str(&s[current_pos..after_fence]);
207
208            // Check what comes after the fence
209            let after_content = &s[after_fence..];
210
211            // Determine if we need to insert a newline
212            let needs_newline = if after_content.is_empty() {
213                false
214            } else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
215                false
216            } else {
217                // Check if there's only whitespace until end of line
218                let next_newline = after_content.find('\n');
219                let until_newline = match next_newline {
220                    Some(pos) => &after_content[..pos],
221                    None => after_content,
222                };
223                !until_newline.trim().is_empty()
224            };
225
226            if needs_newline {
227                result.push('\n');
228            }
229
230            // Move position to after the fence (we'll process the rest in next iteration)
231            current_pos = after_fence;
232        } else {
233            // Unclosed comment at end of string - just append the rest and break
234            // The opener was found but no closer exists.
235            result.push_str(&s[current_pos..]);
236            current_pos = s.len();
237            break;
238        }
239    }
240
241    // Append remaining content (text after last closed comment, or text if no comments found)
242    if current_pos < s.len() {
243        result.push_str(&s[current_pos..]);
244    }
245
246    result
247}
248
249/// Normalizes markdown content by applying all preprocessing steps.
250///
251/// This function applies normalizations in the correct order:
252/// 1. Strip Unicode bidirectional formatting characters
253/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
254///
255/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
256/// in [`normalize_fields`] because it needs to be applied after schema defaults
257/// and coercion.
258///
259/// # Examples
260///
261/// ```
262/// use quillmark_core::normalize::normalize_markdown;
263///
264/// // Bidi characters are stripped
265/// let input = "**bold** \u{202D}**more**";
266/// let normalized = normalize_markdown(input);
267/// assert_eq!(normalized, "**bold** **more**");
268///
269/// // HTML comment trailing text is preserved
270/// let with_comment = "<!-- comment -->Some text";
271/// let normalized = normalize_markdown(with_comment);
272/// assert_eq!(normalized, "<!-- comment -->\nSome text");
273/// ```
274pub fn normalize_markdown(markdown: &str) -> String {
275    let cleaned = normalize_line_endings(markdown);
276    let cleaned = strip_bidi_formatting(&cleaned);
277    fix_html_comment_fences(&cleaned)
278}
279
280/// Convert CRLF (`\r\n`) and bare CR (`\r`) line endings to LF (`\n`).
281///
282/// YAML parsing already normalizes line endings inside scalar values, but the
283/// Markdown body is passed through verbatim. Authoring on Windows or pasting
284/// from some clipboard sources leaves `\r` bytes in the body which some
285/// backends render as visible garbage. This canonicalization is performed
286/// only on the Markdown body (see §7); YAML scalars are unaffected.
287fn normalize_line_endings(s: &str) -> String {
288    if !s.contains('\r') {
289        return s.to_string();
290    }
291    let mut out = String::with_capacity(s.len());
292    let mut chars = s.chars().peekable();
293    while let Some(c) = chars.next() {
294        if c == '\r' {
295            if chars.peek() == Some(&'\n') {
296                chars.next();
297            }
298            out.push('\n');
299        } else {
300            out.push(c);
301        }
302    }
303    out
304}
305
306/// Normalizes document frontmatter fields per the Quillmark §7 spec.
307///
308/// This is an internal helper used by [`normalize_document`]. It operates on
309/// the typed `IndexMap<String, QuillValue>` frontmatter; it does **not** touch
310/// `body` or `cards` (those are normalized separately by the caller).
311///
312/// Field names at the top level are NFC-normalized (see [`normalize_field_name`]).
313/// Only **body regions** receive content normalization (bidi stripping + HTML comment
314/// fence repair). All other field values pass through verbatim.
315///
316/// # Examples
317///
318/// ```
319/// use quillmark_core::normalize::normalize_fields;
320/// use quillmark_core::QuillValue;
321/// use indexmap::IndexMap;
322///
323/// let mut fields = IndexMap::new();
324/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
325///
326/// let result = normalize_fields(fields);
327///
328/// // Title passes through verbatim
329/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
330/// ```
331pub fn normalize_fields(fields: IndexMap<String, QuillValue>) -> IndexMap<String, QuillValue> {
332    fields
333        .into_iter()
334        .map(|(key, value)| {
335            // Normalize field name to NFC form for consistent key comparison.
336            let normalized_key = normalize_field_name(&key);
337            // All top-level frontmatter fields pass through verbatim — body
338            // regions are handled separately in normalize_document.
339            (normalized_key, value)
340        })
341        .collect()
342}
343
344/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
345///
346/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
347/// are treated as identical field names, preventing subtle bugs where visually
348/// identical keys are treated as different.
349///
350/// # Examples
351///
352/// ```
353/// use quillmark_core::normalize::normalize_field_name;
354///
355/// // Composed form (single code point for é)
356/// let composed = "café";
357/// // Decomposed form (e + combining acute accent)
358/// let decomposed = "cafe\u{0301}";
359///
360/// // Both normalize to the same NFC form
361/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
362/// ```
363pub fn normalize_field_name(name: &str) -> String {
364    name.nfc().collect()
365}
366
367/// Normalizes a typed [`crate::document::Document`] by applying all field-level normalizations.
368///
369/// This is the **primary entry point** for normalizing documents after parsing.
370/// It ensures consistent processing regardless of how the document was created.
371///
372/// # Normalization Steps
373///
374/// 1. **Unicode NFC normalization** — Frontmatter field names are normalized to NFC form.
375/// 2. **Bidi stripping** — Invisible bidirectional control characters are removed from
376///    body regions (each `Card::body`). YAML field values in every
377///    `Card::frontmatter` pass through verbatim (spec §7).
378/// 3. **HTML comment fence fixing** — Trailing text after `-->` is preserved in body
379///    regions only.
380///
381/// Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
382///
383/// # Idempotency
384///
385/// This function is idempotent — calling it multiple times produces the same result.
386///
387/// # Example
388///
389/// ```no_run
390/// use quillmark_core::{Document, normalize::normalize_document};
391///
392/// let markdown = "---\nQUILL: my_quill\ntitle: Example\n---\n\nBody with <<placeholder>>";
393/// let doc = Document::from_markdown(markdown).unwrap();
394/// let normalized = normalize_document(doc).unwrap();
395/// ```
396pub fn normalize_document(
397    doc: crate::document::Document,
398) -> Result<crate::document::Document, crate::error::ParseError> {
399    use crate::document::{Document, Sentinel};
400
401    // NFC-normalize main-card field names; values pass through verbatim.
402    let normalized_main_fm_map = normalize_fields(doc.main().frontmatter().to_index_map());
403    let normalized_main_body = normalize_markdown(doc.main().body());
404    let main_sentinel = doc.main().sentinel().clone();
405    let main = Card::new_with_sentinel(
406        main_sentinel,
407        crate::document::Frontmatter::from_index_map(normalized_main_fm_map),
408        normalized_main_body,
409    );
410
411    // Normalize each composable card's body; NFC-normalize its field names;
412    // values pass through verbatim.
413    let normalized_cards: Vec<Card> = doc
414        .cards()
415        .iter()
416        .map(|card| {
417            let normalized_card_fields: IndexMap<String, QuillValue> = card
418                .frontmatter()
419                .iter()
420                .map(|(k, v)| (normalize_field_name(k), v.clone()))
421                .collect();
422            let normalized_card_body = normalize_markdown(card.body());
423            Card::new_with_sentinel(
424                Sentinel::Card(card.tag()),
425                crate::document::Frontmatter::from_index_map(normalized_card_fields),
426                normalized_card_body,
427            )
428        })
429        .collect();
430
431    Ok(Document::from_main_and_cards(
432        main,
433        normalized_cards,
434        doc.warnings().to_vec(),
435    ))
436}
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441
442    // Tests for strip_bidi_formatting
443
444    #[test]
445    fn test_strip_bidi_no_change() {
446        assert_eq!(strip_bidi_formatting("hello world"), "hello world");
447        assert_eq!(strip_bidi_formatting(""), "");
448        assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
449    }
450
451    #[test]
452    fn test_strip_bidi_lro() {
453        // U+202D (LEFT-TO-RIGHT OVERRIDE)
454        assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
455        assert_eq!(
456            strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
457            "**asdf** or **(1234**"
458        );
459    }
460
461    #[test]
462    fn test_strip_bidi_rlo() {
463        // U+202E (RIGHT-TO-LEFT OVERRIDE)
464        assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
465    }
466
467    #[test]
468    fn test_strip_bidi_marks() {
469        // U+200E (LRM) and U+200F (RLM)
470        assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
471    }
472
473    #[test]
474    fn test_strip_bidi_embeddings() {
475        // U+202A (LRE), U+202B (RLE), U+202C (PDF)
476        assert_eq!(
477            strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
478            "textmore"
479        );
480    }
481
482    #[test]
483    fn test_strip_bidi_isolates() {
484        // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
485        assert_eq!(
486            strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
487            "abc"
488        );
489    }
490
491    #[test]
492    fn test_strip_bidi_all_chars() {
493        let all_bidi = "\u{061C}\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
494        assert_eq!(strip_bidi_formatting(all_bidi), "");
495    }
496
497    #[test]
498    fn test_strip_bidi_arabic_letter_mark() {
499        // U+061C ARABIC LETTER MARK (ALM) should be stripped
500        assert_eq!(strip_bidi_formatting("hello\u{061C}world"), "helloworld");
501        assert_eq!(strip_bidi_formatting("\u{061C}**bold**"), "**bold**");
502    }
503
504    #[test]
505    fn test_strip_bidi_unicode_preserved() {
506        // Non-bidi unicode should be preserved
507        assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
508        assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
509        assert_eq!(strip_bidi_formatting("🎉"), "🎉");
510    }
511
512    // Tests for normalize_markdown
513
514    #[test]
515    fn test_normalize_markdown_basic() {
516        assert_eq!(normalize_markdown("hello"), "hello");
517        assert_eq!(
518            normalize_markdown("**bold** \u{202D}**more**"),
519            "**bold** **more**"
520        );
521    }
522
523    #[test]
524    fn test_normalize_markdown_html_comment() {
525        assert_eq!(
526            normalize_markdown("<!-- comment -->Some text"),
527            "<!-- comment -->\nSome text"
528        );
529    }
530
531    // Tests for fix_html_comment_fences
532
533    #[test]
534    fn test_fix_html_comment_no_comment() {
535        assert_eq!(fix_html_comment_fences("hello world"), "hello world");
536        assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
537        assert_eq!(fix_html_comment_fences(""), "");
538    }
539
540    #[test]
541    fn test_fix_html_comment_single_line_trailing_text() {
542        // Text on same line as --> should be moved to next line
543        assert_eq!(
544            fix_html_comment_fences("<!-- comment -->Same line text"),
545            "<!-- comment -->\nSame line text"
546        );
547    }
548
549    #[test]
550    fn test_fix_html_comment_already_newline() {
551        // Already has newline after --> - no change
552        assert_eq!(
553            fix_html_comment_fences("<!-- comment -->\nNext line text"),
554            "<!-- comment -->\nNext line text"
555        );
556    }
557
558    #[test]
559    fn test_fix_html_comment_only_whitespace_after() {
560        // Only whitespace after --> until newline - no change needed
561        assert_eq!(
562            fix_html_comment_fences("<!-- comment -->   \nSome text"),
563            "<!-- comment -->   \nSome text"
564        );
565    }
566
567    #[test]
568    fn test_fix_html_comment_multiline_trailing_text() {
569        // Multi-line comment with text on closing line
570        assert_eq!(
571            fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
572            "<!--\nmultiline\ncomment\n-->\nTrailing text"
573        );
574    }
575
576    #[test]
577    fn test_fix_html_comment_multiline_proper() {
578        // Multi-line comment with proper newline after -->
579        assert_eq!(
580            fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
581            "<!--\nmultiline\n-->\n\nParagraph text"
582        );
583    }
584
585    #[test]
586    fn test_fix_html_comment_multiple_comments() {
587        // Multiple comments in the same document
588        assert_eq!(
589            fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
590            "<!-- first -->\nText\n\n<!-- second -->\nMore text"
591        );
592    }
593
594    #[test]
595    fn test_fix_html_comment_end_of_string() {
596        // Comment at end of string - no trailing content
597        assert_eq!(
598            fix_html_comment_fences("Some text before <!-- comment -->"),
599            "Some text before <!-- comment -->"
600        );
601    }
602
603    #[test]
604    fn test_fix_html_comment_only_comment() {
605        // Just a comment with nothing after
606        assert_eq!(
607            fix_html_comment_fences("<!-- comment -->"),
608            "<!-- comment -->"
609        );
610    }
611
612    #[test]
613    fn test_fix_html_comment_arrow_not_comment() {
614        // --> that's not part of a comment (standalone)
615        // Should NOT be touched by the context-aware fixer
616        assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
617    }
618
619    #[test]
620    fn test_fix_html_comment_nested_opener() {
621        // Nested openers are just text inside the comment
622        // <!-- <!-- -->Trailing
623        // The first <!-- opens, the first --> closes.
624        assert_eq!(
625            fix_html_comment_fences("<!-- <!-- -->Trailing"),
626            "<!-- <!-- -->\nTrailing"
627        );
628    }
629
630    #[test]
631    fn test_fix_html_comment_unmatched_closer() {
632        // Closer without opener
633        assert_eq!(
634            fix_html_comment_fences("text --> more text"),
635            "text --> more text"
636        );
637    }
638
639    #[test]
640    fn test_fix_html_comment_multiple_valid_invalid() {
641        // Mixed valid and invalid comments
642        // <!-- valid -->FixMe
643        // text --> Ignore
644        // <!-- valid2 -->FixMe2
645        let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
646        let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
647        assert_eq!(fix_html_comment_fences(input), expected);
648    }
649
650    #[test]
651    fn test_fix_html_comment_crlf() {
652        // CRLF line endings
653        assert_eq!(
654            fix_html_comment_fences("<!-- comment -->\r\nSome text"),
655            "<!-- comment -->\r\nSome text"
656        );
657    }
658
659    #[test]
660    fn test_fix_html_comment_triple_hyphen_single_line() {
661        assert_eq!(
662            fix_html_comment_fences("<!--- comment --->Trailing text"),
663            "<!--- comment --->\nTrailing text"
664        );
665    }
666
667    #[test]
668    fn test_fix_html_comment_triple_hyphen_multiline() {
669        assert_eq!(
670            fix_html_comment_fences("<!---\ncomment\n--->Trailing text"),
671            "<!---\ncomment\n--->\nTrailing text"
672        );
673    }
674
675    // Tests for normalize_fields (frontmatter only)
676
677    #[test]
678    fn test_normalize_fields_other_field_chevrons_preserved() {
679        let mut fields = IndexMap::new();
680        fields.insert(
681            "title".to_string(),
682            QuillValue::from_json(serde_json::json!("<<hello>>")),
683        );
684
685        let result = normalize_fields(fields);
686        // Chevrons are passed through unchanged
687        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
688    }
689
690    #[test]
691    fn test_normalize_fields_other_field_bidi_preserved() {
692        // Per spec §7: bidi stripping is NOT applied to YAML field values.
693        // Only body regions are normalized.
694        let mut fields = IndexMap::new();
695        fields.insert(
696            "title".to_string(),
697            QuillValue::from_json(serde_json::json!("a\u{202D}b")),
698        );
699
700        let result = normalize_fields(fields);
701        // Bidi character must be PRESERVED in non-body fields
702        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "a\u{202D}b");
703    }
704
705    #[test]
706    fn test_normalize_fields_non_string_unchanged() {
707        let mut fields = IndexMap::new();
708        fields.insert(
709            "count".to_string(),
710            QuillValue::from_json(serde_json::json!(42)),
711        );
712        fields.insert(
713            "enabled".to_string(),
714            QuillValue::from_json(serde_json::json!(true)),
715        );
716
717        let result = normalize_fields(fields);
718        assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
719        assert!(result.get("enabled").unwrap().as_bool().unwrap());
720    }
721
722    // Tests for normalize_document
723
724    #[test]
725    fn test_normalize_document_basic() {
726        use crate::document::Document;
727
728        let doc = Document::from_markdown(
729            "---\nQUILL: test\ntitle: <<placeholder>>\n---\n\n<<content>> \u{202D}**bold**",
730        )
731        .unwrap();
732        let normalized = super::normalize_document(doc).unwrap();
733
734        // Title has chevrons preserved (only bidi stripped on body)
735        assert_eq!(
736            normalized
737                .main()
738                .frontmatter()
739                .get("title")
740                .unwrap()
741                .as_str()
742                .unwrap(),
743            "<<placeholder>>"
744        );
745
746        // Body has bidi stripped, chevrons preserved
747        assert_eq!(normalized.main().body(), "\n<<content>> **bold**");
748    }
749
750    #[test]
751    fn test_normalize_document_preserves_quill_tag() {
752        use crate::document::Document;
753
754        let doc = Document::from_markdown("---\nQUILL: custom_quill\n---\n").unwrap();
755        let normalized = super::normalize_document(doc).unwrap();
756
757        assert_eq!(normalized.quill_reference().name, "custom_quill");
758    }
759
760    #[test]
761    fn test_normalize_document_idempotent() {
762        use crate::document::Document;
763
764        let doc = Document::from_markdown("---\nQUILL: test\n---\n\n<<content>>").unwrap();
765        let normalized_once = super::normalize_document(doc).unwrap();
766        let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
767
768        assert_eq!(
769            normalized_once.main().body(),
770            normalized_twice.main().body()
771        );
772    }
773
774    #[test]
775    fn test_normalize_document_body_bidi_stripped() {
776        use crate::document::Document;
777
778        let doc = Document::from_markdown("---\nQUILL: test\n---\n\nhello\u{202D}world").unwrap();
779        let normalized = super::normalize_document(doc).unwrap();
780        assert_eq!(normalized.main().body(), "\nhelloworld");
781    }
782
783    #[test]
784    fn test_normalize_document_yaml_field_bidi_preserved() {
785        use crate::document::Document;
786
787        let doc = Document::from_markdown("---\nQUILL: test\ntitle: a\u{202D}b\n---\n").unwrap();
788        let normalized = super::normalize_document(doc).unwrap();
789        // Bidi preserved in YAML fields
790        assert_eq!(
791            normalized
792                .main()
793                .frontmatter()
794                .get("title")
795                .unwrap()
796                .as_str()
797                .unwrap(),
798            "a\u{202D}b"
799        );
800    }
801
802    #[test]
803    fn test_normalize_document_card_body_bidi_stripped() {
804        use crate::document::Document;
805
806        let md = "---\nQUILL: test\n---\n\nbody\n\n---\nCARD: note\n---\ncard\u{202D}body\n";
807        let doc = Document::from_markdown(md).unwrap();
808        assert_eq!(doc.cards().len(), 1, "expected 1 card");
809        let normalized = super::normalize_document(doc).unwrap();
810        assert_eq!(normalized.cards()[0].body(), "cardbody\n");
811    }
812
813    #[test]
814    fn test_normalize_document_card_field_bidi_preserved() {
815        use crate::document::Document;
816
817        let md = "---\nQUILL: test\n---\n\nbody\n\n---\nCARD: note\nname: Ali\u{202D}ce\n---\n";
818        let doc = Document::from_markdown(md).unwrap();
819        assert_eq!(doc.cards().len(), 1, "expected 1 card");
820        let normalized = super::normalize_document(doc).unwrap();
821        assert_eq!(
822            normalized.cards()[0]
823                .frontmatter()
824                .get("name")
825                .unwrap()
826                .as_str()
827                .unwrap(),
828            "Ali\u{202D}ce"
829        );
830    }
831
832    #[test]
833    fn test_normalize_document_card_body_html_comment_repair() {
834        use crate::document::Document;
835
836        let md = "---\nQUILL: test\n---\n\n---\nCARD: note\n---\n<!-- comment -->Trailing text\n";
837        let doc = Document::from_markdown(md).unwrap();
838        let normalized = super::normalize_document(doc).unwrap();
839        assert_eq!(
840            normalized.cards()[0].body(),
841            "<!-- comment -->\nTrailing text\n"
842        );
843    }
844
845    #[test]
846    fn test_normalize_document_toplevel_body_html_comment_repair() {
847        use crate::document::Document;
848
849        let md = "---\nQUILL: test\n---\n\n<!-- note -->Content here";
850        let doc = Document::from_markdown(md).unwrap();
851        let normalized = super::normalize_document(doc).unwrap();
852        assert_eq!(normalized.main().body(), "\n<!-- note -->\nContent here");
853    }
854}
quillmark_core/normalize.rs

quillmark_core/
normalize.rs