quillmark_core/normalize.rs
1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Fix HTML comment fences to preserve trailing text
14//! - Apply all normalizations in the correct order
15//!
16//! Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
17//!
18//! ## Functions
19//!
20//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
21//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
22//! - [`normalize_fields`] - Normalize document frontmatter fields (bidi stripping on body only)
23//! - [`normalize_document`] - Normalize a typed [`crate::document::Document`] in-place
24//!
25//! ## Why Normalize?
26//!
27//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
28//! control characters used for bidirectional text layout. When placed adjacent to markdown
29//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
30//!
31//! ```text
32//! **bold** or <U+202D>**(1234**
33//! ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
34//! ```
35//!
36//! These characters commonly appear when copying text from:
37//! - Web pages with mixed LTR/RTL content
38//! - PDF documents
39//! - Word processors
40//! - Some clipboard managers
41//!
42//! ## Examples
43//!
44//! ```
45//! use quillmark_core::normalize::strip_bidi_formatting;
46//!
47//! // Input with invisible U+202D (LRO) before second **
48//! let input = "**asdf** or \u{202D}**(1234**";
49//! let cleaned = strip_bidi_formatting(input);
50//! assert_eq!(cleaned, "**asdf** or **(1234**");
51//! ```
52
53use crate::document::Card;
54use crate::value::QuillValue;
55use indexmap::IndexMap;
56use unicode_normalization::UnicodeNormalization;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61 /// JSON nesting depth exceeded maximum allowed
62 #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63 NestingTooDeep {
64 /// Actual depth
65 depth: usize,
66 /// Maximum allowed depth
67 max: usize,
68 },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74 matches!(
75 c,
76 '\u{061C}' // ARABIC LETTER MARK (ALM)
77 | '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
78 | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
79 | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
80 | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
81 | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
82 | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
83 | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
84 | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
85 | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
86 | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
87 | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
88 )
89}
90
91/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
92///
93/// These invisible control characters are used for bidirectional text layout but can
94/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
95///
96/// # Characters Stripped
97///
98/// - U+061C (ARABIC LETTER MARK, ALM)
99/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
100/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
101/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
102/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
103/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
104/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
105/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
106/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
107/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
108/// - U+2068 (FIRST STRONG ISOLATE, FSI)
109/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
110///
111/// # Examples
112///
113/// ```
114/// use quillmark_core::normalize::strip_bidi_formatting;
115///
116/// // Normal text is unchanged
117/// assert_eq!(strip_bidi_formatting("hello"), "hello");
118///
119/// // LRO character is stripped
120/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
121///
122/// // All bidi characters are stripped
123/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
124/// assert_eq!(strip_bidi_formatting(input), "");
125/// ```
126pub fn strip_bidi_formatting(s: &str) -> String {
127 // Early return optimization: avoid allocation if no bidi characters present
128 if !s.chars().any(is_bidi_char) {
129 return s.to_string();
130 }
131
132 s.chars().filter(|c| !is_bidi_char(*c)).collect()
133}
134
135/// Fixes HTML comment closing fences to prevent content loss.
136///
137/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
138/// This means any text on the same line after `-->` is included in the HTML block and would
139/// be discarded by markdown parsers that ignore HTML blocks.
140///
141/// This function inserts a newline after `-->` when followed by non-whitespace content,
142/// ensuring the trailing text is parsed as regular markdown.
143///
144/// # Examples
145///
146/// ```
147/// use quillmark_core::normalize::fix_html_comment_fences;
148///
149/// // Text on same line as --> is moved to next line
150/// assert_eq!(
151/// fix_html_comment_fences("<!-- comment -->Some text"),
152/// "<!-- comment -->\nSome text"
153/// );
154///
155/// // Already on separate line - no change
156/// assert_eq!(
157/// fix_html_comment_fences("<!-- comment -->\nSome text"),
158/// "<!-- comment -->\nSome text"
159/// );
160///
161/// // Only whitespace after --> - no change needed
162/// assert_eq!(
163/// fix_html_comment_fences("<!-- comment --> \nSome text"),
164/// "<!-- comment --> \nSome text"
165/// );
166///
167/// // Multi-line comments with trailing text
168/// assert_eq!(
169/// fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
170/// "<!--\nmultiline\n-->\nTrailing text"
171/// );
172/// ```
173pub fn fix_html_comment_fences(s: &str) -> String {
174 // Early return if no HTML comment closing fence present
175 if !s.contains("-->") {
176 return s.to_string();
177 }
178
179 // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
180 let mut result = String::with_capacity(s.len() + 16);
181 let mut current_pos = 0;
182
183 // Find first opener
184 while let Some(open_idx) = s[current_pos..].find("<!--") {
185 let abs_open = current_pos + open_idx;
186
187 // Find matching closer AFTER the opener
188 if let Some(close_idx) = s[abs_open..].find("-->") {
189 let abs_close = abs_open + close_idx;
190 let mut after_fence = abs_close + 3;
191
192 // Handle `<!--- ... --->` style fences by treating the extra
193 // hyphen as part of the comment content, not leaked trailing text.
194 // 4 == "<!--".len(); check whether opener is `<!---` (extra hyphen).
195 let opener_has_extra_hyphen = s
196 .get(abs_open + 4..)
197 .is_some_and(|rest| rest.starts_with('-'));
198 if opener_has_extra_hyphen
199 && s.get(after_fence..)
200 .is_some_and(|rest| rest.starts_with('-'))
201 {
202 after_fence += 1;
203 }
204
205 // Append everything up to and including the closing fence
206 result.push_str(&s[current_pos..after_fence]);
207
208 // Check what comes after the fence
209 let after_content = &s[after_fence..];
210
211 // Determine if we need to insert a newline
212 let needs_newline = if after_content.is_empty()
213 || after_content.starts_with('\n')
214 || after_content.starts_with("\r\n")
215 {
216 false
217 } else {
218 // Check if there's only whitespace until end of line
219 let next_newline = after_content.find('\n');
220 let until_newline = match next_newline {
221 Some(pos) => &after_content[..pos],
222 None => after_content,
223 };
224 !until_newline.trim().is_empty()
225 };
226
227 if needs_newline {
228 result.push('\n');
229 }
230
231 // Move position to after the fence (we'll process the rest in next iteration)
232 current_pos = after_fence;
233 } else {
234 // Unclosed comment at end of string - just append the rest and break
235 // The opener was found but no closer exists.
236 result.push_str(&s[current_pos..]);
237 current_pos = s.len();
238 break;
239 }
240 }
241
242 // Append remaining content (text after last closed comment, or text if no comments found)
243 if current_pos < s.len() {
244 result.push_str(&s[current_pos..]);
245 }
246
247 result
248}
249
250/// Normalizes markdown content by applying all preprocessing steps.
251///
252/// This function applies normalizations in the correct order:
253/// 1. Strip Unicode bidirectional formatting characters
254/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
255///
256/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
257/// in [`normalize_fields`] because it needs to be applied after schema defaults
258/// and coercion.
259///
260/// # Examples
261///
262/// ```
263/// use quillmark_core::normalize::normalize_markdown;
264///
265/// // Bidi characters are stripped
266/// let input = "**bold** \u{202D}**more**";
267/// let normalized = normalize_markdown(input);
268/// assert_eq!(normalized, "**bold** **more**");
269///
270/// // HTML comment trailing text is preserved
271/// let with_comment = "<!-- comment -->Some text";
272/// let normalized = normalize_markdown(with_comment);
273/// assert_eq!(normalized, "<!-- comment -->\nSome text");
274/// ```
275pub fn normalize_markdown(markdown: &str) -> String {
276 let cleaned = normalize_line_endings(markdown);
277 let cleaned = strip_bidi_formatting(&cleaned);
278 fix_html_comment_fences(&cleaned)
279}
280
281/// Convert CRLF (`\r\n`) and bare CR (`\r`) line endings to LF (`\n`).
282///
283/// YAML parsing already normalizes line endings inside scalar values, but the
284/// Markdown body is passed through verbatim. Authoring on Windows or pasting
285/// from some clipboard sources leaves `\r` bytes in the body which some
286/// backends render as visible garbage. This canonicalization is performed
287/// only on the Markdown body (see §7); YAML scalars are unaffected.
288fn normalize_line_endings(s: &str) -> String {
289 if !s.contains('\r') {
290 return s.to_string();
291 }
292 let mut out = String::with_capacity(s.len());
293 let mut chars = s.chars().peekable();
294 while let Some(c) = chars.next() {
295 if c == '\r' {
296 if chars.peek() == Some(&'\n') {
297 chars.next();
298 }
299 out.push('\n');
300 } else {
301 out.push(c);
302 }
303 }
304 out
305}
306
307/// Normalizes document frontmatter fields per the Quillmark §7 spec.
308///
309/// This is an internal helper used by [`normalize_document`]. It operates on
310/// the typed `IndexMap<String, QuillValue>` frontmatter; it does **not** touch
311/// `body` or `cards` (those are normalized separately by the caller).
312///
313/// Field names at the top level are NFC-normalized (see [`normalize_field_name`]).
314/// Only **body regions** receive content normalization (bidi stripping + HTML comment
315/// fence repair). All other field values pass through verbatim.
316///
317/// # Examples
318///
319/// ```
320/// use quillmark_core::normalize::normalize_fields;
321/// use quillmark_core::QuillValue;
322/// use indexmap::IndexMap;
323///
324/// let mut fields = IndexMap::new();
325/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
326///
327/// let result = normalize_fields(fields);
328///
329/// // Title passes through verbatim
330/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
331/// ```
332pub fn normalize_fields(fields: IndexMap<String, QuillValue>) -> IndexMap<String, QuillValue> {
333 fields
334 .into_iter()
335 .map(|(key, value)| {
336 // Normalize field name to NFC form for consistent key comparison.
337 let normalized_key = normalize_field_name(&key);
338 // All top-level frontmatter fields pass through verbatim — body
339 // regions are handled separately in normalize_document.
340 (normalized_key, value)
341 })
342 .collect()
343}
344
345/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
346///
347/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
348/// are treated as identical field names, preventing subtle bugs where visually
349/// identical keys are treated as different.
350///
351/// # Examples
352///
353/// ```
354/// use quillmark_core::normalize::normalize_field_name;
355///
356/// // Composed form (single code point for é)
357/// let composed = "café";
358/// // Decomposed form (e + combining acute accent)
359/// let decomposed = "cafe\u{0301}";
360///
361/// // Both normalize to the same NFC form
362/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
363/// ```
364pub fn normalize_field_name(name: &str) -> String {
365 name.nfc().collect()
366}
367
368/// Normalizes a typed [`crate::document::Document`] by applying all field-level normalizations.
369///
370/// This is the **primary entry point** for normalizing documents after parsing.
371/// It ensures consistent processing regardless of how the document was created.
372///
373/// # Normalization Steps
374///
375/// 1. **Unicode NFC normalization** — Frontmatter field names are normalized to NFC form.
376/// 2. **Bidi stripping** — Invisible bidirectional control characters are removed from
377/// body regions (each `Card::body`). YAML field values in every
378/// `Card::frontmatter` pass through verbatim (spec §7).
379/// 3. **HTML comment fence fixing** — Trailing text after `-->` is preserved in body
380/// regions only.
381///
382/// Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
383///
384/// # Idempotency
385///
386/// This function is idempotent — calling it multiple times produces the same result.
387///
388/// # Example
389///
390/// ```no_run
391/// use quillmark_core::{Document, normalize::normalize_document};
392///
393/// let markdown = "---\nQUILL: my_quill\ntitle: Example\n---\n\nBody with <<placeholder>>";
394/// let doc = Document::from_markdown(markdown).unwrap();
395/// let normalized = normalize_document(doc).unwrap();
396/// ```
397pub fn normalize_document(
398 doc: crate::document::Document,
399) -> Result<crate::document::Document, crate::error::ParseError> {
400 use crate::document::{Document, Sentinel};
401
402 // NFC-normalize main-card field names; values pass through verbatim.
403 let normalized_main_fm_map = normalize_fields(doc.main().frontmatter().to_index_map());
404 let normalized_main_body = normalize_markdown(doc.main().body());
405 let main_sentinel = doc.main().sentinel().clone();
406 let main = Card::new_with_sentinel(
407 main_sentinel,
408 crate::document::Frontmatter::from_index_map(normalized_main_fm_map),
409 normalized_main_body,
410 );
411
412 // Normalize each composable card's body; NFC-normalize its field names;
413 // values pass through verbatim.
414 let normalized_cards: Vec<Card> = doc
415 .cards()
416 .iter()
417 .map(|card| {
418 let normalized_card_fields: IndexMap<String, QuillValue> = card
419 .frontmatter()
420 .iter()
421 .map(|(k, v)| (normalize_field_name(k), v.clone()))
422 .collect();
423 let normalized_card_body = normalize_markdown(card.body());
424 Card::new_with_sentinel(
425 Sentinel::Card(card.tag()),
426 crate::document::Frontmatter::from_index_map(normalized_card_fields),
427 normalized_card_body,
428 )
429 })
430 .collect();
431
432 Ok(Document::from_main_and_cards(
433 main,
434 normalized_cards,
435 doc.warnings().to_vec(),
436 ))
437}
438
439#[cfg(test)]
440mod tests {
441 use super::*;
442
443 // Tests for strip_bidi_formatting
444
445 #[test]
446 fn test_strip_bidi_no_change() {
447 assert_eq!(strip_bidi_formatting("hello world"), "hello world");
448 assert_eq!(strip_bidi_formatting(""), "");
449 assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
450 }
451
452 #[test]
453 fn test_strip_bidi_lro() {
454 // U+202D (LEFT-TO-RIGHT OVERRIDE)
455 assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
456 assert_eq!(
457 strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
458 "**asdf** or **(1234**"
459 );
460 }
461
462 #[test]
463 fn test_strip_bidi_rlo() {
464 // U+202E (RIGHT-TO-LEFT OVERRIDE)
465 assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
466 }
467
468 #[test]
469 fn test_strip_bidi_marks() {
470 // U+200E (LRM) and U+200F (RLM)
471 assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
472 }
473
474 #[test]
475 fn test_strip_bidi_embeddings() {
476 // U+202A (LRE), U+202B (RLE), U+202C (PDF)
477 assert_eq!(
478 strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
479 "textmore"
480 );
481 }
482
483 #[test]
484 fn test_strip_bidi_isolates() {
485 // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
486 assert_eq!(
487 strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
488 "abc"
489 );
490 }
491
492 #[test]
493 fn test_strip_bidi_all_chars() {
494 let all_bidi = "\u{061C}\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
495 assert_eq!(strip_bidi_formatting(all_bidi), "");
496 }
497
498 #[test]
499 fn test_strip_bidi_arabic_letter_mark() {
500 // U+061C ARABIC LETTER MARK (ALM) should be stripped
501 assert_eq!(strip_bidi_formatting("hello\u{061C}world"), "helloworld");
502 assert_eq!(strip_bidi_formatting("\u{061C}**bold**"), "**bold**");
503 }
504
505 #[test]
506 fn test_strip_bidi_unicode_preserved() {
507 // Non-bidi unicode should be preserved
508 assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
509 assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
510 assert_eq!(strip_bidi_formatting("🎉"), "🎉");
511 }
512
513 // Tests for normalize_markdown
514
515 #[test]
516 fn test_normalize_markdown_basic() {
517 assert_eq!(normalize_markdown("hello"), "hello");
518 assert_eq!(
519 normalize_markdown("**bold** \u{202D}**more**"),
520 "**bold** **more**"
521 );
522 }
523
524 #[test]
525 fn test_normalize_markdown_html_comment() {
526 assert_eq!(
527 normalize_markdown("<!-- comment -->Some text"),
528 "<!-- comment -->\nSome text"
529 );
530 }
531
532 // Tests for fix_html_comment_fences
533
534 #[test]
535 fn test_fix_html_comment_no_comment() {
536 assert_eq!(fix_html_comment_fences("hello world"), "hello world");
537 assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
538 assert_eq!(fix_html_comment_fences(""), "");
539 }
540
541 #[test]
542 fn test_fix_html_comment_single_line_trailing_text() {
543 // Text on same line as --> should be moved to next line
544 assert_eq!(
545 fix_html_comment_fences("<!-- comment -->Same line text"),
546 "<!-- comment -->\nSame line text"
547 );
548 }
549
550 #[test]
551 fn test_fix_html_comment_already_newline() {
552 // Already has newline after --> - no change
553 assert_eq!(
554 fix_html_comment_fences("<!-- comment -->\nNext line text"),
555 "<!-- comment -->\nNext line text"
556 );
557 }
558
559 #[test]
560 fn test_fix_html_comment_only_whitespace_after() {
561 // Only whitespace after --> until newline - no change needed
562 assert_eq!(
563 fix_html_comment_fences("<!-- comment --> \nSome text"),
564 "<!-- comment --> \nSome text"
565 );
566 }
567
568 #[test]
569 fn test_fix_html_comment_multiline_trailing_text() {
570 // Multi-line comment with text on closing line
571 assert_eq!(
572 fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
573 "<!--\nmultiline\ncomment\n-->\nTrailing text"
574 );
575 }
576
577 #[test]
578 fn test_fix_html_comment_multiline_proper() {
579 // Multi-line comment with proper newline after -->
580 assert_eq!(
581 fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
582 "<!--\nmultiline\n-->\n\nParagraph text"
583 );
584 }
585
586 #[test]
587 fn test_fix_html_comment_multiple_comments() {
588 // Multiple comments in the same document
589 assert_eq!(
590 fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
591 "<!-- first -->\nText\n\n<!-- second -->\nMore text"
592 );
593 }
594
595 #[test]
596 fn test_fix_html_comment_end_of_string() {
597 // Comment at end of string - no trailing content
598 assert_eq!(
599 fix_html_comment_fences("Some text before <!-- comment -->"),
600 "Some text before <!-- comment -->"
601 );
602 }
603
604 #[test]
605 fn test_fix_html_comment_only_comment() {
606 // Just a comment with nothing after
607 assert_eq!(
608 fix_html_comment_fences("<!-- comment -->"),
609 "<!-- comment -->"
610 );
611 }
612
613 #[test]
614 fn test_fix_html_comment_arrow_not_comment() {
615 // --> that's not part of a comment (standalone)
616 // Should NOT be touched by the context-aware fixer
617 assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
618 }
619
620 #[test]
621 fn test_fix_html_comment_nested_opener() {
622 // Nested openers are just text inside the comment
623 // <!-- <!-- -->Trailing
624 // The first <!-- opens, the first --> closes.
625 assert_eq!(
626 fix_html_comment_fences("<!-- <!-- -->Trailing"),
627 "<!-- <!-- -->\nTrailing"
628 );
629 }
630
631 #[test]
632 fn test_fix_html_comment_unmatched_closer() {
633 // Closer without opener
634 assert_eq!(
635 fix_html_comment_fences("text --> more text"),
636 "text --> more text"
637 );
638 }
639
640 #[test]
641 fn test_fix_html_comment_multiple_valid_invalid() {
642 // Mixed valid and invalid comments
643 // <!-- valid -->FixMe
644 // text --> Ignore
645 // <!-- valid2 -->FixMe2
646 let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
647 let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
648 assert_eq!(fix_html_comment_fences(input), expected);
649 }
650
651 #[test]
652 fn test_fix_html_comment_crlf() {
653 // CRLF line endings
654 assert_eq!(
655 fix_html_comment_fences("<!-- comment -->\r\nSome text"),
656 "<!-- comment -->\r\nSome text"
657 );
658 }
659
660 #[test]
661 fn test_fix_html_comment_triple_hyphen_single_line() {
662 assert_eq!(
663 fix_html_comment_fences("<!--- comment --->Trailing text"),
664 "<!--- comment --->\nTrailing text"
665 );
666 }
667
668 #[test]
669 fn test_fix_html_comment_triple_hyphen_multiline() {
670 assert_eq!(
671 fix_html_comment_fences("<!---\ncomment\n--->Trailing text"),
672 "<!---\ncomment\n--->\nTrailing text"
673 );
674 }
675
676 // Tests for normalize_fields (frontmatter only)
677
678 #[test]
679 fn test_normalize_fields_other_field_chevrons_preserved() {
680 let mut fields = IndexMap::new();
681 fields.insert(
682 "title".to_string(),
683 QuillValue::from_json(serde_json::json!("<<hello>>")),
684 );
685
686 let result = normalize_fields(fields);
687 // Chevrons are passed through unchanged
688 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
689 }
690
691 #[test]
692 fn test_normalize_fields_other_field_bidi_preserved() {
693 // Per spec §7: bidi stripping is NOT applied to YAML field values.
694 // Only body regions are normalized.
695 let mut fields = IndexMap::new();
696 fields.insert(
697 "title".to_string(),
698 QuillValue::from_json(serde_json::json!("a\u{202D}b")),
699 );
700
701 let result = normalize_fields(fields);
702 // Bidi character must be PRESERVED in non-body fields
703 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "a\u{202D}b");
704 }
705
706 #[test]
707 fn test_normalize_fields_non_string_unchanged() {
708 let mut fields = IndexMap::new();
709 fields.insert(
710 "count".to_string(),
711 QuillValue::from_json(serde_json::json!(42)),
712 );
713 fields.insert(
714 "enabled".to_string(),
715 QuillValue::from_json(serde_json::json!(true)),
716 );
717
718 let result = normalize_fields(fields);
719 assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
720 assert!(result.get("enabled").unwrap().as_bool().unwrap());
721 }
722
723 // Tests for normalize_document
724
725 #[test]
726 fn test_normalize_document_basic() {
727 use crate::document::Document;
728
729 let doc = Document::from_markdown(
730 "---\nQUILL: test\ntitle: <<placeholder>>\n---\n\n<<content>> \u{202D}**bold**",
731 )
732 .unwrap();
733 let normalized = super::normalize_document(doc).unwrap();
734
735 // Title has chevrons preserved (only bidi stripped on body)
736 assert_eq!(
737 normalized
738 .main()
739 .frontmatter()
740 .get("title")
741 .unwrap()
742 .as_str()
743 .unwrap(),
744 "<<placeholder>>"
745 );
746
747 // Body has bidi stripped, chevrons preserved
748 assert_eq!(normalized.main().body(), "\n<<content>> **bold**");
749 }
750
751 #[test]
752 fn test_normalize_document_preserves_quill_tag() {
753 use crate::document::Document;
754
755 let doc = Document::from_markdown("---\nQUILL: custom_quill\n---\n").unwrap();
756 let normalized = super::normalize_document(doc).unwrap();
757
758 assert_eq!(normalized.quill_reference().name, "custom_quill");
759 }
760
761 #[test]
762 fn test_normalize_document_idempotent() {
763 use crate::document::Document;
764
765 let doc = Document::from_markdown("---\nQUILL: test\n---\n\n<<content>>").unwrap();
766 let normalized_once = super::normalize_document(doc).unwrap();
767 let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
768
769 assert_eq!(
770 normalized_once.main().body(),
771 normalized_twice.main().body()
772 );
773 }
774
775 #[test]
776 fn test_normalize_document_body_bidi_stripped() {
777 use crate::document::Document;
778
779 let doc = Document::from_markdown("---\nQUILL: test\n---\n\nhello\u{202D}world").unwrap();
780 let normalized = super::normalize_document(doc).unwrap();
781 assert_eq!(normalized.main().body(), "\nhelloworld");
782 }
783
784 #[test]
785 fn test_normalize_document_yaml_field_bidi_preserved() {
786 use crate::document::Document;
787
788 let doc = Document::from_markdown("---\nQUILL: test\ntitle: a\u{202D}b\n---\n").unwrap();
789 let normalized = super::normalize_document(doc).unwrap();
790 // Bidi preserved in YAML fields
791 assert_eq!(
792 normalized
793 .main()
794 .frontmatter()
795 .get("title")
796 .unwrap()
797 .as_str()
798 .unwrap(),
799 "a\u{202D}b"
800 );
801 }
802
803 #[test]
804 fn test_normalize_document_card_body_bidi_stripped() {
805 use crate::document::Document;
806
807 let md = "---\nQUILL: test\n---\n\nbody\n\n---\nCARD: note\n---\ncard\u{202D}body\n";
808 let doc = Document::from_markdown(md).unwrap();
809 assert_eq!(doc.cards().len(), 1, "expected 1 card");
810 let normalized = super::normalize_document(doc).unwrap();
811 assert_eq!(normalized.cards()[0].body(), "cardbody\n");
812 }
813
814 #[test]
815 fn test_normalize_document_card_field_bidi_preserved() {
816 use crate::document::Document;
817
818 let md = "---\nQUILL: test\n---\n\nbody\n\n---\nCARD: note\nname: Ali\u{202D}ce\n---\n";
819 let doc = Document::from_markdown(md).unwrap();
820 assert_eq!(doc.cards().len(), 1, "expected 1 card");
821 let normalized = super::normalize_document(doc).unwrap();
822 assert_eq!(
823 normalized.cards()[0]
824 .frontmatter()
825 .get("name")
826 .unwrap()
827 .as_str()
828 .unwrap(),
829 "Ali\u{202D}ce"
830 );
831 }
832
833 #[test]
834 fn test_normalize_document_card_body_html_comment_repair() {
835 use crate::document::Document;
836
837 let md = "---\nQUILL: test\n---\n\n---\nCARD: note\n---\n<!-- comment -->Trailing text\n";
838 let doc = Document::from_markdown(md).unwrap();
839 let normalized = super::normalize_document(doc).unwrap();
840 assert_eq!(
841 normalized.cards()[0].body(),
842 "<!-- comment -->\nTrailing text\n"
843 );
844 }
845
846 #[test]
847 fn test_normalize_document_toplevel_body_html_comment_repair() {
848 use crate::document::Document;
849
850 let md = "---\nQUILL: test\n---\n\n<!-- note -->Content here";
851 let doc = Document::from_markdown(md).unwrap();
852 let normalized = super::normalize_document(doc).unwrap();
853 assert_eq!(normalized.main().body(), "\n<!-- note -->\nContent here");
854 }
855}