quillmark_core/normalize.rs
1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Fix HTML comment fences to preserve trailing text
14//! - Apply all normalizations in the correct order
15//!
16//! Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
17//!
18//! ## Functions
19//!
20//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
21//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
22//! - [`normalize_fields`] - Normalize document fields (bidi stripping)
23//!
24//! ## Why Normalize?
25//!
26//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
27//! control characters used for bidirectional text layout. When placed adjacent to markdown
28//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
29//!
30//! ```text
31//! **bold** or <U+202D>**(1234**
32//! ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
33//! ```
34//!
35//! These characters commonly appear when copying text from:
36//! - Web pages with mixed LTR/RTL content
37//! - PDF documents
38//! - Word processors
39//! - Some clipboard managers
40//!
41//! ## Examples
42//!
43//! ```
44//! use quillmark_core::normalize::strip_bidi_formatting;
45//!
46//! // Input with invisible U+202D (LRO) before second **
47//! let input = "**asdf** or \u{202D}**(1234**";
48//! let cleaned = strip_bidi_formatting(input);
49//! assert_eq!(cleaned, "**asdf** or **(1234**");
50//! ```
51
52use crate::error::MAX_NESTING_DEPTH;
53use crate::parse::BODY_FIELD;
54use crate::value::QuillValue;
55use std::collections::HashMap;
56use unicode_normalization::UnicodeNormalization;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61 /// JSON nesting depth exceeded maximum allowed
62 #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63 NestingTooDeep {
64 /// Actual depth
65 depth: usize,
66 /// Maximum allowed depth
67 max: usize,
68 },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74 matches!(
75 c,
76 '\u{061C}' // ARABIC LETTER MARK (ALM)
77 | '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
78 | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
79 | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
80 | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
81 | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
82 | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
83 | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
84 | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
85 | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
86 | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
87 | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
88 )
89}
90
91/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
92///
93/// These invisible control characters are used for bidirectional text layout but can
94/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
95///
96/// # Characters Stripped
97///
98/// - U+061C (ARABIC LETTER MARK, ALM)
99/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
100/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
101/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
102/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
103/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
104/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
105/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
106/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
107/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
108/// - U+2068 (FIRST STRONG ISOLATE, FSI)
109/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
110///
111/// # Examples
112///
113/// ```
114/// use quillmark_core::normalize::strip_bidi_formatting;
115///
116/// // Normal text is unchanged
117/// assert_eq!(strip_bidi_formatting("hello"), "hello");
118///
119/// // LRO character is stripped
120/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
121///
122/// // All bidi characters are stripped
123/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
124/// assert_eq!(strip_bidi_formatting(input), "");
125/// ```
126pub fn strip_bidi_formatting(s: &str) -> String {
127 // Early return optimization: avoid allocation if no bidi characters present
128 if !s.chars().any(is_bidi_char) {
129 return s.to_string();
130 }
131
132 s.chars().filter(|c| !is_bidi_char(*c)).collect()
133}
134
135/// Fixes HTML comment closing fences to prevent content loss.
136///
137/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
138/// This means any text on the same line after `-->` is included in the HTML block and would
139/// be discarded by markdown parsers that ignore HTML blocks.
140///
141/// This function inserts a newline after `-->` when followed by non-whitespace content,
142/// ensuring the trailing text is parsed as regular markdown.
143///
144/// # Examples
145///
146/// ```
147/// use quillmark_core::normalize::fix_html_comment_fences;
148///
149/// // Text on same line as --> is moved to next line
150/// assert_eq!(
151/// fix_html_comment_fences("<!-- comment -->Some text"),
152/// "<!-- comment -->\nSome text"
153/// );
154///
155/// // Already on separate line - no change
156/// assert_eq!(
157/// fix_html_comment_fences("<!-- comment -->\nSome text"),
158/// "<!-- comment -->\nSome text"
159/// );
160///
161/// // Only whitespace after --> - no change needed
162/// assert_eq!(
163/// fix_html_comment_fences("<!-- comment --> \nSome text"),
164/// "<!-- comment --> \nSome text"
165/// );
166///
167/// // Multi-line comments with trailing text
168/// assert_eq!(
169/// fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
170/// "<!--\nmultiline\n-->\nTrailing text"
171/// );
172/// ```
173pub fn fix_html_comment_fences(s: &str) -> String {
174 // Early return if no HTML comment closing fence present
175 if !s.contains("-->") {
176 return s.to_string();
177 }
178
179 // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
180 let mut result = String::with_capacity(s.len() + 16);
181 let mut current_pos = 0;
182
183 // Find first opener
184 while let Some(open_idx) = s[current_pos..].find("<!--") {
185 let abs_open = current_pos + open_idx;
186
187 // Find matching closer AFTER the opener
188 if let Some(close_idx) = s[abs_open..].find("-->") {
189 let abs_close = abs_open + close_idx;
190 let mut after_fence = abs_close + 3;
191
192 // Handle `<!--- ... --->` style fences by treating the extra
193 // hyphen as part of the comment content, not leaked trailing text.
194 // 4 == "<!--".len(); check whether opener is `<!---` (extra hyphen).
195 let opener_has_extra_hyphen =
196 s.get(abs_open + 4..).is_some_and(|rest| rest.starts_with('-'));
197 if opener_has_extra_hyphen
198 && s.get(after_fence..).is_some_and(|rest| rest.starts_with('-'))
199 {
200 after_fence += 1;
201 }
202
203 // Append everything up to and including the closing fence
204 result.push_str(&s[current_pos..after_fence]);
205
206 // Check what comes after the fence
207 let after_content = &s[after_fence..];
208
209 // Determine if we need to insert a newline
210 let needs_newline = if after_content.is_empty() {
211 false
212 } else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
213 false
214 } else {
215 // Check if there's only whitespace until end of line
216 let next_newline = after_content.find('\n');
217 let until_newline = match next_newline {
218 Some(pos) => &after_content[..pos],
219 None => after_content,
220 };
221 !until_newline.trim().is_empty()
222 };
223
224 if needs_newline {
225 result.push('\n');
226 }
227
228 // Move position to after the fence (we'll process the rest in next iteration)
229 current_pos = after_fence;
230 } else {
231 // Unclosed comment at end of string - just append the rest and break
232 // The opener was found but no closer exists.
233 result.push_str(&s[current_pos..]);
234 current_pos = s.len();
235 break;
236 }
237 }
238
239 // Append remaining content (text after last closed comment, or text if no comments found)
240 if current_pos < s.len() {
241 result.push_str(&s[current_pos..]);
242 }
243
244 result
245}
246
247/// Normalizes markdown content by applying all preprocessing steps.
248///
249/// This function applies normalizations in the correct order:
250/// 1. Strip Unicode bidirectional formatting characters
251/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
252///
253/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
254/// in [`normalize_fields`] because it needs to be applied after schema defaults
255/// and coercion.
256///
257/// # Examples
258///
259/// ```
260/// use quillmark_core::normalize::normalize_markdown;
261///
262/// // Bidi characters are stripped
263/// let input = "**bold** \u{202D}**more**";
264/// let normalized = normalize_markdown(input);
265/// assert_eq!(normalized, "**bold** **more**");
266///
267/// // HTML comment trailing text is preserved
268/// let with_comment = "<!-- comment -->Some text";
269/// let normalized = normalize_markdown(with_comment);
270/// assert_eq!(normalized, "<!-- comment -->\nSome text");
271/// ```
272pub fn normalize_markdown(markdown: &str) -> String {
273 let cleaned = strip_bidi_formatting(markdown);
274 fix_html_comment_fences(&cleaned)
275}
276
277/// Normalizes a string value by stripping bidi characters and fixing HTML comment fences.
278///
279/// - For body content: applies `fix_html_comment_fences` to preserve text after `-->`
280/// - For other fields: strips bidi characters only
281///
282/// Double chevrons (`<<` and `>>`) are passed through untouched without conversion to
283/// guillemets. This preserves the original delimiter syntax in the output.
284fn normalize_string(s: &str, is_body: bool) -> String {
285 // First strip bidi formatting characters
286 let cleaned = strip_bidi_formatting(s);
287
288 // Then apply content-specific normalization
289 if is_body {
290 // Fix HTML comment fences (chevrons pass through unchanged)
291 fix_html_comment_fences(&cleaned)
292 } else {
293 // Non-body fields: just return cleaned string (chevrons pass through unchanged)
294 cleaned
295 }
296}
297
298/// Recursively normalize a JSON value with depth tracking.
299///
300/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
301fn normalize_json_value_inner(
302 value: serde_json::Value,
303 is_body: bool,
304 depth: usize,
305) -> Result<serde_json::Value, NormalizationError> {
306 if depth > MAX_NESTING_DEPTH {
307 return Err(NormalizationError::NestingTooDeep {
308 depth,
309 max: MAX_NESTING_DEPTH,
310 });
311 }
312
313 match value {
314 serde_json::Value::String(s) => {
315 Ok(serde_json::Value::String(normalize_string(&s, is_body)))
316 }
317 serde_json::Value::Array(arr) => {
318 let normalized: Result<Vec<_>, _> = arr
319 .into_iter()
320 .map(|v| normalize_json_value_inner(v, false, depth + 1))
321 .collect();
322 Ok(serde_json::Value::Array(normalized?))
323 }
324 serde_json::Value::Object(map) => {
325 let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
326 .into_iter()
327 .map(|(k, v)| {
328 let is_body = k == BODY_FIELD;
329 normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
330 })
331 .collect();
332 Ok(serde_json::Value::Object(processed?))
333 }
334 // Pass through other types unchanged (numbers, booleans, null)
335 other => Ok(other),
336 }
337}
338
339/// Recursively normalize a JSON value.
340///
341/// This is a convenience wrapper that starts depth tracking at 0.
342/// Logs a warning and returns the original value if depth is exceeded.
343fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
344 match normalize_json_value_inner(value.clone(), is_body, 0) {
345 Ok(normalized) => normalized,
346 Err(e) => {
347 // Log warning but don't fail - return original value
348 eprintln!("Warning: {}", e);
349 value
350 }
351 }
352}
353
354/// Normalizes document fields by applying all preprocessing steps.
355///
356/// This function orchestrates input normalization for document fields:
357/// 1. Strips Unicode bidirectional formatting characters from all string values
358/// 2. For the body field: fixes HTML comment fences to preserve trailing text
359///
360/// Double chevrons (`<<` and `>>`) are passed through unchanged in all fields.
361///
362/// # Processing Order
363///
364/// The normalization order is important:
365/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
366/// 2. **HTML comment fence fixing** - Ensures text after `-->` is preserved
367///
368/// # Examples
369///
370/// ```
371/// use quillmark_core::normalize::normalize_fields;
372/// use quillmark_core::QuillValue;
373/// use std::collections::HashMap;
374///
375/// let mut fields = HashMap::new();
376/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
377/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
378///
379/// let result = normalize_fields(fields);
380///
381/// // Title has chevrons preserved (only bidi stripped)
382/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
383///
384/// // Body has bidi chars stripped, chevrons preserved
385/// assert_eq!(result.get("BODY").unwrap().as_str().unwrap(), "**bold** **more**");
386/// ```
387pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
388 fields
389 .into_iter()
390 .map(|(key, value)| {
391 // Normalize field name to NFC form for consistent key comparison
392 // This ensures café (composed) and café (decomposed) are treated as the same key
393 let normalized_key = normalize_field_name(&key);
394 let json = value.into_json();
395 // Treat as body if it's the BODY field (applies HTML comment fence fixes)
396 let treat_as_body = normalized_key == BODY_FIELD;
397 let processed = normalize_json_value(json, treat_as_body);
398 (normalized_key, QuillValue::from_json(processed))
399 })
400 .collect()
401}
402
403/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
404///
405/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
406/// are treated as identical field names, preventing subtle bugs where visually
407/// identical keys are treated as different.
408///
409/// # Examples
410///
411/// ```
412/// use quillmark_core::normalize::normalize_field_name;
413///
414/// // Composed form (single code point for é)
415/// let composed = "café";
416/// // Decomposed form (e + combining acute accent)
417/// let decomposed = "cafe\u{0301}";
418///
419/// // Both normalize to the same NFC form
420/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
421/// ```
422pub fn normalize_field_name(name: &str) -> String {
423 name.nfc().collect()
424}
425
426/// Normalizes a parsed document by applying all field-level normalizations.
427///
428/// This is the **primary entry point** for normalizing documents after parsing.
429/// It ensures consistent processing regardless of how the document was created.
430///
431/// # Normalization Steps
432///
433/// This function applies all normalizations in the correct order:
434/// 1. **Unicode NFC normalization** - Field names are normalized to NFC form
435/// 2. **Bidi stripping** - Invisible bidirectional control characters are removed
436/// 3. **HTML comment fence fixing** - Trailing text after `-->` is preserved (body only)
437///
438/// Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
439///
440/// # When to Use
441///
442/// Call this function after parsing and before rendering:
443///
444/// ```no_run
445/// use quillmark_core::{ParsedDocument, normalize::normalize_document};
446///
447/// let markdown = "---\ntitle: Example\n---\n\nBody with <<placeholder>>";
448/// let doc = ParsedDocument::from_markdown(markdown).unwrap();
449/// let normalized = normalize_document(doc);
450/// // Use normalized document for rendering...
451/// ```
452///
453/// # Direct API Usage
454///
455/// If you're constructing a `ParsedDocument` directly via [`crate::parse::ParsedDocument::new`]
456/// rather than parsing from markdown, you **MUST** call this function to ensure
457/// consistent normalization:
458///
459/// ```
460/// use quillmark_core::{ParsedDocument, QuillValue, normalize::normalize_document};
461/// use std::collections::HashMap;
462///
463/// // Direct construction (e.g., from API or database)
464/// let mut fields = HashMap::new();
465/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("Test")));
466/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("<<content>>")));
467///
468/// let doc = ParsedDocument::new(fields);
469/// let normalized = normalize_document(doc).expect("Failed to normalize document");
470///
471/// // Body has chevrons preserved
472/// assert_eq!(normalized.body().unwrap(), "<<content>>");
473/// ```
474///
475/// # Idempotency
476///
477/// This function is idempotent - calling it multiple times produces the same result.
478/// However, for performance reasons, avoid unnecessary repeated calls.
479pub fn normalize_document(
480 doc: crate::parse::ParsedDocument,
481) -> Result<crate::parse::ParsedDocument, crate::error::ParseError> {
482 let normalized_fields = normalize_fields(doc.fields().clone());
483 Ok(crate::parse::ParsedDocument::with_quill_ref(
484 normalized_fields,
485 doc.quill_reference().clone(),
486 ))
487}
488
489#[cfg(test)]
490mod tests {
491 use super::*;
492
493 // Tests for strip_bidi_formatting
494
495 #[test]
496 fn test_strip_bidi_no_change() {
497 assert_eq!(strip_bidi_formatting("hello world"), "hello world");
498 assert_eq!(strip_bidi_formatting(""), "");
499 assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
500 }
501
502 #[test]
503 fn test_strip_bidi_lro() {
504 // U+202D (LEFT-TO-RIGHT OVERRIDE)
505 assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
506 assert_eq!(
507 strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
508 "**asdf** or **(1234**"
509 );
510 }
511
512 #[test]
513 fn test_strip_bidi_rlo() {
514 // U+202E (RIGHT-TO-LEFT OVERRIDE)
515 assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
516 }
517
518 #[test]
519 fn test_strip_bidi_marks() {
520 // U+200E (LRM) and U+200F (RLM)
521 assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
522 }
523
524 #[test]
525 fn test_strip_bidi_embeddings() {
526 // U+202A (LRE), U+202B (RLE), U+202C (PDF)
527 assert_eq!(
528 strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
529 "textmore"
530 );
531 }
532
533 #[test]
534 fn test_strip_bidi_isolates() {
535 // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
536 assert_eq!(
537 strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
538 "abc"
539 );
540 }
541
542 #[test]
543 fn test_strip_bidi_all_chars() {
544 let all_bidi = "\u{061C}\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
545 assert_eq!(strip_bidi_formatting(all_bidi), "");
546 }
547
548 #[test]
549 fn test_strip_bidi_arabic_letter_mark() {
550 // U+061C ARABIC LETTER MARK (ALM) should be stripped
551 assert_eq!(strip_bidi_formatting("hello\u{061C}world"), "helloworld");
552 assert_eq!(strip_bidi_formatting("\u{061C}**bold**"), "**bold**");
553 }
554
555 #[test]
556 fn test_strip_bidi_unicode_preserved() {
557 // Non-bidi unicode should be preserved
558 assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
559 assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
560 assert_eq!(strip_bidi_formatting("🎉"), "🎉");
561 }
562
563 // Tests for normalize_markdown
564
565 #[test]
566 fn test_normalize_markdown_basic() {
567 assert_eq!(normalize_markdown("hello"), "hello");
568 assert_eq!(
569 normalize_markdown("**bold** \u{202D}**more**"),
570 "**bold** **more**"
571 );
572 }
573
574 #[test]
575 fn test_normalize_markdown_html_comment() {
576 assert_eq!(
577 normalize_markdown("<!-- comment -->Some text"),
578 "<!-- comment -->\nSome text"
579 );
580 }
581
582 // Tests for fix_html_comment_fences
583
584 #[test]
585 fn test_fix_html_comment_no_comment() {
586 assert_eq!(fix_html_comment_fences("hello world"), "hello world");
587 assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
588 assert_eq!(fix_html_comment_fences(""), "");
589 }
590
591 #[test]
592 fn test_fix_html_comment_single_line_trailing_text() {
593 // Text on same line as --> should be moved to next line
594 assert_eq!(
595 fix_html_comment_fences("<!-- comment -->Same line text"),
596 "<!-- comment -->\nSame line text"
597 );
598 }
599
600 #[test]
601 fn test_fix_html_comment_already_newline() {
602 // Already has newline after --> - no change
603 assert_eq!(
604 fix_html_comment_fences("<!-- comment -->\nNext line text"),
605 "<!-- comment -->\nNext line text"
606 );
607 }
608
609 #[test]
610 fn test_fix_html_comment_only_whitespace_after() {
611 // Only whitespace after --> until newline - no change needed
612 assert_eq!(
613 fix_html_comment_fences("<!-- comment --> \nSome text"),
614 "<!-- comment --> \nSome text"
615 );
616 }
617
618 #[test]
619 fn test_fix_html_comment_multiline_trailing_text() {
620 // Multi-line comment with text on closing line
621 assert_eq!(
622 fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
623 "<!--\nmultiline\ncomment\n-->\nTrailing text"
624 );
625 }
626
627 #[test]
628 fn test_fix_html_comment_multiline_proper() {
629 // Multi-line comment with proper newline after -->
630 assert_eq!(
631 fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
632 "<!--\nmultiline\n-->\n\nParagraph text"
633 );
634 }
635
636 #[test]
637 fn test_fix_html_comment_multiple_comments() {
638 // Multiple comments in the same document
639 assert_eq!(
640 fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
641 "<!-- first -->\nText\n\n<!-- second -->\nMore text"
642 );
643 }
644
645 #[test]
646 fn test_fix_html_comment_end_of_string() {
647 // Comment at end of string - no trailing content
648 assert_eq!(
649 fix_html_comment_fences("Some text before <!-- comment -->"),
650 "Some text before <!-- comment -->"
651 );
652 }
653
654 #[test]
655 fn test_fix_html_comment_only_comment() {
656 // Just a comment with nothing after
657 assert_eq!(
658 fix_html_comment_fences("<!-- comment -->"),
659 "<!-- comment -->"
660 );
661 }
662
663 #[test]
664 fn test_fix_html_comment_arrow_not_comment() {
665 // --> that's not part of a comment (standalone)
666 // Should NOT be touched by the context-aware fixer
667 assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
668 }
669
670 #[test]
671 fn test_fix_html_comment_nested_opener() {
672 // Nested openers are just text inside the comment
673 // <!-- <!-- -->Trailing
674 // The first <!-- opens, the first --> closes.
675 assert_eq!(
676 fix_html_comment_fences("<!-- <!-- -->Trailing"),
677 "<!-- <!-- -->\nTrailing"
678 );
679 }
680
681 #[test]
682 fn test_fix_html_comment_unmatched_closer() {
683 // Closer without opener
684 assert_eq!(
685 fix_html_comment_fences("text --> more text"),
686 "text --> more text"
687 );
688 }
689
690 #[test]
691 fn test_fix_html_comment_multiple_valid_invalid() {
692 // Mixed valid and invalid comments
693 // <!-- valid -->FixMe
694 // text --> Ignore
695 // <!-- valid2 -->FixMe2
696 let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
697 let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
698 assert_eq!(fix_html_comment_fences(input), expected);
699 }
700
701 #[test]
702 fn test_fix_html_comment_crlf() {
703 // CRLF line endings
704 assert_eq!(
705 fix_html_comment_fences("<!-- comment -->\r\nSome text"),
706 "<!-- comment -->\r\nSome text"
707 );
708 }
709
710 #[test]
711 fn test_fix_html_comment_triple_hyphen_single_line() {
712 assert_eq!(
713 fix_html_comment_fences("<!--- comment --->Trailing text"),
714 "<!--- comment --->\nTrailing text"
715 );
716 }
717
718 #[test]
719 fn test_fix_html_comment_triple_hyphen_multiline() {
720 assert_eq!(
721 fix_html_comment_fences("<!---\ncomment\n--->Trailing text"),
722 "<!---\ncomment\n--->\nTrailing text"
723 );
724 }
725
726 // Tests for normalize_fields
727
728 #[test]
729 fn test_normalize_fields_body_bidi() {
730 let mut fields = HashMap::new();
731 fields.insert(
732 BODY_FIELD.to_string(),
733 QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
734 );
735
736 let result = normalize_fields(fields);
737 assert_eq!(
738 result.get(BODY_FIELD).unwrap().as_str().unwrap(),
739 "**bold** **more**"
740 );
741 }
742
743 #[test]
744 fn test_normalize_fields_body_chevrons_preserved() {
745 let mut fields = HashMap::new();
746 fields.insert(
747 BODY_FIELD.to_string(),
748 QuillValue::from_json(serde_json::json!("<<raw>>")),
749 );
750
751 let result = normalize_fields(fields);
752 // Chevrons are passed through unchanged
753 assert_eq!(result.get(BODY_FIELD).unwrap().as_str().unwrap(), "<<raw>>");
754 }
755
756 #[test]
757 fn test_normalize_fields_body_chevrons_and_bidi() {
758 let mut fields = HashMap::new();
759 fields.insert(
760 BODY_FIELD.to_string(),
761 QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
762 );
763
764 let result = normalize_fields(fields);
765 // Bidi stripped, chevrons preserved
766 assert_eq!(
767 result.get(BODY_FIELD).unwrap().as_str().unwrap(),
768 "<<raw>> **bold**"
769 );
770 }
771
772 #[test]
773 fn test_normalize_fields_other_field_chevrons_preserved() {
774 let mut fields = HashMap::new();
775 fields.insert(
776 "title".to_string(),
777 QuillValue::from_json(serde_json::json!("<<hello>>")),
778 );
779
780 let result = normalize_fields(fields);
781 // Chevrons are passed through unchanged
782 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
783 }
784
785 #[test]
786 fn test_normalize_fields_other_field_bidi_stripped() {
787 let mut fields = HashMap::new();
788 fields.insert(
789 "title".to_string(),
790 QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
791 );
792
793 let result = normalize_fields(fields);
794 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
795 }
796
797 #[test]
798 fn test_normalize_fields_nested_values() {
799 let mut fields = HashMap::new();
800 fields.insert(
801 "items".to_string(),
802 QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
803 );
804
805 let result = normalize_fields(fields);
806 let items = result.get("items").unwrap().as_array().unwrap();
807 // Chevrons are preserved, bidi stripped
808 assert_eq!(items[0].as_str().unwrap(), "<<a>>");
809 assert_eq!(items[1].as_str().unwrap(), "b");
810 }
811
812 #[test]
813 fn test_normalize_fields_object_values() {
814 let mut fields = HashMap::new();
815 fields.insert(
816 "meta".to_string(),
817 QuillValue::from_json(serde_json::json!({
818 "title": "<<hello>>",
819 BODY_FIELD: "<<content>>"
820 })),
821 );
822
823 let result = normalize_fields(fields);
824 let meta = result.get("meta").unwrap();
825 let meta_obj = meta.as_object().unwrap();
826 // Chevrons are preserved in all fields
827 assert_eq!(
828 meta_obj.get("title").unwrap().as_str().unwrap(),
829 "<<hello>>"
830 );
831 assert_eq!(
832 meta_obj.get(BODY_FIELD).unwrap().as_str().unwrap(),
833 "<<content>>"
834 );
835 }
836
837 #[test]
838 fn test_normalize_fields_non_string_unchanged() {
839 let mut fields = HashMap::new();
840 fields.insert(
841 "count".to_string(),
842 QuillValue::from_json(serde_json::json!(42)),
843 );
844 fields.insert(
845 "enabled".to_string(),
846 QuillValue::from_json(serde_json::json!(true)),
847 );
848
849 let result = normalize_fields(fields);
850 assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
851 assert!(result.get("enabled").unwrap().as_bool().unwrap());
852 }
853
854 // Tests for depth limiting
855
856 #[test]
857 fn test_normalize_json_value_inner_depth_exceeded() {
858 // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
859 let mut value = serde_json::json!("leaf");
860 for _ in 0..=crate::error::MAX_NESTING_DEPTH {
861 value = serde_json::json!([value]);
862 }
863
864 // The inner function should return an error
865 let result = super::normalize_json_value_inner(value, false, 0);
866 assert!(result.is_err());
867
868 if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
869 assert!(depth > max);
870 assert_eq!(max, crate::error::MAX_NESTING_DEPTH);
871 } else {
872 panic!("Expected NestingTooDeep error");
873 }
874 }
875
876 #[test]
877 fn test_normalize_json_value_inner_within_limit() {
878 // Create a nested structure just within the limit
879 let mut value = serde_json::json!("leaf");
880 for _ in 0..50 {
881 value = serde_json::json!([value]);
882 }
883
884 // This should succeed
885 let result = super::normalize_json_value_inner(value, false, 0);
886 assert!(result.is_ok());
887 }
888
889 // Tests for normalize_document
890
891 #[test]
892 fn test_normalize_document_basic() {
893 use crate::parse::ParsedDocument;
894
895 let mut fields = std::collections::HashMap::new();
896 fields.insert(
897 "title".to_string(),
898 crate::value::QuillValue::from_json(serde_json::json!("<<placeholder>>")),
899 );
900 fields.insert(
901 BODY_FIELD.to_string(),
902 crate::value::QuillValue::from_json(serde_json::json!("<<content>> \u{202D}**bold**")),
903 );
904
905 let doc = ParsedDocument::new(fields);
906 let normalized = super::normalize_document(doc).unwrap();
907
908 // Title has chevrons preserved (only bidi stripped)
909 assert_eq!(
910 normalized.get_field("title").unwrap().as_str().unwrap(),
911 "<<placeholder>>"
912 );
913
914 // Body has bidi stripped, chevrons preserved
915 assert_eq!(normalized.body().unwrap(), "<<content>> **bold**");
916 }
917
918 #[test]
919 fn test_normalize_document_preserves_quill_tag() {
920 use crate::parse::ParsedDocument;
921 use crate::version::QuillReference;
922 use std::str::FromStr;
923
924 let fields = std::collections::HashMap::new();
925 let quill_ref = QuillReference::from_str("custom_quill").unwrap();
926 let doc = ParsedDocument::with_quill_ref(fields, quill_ref);
927 let normalized = super::normalize_document(doc).unwrap();
928
929 assert_eq!(normalized.quill_reference().name, "custom_quill");
930 }
931
932 #[test]
933 fn test_normalize_document_idempotent() {
934 use crate::parse::ParsedDocument;
935
936 let mut fields = std::collections::HashMap::new();
937 fields.insert(
938 BODY_FIELD.to_string(),
939 crate::value::QuillValue::from_json(serde_json::json!("<<content>>")),
940 );
941
942 let doc = ParsedDocument::new(fields);
943 let normalized_once = super::normalize_document(doc).unwrap();
944 let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
945
946 // Calling normalize_document twice should produce the same result
947 assert_eq!(
948 normalized_once.body().unwrap(),
949 normalized_twice.body().unwrap()
950 );
951 }
952}