quillmark_core/normalize.rs
1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Orchestrate guillemet preprocessing (`<<text>>` → `«text»`)
14//! - Apply all normalizations in the correct order
15//!
16//! ## Functions
17//!
18//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
19//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
20//! - [`normalize_fields`] - Normalize document fields (bidi + guillemets)
21//!
22//! ## Why Normalize?
23//!
24//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
25//! control characters used for bidirectional text layout. When placed adjacent to markdown
26//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
27//!
28//! ```text
29//! **bold** or <U+202D>**(1234**
30//! ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
31//! ```
32//!
33//! These characters commonly appear when copying text from:
34//! - Web pages with mixed LTR/RTL content
35//! - PDF documents
36//! - Word processors
37//! - Some clipboard managers
38//!
39//! ## Examples
40//!
41//! ```
42//! use quillmark_core::normalize::strip_bidi_formatting;
43//!
44//! // Input with invisible U+202D (LRO) before second **
45//! let input = "**asdf** or \u{202D}**(1234**";
46//! let cleaned = strip_bidi_formatting(input);
47//! assert_eq!(cleaned, "**asdf** or **(1234**");
48//! ```
49
50use crate::error::MAX_NESTING_DEPTH;
51use crate::guillemet::{preprocess_markdown_guillemets, strip_chevrons};
52use crate::parse::BODY_FIELD;
53use crate::value::QuillValue;
54use std::collections::HashMap;
55use unicode_normalization::UnicodeNormalization;
56
57/// Errors that can occur during normalization
58#[derive(Debug, thiserror::Error)]
59pub enum NormalizationError {
60 /// JSON nesting depth exceeded maximum allowed
61 #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
62 NestingTooDeep {
63 /// Actual depth
64 depth: usize,
65 /// Maximum allowed depth
66 max: usize,
67 },
68}
69
70/// Check if a character is a Unicode bidirectional formatting character
71#[inline]
72fn is_bidi_char(c: char) -> bool {
73 matches!(
74 c,
75 '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
76 | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
77 | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
78 | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
79 | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
80 | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
81 | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
82 | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
83 | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
84 | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
85 | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
86 )
87}
88
89/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
90///
91/// These invisible control characters are used for bidirectional text layout but can
92/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
93///
94/// # Characters Stripped
95///
96/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
97/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
98/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
99/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
100/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
101/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
102/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
103/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
104/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
105/// - U+2068 (FIRST STRONG ISOLATE, FSI)
106/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
107///
108/// # Examples
109///
110/// ```
111/// use quillmark_core::normalize::strip_bidi_formatting;
112///
113/// // Normal text is unchanged
114/// assert_eq!(strip_bidi_formatting("hello"), "hello");
115///
116/// // LRO character is stripped
117/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
118///
119/// // All bidi characters are stripped
120/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
121/// assert_eq!(strip_bidi_formatting(input), "");
122/// ```
123pub fn strip_bidi_formatting(s: &str) -> String {
124 // Early return optimization: avoid allocation if no bidi characters present
125 if !s.chars().any(is_bidi_char) {
126 return s.to_string();
127 }
128
129 s.chars().filter(|c| !is_bidi_char(*c)).collect()
130}
131
132/// Fixes HTML comment closing fences to prevent content loss.
133///
134/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
135/// This means any text on the same line after `-->` is included in the HTML block and would
136/// be discarded by markdown parsers that ignore HTML blocks.
137///
138/// This function inserts a newline after `-->` when followed by non-whitespace content,
139/// ensuring the trailing text is parsed as regular markdown.
140///
141/// # Examples
142///
143/// ```
144/// use quillmark_core::normalize::fix_html_comment_fences;
145///
146/// // Text on same line as --> is moved to next line
147/// assert_eq!(
148/// fix_html_comment_fences("<!-- comment -->Some text"),
149/// "<!-- comment -->\nSome text"
150/// );
151///
152/// // Already on separate line - no change
153/// assert_eq!(
154/// fix_html_comment_fences("<!-- comment -->\nSome text"),
155/// "<!-- comment -->\nSome text"
156/// );
157///
158/// // Only whitespace after --> - no change needed
159/// assert_eq!(
160/// fix_html_comment_fences("<!-- comment --> \nSome text"),
161/// "<!-- comment --> \nSome text"
162/// );
163///
164/// // Multi-line comments with trailing text
165/// assert_eq!(
166/// fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
167/// "<!--\nmultiline\n-->\nTrailing text"
168/// );
169/// ```
170pub fn fix_html_comment_fences(s: &str) -> String {
171 // Early return if no HTML comment closing fence present
172 if !s.contains("-->") {
173 return s.to_string();
174 }
175
176 // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
177 let mut result = String::with_capacity(s.len() + 16);
178 let mut current_pos = 0;
179
180 // Find first opener
181 while let Some(open_idx) = s[current_pos..].find("<!--") {
182 let abs_open = current_pos + open_idx;
183
184 // Find matching closer AFTER the opener
185 if let Some(close_idx) = s[abs_open..].find("-->") {
186 let abs_close = abs_open + close_idx;
187 let after_fence = abs_close + 3;
188
189 // Append everything up to and including the closing fence
190 result.push_str(&s[current_pos..after_fence]);
191
192 // Check what comes after the fence
193 let after_content = &s[after_fence..];
194
195 // Determine if we need to insert a newline
196 let needs_newline = if after_content.is_empty() {
197 false
198 } else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
199 false
200 } else {
201 // Check if there's only whitespace until end of line
202 let next_newline = after_content.find('\n');
203 let until_newline = match next_newline {
204 Some(pos) => &after_content[..pos],
205 None => after_content,
206 };
207 !until_newline.trim().is_empty()
208 };
209
210 if needs_newline {
211 result.push('\n');
212 }
213
214 // Move position to after the fence (we'll process the rest in next iteration)
215 current_pos = after_fence;
216 } else {
217 // Unclosed comment at end of string - just append the rest and break
218 // The opener was found but no closer exists.
219 result.push_str(&s[current_pos..]);
220 current_pos = s.len();
221 break;
222 }
223 }
224
225 // Append remaining content (text after last closed comment, or text if no comments found)
226 if current_pos < s.len() {
227 result.push_str(&s[current_pos..]);
228 }
229
230 result
231}
232
233/// Normalizes markdown content by applying all preprocessing steps.
234///
235/// This function applies normalizations in the correct order:
236/// 1. Strip Unicode bidirectional formatting characters
237/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
238///
239/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
240/// in [`normalize_fields`] because it needs to be applied after schema defaults
241/// and coercion.
242///
243/// # Examples
244///
245/// ```
246/// use quillmark_core::normalize::normalize_markdown;
247///
248/// // Bidi characters are stripped
249/// let input = "**bold** \u{202D}**more**";
250/// let normalized = normalize_markdown(input);
251/// assert_eq!(normalized, "**bold** **more**");
252///
253/// // HTML comment trailing text is preserved
254/// let with_comment = "<!-- comment -->Some text";
255/// let normalized = normalize_markdown(with_comment);
256/// assert_eq!(normalized, "<!-- comment -->\nSome text");
257/// ```
258pub fn normalize_markdown(markdown: &str) -> String {
259 let cleaned = strip_bidi_formatting(markdown);
260 fix_html_comment_fences(&cleaned)
261}
262
263/// Normalizes a string value by stripping bidi characters and optionally processing guillemets.
264///
265/// - For body content: applies `preprocess_markdown_guillemets` (converts `<<text>>` to `«text»`)
266/// and `fix_html_comment_fences` to preserve text after `-->`
267/// - For other fields: applies `strip_chevrons` (removes chevrons entirely)
268fn normalize_string(s: &str, is_body: bool) -> String {
269 // First strip bidi formatting characters
270 let cleaned = strip_bidi_formatting(s);
271
272 // Then apply content-specific normalization
273 if is_body {
274 // Fix HTML comment fences first, then convert guillemets
275 let fixed = fix_html_comment_fences(&cleaned);
276 preprocess_markdown_guillemets(&fixed)
277 } else {
278 strip_chevrons(&cleaned)
279 }
280}
281
282/// Recursively normalize a JSON value with depth tracking.
283///
284/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
285fn normalize_json_value_inner(
286 value: serde_json::Value,
287 is_body: bool,
288 depth: usize,
289) -> Result<serde_json::Value, NormalizationError> {
290 if depth > MAX_NESTING_DEPTH {
291 return Err(NormalizationError::NestingTooDeep {
292 depth,
293 max: MAX_NESTING_DEPTH,
294 });
295 }
296
297 match value {
298 serde_json::Value::String(s) => {
299 Ok(serde_json::Value::String(normalize_string(&s, is_body)))
300 }
301 serde_json::Value::Array(arr) => {
302 let normalized: Result<Vec<_>, _> = arr
303 .into_iter()
304 .map(|v| normalize_json_value_inner(v, false, depth + 1))
305 .collect();
306 Ok(serde_json::Value::Array(normalized?))
307 }
308 serde_json::Value::Object(map) => {
309 let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
310 .into_iter()
311 .map(|(k, v)| {
312 let is_body = k == BODY_FIELD;
313 normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
314 })
315 .collect();
316 Ok(serde_json::Value::Object(processed?))
317 }
318 // Pass through other types unchanged (numbers, booleans, null)
319 other => Ok(other),
320 }
321}
322
323/// Recursively normalize a JSON value.
324///
325/// This is a convenience wrapper that starts depth tracking at 0.
326/// Logs a warning and returns the original value if depth is exceeded.
327fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
328 match normalize_json_value_inner(value.clone(), is_body, 0) {
329 Ok(normalized) => normalized,
330 Err(e) => {
331 // Log warning but don't fail - return original value
332 eprintln!("Warning: {}", e);
333 value
334 }
335 }
336}
337
338/// Normalizes document fields by applying all preprocessing steps.
339///
340/// This function orchestrates input normalization for document fields:
341/// 1. Strips Unicode bidirectional formatting characters from all string values
342/// 2. For the body field: converts `<<text>>` to `«text»` (guillemets)
343/// 3. For other fields: strips chevrons entirely (`<<text>>` → `text`)
344///
345/// # Processing Order
346///
347/// The normalization order is important:
348/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
349/// 2. **Guillemet preprocessing** - Converts user syntax to internal markers
350///
351/// # Examples
352///
353/// ```
354/// use quillmark_core::normalize::normalize_fields;
355/// use quillmark_core::QuillValue;
356/// use std::collections::HashMap;
357///
358/// let mut fields = HashMap::new();
359/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
360/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
361///
362/// let result = normalize_fields(fields);
363///
364/// // Title has chevrons stripped
365/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
366///
367/// // Body has bidi chars stripped (guillemet would apply if there were any <<>>)
368/// assert_eq!(result.get("BODY").unwrap().as_str().unwrap(), "**bold** **more**");
369/// ```
370pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
371 fields
372 .into_iter()
373 .map(|(key, value)| {
374 // Normalize field name to NFC form for consistent key comparison
375 // This ensures café (composed) and café (decomposed) are treated as the same key
376 let normalized_key = normalize_field_name(&key);
377 let json = value.into_json();
378 let processed = normalize_json_value(json, normalized_key == BODY_FIELD);
379 (normalized_key, QuillValue::from_json(processed))
380 })
381 .collect()
382}
383
384/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
385///
386/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
387/// are treated as identical field names, preventing subtle bugs where visually
388/// identical keys are treated as different.
389///
390/// # Examples
391///
392/// ```
393/// use quillmark_core::normalize::normalize_field_name;
394///
395/// // Composed form (single code point for é)
396/// let composed = "café";
397/// // Decomposed form (e + combining acute accent)
398/// let decomposed = "cafe\u{0301}";
399///
400/// // Both normalize to the same NFC form
401/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
402/// ```
403pub fn normalize_field_name(name: &str) -> String {
404 name.nfc().collect()
405}
406
407/// Normalizes a parsed document by applying all field-level normalizations.
408///
409/// This is the **primary entry point** for normalizing documents after parsing.
410/// It ensures consistent processing regardless of how the document was created.
411///
412/// # Normalization Steps
413///
414/// This function applies all normalizations in the correct order:
415/// 1. **Unicode NFC normalization** - Field names are normalized to NFC form
416/// 2. **Bidi stripping** - Invisible bidirectional control characters are removed
417/// 3. **HTML comment fence fixing** - Trailing text after `-->` is preserved
418/// 4. **Guillemet conversion** - `<<text>>` is converted to `«text»` in BODY fields
419/// 5. **Chevron stripping** - `<<text>>` is stripped to `text` in other fields
420///
421/// # When to Use
422///
423/// Call this function after parsing and before rendering:
424///
425/// ```no_run
426/// use quillmark_core::{ParsedDocument, normalize::normalize_document};
427///
428/// let markdown = "---\ntitle: Example\n---\n\nBody with <<placeholder>>";
429/// let doc = ParsedDocument::from_markdown(markdown).unwrap();
430/// let normalized = normalize_document(doc);
431/// // Use normalized document for rendering...
432/// ```
433///
434/// # Direct API Usage
435///
436/// If you're constructing a `ParsedDocument` directly via [`crate::parse::ParsedDocument::new`]
437/// rather than parsing from markdown, you **MUST** call this function to ensure
438/// consistent normalization:
439///
440/// ```
441/// use quillmark_core::{ParsedDocument, QuillValue, normalize::normalize_document};
442/// use std::collections::HashMap;
443///
444/// // Direct construction (e.g., from API or database)
445/// let mut fields = HashMap::new();
446/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("Test")));
447/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("<<content>>")));
448///
449/// let doc = ParsedDocument::new(fields);
450/// let normalized = normalize_document(doc);
451///
452/// // Body now has guillemets converted
453/// assert_eq!(normalized.body().unwrap(), "«content»");
454/// ```
455///
456/// # Idempotency
457///
458/// This function is idempotent - calling it multiple times produces the same result.
459/// However, for performance reasons, avoid unnecessary repeated calls.
460pub fn normalize_document(doc: crate::parse::ParsedDocument) -> crate::parse::ParsedDocument {
461 let normalized_fields = normalize_fields(doc.fields().clone());
462 crate::parse::ParsedDocument::with_quill_tag(normalized_fields, doc.quill_tag().to_string())
463}
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468
469 // Tests for strip_bidi_formatting
470
471 #[test]
472 fn test_strip_bidi_no_change() {
473 assert_eq!(strip_bidi_formatting("hello world"), "hello world");
474 assert_eq!(strip_bidi_formatting(""), "");
475 assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
476 }
477
478 #[test]
479 fn test_strip_bidi_lro() {
480 // U+202D (LEFT-TO-RIGHT OVERRIDE)
481 assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
482 assert_eq!(
483 strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
484 "**asdf** or **(1234**"
485 );
486 }
487
488 #[test]
489 fn test_strip_bidi_rlo() {
490 // U+202E (RIGHT-TO-LEFT OVERRIDE)
491 assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
492 }
493
494 #[test]
495 fn test_strip_bidi_marks() {
496 // U+200E (LRM) and U+200F (RLM)
497 assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
498 }
499
500 #[test]
501 fn test_strip_bidi_embeddings() {
502 // U+202A (LRE), U+202B (RLE), U+202C (PDF)
503 assert_eq!(
504 strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
505 "textmore"
506 );
507 }
508
509 #[test]
510 fn test_strip_bidi_isolates() {
511 // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
512 assert_eq!(
513 strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
514 "abc"
515 );
516 }
517
518 #[test]
519 fn test_strip_bidi_all_chars() {
520 let all_bidi = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
521 assert_eq!(strip_bidi_formatting(all_bidi), "");
522 }
523
524 #[test]
525 fn test_strip_bidi_unicode_preserved() {
526 // Non-bidi unicode should be preserved
527 assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
528 assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
529 assert_eq!(strip_bidi_formatting("🎉"), "🎉");
530 }
531
532 // Tests for normalize_markdown
533
534 #[test]
535 fn test_normalize_markdown_basic() {
536 assert_eq!(normalize_markdown("hello"), "hello");
537 assert_eq!(
538 normalize_markdown("**bold** \u{202D}**more**"),
539 "**bold** **more**"
540 );
541 }
542
543 #[test]
544 fn test_normalize_markdown_html_comment() {
545 assert_eq!(
546 normalize_markdown("<!-- comment -->Some text"),
547 "<!-- comment -->\nSome text"
548 );
549 }
550
551 // Tests for fix_html_comment_fences
552
553 #[test]
554 fn test_fix_html_comment_no_comment() {
555 assert_eq!(fix_html_comment_fences("hello world"), "hello world");
556 assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
557 assert_eq!(fix_html_comment_fences(""), "");
558 }
559
560 #[test]
561 fn test_fix_html_comment_single_line_trailing_text() {
562 // Text on same line as --> should be moved to next line
563 assert_eq!(
564 fix_html_comment_fences("<!-- comment -->Same line text"),
565 "<!-- comment -->\nSame line text"
566 );
567 }
568
569 #[test]
570 fn test_fix_html_comment_already_newline() {
571 // Already has newline after --> - no change
572 assert_eq!(
573 fix_html_comment_fences("<!-- comment -->\nNext line text"),
574 "<!-- comment -->\nNext line text"
575 );
576 }
577
578 #[test]
579 fn test_fix_html_comment_only_whitespace_after() {
580 // Only whitespace after --> until newline - no change needed
581 assert_eq!(
582 fix_html_comment_fences("<!-- comment --> \nSome text"),
583 "<!-- comment --> \nSome text"
584 );
585 }
586
587 #[test]
588 fn test_fix_html_comment_multiline_trailing_text() {
589 // Multi-line comment with text on closing line
590 assert_eq!(
591 fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
592 "<!--\nmultiline\ncomment\n-->\nTrailing text"
593 );
594 }
595
596 #[test]
597 fn test_fix_html_comment_multiline_proper() {
598 // Multi-line comment with proper newline after -->
599 assert_eq!(
600 fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
601 "<!--\nmultiline\n-->\n\nParagraph text"
602 );
603 }
604
605 #[test]
606 fn test_fix_html_comment_multiple_comments() {
607 // Multiple comments in the same document
608 assert_eq!(
609 fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
610 "<!-- first -->\nText\n\n<!-- second -->\nMore text"
611 );
612 }
613
614 #[test]
615 fn test_fix_html_comment_end_of_string() {
616 // Comment at end of string - no trailing content
617 assert_eq!(
618 fix_html_comment_fences("Some text before <!-- comment -->"),
619 "Some text before <!-- comment -->"
620 );
621 }
622
623 #[test]
624 fn test_fix_html_comment_only_comment() {
625 // Just a comment with nothing after
626 assert_eq!(
627 fix_html_comment_fences("<!-- comment -->"),
628 "<!-- comment -->"
629 );
630 }
631
632 #[test]
633 fn test_fix_html_comment_arrow_not_comment() {
634 // --> that's not part of a comment (standalone)
635 // Should NOT be touched by the context-aware fixer
636 assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
637 }
638
639 #[test]
640 fn test_fix_html_comment_nested_opener() {
641 // Nested openers are just text inside the comment
642 // <!-- <!-- -->Trailing
643 // The first <!-- opens, the first --> closes.
644 assert_eq!(
645 fix_html_comment_fences("<!-- <!-- -->Trailing"),
646 "<!-- <!-- -->\nTrailing"
647 );
648 }
649
650 #[test]
651 fn test_fix_html_comment_unmatched_closer() {
652 // Closer without opener
653 assert_eq!(
654 fix_html_comment_fences("text --> more text"),
655 "text --> more text"
656 );
657 }
658
659 #[test]
660 fn test_fix_html_comment_multiple_valid_invalid() {
661 // Mixed valid and invalid comments
662 // <!-- valid -->FixMe
663 // text --> Ignore
664 // <!-- valid2 -->FixMe2
665 let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
666 let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
667 assert_eq!(fix_html_comment_fences(input), expected);
668 }
669
670 #[test]
671 fn test_fix_html_comment_crlf() {
672 // CRLF line endings
673 assert_eq!(
674 fix_html_comment_fences("<!-- comment -->\r\nSome text"),
675 "<!-- comment -->\r\nSome text"
676 );
677 }
678
679 // Tests for normalize_fields
680
681 #[test]
682 fn test_normalize_fields_body_bidi() {
683 let mut fields = HashMap::new();
684 fields.insert(
685 BODY_FIELD.to_string(),
686 QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
687 );
688
689 let result = normalize_fields(fields);
690 assert_eq!(
691 result.get(BODY_FIELD).unwrap().as_str().unwrap(),
692 "**bold** **more**"
693 );
694 }
695
696 #[test]
697 fn test_normalize_fields_body_guillemets() {
698 let mut fields = HashMap::new();
699 fields.insert(
700 BODY_FIELD.to_string(),
701 QuillValue::from_json(serde_json::json!("<<raw>>")),
702 );
703
704 let result = normalize_fields(fields);
705 assert_eq!(result.get(BODY_FIELD).unwrap().as_str().unwrap(), "«raw»");
706 }
707
708 #[test]
709 fn test_normalize_fields_body_both() {
710 let mut fields = HashMap::new();
711 fields.insert(
712 BODY_FIELD.to_string(),
713 QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
714 );
715
716 let result = normalize_fields(fields);
717 // Bidi stripped first, then guillemets converted
718 assert_eq!(
719 result.get(BODY_FIELD).unwrap().as_str().unwrap(),
720 "«raw» **bold**"
721 );
722 }
723
724 #[test]
725 fn test_normalize_fields_other_field_chevrons_stripped() {
726 let mut fields = HashMap::new();
727 fields.insert(
728 "title".to_string(),
729 QuillValue::from_json(serde_json::json!("<<hello>>")),
730 );
731
732 let result = normalize_fields(fields);
733 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
734 }
735
736 #[test]
737 fn test_normalize_fields_other_field_bidi_stripped() {
738 let mut fields = HashMap::new();
739 fields.insert(
740 "title".to_string(),
741 QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
742 );
743
744 let result = normalize_fields(fields);
745 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
746 }
747
748 #[test]
749 fn test_normalize_fields_nested_values() {
750 let mut fields = HashMap::new();
751 fields.insert(
752 "items".to_string(),
753 QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
754 );
755
756 let result = normalize_fields(fields);
757 let items = result.get("items").unwrap().as_array().unwrap();
758 assert_eq!(items[0].as_str().unwrap(), "a");
759 assert_eq!(items[1].as_str().unwrap(), "b");
760 }
761
762 #[test]
763 fn test_normalize_fields_object_values() {
764 let mut fields = HashMap::new();
765 fields.insert(
766 "meta".to_string(),
767 QuillValue::from_json(serde_json::json!({
768 "title": "<<hello>>",
769 BODY_FIELD: "<<content>>"
770 })),
771 );
772
773 let result = normalize_fields(fields);
774 let meta = result.get("meta").unwrap();
775 let meta_obj = meta.as_object().unwrap();
776 // Nested "BODY" key should be recognized
777 assert_eq!(meta_obj.get("title").unwrap().as_str().unwrap(), "hello");
778 assert_eq!(
779 meta_obj.get(BODY_FIELD).unwrap().as_str().unwrap(),
780 "«content»"
781 );
782 }
783
784 #[test]
785 fn test_normalize_fields_non_string_unchanged() {
786 let mut fields = HashMap::new();
787 fields.insert(
788 "count".to_string(),
789 QuillValue::from_json(serde_json::json!(42)),
790 );
791 fields.insert(
792 "enabled".to_string(),
793 QuillValue::from_json(serde_json::json!(true)),
794 );
795
796 let result = normalize_fields(fields);
797 assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
798 assert!(result.get("enabled").unwrap().as_bool().unwrap());
799 }
800
801 // Tests for depth limiting
802
803 #[test]
804 fn test_normalize_json_value_inner_depth_exceeded() {
805 // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
806 let mut value = serde_json::json!("leaf");
807 for _ in 0..=crate::error::MAX_NESTING_DEPTH {
808 value = serde_json::json!([value]);
809 }
810
811 // The inner function should return an error
812 let result = super::normalize_json_value_inner(value, false, 0);
813 assert!(result.is_err());
814
815 if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
816 assert!(depth > max);
817 assert_eq!(max, crate::error::MAX_NESTING_DEPTH);
818 } else {
819 panic!("Expected NestingTooDeep error");
820 }
821 }
822
823 #[test]
824 fn test_normalize_json_value_inner_within_limit() {
825 // Create a nested structure just within the limit
826 let mut value = serde_json::json!("leaf");
827 for _ in 0..50 {
828 value = serde_json::json!([value]);
829 }
830
831 // This should succeed
832 let result = super::normalize_json_value_inner(value, false, 0);
833 assert!(result.is_ok());
834 }
835
836 // Tests for normalize_document
837
838 #[test]
839 fn test_normalize_document_basic() {
840 use crate::parse::ParsedDocument;
841
842 let mut fields = std::collections::HashMap::new();
843 fields.insert(
844 "title".to_string(),
845 crate::value::QuillValue::from_json(serde_json::json!("<<placeholder>>")),
846 );
847 fields.insert(
848 BODY_FIELD.to_string(),
849 crate::value::QuillValue::from_json(serde_json::json!("<<content>> \u{202D}**bold**")),
850 );
851
852 let doc = ParsedDocument::new(fields);
853 let normalized = super::normalize_document(doc);
854
855 // Title has chevrons stripped
856 assert_eq!(
857 normalized.get_field("title").unwrap().as_str().unwrap(),
858 "placeholder"
859 );
860
861 // Body has guillemets converted and bidi stripped
862 assert_eq!(normalized.body().unwrap(), "«content» **bold**");
863 }
864
865 #[test]
866 fn test_normalize_document_preserves_quill_tag() {
867 use crate::parse::ParsedDocument;
868
869 let fields = std::collections::HashMap::new();
870 let doc = ParsedDocument::with_quill_tag(fields, "custom_quill".to_string());
871 let normalized = super::normalize_document(doc);
872
873 assert_eq!(normalized.quill_tag(), "custom_quill");
874 }
875
876 #[test]
877 fn test_normalize_document_idempotent() {
878 use crate::parse::ParsedDocument;
879
880 let mut fields = std::collections::HashMap::new();
881 fields.insert(
882 BODY_FIELD.to_string(),
883 crate::value::QuillValue::from_json(serde_json::json!("<<content>>")),
884 );
885
886 let doc = ParsedDocument::new(fields);
887 let normalized_once = super::normalize_document(doc);
888 let normalized_twice = super::normalize_document(normalized_once.clone());
889
890 // Calling normalize_document twice should produce the same result
891 assert_eq!(
892 normalized_once.body().unwrap(),
893 normalized_twice.body().unwrap()
894 );
895 }
896}