quillmark_core/
normalize.rs

1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Fix HTML comment fences to preserve trailing text
14//! - Apply all normalizations in the correct order
15//!
16//! Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
17//!
18//! ## Functions
19//!
20//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
21//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
22//! - [`normalize_fields`] - Normalize document fields (bidi stripping)
23//!
24//! ## Why Normalize?
25//!
26//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
27//! control characters used for bidirectional text layout. When placed adjacent to markdown
28//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
29//!
30//! ```text
31//! **bold** or <U+202D>**(1234**
32//!             ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
33//! ```
34//!
35//! These characters commonly appear when copying text from:
36//! - Web pages with mixed LTR/RTL content
37//! - PDF documents
38//! - Word processors
39//! - Some clipboard managers
40//!
41//! ## Examples
42//!
43//! ```
44//! use quillmark_core::normalize::strip_bidi_formatting;
45//!
46//! // Input with invisible U+202D (LRO) before second **
47//! let input = "**asdf** or \u{202D}**(1234**";
48//! let cleaned = strip_bidi_formatting(input);
49//! assert_eq!(cleaned, "**asdf** or **(1234**");
50//! ```
51
52use crate::error::MAX_NESTING_DEPTH;
53use crate::parse::BODY_FIELD;
54use crate::value::QuillValue;
55use std::collections::HashMap;
56use unicode_normalization::UnicodeNormalization;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61    /// JSON nesting depth exceeded maximum allowed
62    #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63    NestingTooDeep {
64        /// Actual depth
65        depth: usize,
66        /// Maximum allowed depth
67        max: usize,
68    },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74    matches!(
75        c,
76        '\u{061C}' // ARABIC LETTER MARK (ALM)
77        | '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
78        | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
79        | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
80        | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
81        | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
82        | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
83        | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
84        | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
85        | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
86        | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
87        | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
88    )
89}
90
91/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
92///
93/// These invisible control characters are used for bidirectional text layout but can
94/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
95///
96/// # Characters Stripped
97///
98/// - U+061C (ARABIC LETTER MARK, ALM)
99/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
100/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
101/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
102/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
103/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
104/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
105/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
106/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
107/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
108/// - U+2068 (FIRST STRONG ISOLATE, FSI)
109/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
110///
111/// # Examples
112///
113/// ```
114/// use quillmark_core::normalize::strip_bidi_formatting;
115///
116/// // Normal text is unchanged
117/// assert_eq!(strip_bidi_formatting("hello"), "hello");
118///
119/// // LRO character is stripped
120/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
121///
122/// // All bidi characters are stripped
123/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
124/// assert_eq!(strip_bidi_formatting(input), "");
125/// ```
126pub fn strip_bidi_formatting(s: &str) -> String {
127    // Early return optimization: avoid allocation if no bidi characters present
128    if !s.chars().any(is_bidi_char) {
129        return s.to_string();
130    }
131
132    s.chars().filter(|c| !is_bidi_char(*c)).collect()
133}
134
135/// Fixes HTML comment closing fences to prevent content loss.
136///
137/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
138/// This means any text on the same line after `-->` is included in the HTML block and would
139/// be discarded by markdown parsers that ignore HTML blocks.
140///
141/// This function inserts a newline after `-->` when followed by non-whitespace content,
142/// ensuring the trailing text is parsed as regular markdown.
143///
144/// # Examples
145///
146/// ```
147/// use quillmark_core::normalize::fix_html_comment_fences;
148///
149/// // Text on same line as --> is moved to next line
150/// assert_eq!(
151///     fix_html_comment_fences("<!-- comment -->Some text"),
152///     "<!-- comment -->\nSome text"
153/// );
154///
155/// // Already on separate line - no change
156/// assert_eq!(
157///     fix_html_comment_fences("<!-- comment -->\nSome text"),
158///     "<!-- comment -->\nSome text"
159/// );
160///
161/// // Only whitespace after --> - no change needed
162/// assert_eq!(
163///     fix_html_comment_fences("<!-- comment -->   \nSome text"),
164///     "<!-- comment -->   \nSome text"
165/// );
166///
167/// // Multi-line comments with trailing text
168/// assert_eq!(
169///     fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
170///     "<!--\nmultiline\n-->\nTrailing text"
171/// );
172/// ```
173pub fn fix_html_comment_fences(s: &str) -> String {
174    // Early return if no HTML comment closing fence present
175    if !s.contains("-->") {
176        return s.to_string();
177    }
178
179    // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
180    let mut result = String::with_capacity(s.len() + 16);
181    let mut current_pos = 0;
182
183    // Find first opener
184    while let Some(open_idx) = s[current_pos..].find("<!--") {
185        let abs_open = current_pos + open_idx;
186
187        // Find matching closer AFTER the opener
188        if let Some(close_idx) = s[abs_open..].find("-->") {
189            let abs_close = abs_open + close_idx;
190            let after_fence = abs_close + 3;
191
192            // Append everything up to and including the closing fence
193            result.push_str(&s[current_pos..after_fence]);
194
195            // Check what comes after the fence
196            let after_content = &s[after_fence..];
197
198            // Determine if we need to insert a newline
199            let needs_newline = if after_content.is_empty() {
200                false
201            } else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
202                false
203            } else {
204                // Check if there's only whitespace until end of line
205                let next_newline = after_content.find('\n');
206                let until_newline = match next_newline {
207                    Some(pos) => &after_content[..pos],
208                    None => after_content,
209                };
210                !until_newline.trim().is_empty()
211            };
212
213            if needs_newline {
214                result.push('\n');
215            }
216
217            // Move position to after the fence (we'll process the rest in next iteration)
218            current_pos = after_fence;
219        } else {
220            // Unclosed comment at end of string - just append the rest and break
221            // The opener was found but no closer exists.
222            result.push_str(&s[current_pos..]);
223            current_pos = s.len();
224            break;
225        }
226    }
227
228    // Append remaining content (text after last closed comment, or text if no comments found)
229    if current_pos < s.len() {
230        result.push_str(&s[current_pos..]);
231    }
232
233    result
234}
235
236/// Normalizes markdown content by applying all preprocessing steps.
237///
238/// This function applies normalizations in the correct order:
239/// 1. Strip Unicode bidirectional formatting characters
240/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
241///
242/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
243/// in [`normalize_fields`] because it needs to be applied after schema defaults
244/// and coercion.
245///
246/// # Examples
247///
248/// ```
249/// use quillmark_core::normalize::normalize_markdown;
250///
251/// // Bidi characters are stripped
252/// let input = "**bold** \u{202D}**more**";
253/// let normalized = normalize_markdown(input);
254/// assert_eq!(normalized, "**bold** **more**");
255///
256/// // HTML comment trailing text is preserved
257/// let with_comment = "<!-- comment -->Some text";
258/// let normalized = normalize_markdown(with_comment);
259/// assert_eq!(normalized, "<!-- comment -->\nSome text");
260/// ```
261pub fn normalize_markdown(markdown: &str) -> String {
262    let cleaned = strip_bidi_formatting(markdown);
263    fix_html_comment_fences(&cleaned)
264}
265
266/// Normalizes a string value by stripping bidi characters and fixing HTML comment fences.
267///
268/// - For body content: applies `fix_html_comment_fences` to preserve text after `-->`
269/// - For other fields: strips bidi characters only
270///
271/// Double chevrons (`<<` and `>>`) are passed through untouched without conversion to
272/// guillemets. This preserves the original delimiter syntax in the output.
273fn normalize_string(s: &str, is_body: bool) -> String {
274    // First strip bidi formatting characters
275    let cleaned = strip_bidi_formatting(s);
276
277    // Then apply content-specific normalization
278    if is_body {
279        // Fix HTML comment fences (chevrons pass through unchanged)
280        fix_html_comment_fences(&cleaned)
281    } else {
282        // Non-body fields: just return cleaned string (chevrons pass through unchanged)
283        cleaned
284    }
285}
286
287/// Recursively normalize a JSON value with depth tracking.
288///
289/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
290fn normalize_json_value_inner(
291    value: serde_json::Value,
292    is_body: bool,
293    depth: usize,
294) -> Result<serde_json::Value, NormalizationError> {
295    if depth > MAX_NESTING_DEPTH {
296        return Err(NormalizationError::NestingTooDeep {
297            depth,
298            max: MAX_NESTING_DEPTH,
299        });
300    }
301
302    match value {
303        serde_json::Value::String(s) => {
304            Ok(serde_json::Value::String(normalize_string(&s, is_body)))
305        }
306        serde_json::Value::Array(arr) => {
307            let normalized: Result<Vec<_>, _> = arr
308                .into_iter()
309                .map(|v| normalize_json_value_inner(v, false, depth + 1))
310                .collect();
311            Ok(serde_json::Value::Array(normalized?))
312        }
313        serde_json::Value::Object(map) => {
314            let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
315                .into_iter()
316                .map(|(k, v)| {
317                    let is_body = k == BODY_FIELD;
318                    normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
319                })
320                .collect();
321            Ok(serde_json::Value::Object(processed?))
322        }
323        // Pass through other types unchanged (numbers, booleans, null)
324        other => Ok(other),
325    }
326}
327
328/// Recursively normalize a JSON value.
329///
330/// This is a convenience wrapper that starts depth tracking at 0.
331/// Logs a warning and returns the original value if depth is exceeded.
332fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
333    match normalize_json_value_inner(value.clone(), is_body, 0) {
334        Ok(normalized) => normalized,
335        Err(e) => {
336            // Log warning but don't fail - return original value
337            eprintln!("Warning: {}", e);
338            value
339        }
340    }
341}
342
343/// Normalizes document fields by applying all preprocessing steps.
344///
345/// This function orchestrates input normalization for document fields:
346/// 1. Strips Unicode bidirectional formatting characters from all string values
347/// 2. For the body field: fixes HTML comment fences to preserve trailing text
348///
349/// Double chevrons (`<<` and `>>`) are passed through unchanged in all fields.
350///
351/// # Processing Order
352///
353/// The normalization order is important:
354/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
355/// 2. **HTML comment fence fixing** - Ensures text after `-->` is preserved
356///
357/// # Examples
358///
359/// ```
360/// use quillmark_core::normalize::normalize_fields;
361/// use quillmark_core::QuillValue;
362/// use std::collections::HashMap;
363///
364/// let mut fields = HashMap::new();
365/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
366/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
367///
368/// let result = normalize_fields(fields);
369///
370/// // Title has chevrons preserved (only bidi stripped)
371/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
372///
373/// // Body has bidi chars stripped, chevrons preserved
374/// assert_eq!(result.get("BODY").unwrap().as_str().unwrap(), "**bold** **more**");
375/// ```
376pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
377    fields
378        .into_iter()
379        .map(|(key, value)| {
380            // Normalize field name to NFC form for consistent key comparison
381            // This ensures café (composed) and café (decomposed) are treated as the same key
382            let normalized_key = normalize_field_name(&key);
383            let json = value.into_json();
384            // Treat as body if it's the BODY field (applies HTML comment fence fixes)
385            let treat_as_body = normalized_key == BODY_FIELD;
386            let processed = normalize_json_value(json, treat_as_body);
387            (normalized_key, QuillValue::from_json(processed))
388        })
389        .collect()
390}
391
392/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
393///
394/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
395/// are treated as identical field names, preventing subtle bugs where visually
396/// identical keys are treated as different.
397///
398/// # Examples
399///
400/// ```
401/// use quillmark_core::normalize::normalize_field_name;
402///
403/// // Composed form (single code point for é)
404/// let composed = "café";
405/// // Decomposed form (e + combining acute accent)
406/// let decomposed = "cafe\u{0301}";
407///
408/// // Both normalize to the same NFC form
409/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
410/// ```
411pub fn normalize_field_name(name: &str) -> String {
412    name.nfc().collect()
413}
414
415/// Normalizes a parsed document by applying all field-level normalizations.
416///
417/// This is the **primary entry point** for normalizing documents after parsing.
418/// It ensures consistent processing regardless of how the document was created.
419///
420/// # Normalization Steps
421///
422/// This function applies all normalizations in the correct order:
423/// 1. **Unicode NFC normalization** - Field names are normalized to NFC form
424/// 2. **Bidi stripping** - Invisible bidirectional control characters are removed
425/// 3. **HTML comment fence fixing** - Trailing text after `-->` is preserved (body only)
426///
427/// Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
428///
429/// # When to Use
430///
431/// Call this function after parsing and before rendering:
432///
433/// ```no_run
434/// use quillmark_core::{ParsedDocument, normalize::normalize_document};
435///
436/// let markdown = "---\ntitle: Example\n---\n\nBody with <<placeholder>>";
437/// let doc = ParsedDocument::from_markdown(markdown).unwrap();
438/// let normalized = normalize_document(doc);
439/// // Use normalized document for rendering...
440/// ```
441///
442/// # Direct API Usage
443///
444/// If you're constructing a `ParsedDocument` directly via [`crate::parse::ParsedDocument::new`]
445/// rather than parsing from markdown, you **MUST** call this function to ensure
446/// consistent normalization:
447///
448/// ```
449/// use quillmark_core::{ParsedDocument, QuillValue, normalize::normalize_document};
450/// use std::collections::HashMap;
451///
452/// // Direct construction (e.g., from API or database)
453/// let mut fields = HashMap::new();
454/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("Test")));
455/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("<<content>>")));
456///
457/// let doc = ParsedDocument::new(fields);
458/// let normalized = normalize_document(doc).expect("Failed to normalize document");
459///
460/// // Body has chevrons preserved
461/// assert_eq!(normalized.body().unwrap(), "<<content>>");
462/// ```
463///
464/// # Idempotency
465///
466/// This function is idempotent - calling it multiple times produces the same result.
467/// However, for performance reasons, avoid unnecessary repeated calls.
468pub fn normalize_document(
469    doc: crate::parse::ParsedDocument,
470) -> Result<crate::parse::ParsedDocument, crate::error::ParseError> {
471    let normalized_fields = normalize_fields(doc.fields().clone());
472    Ok(crate::parse::ParsedDocument::with_quill_ref(
473        normalized_fields,
474        doc.quill_reference().clone(),
475    ))
476}
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    // Tests for strip_bidi_formatting
483
484    #[test]
485    fn test_strip_bidi_no_change() {
486        assert_eq!(strip_bidi_formatting("hello world"), "hello world");
487        assert_eq!(strip_bidi_formatting(""), "");
488        assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
489    }
490
491    #[test]
492    fn test_strip_bidi_lro() {
493        // U+202D (LEFT-TO-RIGHT OVERRIDE)
494        assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
495        assert_eq!(
496            strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
497            "**asdf** or **(1234**"
498        );
499    }
500
501    #[test]
502    fn test_strip_bidi_rlo() {
503        // U+202E (RIGHT-TO-LEFT OVERRIDE)
504        assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
505    }
506
507    #[test]
508    fn test_strip_bidi_marks() {
509        // U+200E (LRM) and U+200F (RLM)
510        assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
511    }
512
513    #[test]
514    fn test_strip_bidi_embeddings() {
515        // U+202A (LRE), U+202B (RLE), U+202C (PDF)
516        assert_eq!(
517            strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
518            "textmore"
519        );
520    }
521
522    #[test]
523    fn test_strip_bidi_isolates() {
524        // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
525        assert_eq!(
526            strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
527            "abc"
528        );
529    }
530
531    #[test]
532    fn test_strip_bidi_all_chars() {
533        let all_bidi = "\u{061C}\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
534        assert_eq!(strip_bidi_formatting(all_bidi), "");
535    }
536
537    #[test]
538    fn test_strip_bidi_arabic_letter_mark() {
539        // U+061C ARABIC LETTER MARK (ALM) should be stripped
540        assert_eq!(strip_bidi_formatting("hello\u{061C}world"), "helloworld");
541        assert_eq!(strip_bidi_formatting("\u{061C}**bold**"), "**bold**");
542    }
543
544    #[test]
545    fn test_strip_bidi_unicode_preserved() {
546        // Non-bidi unicode should be preserved
547        assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
548        assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
549        assert_eq!(strip_bidi_formatting("🎉"), "🎉");
550    }
551
552    // Tests for normalize_markdown
553
554    #[test]
555    fn test_normalize_markdown_basic() {
556        assert_eq!(normalize_markdown("hello"), "hello");
557        assert_eq!(
558            normalize_markdown("**bold** \u{202D}**more**"),
559            "**bold** **more**"
560        );
561    }
562
563    #[test]
564    fn test_normalize_markdown_html_comment() {
565        assert_eq!(
566            normalize_markdown("<!-- comment -->Some text"),
567            "<!-- comment -->\nSome text"
568        );
569    }
570
571    // Tests for fix_html_comment_fences
572
573    #[test]
574    fn test_fix_html_comment_no_comment() {
575        assert_eq!(fix_html_comment_fences("hello world"), "hello world");
576        assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
577        assert_eq!(fix_html_comment_fences(""), "");
578    }
579
580    #[test]
581    fn test_fix_html_comment_single_line_trailing_text() {
582        // Text on same line as --> should be moved to next line
583        assert_eq!(
584            fix_html_comment_fences("<!-- comment -->Same line text"),
585            "<!-- comment -->\nSame line text"
586        );
587    }
588
589    #[test]
590    fn test_fix_html_comment_already_newline() {
591        // Already has newline after --> - no change
592        assert_eq!(
593            fix_html_comment_fences("<!-- comment -->\nNext line text"),
594            "<!-- comment -->\nNext line text"
595        );
596    }
597
598    #[test]
599    fn test_fix_html_comment_only_whitespace_after() {
600        // Only whitespace after --> until newline - no change needed
601        assert_eq!(
602            fix_html_comment_fences("<!-- comment -->   \nSome text"),
603            "<!-- comment -->   \nSome text"
604        );
605    }
606
607    #[test]
608    fn test_fix_html_comment_multiline_trailing_text() {
609        // Multi-line comment with text on closing line
610        assert_eq!(
611            fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
612            "<!--\nmultiline\ncomment\n-->\nTrailing text"
613        );
614    }
615
616    #[test]
617    fn test_fix_html_comment_multiline_proper() {
618        // Multi-line comment with proper newline after -->
619        assert_eq!(
620            fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
621            "<!--\nmultiline\n-->\n\nParagraph text"
622        );
623    }
624
625    #[test]
626    fn test_fix_html_comment_multiple_comments() {
627        // Multiple comments in the same document
628        assert_eq!(
629            fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
630            "<!-- first -->\nText\n\n<!-- second -->\nMore text"
631        );
632    }
633
634    #[test]
635    fn test_fix_html_comment_end_of_string() {
636        // Comment at end of string - no trailing content
637        assert_eq!(
638            fix_html_comment_fences("Some text before <!-- comment -->"),
639            "Some text before <!-- comment -->"
640        );
641    }
642
643    #[test]
644    fn test_fix_html_comment_only_comment() {
645        // Just a comment with nothing after
646        assert_eq!(
647            fix_html_comment_fences("<!-- comment -->"),
648            "<!-- comment -->"
649        );
650    }
651
652    #[test]
653    fn test_fix_html_comment_arrow_not_comment() {
654        // --> that's not part of a comment (standalone)
655        // Should NOT be touched by the context-aware fixer
656        assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
657    }
658
659    #[test]
660    fn test_fix_html_comment_nested_opener() {
661        // Nested openers are just text inside the comment
662        // <!-- <!-- -->Trailing
663        // The first <!-- opens, the first --> closes.
664        assert_eq!(
665            fix_html_comment_fences("<!-- <!-- -->Trailing"),
666            "<!-- <!-- -->\nTrailing"
667        );
668    }
669
670    #[test]
671    fn test_fix_html_comment_unmatched_closer() {
672        // Closer without opener
673        assert_eq!(
674            fix_html_comment_fences("text --> more text"),
675            "text --> more text"
676        );
677    }
678
679    #[test]
680    fn test_fix_html_comment_multiple_valid_invalid() {
681        // Mixed valid and invalid comments
682        // <!-- valid -->FixMe
683        // text --> Ignore
684        // <!-- valid2 -->FixMe2
685        let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
686        let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
687        assert_eq!(fix_html_comment_fences(input), expected);
688    }
689
690    #[test]
691    fn test_fix_html_comment_crlf() {
692        // CRLF line endings
693        assert_eq!(
694            fix_html_comment_fences("<!-- comment -->\r\nSome text"),
695            "<!-- comment -->\r\nSome text"
696        );
697    }
698
699    // Tests for normalize_fields
700
701    #[test]
702    fn test_normalize_fields_body_bidi() {
703        let mut fields = HashMap::new();
704        fields.insert(
705            BODY_FIELD.to_string(),
706            QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
707        );
708
709        let result = normalize_fields(fields);
710        assert_eq!(
711            result.get(BODY_FIELD).unwrap().as_str().unwrap(),
712            "**bold** **more**"
713        );
714    }
715
716    #[test]
717    fn test_normalize_fields_body_chevrons_preserved() {
718        let mut fields = HashMap::new();
719        fields.insert(
720            BODY_FIELD.to_string(),
721            QuillValue::from_json(serde_json::json!("<<raw>>")),
722        );
723
724        let result = normalize_fields(fields);
725        // Chevrons are passed through unchanged
726        assert_eq!(result.get(BODY_FIELD).unwrap().as_str().unwrap(), "<<raw>>");
727    }
728
729    #[test]
730    fn test_normalize_fields_body_chevrons_and_bidi() {
731        let mut fields = HashMap::new();
732        fields.insert(
733            BODY_FIELD.to_string(),
734            QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
735        );
736
737        let result = normalize_fields(fields);
738        // Bidi stripped, chevrons preserved
739        assert_eq!(
740            result.get(BODY_FIELD).unwrap().as_str().unwrap(),
741            "<<raw>> **bold**"
742        );
743    }
744
745    #[test]
746    fn test_normalize_fields_other_field_chevrons_preserved() {
747        let mut fields = HashMap::new();
748        fields.insert(
749            "title".to_string(),
750            QuillValue::from_json(serde_json::json!("<<hello>>")),
751        );
752
753        let result = normalize_fields(fields);
754        // Chevrons are passed through unchanged
755        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
756    }
757
758    #[test]
759    fn test_normalize_fields_other_field_bidi_stripped() {
760        let mut fields = HashMap::new();
761        fields.insert(
762            "title".to_string(),
763            QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
764        );
765
766        let result = normalize_fields(fields);
767        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
768    }
769
770    #[test]
771    fn test_normalize_fields_nested_values() {
772        let mut fields = HashMap::new();
773        fields.insert(
774            "items".to_string(),
775            QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
776        );
777
778        let result = normalize_fields(fields);
779        let items = result.get("items").unwrap().as_array().unwrap();
780        // Chevrons are preserved, bidi stripped
781        assert_eq!(items[0].as_str().unwrap(), "<<a>>");
782        assert_eq!(items[1].as_str().unwrap(), "b");
783    }
784
785    #[test]
786    fn test_normalize_fields_object_values() {
787        let mut fields = HashMap::new();
788        fields.insert(
789            "meta".to_string(),
790            QuillValue::from_json(serde_json::json!({
791                "title": "<<hello>>",
792                BODY_FIELD: "<<content>>"
793            })),
794        );
795
796        let result = normalize_fields(fields);
797        let meta = result.get("meta").unwrap();
798        let meta_obj = meta.as_object().unwrap();
799        // Chevrons are preserved in all fields
800        assert_eq!(
801            meta_obj.get("title").unwrap().as_str().unwrap(),
802            "<<hello>>"
803        );
804        assert_eq!(
805            meta_obj.get(BODY_FIELD).unwrap().as_str().unwrap(),
806            "<<content>>"
807        );
808    }
809
810    #[test]
811    fn test_normalize_fields_non_string_unchanged() {
812        let mut fields = HashMap::new();
813        fields.insert(
814            "count".to_string(),
815            QuillValue::from_json(serde_json::json!(42)),
816        );
817        fields.insert(
818            "enabled".to_string(),
819            QuillValue::from_json(serde_json::json!(true)),
820        );
821
822        let result = normalize_fields(fields);
823        assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
824        assert!(result.get("enabled").unwrap().as_bool().unwrap());
825    }
826
827    // Tests for depth limiting
828
829    #[test]
830    fn test_normalize_json_value_inner_depth_exceeded() {
831        // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
832        let mut value = serde_json::json!("leaf");
833        for _ in 0..=crate::error::MAX_NESTING_DEPTH {
834            value = serde_json::json!([value]);
835        }
836
837        // The inner function should return an error
838        let result = super::normalize_json_value_inner(value, false, 0);
839        assert!(result.is_err());
840
841        if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
842            assert!(depth > max);
843            assert_eq!(max, crate::error::MAX_NESTING_DEPTH);
844        } else {
845            panic!("Expected NestingTooDeep error");
846        }
847    }
848
849    #[test]
850    fn test_normalize_json_value_inner_within_limit() {
851        // Create a nested structure just within the limit
852        let mut value = serde_json::json!("leaf");
853        for _ in 0..50 {
854            value = serde_json::json!([value]);
855        }
856
857        // This should succeed
858        let result = super::normalize_json_value_inner(value, false, 0);
859        assert!(result.is_ok());
860    }
861
862    // Tests for normalize_document
863
864    #[test]
865    fn test_normalize_document_basic() {
866        use crate::parse::ParsedDocument;
867
868        let mut fields = std::collections::HashMap::new();
869        fields.insert(
870            "title".to_string(),
871            crate::value::QuillValue::from_json(serde_json::json!("<<placeholder>>")),
872        );
873        fields.insert(
874            BODY_FIELD.to_string(),
875            crate::value::QuillValue::from_json(serde_json::json!("<<content>> \u{202D}**bold**")),
876        );
877
878        let doc = ParsedDocument::new(fields);
879        let normalized = super::normalize_document(doc).unwrap();
880
881        // Title has chevrons preserved (only bidi stripped)
882        assert_eq!(
883            normalized.get_field("title").unwrap().as_str().unwrap(),
884            "<<placeholder>>"
885        );
886
887        // Body has bidi stripped, chevrons preserved
888        assert_eq!(normalized.body().unwrap(), "<<content>> **bold**");
889    }
890
891    #[test]
892    fn test_normalize_document_preserves_quill_tag() {
893        use crate::parse::ParsedDocument;
894        use crate::version::QuillReference;
895        use std::str::FromStr;
896
897        let fields = std::collections::HashMap::new();
898        let quill_ref = QuillReference::from_str("custom_quill").unwrap();
899        let doc = ParsedDocument::with_quill_ref(fields, quill_ref);
900        let normalized = super::normalize_document(doc).unwrap();
901
902        assert_eq!(normalized.quill_reference().name, "custom_quill");
903    }
904
905    #[test]
906    fn test_normalize_document_idempotent() {
907        use crate::parse::ParsedDocument;
908
909        let mut fields = std::collections::HashMap::new();
910        fields.insert(
911            BODY_FIELD.to_string(),
912            crate::value::QuillValue::from_json(serde_json::json!("<<content>>")),
913        );
914
915        let doc = ParsedDocument::new(fields);
916        let normalized_once = super::normalize_document(doc).unwrap();
917        let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
918
919        // Calling normalize_document twice should produce the same result
920        assert_eq!(
921            normalized_once.body().unwrap(),
922            normalized_twice.body().unwrap()
923        );
924    }
925}
quillmark_core/normalize.rs

quillmark_core/
normalize.rs