quillmark_core/
normalize.rs

1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Fix HTML comment fences to preserve trailing text
14//! - Apply all normalizations in the correct order
15//!
16//! Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
17//!
18//! ## Functions
19//!
20//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
21//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
22//! - [`normalize_fields`] - Normalize document fields (bidi stripping)
23//!
24//! ## Why Normalize?
25//!
26//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
27//! control characters used for bidirectional text layout. When placed adjacent to markdown
28//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
29//!
30//! ```text
31//! **bold** or <U+202D>**(1234**
32//!             ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
33//! ```
34//!
35//! These characters commonly appear when copying text from:
36//! - Web pages with mixed LTR/RTL content
37//! - PDF documents
38//! - Word processors
39//! - Some clipboard managers
40//!
41//! ## Examples
42//!
43//! ```
44//! use quillmark_core::normalize::strip_bidi_formatting;
45//!
46//! // Input with invisible U+202D (LRO) before second **
47//! let input = "**asdf** or \u{202D}**(1234**";
48//! let cleaned = strip_bidi_formatting(input);
49//! assert_eq!(cleaned, "**asdf** or **(1234**");
50//! ```
51
52use crate::error::MAX_NESTING_DEPTH;
53use crate::parse::BODY_FIELD;
54use crate::value::QuillValue;
55use std::collections::HashMap;
56use unicode_normalization::UnicodeNormalization;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61    /// JSON nesting depth exceeded maximum allowed
62    #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63    NestingTooDeep {
64        /// Actual depth
65        depth: usize,
66        /// Maximum allowed depth
67        max: usize,
68    },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74    matches!(
75        c,
76        '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
77        | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
78        | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
79        | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
80        | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
81        | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
82        | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
83        | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
84        | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
85        | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
86        | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
87    )
88}
89
90/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
91///
92/// These invisible control characters are used for bidirectional text layout but can
93/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
94///
95/// # Characters Stripped
96///
97/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
98/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
99/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
100/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
101/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
102/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
103/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
104/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
105/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
106/// - U+2068 (FIRST STRONG ISOLATE, FSI)
107/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
108///
109/// # Examples
110///
111/// ```
112/// use quillmark_core::normalize::strip_bidi_formatting;
113///
114/// // Normal text is unchanged
115/// assert_eq!(strip_bidi_formatting("hello"), "hello");
116///
117/// // LRO character is stripped
118/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
119///
120/// // All bidi characters are stripped
121/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
122/// assert_eq!(strip_bidi_formatting(input), "");
123/// ```
124pub fn strip_bidi_formatting(s: &str) -> String {
125    // Early return optimization: avoid allocation if no bidi characters present
126    if !s.chars().any(is_bidi_char) {
127        return s.to_string();
128    }
129
130    s.chars().filter(|c| !is_bidi_char(*c)).collect()
131}
132
133/// Fixes HTML comment closing fences to prevent content loss.
134///
135/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
136/// This means any text on the same line after `-->` is included in the HTML block and would
137/// be discarded by markdown parsers that ignore HTML blocks.
138///
139/// This function inserts a newline after `-->` when followed by non-whitespace content,
140/// ensuring the trailing text is parsed as regular markdown.
141///
142/// # Examples
143///
144/// ```
145/// use quillmark_core::normalize::fix_html_comment_fences;
146///
147/// // Text on same line as --> is moved to next line
148/// assert_eq!(
149///     fix_html_comment_fences("<!-- comment -->Some text"),
150///     "<!-- comment -->\nSome text"
151/// );
152///
153/// // Already on separate line - no change
154/// assert_eq!(
155///     fix_html_comment_fences("<!-- comment -->\nSome text"),
156///     "<!-- comment -->\nSome text"
157/// );
158///
159/// // Only whitespace after --> - no change needed
160/// assert_eq!(
161///     fix_html_comment_fences("<!-- comment -->   \nSome text"),
162///     "<!-- comment -->   \nSome text"
163/// );
164///
165/// // Multi-line comments with trailing text
166/// assert_eq!(
167///     fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
168///     "<!--\nmultiline\n-->\nTrailing text"
169/// );
170/// ```
171pub fn fix_html_comment_fences(s: &str) -> String {
172    // Early return if no HTML comment closing fence present
173    if !s.contains("-->") {
174        return s.to_string();
175    }
176
177    // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
178    let mut result = String::with_capacity(s.len() + 16);
179    let mut current_pos = 0;
180
181    // Find first opener
182    while let Some(open_idx) = s[current_pos..].find("<!--") {
183        let abs_open = current_pos + open_idx;
184
185        // Find matching closer AFTER the opener
186        if let Some(close_idx) = s[abs_open..].find("-->") {
187            let abs_close = abs_open + close_idx;
188            let after_fence = abs_close + 3;
189
190            // Append everything up to and including the closing fence
191            result.push_str(&s[current_pos..after_fence]);
192
193            // Check what comes after the fence
194            let after_content = &s[after_fence..];
195
196            // Determine if we need to insert a newline
197            let needs_newline = if after_content.is_empty() {
198                false
199            } else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
200                false
201            } else {
202                // Check if there's only whitespace until end of line
203                let next_newline = after_content.find('\n');
204                let until_newline = match next_newline {
205                    Some(pos) => &after_content[..pos],
206                    None => after_content,
207                };
208                !until_newline.trim().is_empty()
209            };
210
211            if needs_newline {
212                result.push('\n');
213            }
214
215            // Move position to after the fence (we'll process the rest in next iteration)
216            current_pos = after_fence;
217        } else {
218            // Unclosed comment at end of string - just append the rest and break
219            // The opener was found but no closer exists.
220            result.push_str(&s[current_pos..]);
221            current_pos = s.len();
222            break;
223        }
224    }
225
226    // Append remaining content (text after last closed comment, or text if no comments found)
227    if current_pos < s.len() {
228        result.push_str(&s[current_pos..]);
229    }
230
231    result
232}
233
234/// Normalizes markdown content by applying all preprocessing steps.
235///
236/// This function applies normalizations in the correct order:
237/// 1. Strip Unicode bidirectional formatting characters
238/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
239///
240/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
241/// in [`normalize_fields`] because it needs to be applied after schema defaults
242/// and coercion.
243///
244/// # Examples
245///
246/// ```
247/// use quillmark_core::normalize::normalize_markdown;
248///
249/// // Bidi characters are stripped
250/// let input = "**bold** \u{202D}**more**";
251/// let normalized = normalize_markdown(input);
252/// assert_eq!(normalized, "**bold** **more**");
253///
254/// // HTML comment trailing text is preserved
255/// let with_comment = "<!-- comment -->Some text";
256/// let normalized = normalize_markdown(with_comment);
257/// assert_eq!(normalized, "<!-- comment -->\nSome text");
258/// ```
259pub fn normalize_markdown(markdown: &str) -> String {
260    let cleaned = strip_bidi_formatting(markdown);
261    fix_html_comment_fences(&cleaned)
262}
263
264/// Normalizes a string value by stripping bidi characters and fixing HTML comment fences.
265///
266/// - For body content: applies `fix_html_comment_fences` to preserve text after `-->`
267/// - For other fields: strips bidi characters only
268///
269/// Double chevrons (`<<` and `>>`) are passed through untouched without conversion to
270/// guillemets. This preserves the original delimiter syntax in the output.
271fn normalize_string(s: &str, is_body: bool) -> String {
272    // First strip bidi formatting characters
273    let cleaned = strip_bidi_formatting(s);
274
275    // Then apply content-specific normalization
276    if is_body {
277        // Fix HTML comment fences (chevrons pass through unchanged)
278        fix_html_comment_fences(&cleaned)
279    } else {
280        // Non-body fields: just return cleaned string (chevrons pass through unchanged)
281        cleaned
282    }
283}
284
285/// Recursively normalize a JSON value with depth tracking.
286///
287/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
288fn normalize_json_value_inner(
289    value: serde_json::Value,
290    is_body: bool,
291    depth: usize,
292) -> Result<serde_json::Value, NormalizationError> {
293    if depth > MAX_NESTING_DEPTH {
294        return Err(NormalizationError::NestingTooDeep {
295            depth,
296            max: MAX_NESTING_DEPTH,
297        });
298    }
299
300    match value {
301        serde_json::Value::String(s) => {
302            Ok(serde_json::Value::String(normalize_string(&s, is_body)))
303        }
304        serde_json::Value::Array(arr) => {
305            let normalized: Result<Vec<_>, _> = arr
306                .into_iter()
307                .map(|v| normalize_json_value_inner(v, false, depth + 1))
308                .collect();
309            Ok(serde_json::Value::Array(normalized?))
310        }
311        serde_json::Value::Object(map) => {
312            let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
313                .into_iter()
314                .map(|(k, v)| {
315                    let is_body = k == BODY_FIELD;
316                    normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
317                })
318                .collect();
319            Ok(serde_json::Value::Object(processed?))
320        }
321        // Pass through other types unchanged (numbers, booleans, null)
322        other => Ok(other),
323    }
324}
325
326/// Recursively normalize a JSON value.
327///
328/// This is a convenience wrapper that starts depth tracking at 0.
329/// Logs a warning and returns the original value if depth is exceeded.
330fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
331    match normalize_json_value_inner(value.clone(), is_body, 0) {
332        Ok(normalized) => normalized,
333        Err(e) => {
334            // Log warning but don't fail - return original value
335            eprintln!("Warning: {}", e);
336            value
337        }
338    }
339}
340
341/// Normalizes document fields by applying all preprocessing steps.
342///
343/// This function orchestrates input normalization for document fields:
344/// 1. Strips Unicode bidirectional formatting characters from all string values
345/// 2. For the body field: fixes HTML comment fences to preserve trailing text
346///
347/// Double chevrons (`<<` and `>>`) are passed through unchanged in all fields.
348///
349/// # Processing Order
350///
351/// The normalization order is important:
352/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
353/// 2. **HTML comment fence fixing** - Ensures text after `-->` is preserved
354///
355/// # Examples
356///
357/// ```
358/// use quillmark_core::normalize::normalize_fields;
359/// use quillmark_core::QuillValue;
360/// use std::collections::HashMap;
361///
362/// let mut fields = HashMap::new();
363/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
364/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
365///
366/// let result = normalize_fields(fields);
367///
368/// // Title has chevrons preserved (only bidi stripped)
369/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
370///
371/// // Body has bidi chars stripped, chevrons preserved
372/// assert_eq!(result.get("BODY").unwrap().as_str().unwrap(), "**bold** **more**");
373/// ```
374pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
375    fields
376        .into_iter()
377        .map(|(key, value)| {
378            // Normalize field name to NFC form for consistent key comparison
379            // This ensures café (composed) and café (decomposed) are treated as the same key
380            let normalized_key = normalize_field_name(&key);
381            let json = value.into_json();
382            // Treat as body if it's the BODY field (applies HTML comment fence fixes)
383            let treat_as_body = normalized_key == BODY_FIELD;
384            let processed = normalize_json_value(json, treat_as_body);
385            (normalized_key, QuillValue::from_json(processed))
386        })
387        .collect()
388}
389
390/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
391///
392/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
393/// are treated as identical field names, preventing subtle bugs where visually
394/// identical keys are treated as different.
395///
396/// # Examples
397///
398/// ```
399/// use quillmark_core::normalize::normalize_field_name;
400///
401/// // Composed form (single code point for é)
402/// let composed = "café";
403/// // Decomposed form (e + combining acute accent)
404/// let decomposed = "cafe\u{0301}";
405///
406/// // Both normalize to the same NFC form
407/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
408/// ```
409pub fn normalize_field_name(name: &str) -> String {
410    name.nfc().collect()
411}
412
413/// Normalizes a parsed document by applying all field-level normalizations.
414///
415/// This is the **primary entry point** for normalizing documents after parsing.
416/// It ensures consistent processing regardless of how the document was created.
417///
418/// # Normalization Steps
419///
420/// This function applies all normalizations in the correct order:
421/// 1. **Unicode NFC normalization** - Field names are normalized to NFC form
422/// 2. **Bidi stripping** - Invisible bidirectional control characters are removed
423/// 3. **HTML comment fence fixing** - Trailing text after `-->` is preserved (body only)
424///
425/// Double chevrons (`<<` and `>>`) are passed through unchanged without conversion.
426///
427/// # When to Use
428///
429/// Call this function after parsing and before rendering:
430///
431/// ```no_run
432/// use quillmark_core::{ParsedDocument, normalize::normalize_document};
433///
434/// let markdown = "---\ntitle: Example\n---\n\nBody with <<placeholder>>";
435/// let doc = ParsedDocument::from_markdown(markdown).unwrap();
436/// let normalized = normalize_document(doc);
437/// // Use normalized document for rendering...
438/// ```
439///
440/// # Direct API Usage
441///
442/// If you're constructing a `ParsedDocument` directly via [`crate::parse::ParsedDocument::new`]
443/// rather than parsing from markdown, you **MUST** call this function to ensure
444/// consistent normalization:
445///
446/// ```
447/// use quillmark_core::{ParsedDocument, QuillValue, normalize::normalize_document};
448/// use std::collections::HashMap;
449///
450/// // Direct construction (e.g., from API or database)
451/// let mut fields = HashMap::new();
452/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("Test")));
453/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("<<content>>")));
454///
455/// let doc = ParsedDocument::new(fields);
456/// let normalized = normalize_document(doc).expect("Failed to normalize document");
457///
458/// // Body has chevrons preserved
459/// assert_eq!(normalized.body().unwrap(), "<<content>>");
460/// ```
461///
462/// # Idempotency
463///
464/// This function is idempotent - calling it multiple times produces the same result.
465/// However, for performance reasons, avoid unnecessary repeated calls.
466pub fn normalize_document(
467    doc: crate::parse::ParsedDocument,
468) -> Result<crate::parse::ParsedDocument, crate::error::ParseError> {
469    let normalized_fields = normalize_fields(doc.fields().clone());
470    Ok(crate::parse::ParsedDocument::with_quill_ref(
471        normalized_fields,
472        doc.quill_reference().clone(),
473    ))
474}
475
476#[cfg(test)]
477mod tests {
478    use super::*;
479
480    // Tests for strip_bidi_formatting
481
482    #[test]
483    fn test_strip_bidi_no_change() {
484        assert_eq!(strip_bidi_formatting("hello world"), "hello world");
485        assert_eq!(strip_bidi_formatting(""), "");
486        assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
487    }
488
489    #[test]
490    fn test_strip_bidi_lro() {
491        // U+202D (LEFT-TO-RIGHT OVERRIDE)
492        assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
493        assert_eq!(
494            strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
495            "**asdf** or **(1234**"
496        );
497    }
498
499    #[test]
500    fn test_strip_bidi_rlo() {
501        // U+202E (RIGHT-TO-LEFT OVERRIDE)
502        assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
503    }
504
505    #[test]
506    fn test_strip_bidi_marks() {
507        // U+200E (LRM) and U+200F (RLM)
508        assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
509    }
510
511    #[test]
512    fn test_strip_bidi_embeddings() {
513        // U+202A (LRE), U+202B (RLE), U+202C (PDF)
514        assert_eq!(
515            strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
516            "textmore"
517        );
518    }
519
520    #[test]
521    fn test_strip_bidi_isolates() {
522        // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
523        assert_eq!(
524            strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
525            "abc"
526        );
527    }
528
529    #[test]
530    fn test_strip_bidi_all_chars() {
531        let all_bidi = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
532        assert_eq!(strip_bidi_formatting(all_bidi), "");
533    }
534
535    #[test]
536    fn test_strip_bidi_unicode_preserved() {
537        // Non-bidi unicode should be preserved
538        assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
539        assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
540        assert_eq!(strip_bidi_formatting("🎉"), "🎉");
541    }
542
543    // Tests for normalize_markdown
544
545    #[test]
546    fn test_normalize_markdown_basic() {
547        assert_eq!(normalize_markdown("hello"), "hello");
548        assert_eq!(
549            normalize_markdown("**bold** \u{202D}**more**"),
550            "**bold** **more**"
551        );
552    }
553
554    #[test]
555    fn test_normalize_markdown_html_comment() {
556        assert_eq!(
557            normalize_markdown("<!-- comment -->Some text"),
558            "<!-- comment -->\nSome text"
559        );
560    }
561
562    // Tests for fix_html_comment_fences
563
564    #[test]
565    fn test_fix_html_comment_no_comment() {
566        assert_eq!(fix_html_comment_fences("hello world"), "hello world");
567        assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
568        assert_eq!(fix_html_comment_fences(""), "");
569    }
570
571    #[test]
572    fn test_fix_html_comment_single_line_trailing_text() {
573        // Text on same line as --> should be moved to next line
574        assert_eq!(
575            fix_html_comment_fences("<!-- comment -->Same line text"),
576            "<!-- comment -->\nSame line text"
577        );
578    }
579
580    #[test]
581    fn test_fix_html_comment_already_newline() {
582        // Already has newline after --> - no change
583        assert_eq!(
584            fix_html_comment_fences("<!-- comment -->\nNext line text"),
585            "<!-- comment -->\nNext line text"
586        );
587    }
588
589    #[test]
590    fn test_fix_html_comment_only_whitespace_after() {
591        // Only whitespace after --> until newline - no change needed
592        assert_eq!(
593            fix_html_comment_fences("<!-- comment -->   \nSome text"),
594            "<!-- comment -->   \nSome text"
595        );
596    }
597
598    #[test]
599    fn test_fix_html_comment_multiline_trailing_text() {
600        // Multi-line comment with text on closing line
601        assert_eq!(
602            fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
603            "<!--\nmultiline\ncomment\n-->\nTrailing text"
604        );
605    }
606
607    #[test]
608    fn test_fix_html_comment_multiline_proper() {
609        // Multi-line comment with proper newline after -->
610        assert_eq!(
611            fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
612            "<!--\nmultiline\n-->\n\nParagraph text"
613        );
614    }
615
616    #[test]
617    fn test_fix_html_comment_multiple_comments() {
618        // Multiple comments in the same document
619        assert_eq!(
620            fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
621            "<!-- first -->\nText\n\n<!-- second -->\nMore text"
622        );
623    }
624
625    #[test]
626    fn test_fix_html_comment_end_of_string() {
627        // Comment at end of string - no trailing content
628        assert_eq!(
629            fix_html_comment_fences("Some text before <!-- comment -->"),
630            "Some text before <!-- comment -->"
631        );
632    }
633
634    #[test]
635    fn test_fix_html_comment_only_comment() {
636        // Just a comment with nothing after
637        assert_eq!(
638            fix_html_comment_fences("<!-- comment -->"),
639            "<!-- comment -->"
640        );
641    }
642
643    #[test]
644    fn test_fix_html_comment_arrow_not_comment() {
645        // --> that's not part of a comment (standalone)
646        // Should NOT be touched by the context-aware fixer
647        assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
648    }
649
650    #[test]
651    fn test_fix_html_comment_nested_opener() {
652        // Nested openers are just text inside the comment
653        // <!-- <!-- -->Trailing
654        // The first <!-- opens, the first --> closes.
655        assert_eq!(
656            fix_html_comment_fences("<!-- <!-- -->Trailing"),
657            "<!-- <!-- -->\nTrailing"
658        );
659    }
660
661    #[test]
662    fn test_fix_html_comment_unmatched_closer() {
663        // Closer without opener
664        assert_eq!(
665            fix_html_comment_fences("text --> more text"),
666            "text --> more text"
667        );
668    }
669
670    #[test]
671    fn test_fix_html_comment_multiple_valid_invalid() {
672        // Mixed valid and invalid comments
673        // <!-- valid -->FixMe
674        // text --> Ignore
675        // <!-- valid2 -->FixMe2
676        let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
677        let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
678        assert_eq!(fix_html_comment_fences(input), expected);
679    }
680
681    #[test]
682    fn test_fix_html_comment_crlf() {
683        // CRLF line endings
684        assert_eq!(
685            fix_html_comment_fences("<!-- comment -->\r\nSome text"),
686            "<!-- comment -->\r\nSome text"
687        );
688    }
689
690    // Tests for normalize_fields
691
692    #[test]
693    fn test_normalize_fields_body_bidi() {
694        let mut fields = HashMap::new();
695        fields.insert(
696            BODY_FIELD.to_string(),
697            QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
698        );
699
700        let result = normalize_fields(fields);
701        assert_eq!(
702            result.get(BODY_FIELD).unwrap().as_str().unwrap(),
703            "**bold** **more**"
704        );
705    }
706
707    #[test]
708    fn test_normalize_fields_body_chevrons_preserved() {
709        let mut fields = HashMap::new();
710        fields.insert(
711            BODY_FIELD.to_string(),
712            QuillValue::from_json(serde_json::json!("<<raw>>")),
713        );
714
715        let result = normalize_fields(fields);
716        // Chevrons are passed through unchanged
717        assert_eq!(result.get(BODY_FIELD).unwrap().as_str().unwrap(), "<<raw>>");
718    }
719
720    #[test]
721    fn test_normalize_fields_body_chevrons_and_bidi() {
722        let mut fields = HashMap::new();
723        fields.insert(
724            BODY_FIELD.to_string(),
725            QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
726        );
727
728        let result = normalize_fields(fields);
729        // Bidi stripped, chevrons preserved
730        assert_eq!(
731            result.get(BODY_FIELD).unwrap().as_str().unwrap(),
732            "<<raw>> **bold**"
733        );
734    }
735
736    #[test]
737    fn test_normalize_fields_other_field_chevrons_preserved() {
738        let mut fields = HashMap::new();
739        fields.insert(
740            "title".to_string(),
741            QuillValue::from_json(serde_json::json!("<<hello>>")),
742        );
743
744        let result = normalize_fields(fields);
745        // Chevrons are passed through unchanged
746        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
747    }
748
749    #[test]
750    fn test_normalize_fields_other_field_bidi_stripped() {
751        let mut fields = HashMap::new();
752        fields.insert(
753            "title".to_string(),
754            QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
755        );
756
757        let result = normalize_fields(fields);
758        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
759    }
760
761    #[test]
762    fn test_normalize_fields_nested_values() {
763        let mut fields = HashMap::new();
764        fields.insert(
765            "items".to_string(),
766            QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
767        );
768
769        let result = normalize_fields(fields);
770        let items = result.get("items").unwrap().as_array().unwrap();
771        // Chevrons are preserved, bidi stripped
772        assert_eq!(items[0].as_str().unwrap(), "<<a>>");
773        assert_eq!(items[1].as_str().unwrap(), "b");
774    }
775
776    #[test]
777    fn test_normalize_fields_object_values() {
778        let mut fields = HashMap::new();
779        fields.insert(
780            "meta".to_string(),
781            QuillValue::from_json(serde_json::json!({
782                "title": "<<hello>>",
783                BODY_FIELD: "<<content>>"
784            })),
785        );
786
787        let result = normalize_fields(fields);
788        let meta = result.get("meta").unwrap();
789        let meta_obj = meta.as_object().unwrap();
790        // Chevrons are preserved in all fields
791        assert_eq!(
792            meta_obj.get("title").unwrap().as_str().unwrap(),
793            "<<hello>>"
794        );
795        assert_eq!(
796            meta_obj.get(BODY_FIELD).unwrap().as_str().unwrap(),
797            "<<content>>"
798        );
799    }
800
801    #[test]
802    fn test_normalize_fields_non_string_unchanged() {
803        let mut fields = HashMap::new();
804        fields.insert(
805            "count".to_string(),
806            QuillValue::from_json(serde_json::json!(42)),
807        );
808        fields.insert(
809            "enabled".to_string(),
810            QuillValue::from_json(serde_json::json!(true)),
811        );
812
813        let result = normalize_fields(fields);
814        assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
815        assert!(result.get("enabled").unwrap().as_bool().unwrap());
816    }
817
818    // Tests for depth limiting
819
820    #[test]
821    fn test_normalize_json_value_inner_depth_exceeded() {
822        // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
823        let mut value = serde_json::json!("leaf");
824        for _ in 0..=crate::error::MAX_NESTING_DEPTH {
825            value = serde_json::json!([value]);
826        }
827
828        // The inner function should return an error
829        let result = super::normalize_json_value_inner(value, false, 0);
830        assert!(result.is_err());
831
832        if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
833            assert!(depth > max);
834            assert_eq!(max, crate::error::MAX_NESTING_DEPTH);
835        } else {
836            panic!("Expected NestingTooDeep error");
837        }
838    }
839
840    #[test]
841    fn test_normalize_json_value_inner_within_limit() {
842        // Create a nested structure just within the limit
843        let mut value = serde_json::json!("leaf");
844        for _ in 0..50 {
845            value = serde_json::json!([value]);
846        }
847
848        // This should succeed
849        let result = super::normalize_json_value_inner(value, false, 0);
850        assert!(result.is_ok());
851    }
852
853    // Tests for normalize_document
854
855    #[test]
856    fn test_normalize_document_basic() {
857        use crate::parse::ParsedDocument;
858
859        let mut fields = std::collections::HashMap::new();
860        fields.insert(
861            "title".to_string(),
862            crate::value::QuillValue::from_json(serde_json::json!("<<placeholder>>")),
863        );
864        fields.insert(
865            BODY_FIELD.to_string(),
866            crate::value::QuillValue::from_json(serde_json::json!("<<content>> \u{202D}**bold**")),
867        );
868
869        let doc = ParsedDocument::new(fields);
870        let normalized = super::normalize_document(doc).unwrap();
871
872        // Title has chevrons preserved (only bidi stripped)
873        assert_eq!(
874            normalized.get_field("title").unwrap().as_str().unwrap(),
875            "<<placeholder>>"
876        );
877
878        // Body has bidi stripped, chevrons preserved
879        assert_eq!(normalized.body().unwrap(), "<<content>> **bold**");
880    }
881
882    #[test]
883    fn test_normalize_document_preserves_quill_tag() {
884        use crate::parse::ParsedDocument;
885        use crate::version::QuillReference;
886        use std::str::FromStr;
887
888        let fields = std::collections::HashMap::new();
889        let quill_ref = QuillReference::from_str("custom_quill").unwrap();
890        let doc = ParsedDocument::with_quill_ref(fields, quill_ref);
891        let normalized = super::normalize_document(doc).unwrap();
892
893        assert_eq!(normalized.quill_reference().name, "custom_quill");
894    }
895
896    #[test]
897    fn test_normalize_document_idempotent() {
898        use crate::parse::ParsedDocument;
899
900        let mut fields = std::collections::HashMap::new();
901        fields.insert(
902            BODY_FIELD.to_string(),
903            crate::value::QuillValue::from_json(serde_json::json!("<<content>>")),
904        );
905
906        let doc = ParsedDocument::new(fields);
907        let normalized_once = super::normalize_document(doc).unwrap();
908        let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
909
910        // Calling normalize_document twice should produce the same result
911        assert_eq!(
912            normalized_once.body().unwrap(),
913            normalized_twice.body().unwrap()
914        );
915    }
916}
quillmark_core/normalize.rs

quillmark_core/
normalize.rs