quillmark_core/
normalize.rs

1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Orchestrate guillemet preprocessing (`<<text>>` → `«text»`)
14//! - Apply all normalizations in the correct order
15//!
16//! ## Functions
17//!
18//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
19//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
20//! - [`normalize_fields`] - Normalize document fields (bidi + guillemets)
21//!
22//! ## Why Normalize?
23//!
24//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
25//! control characters used for bidirectional text layout. When placed adjacent to markdown
26//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
27//!
28//! ```text
29//! **bold** or <U+202D>**(1234**
30//!             ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
31//! ```
32//!
33//! These characters commonly appear when copying text from:
34//! - Web pages with mixed LTR/RTL content
35//! - PDF documents
36//! - Word processors
37//! - Some clipboard managers
38//!
39//! ## Examples
40//!
41//! ```
42//! use quillmark_core::normalize::strip_bidi_formatting;
43//!
44//! // Input with invisible U+202D (LRO) before second **
45//! let input = "**asdf** or \u{202D}**(1234**";
46//! let cleaned = strip_bidi_formatting(input);
47//! assert_eq!(cleaned, "**asdf** or **(1234**");
48//! ```
49
50use crate::error::MAX_NESTING_DEPTH;
51use crate::guillemet::{preprocess_markdown_guillemets, strip_chevrons};
52use crate::parse::BODY_FIELD;
53use crate::value::QuillValue;
54use std::collections::HashMap;
55use unicode_normalization::UnicodeNormalization;
56
57/// Errors that can occur during normalization
58#[derive(Debug, thiserror::Error)]
59pub enum NormalizationError {
60    /// JSON nesting depth exceeded maximum allowed
61    #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
62    NestingTooDeep {
63        /// Actual depth
64        depth: usize,
65        /// Maximum allowed depth
66        max: usize,
67    },
68}
69
70/// Check if a character is a Unicode bidirectional formatting character
71#[inline]
72fn is_bidi_char(c: char) -> bool {
73    matches!(
74        c,
75        '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
76        | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
77        | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
78        | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
79        | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
80        | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
81        | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
82        | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
83        | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
84        | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
85        | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
86    )
87}
88
89/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
90///
91/// These invisible control characters are used for bidirectional text layout but can
92/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
93///
94/// # Characters Stripped
95///
96/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
97/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
98/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
99/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
100/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
101/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
102/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
103/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
104/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
105/// - U+2068 (FIRST STRONG ISOLATE, FSI)
106/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
107///
108/// # Examples
109///
110/// ```
111/// use quillmark_core::normalize::strip_bidi_formatting;
112///
113/// // Normal text is unchanged
114/// assert_eq!(strip_bidi_formatting("hello"), "hello");
115///
116/// // LRO character is stripped
117/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
118///
119/// // All bidi characters are stripped
120/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
121/// assert_eq!(strip_bidi_formatting(input), "");
122/// ```
123pub fn strip_bidi_formatting(s: &str) -> String {
124    // Early return optimization: avoid allocation if no bidi characters present
125    if !s.chars().any(is_bidi_char) {
126        return s.to_string();
127    }
128
129    s.chars().filter(|c| !is_bidi_char(*c)).collect()
130}
131
132/// Fixes HTML comment closing fences to prevent content loss.
133///
134/// According to CommonMark, HTML block type 2 (comments) ends with the line containing `-->`.
135/// This means any text on the same line after `-->` is included in the HTML block and would
136/// be discarded by markdown parsers that ignore HTML blocks.
137///
138/// This function inserts a newline after `-->` when followed by non-whitespace content,
139/// ensuring the trailing text is parsed as regular markdown.
140///
141/// # Examples
142///
143/// ```
144/// use quillmark_core::normalize::fix_html_comment_fences;
145///
146/// // Text on same line as --> is moved to next line
147/// assert_eq!(
148///     fix_html_comment_fences("<!-- comment -->Some text"),
149///     "<!-- comment -->\nSome text"
150/// );
151///
152/// // Already on separate line - no change
153/// assert_eq!(
154///     fix_html_comment_fences("<!-- comment -->\nSome text"),
155///     "<!-- comment -->\nSome text"
156/// );
157///
158/// // Only whitespace after --> - no change needed
159/// assert_eq!(
160///     fix_html_comment_fences("<!-- comment -->   \nSome text"),
161///     "<!-- comment -->   \nSome text"
162/// );
163///
164/// // Multi-line comments with trailing text
165/// assert_eq!(
166///     fix_html_comment_fences("<!--\nmultiline\n-->Trailing text"),
167///     "<!--\nmultiline\n-->\nTrailing text"
168/// );
169/// ```
170pub fn fix_html_comment_fences(s: &str) -> String {
171    // Early return if no HTML comment closing fence present
172    if !s.contains("-->") {
173        return s.to_string();
174    }
175
176    // Context-aware processing: only fix `-->` if we are inside a comment started by `<!--`
177    let mut result = String::with_capacity(s.len() + 16);
178    let mut current_pos = 0;
179
180    // Find first opener
181    while let Some(open_idx) = s[current_pos..].find("<!--") {
182        let abs_open = current_pos + open_idx;
183
184        // Find matching closer AFTER the opener
185        if let Some(close_idx) = s[abs_open..].find("-->") {
186            let abs_close = abs_open + close_idx;
187            let after_fence = abs_close + 3;
188
189            // Append everything up to and including the closing fence
190            result.push_str(&s[current_pos..after_fence]);
191
192            // Check what comes after the fence
193            let after_content = &s[after_fence..];
194
195            // Determine if we need to insert a newline
196            let needs_newline = if after_content.is_empty() {
197                false
198            } else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
199                false
200            } else {
201                // Check if there's only whitespace until end of line
202                let next_newline = after_content.find('\n');
203                let until_newline = match next_newline {
204                    Some(pos) => &after_content[..pos],
205                    None => after_content,
206                };
207                !until_newline.trim().is_empty()
208            };
209
210            if needs_newline {
211                result.push('\n');
212            }
213
214            // Move position to after the fence (we'll process the rest in next iteration)
215            current_pos = after_fence;
216        } else {
217            // Unclosed comment at end of string - just append the rest and break
218            // The opener was found but no closer exists.
219            result.push_str(&s[current_pos..]);
220            current_pos = s.len();
221            break;
222        }
223    }
224
225    // Append remaining content (text after last closed comment, or text if no comments found)
226    if current_pos < s.len() {
227        result.push_str(&s[current_pos..]);
228    }
229
230    result
231}
232
233/// Normalizes markdown content by applying all preprocessing steps.
234///
235/// This function applies normalizations in the correct order:
236/// 1. Strip Unicode bidirectional formatting characters
237/// 2. Fix HTML comment closing fences (ensure text after `-->` is preserved)
238///
239/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
240/// in [`normalize_fields`] because it needs to be applied after schema defaults
241/// and coercion.
242///
243/// # Examples
244///
245/// ```
246/// use quillmark_core::normalize::normalize_markdown;
247///
248/// // Bidi characters are stripped
249/// let input = "**bold** \u{202D}**more**";
250/// let normalized = normalize_markdown(input);
251/// assert_eq!(normalized, "**bold** **more**");
252///
253/// // HTML comment trailing text is preserved
254/// let with_comment = "<!-- comment -->Some text";
255/// let normalized = normalize_markdown(with_comment);
256/// assert_eq!(normalized, "<!-- comment -->\nSome text");
257/// ```
258pub fn normalize_markdown(markdown: &str) -> String {
259    let cleaned = strip_bidi_formatting(markdown);
260    fix_html_comment_fences(&cleaned)
261}
262
263/// Normalizes a string value by stripping bidi characters and optionally processing guillemets.
264///
265/// - For body content: applies `preprocess_markdown_guillemets` (converts `<<text>>` to `«text»`)
266///   and `fix_html_comment_fences` to preserve text after `-->`
267/// - For other fields: applies `strip_chevrons` (removes chevrons entirely)
268fn normalize_string(s: &str, is_body: bool) -> String {
269    // First strip bidi formatting characters
270    let cleaned = strip_bidi_formatting(s);
271
272    // Then apply content-specific normalization
273    if is_body {
274        // Fix HTML comment fences first, then convert guillemets
275        let fixed = fix_html_comment_fences(&cleaned);
276        preprocess_markdown_guillemets(&fixed)
277    } else {
278        strip_chevrons(&cleaned)
279    }
280}
281
282/// Recursively normalize a JSON value with depth tracking.
283///
284/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
285fn normalize_json_value_inner(
286    value: serde_json::Value,
287    is_body: bool,
288    depth: usize,
289) -> Result<serde_json::Value, NormalizationError> {
290    if depth > MAX_NESTING_DEPTH {
291        return Err(NormalizationError::NestingTooDeep {
292            depth,
293            max: MAX_NESTING_DEPTH,
294        });
295    }
296
297    match value {
298        serde_json::Value::String(s) => {
299            Ok(serde_json::Value::String(normalize_string(&s, is_body)))
300        }
301        serde_json::Value::Array(arr) => {
302            let normalized: Result<Vec<_>, _> = arr
303                .into_iter()
304                .map(|v| normalize_json_value_inner(v, false, depth + 1))
305                .collect();
306            Ok(serde_json::Value::Array(normalized?))
307        }
308        serde_json::Value::Object(map) => {
309            let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
310                .into_iter()
311                .map(|(k, v)| {
312                    let is_body = k == BODY_FIELD;
313                    normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
314                })
315                .collect();
316            Ok(serde_json::Value::Object(processed?))
317        }
318        // Pass through other types unchanged (numbers, booleans, null)
319        other => Ok(other),
320    }
321}
322
323/// Recursively normalize a JSON value.
324///
325/// This is a convenience wrapper that starts depth tracking at 0.
326/// Logs a warning and returns the original value if depth is exceeded.
327fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
328    match normalize_json_value_inner(value.clone(), is_body, 0) {
329        Ok(normalized) => normalized,
330        Err(e) => {
331            // Log warning but don't fail - return original value
332            eprintln!("Warning: {}", e);
333            value
334        }
335    }
336}
337
338/// Normalizes document fields by applying all preprocessing steps.
339///
340/// This function orchestrates input normalization for document fields:
341/// 1. Strips Unicode bidirectional formatting characters from all string values
342/// 2. For the body field: converts `<<text>>` to `«text»` (guillemets)
343/// 3. For other fields: strips chevrons entirely (`<<text>>` → `text`)
344///
345/// # Processing Order
346///
347/// The normalization order is important:
348/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
349/// 2. **Guillemet preprocessing** - Converts user syntax to internal markers
350///
351/// # Examples
352///
353/// ```
354/// use quillmark_core::normalize::normalize_fields;
355/// use quillmark_core::QuillValue;
356/// use std::collections::HashMap;
357///
358/// let mut fields = HashMap::new();
359/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
360/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
361///
362/// let result = normalize_fields(fields);
363///
364/// // Title has chevrons stripped
365/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
366///
367/// // Body has bidi chars stripped (guillemet would apply if there were any <<>>)
368/// assert_eq!(result.get("BODY").unwrap().as_str().unwrap(), "**bold** **more**");
369/// ```
370pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
371    fields
372        .into_iter()
373        .map(|(key, value)| {
374            // Normalize field name to NFC form for consistent key comparison
375            // This ensures café (composed) and café (decomposed) are treated as the same key
376            let normalized_key = normalize_field_name(&key);
377            let json = value.into_json();
378            let processed = normalize_json_value(json, normalized_key == BODY_FIELD);
379            (normalized_key, QuillValue::from_json(processed))
380        })
381        .collect()
382}
383
384/// Normalize field name to Unicode NFC (Canonical Decomposition, followed by Canonical Composition)
385///
386/// This ensures that equivalent Unicode strings (e.g., "café" composed vs decomposed)
387/// are treated as identical field names, preventing subtle bugs where visually
388/// identical keys are treated as different.
389///
390/// # Examples
391///
392/// ```
393/// use quillmark_core::normalize::normalize_field_name;
394///
395/// // Composed form (single code point for é)
396/// let composed = "café";
397/// // Decomposed form (e + combining acute accent)
398/// let decomposed = "cafe\u{0301}";
399///
400/// // Both normalize to the same NFC form
401/// assert_eq!(normalize_field_name(composed), normalize_field_name(decomposed));
402/// ```
403pub fn normalize_field_name(name: &str) -> String {
404    name.nfc().collect()
405}
406
407/// Normalizes a parsed document by applying all field-level normalizations.
408///
409/// This is the **primary entry point** for normalizing documents after parsing.
410/// It ensures consistent processing regardless of how the document was created.
411///
412/// # Normalization Steps
413///
414/// This function applies all normalizations in the correct order:
415/// 1. **Unicode NFC normalization** - Field names are normalized to NFC form
416/// 2. **Bidi stripping** - Invisible bidirectional control characters are removed
417/// 3. **HTML comment fence fixing** - Trailing text after `-->` is preserved
418/// 4. **Guillemet conversion** - `<<text>>` is converted to `«text»` in BODY fields
419/// 5. **Chevron stripping** - `<<text>>` is stripped to `text` in other fields
420///
421/// # When to Use
422///
423/// Call this function after parsing and before rendering:
424///
425/// ```no_run
426/// use quillmark_core::{ParsedDocument, normalize::normalize_document};
427///
428/// let markdown = "---\ntitle: Example\n---\n\nBody with <<placeholder>>";
429/// let doc = ParsedDocument::from_markdown(markdown).unwrap();
430/// let normalized = normalize_document(doc);
431/// // Use normalized document for rendering...
432/// ```
433///
434/// # Direct API Usage
435///
436/// If you're constructing a `ParsedDocument` directly via [`crate::parse::ParsedDocument::new`]
437/// rather than parsing from markdown, you **MUST** call this function to ensure
438/// consistent normalization:
439///
440/// ```
441/// use quillmark_core::{ParsedDocument, QuillValue, normalize::normalize_document};
442/// use std::collections::HashMap;
443///
444/// // Direct construction (e.g., from API or database)
445/// let mut fields = HashMap::new();
446/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("Test")));
447/// fields.insert("BODY".to_string(), QuillValue::from_json(serde_json::json!("<<content>>")));
448///
449/// let doc = ParsedDocument::new(fields);
450/// let normalized = normalize_document(doc);
451///
452/// // Body now has guillemets converted
453/// assert_eq!(normalized.body().unwrap(), "«content»");
454/// ```
455///
456/// # Idempotency
457///
458/// This function is idempotent - calling it multiple times produces the same result.
459/// However, for performance reasons, avoid unnecessary repeated calls.
460pub fn normalize_document(doc: crate::parse::ParsedDocument) -> crate::parse::ParsedDocument {
461    let normalized_fields = normalize_fields(doc.fields().clone());
462    crate::parse::ParsedDocument::with_quill_tag(normalized_fields, doc.quill_tag().to_string())
463}
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468
469    // Tests for strip_bidi_formatting
470
471    #[test]
472    fn test_strip_bidi_no_change() {
473        assert_eq!(strip_bidi_formatting("hello world"), "hello world");
474        assert_eq!(strip_bidi_formatting(""), "");
475        assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
476    }
477
478    #[test]
479    fn test_strip_bidi_lro() {
480        // U+202D (LEFT-TO-RIGHT OVERRIDE)
481        assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
482        assert_eq!(
483            strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
484            "**asdf** or **(1234**"
485        );
486    }
487
488    #[test]
489    fn test_strip_bidi_rlo() {
490        // U+202E (RIGHT-TO-LEFT OVERRIDE)
491        assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
492    }
493
494    #[test]
495    fn test_strip_bidi_marks() {
496        // U+200E (LRM) and U+200F (RLM)
497        assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
498    }
499
500    #[test]
501    fn test_strip_bidi_embeddings() {
502        // U+202A (LRE), U+202B (RLE), U+202C (PDF)
503        assert_eq!(
504            strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
505            "textmore"
506        );
507    }
508
509    #[test]
510    fn test_strip_bidi_isolates() {
511        // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
512        assert_eq!(
513            strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
514            "abc"
515        );
516    }
517
518    #[test]
519    fn test_strip_bidi_all_chars() {
520        let all_bidi = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
521        assert_eq!(strip_bidi_formatting(all_bidi), "");
522    }
523
524    #[test]
525    fn test_strip_bidi_unicode_preserved() {
526        // Non-bidi unicode should be preserved
527        assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
528        assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
529        assert_eq!(strip_bidi_formatting("🎉"), "🎉");
530    }
531
532    // Tests for normalize_markdown
533
534    #[test]
535    fn test_normalize_markdown_basic() {
536        assert_eq!(normalize_markdown("hello"), "hello");
537        assert_eq!(
538            normalize_markdown("**bold** \u{202D}**more**"),
539            "**bold** **more**"
540        );
541    }
542
543    #[test]
544    fn test_normalize_markdown_html_comment() {
545        assert_eq!(
546            normalize_markdown("<!-- comment -->Some text"),
547            "<!-- comment -->\nSome text"
548        );
549    }
550
551    // Tests for fix_html_comment_fences
552
553    #[test]
554    fn test_fix_html_comment_no_comment() {
555        assert_eq!(fix_html_comment_fences("hello world"), "hello world");
556        assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
557        assert_eq!(fix_html_comment_fences(""), "");
558    }
559
560    #[test]
561    fn test_fix_html_comment_single_line_trailing_text() {
562        // Text on same line as --> should be moved to next line
563        assert_eq!(
564            fix_html_comment_fences("<!-- comment -->Same line text"),
565            "<!-- comment -->\nSame line text"
566        );
567    }
568
569    #[test]
570    fn test_fix_html_comment_already_newline() {
571        // Already has newline after --> - no change
572        assert_eq!(
573            fix_html_comment_fences("<!-- comment -->\nNext line text"),
574            "<!-- comment -->\nNext line text"
575        );
576    }
577
578    #[test]
579    fn test_fix_html_comment_only_whitespace_after() {
580        // Only whitespace after --> until newline - no change needed
581        assert_eq!(
582            fix_html_comment_fences("<!-- comment -->   \nSome text"),
583            "<!-- comment -->   \nSome text"
584        );
585    }
586
587    #[test]
588    fn test_fix_html_comment_multiline_trailing_text() {
589        // Multi-line comment with text on closing line
590        assert_eq!(
591            fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
592            "<!--\nmultiline\ncomment\n-->\nTrailing text"
593        );
594    }
595
596    #[test]
597    fn test_fix_html_comment_multiline_proper() {
598        // Multi-line comment with proper newline after -->
599        assert_eq!(
600            fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
601            "<!--\nmultiline\n-->\n\nParagraph text"
602        );
603    }
604
605    #[test]
606    fn test_fix_html_comment_multiple_comments() {
607        // Multiple comments in the same document
608        assert_eq!(
609            fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
610            "<!-- first -->\nText\n\n<!-- second -->\nMore text"
611        );
612    }
613
614    #[test]
615    fn test_fix_html_comment_end_of_string() {
616        // Comment at end of string - no trailing content
617        assert_eq!(
618            fix_html_comment_fences("Some text before <!-- comment -->"),
619            "Some text before <!-- comment -->"
620        );
621    }
622
623    #[test]
624    fn test_fix_html_comment_only_comment() {
625        // Just a comment with nothing after
626        assert_eq!(
627            fix_html_comment_fences("<!-- comment -->"),
628            "<!-- comment -->"
629        );
630    }
631
632    #[test]
633    fn test_fix_html_comment_arrow_not_comment() {
634        // --> that's not part of a comment (standalone)
635        // Should NOT be touched by the context-aware fixer
636        assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
637    }
638
639    #[test]
640    fn test_fix_html_comment_nested_opener() {
641        // Nested openers are just text inside the comment
642        // <!-- <!-- -->Trailing
643        // The first <!-- opens, the first --> closes.
644        assert_eq!(
645            fix_html_comment_fences("<!-- <!-- -->Trailing"),
646            "<!-- <!-- -->\nTrailing"
647        );
648    }
649
650    #[test]
651    fn test_fix_html_comment_unmatched_closer() {
652        // Closer without opener
653        assert_eq!(
654            fix_html_comment_fences("text --> more text"),
655            "text --> more text"
656        );
657    }
658
659    #[test]
660    fn test_fix_html_comment_multiple_valid_invalid() {
661        // Mixed valid and invalid comments
662        // <!-- valid -->FixMe
663        // text --> Ignore
664        // <!-- valid2 -->FixMe2
665        let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
666        let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
667        assert_eq!(fix_html_comment_fences(input), expected);
668    }
669
670    #[test]
671    fn test_fix_html_comment_crlf() {
672        // CRLF line endings
673        assert_eq!(
674            fix_html_comment_fences("<!-- comment -->\r\nSome text"),
675            "<!-- comment -->\r\nSome text"
676        );
677    }
678
679    // Tests for normalize_fields
680
681    #[test]
682    fn test_normalize_fields_body_bidi() {
683        let mut fields = HashMap::new();
684        fields.insert(
685            BODY_FIELD.to_string(),
686            QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
687        );
688
689        let result = normalize_fields(fields);
690        assert_eq!(
691            result.get(BODY_FIELD).unwrap().as_str().unwrap(),
692            "**bold** **more**"
693        );
694    }
695
696    #[test]
697    fn test_normalize_fields_body_guillemets() {
698        let mut fields = HashMap::new();
699        fields.insert(
700            BODY_FIELD.to_string(),
701            QuillValue::from_json(serde_json::json!("<<raw>>")),
702        );
703
704        let result = normalize_fields(fields);
705        assert_eq!(result.get(BODY_FIELD).unwrap().as_str().unwrap(), "«raw»");
706    }
707
708    #[test]
709    fn test_normalize_fields_body_both() {
710        let mut fields = HashMap::new();
711        fields.insert(
712            BODY_FIELD.to_string(),
713            QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
714        );
715
716        let result = normalize_fields(fields);
717        // Bidi stripped first, then guillemets converted
718        assert_eq!(
719            result.get(BODY_FIELD).unwrap().as_str().unwrap(),
720            "«raw» **bold**"
721        );
722    }
723
724    #[test]
725    fn test_normalize_fields_other_field_chevrons_stripped() {
726        let mut fields = HashMap::new();
727        fields.insert(
728            "title".to_string(),
729            QuillValue::from_json(serde_json::json!("<<hello>>")),
730        );
731
732        let result = normalize_fields(fields);
733        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
734    }
735
736    #[test]
737    fn test_normalize_fields_other_field_bidi_stripped() {
738        let mut fields = HashMap::new();
739        fields.insert(
740            "title".to_string(),
741            QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
742        );
743
744        let result = normalize_fields(fields);
745        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
746    }
747
748    #[test]
749    fn test_normalize_fields_nested_values() {
750        let mut fields = HashMap::new();
751        fields.insert(
752            "items".to_string(),
753            QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
754        );
755
756        let result = normalize_fields(fields);
757        let items = result.get("items").unwrap().as_array().unwrap();
758        assert_eq!(items[0].as_str().unwrap(), "a");
759        assert_eq!(items[1].as_str().unwrap(), "b");
760    }
761
762    #[test]
763    fn test_normalize_fields_object_values() {
764        let mut fields = HashMap::new();
765        fields.insert(
766            "meta".to_string(),
767            QuillValue::from_json(serde_json::json!({
768                "title": "<<hello>>",
769                BODY_FIELD: "<<content>>"
770            })),
771        );
772
773        let result = normalize_fields(fields);
774        let meta = result.get("meta").unwrap();
775        let meta_obj = meta.as_object().unwrap();
776        // Nested "BODY" key should be recognized
777        assert_eq!(meta_obj.get("title").unwrap().as_str().unwrap(), "hello");
778        assert_eq!(
779            meta_obj.get(BODY_FIELD).unwrap().as_str().unwrap(),
780            "«content»"
781        );
782    }
783
784    #[test]
785    fn test_normalize_fields_non_string_unchanged() {
786        let mut fields = HashMap::new();
787        fields.insert(
788            "count".to_string(),
789            QuillValue::from_json(serde_json::json!(42)),
790        );
791        fields.insert(
792            "enabled".to_string(),
793            QuillValue::from_json(serde_json::json!(true)),
794        );
795
796        let result = normalize_fields(fields);
797        assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
798        assert!(result.get("enabled").unwrap().as_bool().unwrap());
799    }
800
801    // Tests for depth limiting
802
803    #[test]
804    fn test_normalize_json_value_inner_depth_exceeded() {
805        // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
806        let mut value = serde_json::json!("leaf");
807        for _ in 0..=crate::error::MAX_NESTING_DEPTH {
808            value = serde_json::json!([value]);
809        }
810
811        // The inner function should return an error
812        let result = super::normalize_json_value_inner(value, false, 0);
813        assert!(result.is_err());
814
815        if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
816            assert!(depth > max);
817            assert_eq!(max, crate::error::MAX_NESTING_DEPTH);
818        } else {
819            panic!("Expected NestingTooDeep error");
820        }
821    }
822
823    #[test]
824    fn test_normalize_json_value_inner_within_limit() {
825        // Create a nested structure just within the limit
826        let mut value = serde_json::json!("leaf");
827        for _ in 0..50 {
828            value = serde_json::json!([value]);
829        }
830
831        // This should succeed
832        let result = super::normalize_json_value_inner(value, false, 0);
833        assert!(result.is_ok());
834    }
835
836    // Tests for normalize_document
837
838    #[test]
839    fn test_normalize_document_basic() {
840        use crate::parse::ParsedDocument;
841
842        let mut fields = std::collections::HashMap::new();
843        fields.insert(
844            "title".to_string(),
845            crate::value::QuillValue::from_json(serde_json::json!("<<placeholder>>")),
846        );
847        fields.insert(
848            BODY_FIELD.to_string(),
849            crate::value::QuillValue::from_json(serde_json::json!("<<content>> \u{202D}**bold**")),
850        );
851
852        let doc = ParsedDocument::new(fields);
853        let normalized = super::normalize_document(doc);
854
855        // Title has chevrons stripped
856        assert_eq!(
857            normalized.get_field("title").unwrap().as_str().unwrap(),
858            "placeholder"
859        );
860
861        // Body has guillemets converted and bidi stripped
862        assert_eq!(normalized.body().unwrap(), "«content» **bold**");
863    }
864
865    #[test]
866    fn test_normalize_document_preserves_quill_tag() {
867        use crate::parse::ParsedDocument;
868
869        let fields = std::collections::HashMap::new();
870        let doc = ParsedDocument::with_quill_tag(fields, "custom_quill".to_string());
871        let normalized = super::normalize_document(doc);
872
873        assert_eq!(normalized.quill_tag(), "custom_quill");
874    }
875
876    #[test]
877    fn test_normalize_document_idempotent() {
878        use crate::parse::ParsedDocument;
879
880        let mut fields = std::collections::HashMap::new();
881        fields.insert(
882            BODY_FIELD.to_string(),
883            crate::value::QuillValue::from_json(serde_json::json!("<<content>>")),
884        );
885
886        let doc = ParsedDocument::new(fields);
887        let normalized_once = super::normalize_document(doc);
888        let normalized_twice = super::normalize_document(normalized_once.clone());
889
890        // Calling normalize_document twice should produce the same result
891        assert_eq!(
892            normalized_once.body().unwrap(),
893            normalized_twice.body().unwrap()
894        );
895    }
896}
quillmark_core/normalize.rs

quillmark_core/
normalize.rs