quillmark_core/
normalize.rs

1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Orchestrate guillemet preprocessing (`<<text>>` → `«text»`)
14//! - Apply all normalizations in the correct order
15//!
16//! ## Functions
17//!
18//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
19//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
20//! - [`normalize_fields`] - Normalize document fields (bidi + guillemets)
21//!
22//! ## Why Normalize?
23//!
24//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
25//! control characters used for bidirectional text layout. When placed adjacent to markdown
26//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
27//!
28//! ```text
29//! **bold** or <U+202D>**(1234**
30//!             ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
31//! ```
32//!
33//! These characters commonly appear when copying text from:
34//! - Web pages with mixed LTR/RTL content
35//! - PDF documents
36//! - Word processors
37//! - Some clipboard managers
38//!
39//! ## Examples
40//!
41//! ```
42//! use quillmark_core::normalize::strip_bidi_formatting;
43//!
44//! // Input with invisible U+202D (LRO) before second **
45//! let input = "**asdf** or \u{202D}**(1234**";
46//! let cleaned = strip_bidi_formatting(input);
47//! assert_eq!(cleaned, "**asdf** or **(1234**");
48//! ```
49
50use crate::guillemet::{preprocess_markdown_guillemets, strip_chevrons};
51use crate::parse::BODY_FIELD;
52use crate::value::QuillValue;
53use std::collections::HashMap;
54
55/// Maximum nesting depth for JSON value normalization to prevent stack overflow
56const MAX_NESTING_DEPTH: usize = 100;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61    /// JSON nesting depth exceeded maximum allowed
62    #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63    NestingTooDeep {
64        /// Actual depth
65        depth: usize,
66        /// Maximum allowed depth
67        max: usize,
68    },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74    matches!(
75        c,
76        '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
77        | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
78        | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
79        | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
80        | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
81        | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
82        | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
83        | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
84        | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
85        | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
86        | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
87    )
88}
89
90/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
91///
92/// These invisible control characters are used for bidirectional text layout but can
93/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
94///
95/// # Characters Stripped
96///
97/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
98/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
99/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
100/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
101/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
102/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
103/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
104/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
105/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
106/// - U+2068 (FIRST STRONG ISOLATE, FSI)
107/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
108///
109/// # Examples
110///
111/// ```
112/// use quillmark_core::normalize::strip_bidi_formatting;
113///
114/// // Normal text is unchanged
115/// assert_eq!(strip_bidi_formatting("hello"), "hello");
116///
117/// // LRO character is stripped
118/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
119///
120/// // All bidi characters are stripped
121/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
122/// assert_eq!(strip_bidi_formatting(input), "");
123/// ```
124pub fn strip_bidi_formatting(s: &str) -> String {
125    // Early return optimization: avoid allocation if no bidi characters present
126    if !s.chars().any(is_bidi_char) {
127        return s.to_string();
128    }
129
130    s.chars().filter(|c| !is_bidi_char(*c)).collect()
131}
132
133/// Normalizes markdown content by applying all preprocessing steps.
134///
135/// This function applies normalizations in the correct order:
136/// 1. Strip Unicode bidirectional formatting characters
137///
138/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
139/// in [`normalize_fields`] because it needs to be applied after schema defaults
140/// and coercion.
141///
142/// # Examples
143///
144/// ```
145/// use quillmark_core::normalize::normalize_markdown;
146///
147/// // Bidi characters are stripped
148/// let input = "**bold** \u{202D}**more**";
149/// let normalized = normalize_markdown(input);
150/// assert_eq!(normalized, "**bold** **more**");
151/// ```
152pub fn normalize_markdown(markdown: &str) -> String {
153    strip_bidi_formatting(markdown)
154}
155
156/// Normalizes a string value by stripping bidi characters and optionally processing guillemets.
157///
158/// - For body content: applies `preprocess_markdown_guillemets` (converts `<<text>>` to `«text»`)
159/// - For other fields: applies `strip_chevrons` (removes chevrons entirely)
160fn normalize_string(s: &str, is_body: bool) -> String {
161    // First strip bidi formatting characters
162    let cleaned = strip_bidi_formatting(s);
163
164    // Then apply guillemet preprocessing
165    if is_body {
166        preprocess_markdown_guillemets(&cleaned)
167    } else {
168        strip_chevrons(&cleaned)
169    }
170}
171
172/// Recursively normalize a JSON value with depth tracking.
173///
174/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
175fn normalize_json_value_inner(
176    value: serde_json::Value,
177    is_body: bool,
178    depth: usize,
179) -> Result<serde_json::Value, NormalizationError> {
180    if depth > MAX_NESTING_DEPTH {
181        return Err(NormalizationError::NestingTooDeep {
182            depth,
183            max: MAX_NESTING_DEPTH,
184        });
185    }
186
187    match value {
188        serde_json::Value::String(s) => {
189            Ok(serde_json::Value::String(normalize_string(&s, is_body)))
190        }
191        serde_json::Value::Array(arr) => {
192            let normalized: Result<Vec<_>, _> = arr
193                .into_iter()
194                .map(|v| normalize_json_value_inner(v, false, depth + 1))
195                .collect();
196            Ok(serde_json::Value::Array(normalized?))
197        }
198        serde_json::Value::Object(map) => {
199            let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
200                .into_iter()
201                .map(|(k, v)| {
202                    let is_body = k == BODY_FIELD;
203                    normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
204                })
205                .collect();
206            Ok(serde_json::Value::Object(processed?))
207        }
208        // Pass through other types unchanged (numbers, booleans, null)
209        other => Ok(other),
210    }
211}
212
213/// Recursively normalize a JSON value.
214///
215/// This is a convenience wrapper that starts depth tracking at 0.
216/// Logs a warning and returns the original value if depth is exceeded.
217fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
218    match normalize_json_value_inner(value.clone(), is_body, 0) {
219        Ok(normalized) => normalized,
220        Err(e) => {
221            // Log warning but don't fail - return original value
222            eprintln!("Warning: {}", e);
223            value
224        }
225    }
226}
227
228/// Normalizes document fields by applying all preprocessing steps.
229///
230/// This function orchestrates input normalization for document fields:
231/// 1. Strips Unicode bidirectional formatting characters from all string values
232/// 2. For the body field: converts `<<text>>` to `«text»` (guillemets)
233/// 3. For other fields: strips chevrons entirely (`<<text>>` → `text`)
234///
235/// # Processing Order
236///
237/// The normalization order is important:
238/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
239/// 2. **Guillemet preprocessing** - Converts user syntax to internal markers
240///
241/// # Examples
242///
243/// ```
244/// use quillmark_core::normalize::normalize_fields;
245/// use quillmark_core::QuillValue;
246/// use std::collections::HashMap;
247///
248/// let mut fields = HashMap::new();
249/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
250/// fields.insert("body".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
251///
252/// let result = normalize_fields(fields);
253///
254/// // Title has chevrons stripped
255/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
256///
257/// // Body has bidi chars stripped (guillemet would apply if there were any <<>>)
258/// assert_eq!(result.get("body").unwrap().as_str().unwrap(), "**bold** **more**");
259/// ```
260pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
261    fields
262        .into_iter()
263        .map(|(key, value)| {
264            let json = value.into_json();
265            let processed = normalize_json_value(json, key == BODY_FIELD);
266            (key, QuillValue::from_json(processed))
267        })
268        .collect()
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274
275    // Tests for strip_bidi_formatting
276
277    #[test]
278    fn test_strip_bidi_no_change() {
279        assert_eq!(strip_bidi_formatting("hello world"), "hello world");
280        assert_eq!(strip_bidi_formatting(""), "");
281        assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
282    }
283
284    #[test]
285    fn test_strip_bidi_lro() {
286        // U+202D (LEFT-TO-RIGHT OVERRIDE)
287        assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
288        assert_eq!(
289            strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
290            "**asdf** or **(1234**"
291        );
292    }
293
294    #[test]
295    fn test_strip_bidi_rlo() {
296        // U+202E (RIGHT-TO-LEFT OVERRIDE)
297        assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
298    }
299
300    #[test]
301    fn test_strip_bidi_marks() {
302        // U+200E (LRM) and U+200F (RLM)
303        assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
304    }
305
306    #[test]
307    fn test_strip_bidi_embeddings() {
308        // U+202A (LRE), U+202B (RLE), U+202C (PDF)
309        assert_eq!(
310            strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
311            "textmore"
312        );
313    }
314
315    #[test]
316    fn test_strip_bidi_isolates() {
317        // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
318        assert_eq!(
319            strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
320            "abc"
321        );
322    }
323
324    #[test]
325    fn test_strip_bidi_all_chars() {
326        let all_bidi = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
327        assert_eq!(strip_bidi_formatting(all_bidi), "");
328    }
329
330    #[test]
331    fn test_strip_bidi_unicode_preserved() {
332        // Non-bidi unicode should be preserved
333        assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
334        assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
335        assert_eq!(strip_bidi_formatting("🎉"), "🎉");
336    }
337
338    // Tests for normalize_markdown
339
340    #[test]
341    fn test_normalize_markdown_basic() {
342        assert_eq!(normalize_markdown("hello"), "hello");
343        assert_eq!(
344            normalize_markdown("**bold** \u{202D}**more**"),
345            "**bold** **more**"
346        );
347    }
348
349    // Tests for normalize_fields
350
351    #[test]
352    fn test_normalize_fields_body_bidi() {
353        let mut fields = HashMap::new();
354        fields.insert(
355            "body".to_string(),
356            QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
357        );
358
359        let result = normalize_fields(fields);
360        assert_eq!(
361            result.get("body").unwrap().as_str().unwrap(),
362            "**bold** **more**"
363        );
364    }
365
366    #[test]
367    fn test_normalize_fields_body_guillemets() {
368        let mut fields = HashMap::new();
369        fields.insert(
370            "body".to_string(),
371            QuillValue::from_json(serde_json::json!("<<raw>>")),
372        );
373
374        let result = normalize_fields(fields);
375        assert_eq!(result.get("body").unwrap().as_str().unwrap(), "«raw»");
376    }
377
378    #[test]
379    fn test_normalize_fields_body_both() {
380        let mut fields = HashMap::new();
381        fields.insert(
382            "body".to_string(),
383            QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
384        );
385
386        let result = normalize_fields(fields);
387        // Bidi stripped first, then guillemets converted
388        assert_eq!(
389            result.get("body").unwrap().as_str().unwrap(),
390            "«raw» **bold**"
391        );
392    }
393
394    #[test]
395    fn test_normalize_fields_other_field_chevrons_stripped() {
396        let mut fields = HashMap::new();
397        fields.insert(
398            "title".to_string(),
399            QuillValue::from_json(serde_json::json!("<<hello>>")),
400        );
401
402        let result = normalize_fields(fields);
403        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
404    }
405
406    #[test]
407    fn test_normalize_fields_other_field_bidi_stripped() {
408        let mut fields = HashMap::new();
409        fields.insert(
410            "title".to_string(),
411            QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
412        );
413
414        let result = normalize_fields(fields);
415        assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
416    }
417
418    #[test]
419    fn test_normalize_fields_nested_values() {
420        let mut fields = HashMap::new();
421        fields.insert(
422            "items".to_string(),
423            QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
424        );
425
426        let result = normalize_fields(fields);
427        let items = result.get("items").unwrap().as_array().unwrap();
428        assert_eq!(items[0].as_str().unwrap(), "a");
429        assert_eq!(items[1].as_str().unwrap(), "b");
430    }
431
432    #[test]
433    fn test_normalize_fields_object_values() {
434        let mut fields = HashMap::new();
435        fields.insert(
436            "meta".to_string(),
437            QuillValue::from_json(serde_json::json!({
438                "title": "<<hello>>",
439                "body": "<<content>>"
440            })),
441        );
442
443        let result = normalize_fields(fields);
444        let meta = result.get("meta").unwrap();
445        let meta_obj = meta.as_object().unwrap();
446        // Nested "body" key should be recognized
447        assert_eq!(meta_obj.get("title").unwrap().as_str().unwrap(), "hello");
448        assert_eq!(meta_obj.get("body").unwrap().as_str().unwrap(), "«content»");
449    }
450
451    #[test]
452    fn test_normalize_fields_non_string_unchanged() {
453        let mut fields = HashMap::new();
454        fields.insert(
455            "count".to_string(),
456            QuillValue::from_json(serde_json::json!(42)),
457        );
458        fields.insert(
459            "enabled".to_string(),
460            QuillValue::from_json(serde_json::json!(true)),
461        );
462
463        let result = normalize_fields(fields);
464        assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
465        assert!(result.get("enabled").unwrap().as_bool().unwrap());
466    }
467
468    // Tests for depth limiting
469
470    #[test]
471    fn test_normalize_json_value_inner_depth_exceeded() {
472        // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
473        let mut value = serde_json::json!("leaf");
474        for _ in 0..=super::MAX_NESTING_DEPTH {
475            value = serde_json::json!([value]);
476        }
477
478        // The inner function should return an error
479        let result = super::normalize_json_value_inner(value, false, 0);
480        assert!(result.is_err());
481
482        if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
483            assert!(depth > max);
484            assert_eq!(max, super::MAX_NESTING_DEPTH);
485        } else {
486            panic!("Expected NestingTooDeep error");
487        }
488    }
489
490    #[test]
491    fn test_normalize_json_value_inner_within_limit() {
492        // Create a nested structure just within the limit
493        let mut value = serde_json::json!("leaf");
494        for _ in 0..50 {
495            value = serde_json::json!([value]);
496        }
497
498        // This should succeed
499        let result = super::normalize_json_value_inner(value, false, 0);
500        assert!(result.is_ok());
501    }
502}