quillmark_core/normalize.rs
1//! # Input Normalization
2//!
3//! This module provides input normalization for markdown content before parsing.
4//! Normalization ensures that invisible control characters and other artifacts
5//! that can interfere with markdown parsing are handled consistently.
6//!
7//! ## Overview
8//!
9//! Input text may contain invisible Unicode characters (especially from copy-paste)
10//! that interfere with markdown parsing. This module provides functions to:
11//!
12//! - Strip Unicode bidirectional formatting characters that break delimiter recognition
13//! - Orchestrate guillemet preprocessing (`<<text>>` → `«text»`)
14//! - Apply all normalizations in the correct order
15//!
16//! ## Functions
17//!
18//! - [`strip_bidi_formatting`] - Remove Unicode bidi control characters
19//! - [`normalize_markdown`] - Apply all markdown-specific normalizations
20//! - [`normalize_fields`] - Normalize document fields (bidi + guillemets)
21//!
22//! ## Why Normalize?
23//!
24//! Unicode bidirectional formatting characters (LRO, RLO, LRE, RLE, etc.) are invisible
25//! control characters used for bidirectional text layout. When placed adjacent to markdown
26//! delimiters like `**`, they can prevent parsers from recognizing the delimiters:
27//!
28//! ```text
29//! **bold** or <U+202D>**(1234**
30//! ^^^^^^^^ invisible LRO here prevents second ** from being recognized as bold
31//! ```
32//!
33//! These characters commonly appear when copying text from:
34//! - Web pages with mixed LTR/RTL content
35//! - PDF documents
36//! - Word processors
37//! - Some clipboard managers
38//!
39//! ## Examples
40//!
41//! ```
42//! use quillmark_core::normalize::strip_bidi_formatting;
43//!
44//! // Input with invisible U+202D (LRO) before second **
45//! let input = "**asdf** or \u{202D}**(1234**";
46//! let cleaned = strip_bidi_formatting(input);
47//! assert_eq!(cleaned, "**asdf** or **(1234**");
48//! ```
49
50use crate::guillemet::{preprocess_markdown_guillemets, strip_chevrons};
51use crate::parse::BODY_FIELD;
52use crate::value::QuillValue;
53use std::collections::HashMap;
54
55/// Maximum nesting depth for JSON value normalization to prevent stack overflow
56const MAX_NESTING_DEPTH: usize = 100;
57
58/// Errors that can occur during normalization
59#[derive(Debug, thiserror::Error)]
60pub enum NormalizationError {
61 /// JSON nesting depth exceeded maximum allowed
62 #[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
63 NestingTooDeep {
64 /// Actual depth
65 depth: usize,
66 /// Maximum allowed depth
67 max: usize,
68 },
69}
70
71/// Check if a character is a Unicode bidirectional formatting character
72#[inline]
73fn is_bidi_char(c: char) -> bool {
74 matches!(
75 c,
76 '\u{200E}' // LEFT-TO-RIGHT MARK (LRM)
77 | '\u{200F}' // RIGHT-TO-LEFT MARK (RLM)
78 | '\u{202A}' // LEFT-TO-RIGHT EMBEDDING (LRE)
79 | '\u{202B}' // RIGHT-TO-LEFT EMBEDDING (RLE)
80 | '\u{202C}' // POP DIRECTIONAL FORMATTING (PDF)
81 | '\u{202D}' // LEFT-TO-RIGHT OVERRIDE (LRO)
82 | '\u{202E}' // RIGHT-TO-LEFT OVERRIDE (RLO)
83 | '\u{2066}' // LEFT-TO-RIGHT ISOLATE (LRI)
84 | '\u{2067}' // RIGHT-TO-LEFT ISOLATE (RLI)
85 | '\u{2068}' // FIRST STRONG ISOLATE (FSI)
86 | '\u{2069}' // POP DIRECTIONAL ISOLATE (PDI)
87 )
88}
89
90/// Strips Unicode bidirectional formatting characters that can interfere with markdown parsing.
91///
92/// These invisible control characters are used for bidirectional text layout but can
93/// break markdown delimiter recognition when placed adjacent to `**`, `*`, `_`, etc.
94///
95/// # Characters Stripped
96///
97/// - U+200E (LEFT-TO-RIGHT MARK, LRM)
98/// - U+200F (RIGHT-TO-LEFT MARK, RLM)
99/// - U+202A (LEFT-TO-RIGHT EMBEDDING, LRE)
100/// - U+202B (RIGHT-TO-LEFT EMBEDDING, RLE)
101/// - U+202C (POP DIRECTIONAL FORMATTING, PDF)
102/// - U+202D (LEFT-TO-RIGHT OVERRIDE, LRO)
103/// - U+202E (RIGHT-TO-LEFT OVERRIDE, RLO)
104/// - U+2066 (LEFT-TO-RIGHT ISOLATE, LRI)
105/// - U+2067 (RIGHT-TO-LEFT ISOLATE, RLI)
106/// - U+2068 (FIRST STRONG ISOLATE, FSI)
107/// - U+2069 (POP DIRECTIONAL ISOLATE, PDI)
108///
109/// # Examples
110///
111/// ```
112/// use quillmark_core::normalize::strip_bidi_formatting;
113///
114/// // Normal text is unchanged
115/// assert_eq!(strip_bidi_formatting("hello"), "hello");
116///
117/// // LRO character is stripped
118/// assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
119///
120/// // All bidi characters are stripped
121/// let input = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}";
122/// assert_eq!(strip_bidi_formatting(input), "");
123/// ```
124pub fn strip_bidi_formatting(s: &str) -> String {
125 // Early return optimization: avoid allocation if no bidi characters present
126 if !s.chars().any(is_bidi_char) {
127 return s.to_string();
128 }
129
130 s.chars().filter(|c| !is_bidi_char(*c)).collect()
131}
132
133/// Normalizes markdown content by applying all preprocessing steps.
134///
135/// This function applies normalizations in the correct order:
136/// 1. Strip Unicode bidirectional formatting characters
137///
138/// Note: Guillemet preprocessing (`<<text>>` → `«text»`) is handled separately
139/// in [`normalize_fields`] because it needs to be applied after schema defaults
140/// and coercion.
141///
142/// # Examples
143///
144/// ```
145/// use quillmark_core::normalize::normalize_markdown;
146///
147/// // Bidi characters are stripped
148/// let input = "**bold** \u{202D}**more**";
149/// let normalized = normalize_markdown(input);
150/// assert_eq!(normalized, "**bold** **more**");
151/// ```
152pub fn normalize_markdown(markdown: &str) -> String {
153 strip_bidi_formatting(markdown)
154}
155
156/// Normalizes a string value by stripping bidi characters and optionally processing guillemets.
157///
158/// - For body content: applies `preprocess_markdown_guillemets` (converts `<<text>>` to `«text»`)
159/// - For other fields: applies `strip_chevrons` (removes chevrons entirely)
160fn normalize_string(s: &str, is_body: bool) -> String {
161 // First strip bidi formatting characters
162 let cleaned = strip_bidi_formatting(s);
163
164 // Then apply guillemet preprocessing
165 if is_body {
166 preprocess_markdown_guillemets(&cleaned)
167 } else {
168 strip_chevrons(&cleaned)
169 }
170}
171
172/// Recursively normalize a JSON value with depth tracking.
173///
174/// Returns an error if nesting exceeds MAX_NESTING_DEPTH to prevent stack overflow.
175fn normalize_json_value_inner(
176 value: serde_json::Value,
177 is_body: bool,
178 depth: usize,
179) -> Result<serde_json::Value, NormalizationError> {
180 if depth > MAX_NESTING_DEPTH {
181 return Err(NormalizationError::NestingTooDeep {
182 depth,
183 max: MAX_NESTING_DEPTH,
184 });
185 }
186
187 match value {
188 serde_json::Value::String(s) => {
189 Ok(serde_json::Value::String(normalize_string(&s, is_body)))
190 }
191 serde_json::Value::Array(arr) => {
192 let normalized: Result<Vec<_>, _> = arr
193 .into_iter()
194 .map(|v| normalize_json_value_inner(v, false, depth + 1))
195 .collect();
196 Ok(serde_json::Value::Array(normalized?))
197 }
198 serde_json::Value::Object(map) => {
199 let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
200 .into_iter()
201 .map(|(k, v)| {
202 let is_body = k == BODY_FIELD;
203 normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
204 })
205 .collect();
206 Ok(serde_json::Value::Object(processed?))
207 }
208 // Pass through other types unchanged (numbers, booleans, null)
209 other => Ok(other),
210 }
211}
212
213/// Recursively normalize a JSON value.
214///
215/// This is a convenience wrapper that starts depth tracking at 0.
216/// Logs a warning and returns the original value if depth is exceeded.
217fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
218 match normalize_json_value_inner(value.clone(), is_body, 0) {
219 Ok(normalized) => normalized,
220 Err(e) => {
221 // Log warning but don't fail - return original value
222 eprintln!("Warning: {}", e);
223 value
224 }
225 }
226}
227
228/// Normalizes document fields by applying all preprocessing steps.
229///
230/// This function orchestrates input normalization for document fields:
231/// 1. Strips Unicode bidirectional formatting characters from all string values
232/// 2. For the body field: converts `<<text>>` to `«text»` (guillemets)
233/// 3. For other fields: strips chevrons entirely (`<<text>>` → `text`)
234///
235/// # Processing Order
236///
237/// The normalization order is important:
238/// 1. **Bidi stripping** - Must happen first so markdown delimiters are recognized
239/// 2. **Guillemet preprocessing** - Converts user syntax to internal markers
240///
241/// # Examples
242///
243/// ```
244/// use quillmark_core::normalize::normalize_fields;
245/// use quillmark_core::QuillValue;
246/// use std::collections::HashMap;
247///
248/// let mut fields = HashMap::new();
249/// fields.insert("title".to_string(), QuillValue::from_json(serde_json::json!("<<hello>>")));
250/// fields.insert("body".to_string(), QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")));
251///
252/// let result = normalize_fields(fields);
253///
254/// // Title has chevrons stripped
255/// assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
256///
257/// // Body has bidi chars stripped (guillemet would apply if there were any <<>>)
258/// assert_eq!(result.get("body").unwrap().as_str().unwrap(), "**bold** **more**");
259/// ```
260pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
261 fields
262 .into_iter()
263 .map(|(key, value)| {
264 let json = value.into_json();
265 let processed = normalize_json_value(json, key == BODY_FIELD);
266 (key, QuillValue::from_json(processed))
267 })
268 .collect()
269}
270
271#[cfg(test)]
272mod tests {
273 use super::*;
274
275 // Tests for strip_bidi_formatting
276
277 #[test]
278 fn test_strip_bidi_no_change() {
279 assert_eq!(strip_bidi_formatting("hello world"), "hello world");
280 assert_eq!(strip_bidi_formatting(""), "");
281 assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
282 }
283
284 #[test]
285 fn test_strip_bidi_lro() {
286 // U+202D (LEFT-TO-RIGHT OVERRIDE)
287 assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
288 assert_eq!(
289 strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
290 "**asdf** or **(1234**"
291 );
292 }
293
294 #[test]
295 fn test_strip_bidi_rlo() {
296 // U+202E (RIGHT-TO-LEFT OVERRIDE)
297 assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
298 }
299
300 #[test]
301 fn test_strip_bidi_marks() {
302 // U+200E (LRM) and U+200F (RLM)
303 assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
304 }
305
306 #[test]
307 fn test_strip_bidi_embeddings() {
308 // U+202A (LRE), U+202B (RLE), U+202C (PDF)
309 assert_eq!(
310 strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
311 "textmore"
312 );
313 }
314
315 #[test]
316 fn test_strip_bidi_isolates() {
317 // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI)
318 assert_eq!(
319 strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
320 "abc"
321 );
322 }
323
324 #[test]
325 fn test_strip_bidi_all_chars() {
326 let all_bidi = "\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
327 assert_eq!(strip_bidi_formatting(all_bidi), "");
328 }
329
330 #[test]
331 fn test_strip_bidi_unicode_preserved() {
332 // Non-bidi unicode should be preserved
333 assert_eq!(strip_bidi_formatting("你好世界"), "你好世界");
334 assert_eq!(strip_bidi_formatting("مرحبا"), "مرحبا");
335 assert_eq!(strip_bidi_formatting("🎉"), "🎉");
336 }
337
338 // Tests for normalize_markdown
339
340 #[test]
341 fn test_normalize_markdown_basic() {
342 assert_eq!(normalize_markdown("hello"), "hello");
343 assert_eq!(
344 normalize_markdown("**bold** \u{202D}**more**"),
345 "**bold** **more**"
346 );
347 }
348
349 // Tests for normalize_fields
350
351 #[test]
352 fn test_normalize_fields_body_bidi() {
353 let mut fields = HashMap::new();
354 fields.insert(
355 "body".to_string(),
356 QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
357 );
358
359 let result = normalize_fields(fields);
360 assert_eq!(
361 result.get("body").unwrap().as_str().unwrap(),
362 "**bold** **more**"
363 );
364 }
365
366 #[test]
367 fn test_normalize_fields_body_guillemets() {
368 let mut fields = HashMap::new();
369 fields.insert(
370 "body".to_string(),
371 QuillValue::from_json(serde_json::json!("<<raw>>")),
372 );
373
374 let result = normalize_fields(fields);
375 assert_eq!(result.get("body").unwrap().as_str().unwrap(), "«raw»");
376 }
377
378 #[test]
379 fn test_normalize_fields_body_both() {
380 let mut fields = HashMap::new();
381 fields.insert(
382 "body".to_string(),
383 QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
384 );
385
386 let result = normalize_fields(fields);
387 // Bidi stripped first, then guillemets converted
388 assert_eq!(
389 result.get("body").unwrap().as_str().unwrap(),
390 "«raw» **bold**"
391 );
392 }
393
394 #[test]
395 fn test_normalize_fields_other_field_chevrons_stripped() {
396 let mut fields = HashMap::new();
397 fields.insert(
398 "title".to_string(),
399 QuillValue::from_json(serde_json::json!("<<hello>>")),
400 );
401
402 let result = normalize_fields(fields);
403 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
404 }
405
406 #[test]
407 fn test_normalize_fields_other_field_bidi_stripped() {
408 let mut fields = HashMap::new();
409 fields.insert(
410 "title".to_string(),
411 QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
412 );
413
414 let result = normalize_fields(fields);
415 assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
416 }
417
418 #[test]
419 fn test_normalize_fields_nested_values() {
420 let mut fields = HashMap::new();
421 fields.insert(
422 "items".to_string(),
423 QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
424 );
425
426 let result = normalize_fields(fields);
427 let items = result.get("items").unwrap().as_array().unwrap();
428 assert_eq!(items[0].as_str().unwrap(), "a");
429 assert_eq!(items[1].as_str().unwrap(), "b");
430 }
431
432 #[test]
433 fn test_normalize_fields_object_values() {
434 let mut fields = HashMap::new();
435 fields.insert(
436 "meta".to_string(),
437 QuillValue::from_json(serde_json::json!({
438 "title": "<<hello>>",
439 "body": "<<content>>"
440 })),
441 );
442
443 let result = normalize_fields(fields);
444 let meta = result.get("meta").unwrap();
445 let meta_obj = meta.as_object().unwrap();
446 // Nested "body" key should be recognized
447 assert_eq!(meta_obj.get("title").unwrap().as_str().unwrap(), "hello");
448 assert_eq!(meta_obj.get("body").unwrap().as_str().unwrap(), "«content»");
449 }
450
451 #[test]
452 fn test_normalize_fields_non_string_unchanged() {
453 let mut fields = HashMap::new();
454 fields.insert(
455 "count".to_string(),
456 QuillValue::from_json(serde_json::json!(42)),
457 );
458 fields.insert(
459 "enabled".to_string(),
460 QuillValue::from_json(serde_json::json!(true)),
461 );
462
463 let result = normalize_fields(fields);
464 assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
465 assert!(result.get("enabled").unwrap().as_bool().unwrap());
466 }
467
468 // Tests for depth limiting
469
470 #[test]
471 fn test_normalize_json_value_inner_depth_exceeded() {
472 // Create a deeply nested JSON structure that exceeds MAX_NESTING_DEPTH
473 let mut value = serde_json::json!("leaf");
474 for _ in 0..=super::MAX_NESTING_DEPTH {
475 value = serde_json::json!([value]);
476 }
477
478 // The inner function should return an error
479 let result = super::normalize_json_value_inner(value, false, 0);
480 assert!(result.is_err());
481
482 if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
483 assert!(depth > max);
484 assert_eq!(max, super::MAX_NESTING_DEPTH);
485 } else {
486 panic!("Expected NestingTooDeep error");
487 }
488 }
489
490 #[test]
491 fn test_normalize_json_value_inner_within_limit() {
492 // Create a nested structure just within the limit
493 let mut value = serde_json::json!("leaf");
494 for _ in 0..50 {
495 value = serde_json::json!([value]);
496 }
497
498 // This should succeed
499 let result = super::normalize_json_value_inner(value, false, 0);
500 assert!(result.is_ok());
501 }
502}