use crate::error::MAX_NESTING_DEPTH;
use crate::parse::BODY_FIELD;
use crate::value::QuillValue;
use std::collections::HashMap;
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, thiserror::Error)]
pub enum NormalizationError {
#[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
NestingTooDeep {
depth: usize,
max: usize,
},
}
#[inline]
fn is_bidi_char(c: char) -> bool {
matches!(
c,
'\u{061C}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
pub fn strip_bidi_formatting(s: &str) -> String {
if !s.chars().any(is_bidi_char) {
return s.to_string();
}
s.chars().filter(|c| !is_bidi_char(*c)).collect()
}
pub fn fix_html_comment_fences(s: &str) -> String {
if !s.contains("-->") {
return s.to_string();
}
let mut result = String::with_capacity(s.len() + 16);
let mut current_pos = 0;
while let Some(open_idx) = s[current_pos..].find("<!--") {
let abs_open = current_pos + open_idx;
if let Some(close_idx) = s[abs_open..].find("-->") {
let abs_close = abs_open + close_idx;
let after_fence = abs_close + 3;
result.push_str(&s[current_pos..after_fence]);
let after_content = &s[after_fence..];
let needs_newline = if after_content.is_empty() {
false
} else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
false
} else {
let next_newline = after_content.find('\n');
let until_newline = match next_newline {
Some(pos) => &after_content[..pos],
None => after_content,
};
!until_newline.trim().is_empty()
};
if needs_newline {
result.push('\n');
}
current_pos = after_fence;
} else {
result.push_str(&s[current_pos..]);
current_pos = s.len();
break;
}
}
if current_pos < s.len() {
result.push_str(&s[current_pos..]);
}
result
}
pub fn normalize_markdown(markdown: &str) -> String {
let cleaned = strip_bidi_formatting(markdown);
fix_html_comment_fences(&cleaned)
}
fn normalize_string(s: &str, is_body: bool) -> String {
let cleaned = strip_bidi_formatting(s);
if is_body {
fix_html_comment_fences(&cleaned)
} else {
cleaned
}
}
fn normalize_json_value_inner(
value: serde_json::Value,
is_body: bool,
depth: usize,
) -> Result<serde_json::Value, NormalizationError> {
if depth > MAX_NESTING_DEPTH {
return Err(NormalizationError::NestingTooDeep {
depth,
max: MAX_NESTING_DEPTH,
});
}
match value {
serde_json::Value::String(s) => {
Ok(serde_json::Value::String(normalize_string(&s, is_body)))
}
serde_json::Value::Array(arr) => {
let normalized: Result<Vec<_>, _> = arr
.into_iter()
.map(|v| normalize_json_value_inner(v, false, depth + 1))
.collect();
Ok(serde_json::Value::Array(normalized?))
}
serde_json::Value::Object(map) => {
let processed: Result<serde_json::Map<String, serde_json::Value>, _> = map
.into_iter()
.map(|(k, v)| {
let is_body = k == BODY_FIELD;
normalize_json_value_inner(v, is_body, depth + 1).map(|nv| (k, nv))
})
.collect();
Ok(serde_json::Value::Object(processed?))
}
other => Ok(other),
}
}
fn normalize_json_value(value: serde_json::Value, is_body: bool) -> serde_json::Value {
match normalize_json_value_inner(value.clone(), is_body, 0) {
Ok(normalized) => normalized,
Err(e) => {
eprintln!("Warning: {}", e);
value
}
}
}
pub fn normalize_fields(fields: HashMap<String, QuillValue>) -> HashMap<String, QuillValue> {
fields
.into_iter()
.map(|(key, value)| {
let normalized_key = normalize_field_name(&key);
let json = value.into_json();
let treat_as_body = normalized_key == BODY_FIELD;
let processed = normalize_json_value(json, treat_as_body);
(normalized_key, QuillValue::from_json(processed))
})
.collect()
}
pub fn normalize_field_name(name: &str) -> String {
name.nfc().collect()
}
pub fn normalize_document(
doc: crate::parse::ParsedDocument,
) -> Result<crate::parse::ParsedDocument, crate::error::ParseError> {
let normalized_fields = normalize_fields(doc.fields().clone());
Ok(crate::parse::ParsedDocument::with_quill_ref(
normalized_fields,
doc.quill_reference().clone(),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_bidi_no_change() {
assert_eq!(strip_bidi_formatting("hello world"), "hello world");
assert_eq!(strip_bidi_formatting(""), "");
assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
}
#[test]
fn test_strip_bidi_lro() {
assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
assert_eq!(
strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
"**asdf** or **(1234**"
);
}
#[test]
fn test_strip_bidi_rlo() {
assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
}
#[test]
fn test_strip_bidi_marks() {
assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
}
#[test]
fn test_strip_bidi_embeddings() {
assert_eq!(
strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
"textmore"
);
}
#[test]
fn test_strip_bidi_isolates() {
assert_eq!(
strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
"abc"
);
}
#[test]
fn test_strip_bidi_all_chars() {
let all_bidi = "\u{061C}\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
assert_eq!(strip_bidi_formatting(all_bidi), "");
}
#[test]
fn test_strip_bidi_arabic_letter_mark() {
assert_eq!(strip_bidi_formatting("hello\u{061C}world"), "helloworld");
assert_eq!(strip_bidi_formatting("\u{061C}**bold**"), "**bold**");
}
#[test]
fn test_strip_bidi_unicode_preserved() {
assert_eq!(strip_bidi_formatting("ä½ å¥½ä¸–ç•Œ"), "ä½ å¥½ä¸–ç•Œ");
assert_eq!(strip_bidi_formatting("Ù…Ø±ØØ¨Ø§"), "Ù…Ø±ØØ¨Ø§");
assert_eq!(strip_bidi_formatting("🎉"), "🎉");
}
#[test]
fn test_normalize_markdown_basic() {
assert_eq!(normalize_markdown("hello"), "hello");
assert_eq!(
normalize_markdown("**bold** \u{202D}**more**"),
"**bold** **more**"
);
}
#[test]
fn test_normalize_markdown_html_comment() {
assert_eq!(
normalize_markdown("<!-- comment -->Some text"),
"<!-- comment -->\nSome text"
);
}
#[test]
fn test_fix_html_comment_no_comment() {
assert_eq!(fix_html_comment_fences("hello world"), "hello world");
assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
assert_eq!(fix_html_comment_fences(""), "");
}
#[test]
fn test_fix_html_comment_single_line_trailing_text() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->Same line text"),
"<!-- comment -->\nSame line text"
);
}
#[test]
fn test_fix_html_comment_already_newline() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->\nNext line text"),
"<!-- comment -->\nNext line text"
);
}
#[test]
fn test_fix_html_comment_only_whitespace_after() {
assert_eq!(
fix_html_comment_fences("<!-- comment --> \nSome text"),
"<!-- comment --> \nSome text"
);
}
#[test]
fn test_fix_html_comment_multiline_trailing_text() {
assert_eq!(
fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
"<!--\nmultiline\ncomment\n-->\nTrailing text"
);
}
#[test]
fn test_fix_html_comment_multiline_proper() {
assert_eq!(
fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
"<!--\nmultiline\n-->\n\nParagraph text"
);
}
#[test]
fn test_fix_html_comment_multiple_comments() {
assert_eq!(
fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
"<!-- first -->\nText\n\n<!-- second -->\nMore text"
);
}
#[test]
fn test_fix_html_comment_end_of_string() {
assert_eq!(
fix_html_comment_fences("Some text before <!-- comment -->"),
"Some text before <!-- comment -->"
);
}
#[test]
fn test_fix_html_comment_only_comment() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->"),
"<!-- comment -->"
);
}
#[test]
fn test_fix_html_comment_arrow_not_comment() {
assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
}
#[test]
fn test_fix_html_comment_nested_opener() {
assert_eq!(
fix_html_comment_fences("<!-- <!-- -->Trailing"),
"<!-- <!-- -->\nTrailing"
);
}
#[test]
fn test_fix_html_comment_unmatched_closer() {
assert_eq!(
fix_html_comment_fences("text --> more text"),
"text --> more text"
);
}
#[test]
fn test_fix_html_comment_multiple_valid_invalid() {
let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
assert_eq!(fix_html_comment_fences(input), expected);
}
#[test]
fn test_fix_html_comment_crlf() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->\r\nSome text"),
"<!-- comment -->\r\nSome text"
);
}
#[test]
fn test_normalize_fields_body_bidi() {
let mut fields = HashMap::new();
fields.insert(
BODY_FIELD.to_string(),
QuillValue::from_json(serde_json::json!("**bold** \u{202D}**more**")),
);
let result = normalize_fields(fields);
assert_eq!(
result.get(BODY_FIELD).unwrap().as_str().unwrap(),
"**bold** **more**"
);
}
#[test]
fn test_normalize_fields_body_chevrons_preserved() {
let mut fields = HashMap::new();
fields.insert(
BODY_FIELD.to_string(),
QuillValue::from_json(serde_json::json!("<<raw>>")),
);
let result = normalize_fields(fields);
assert_eq!(result.get(BODY_FIELD).unwrap().as_str().unwrap(), "<<raw>>");
}
#[test]
fn test_normalize_fields_body_chevrons_and_bidi() {
let mut fields = HashMap::new();
fields.insert(
BODY_FIELD.to_string(),
QuillValue::from_json(serde_json::json!("<<raw>> \u{202D}**bold**")),
);
let result = normalize_fields(fields);
assert_eq!(
result.get(BODY_FIELD).unwrap().as_str().unwrap(),
"<<raw>> **bold**"
);
}
#[test]
fn test_normalize_fields_other_field_chevrons_preserved() {
let mut fields = HashMap::new();
fields.insert(
"title".to_string(),
QuillValue::from_json(serde_json::json!("<<hello>>")),
);
let result = normalize_fields(fields);
assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
}
#[test]
fn test_normalize_fields_other_field_bidi_stripped() {
let mut fields = HashMap::new();
fields.insert(
"title".to_string(),
QuillValue::from_json(serde_json::json!("he\u{202D}llo")),
);
let result = normalize_fields(fields);
assert_eq!(result.get("title").unwrap().as_str().unwrap(), "hello");
}
#[test]
fn test_normalize_fields_nested_values() {
let mut fields = HashMap::new();
fields.insert(
"items".to_string(),
QuillValue::from_json(serde_json::json!(["<<a>>", "\u{202D}b"])),
);
let result = normalize_fields(fields);
let items = result.get("items").unwrap().as_array().unwrap();
assert_eq!(items[0].as_str().unwrap(), "<<a>>");
assert_eq!(items[1].as_str().unwrap(), "b");
}
#[test]
fn test_normalize_fields_object_values() {
let mut fields = HashMap::new();
fields.insert(
"meta".to_string(),
QuillValue::from_json(serde_json::json!({
"title": "<<hello>>",
BODY_FIELD: "<<content>>"
})),
);
let result = normalize_fields(fields);
let meta = result.get("meta").unwrap();
let meta_obj = meta.as_object().unwrap();
assert_eq!(
meta_obj.get("title").unwrap().as_str().unwrap(),
"<<hello>>"
);
assert_eq!(
meta_obj.get(BODY_FIELD).unwrap().as_str().unwrap(),
"<<content>>"
);
}
#[test]
fn test_normalize_fields_non_string_unchanged() {
let mut fields = HashMap::new();
fields.insert(
"count".to_string(),
QuillValue::from_json(serde_json::json!(42)),
);
fields.insert(
"enabled".to_string(),
QuillValue::from_json(serde_json::json!(true)),
);
let result = normalize_fields(fields);
assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
assert!(result.get("enabled").unwrap().as_bool().unwrap());
}
#[test]
fn test_normalize_json_value_inner_depth_exceeded() {
let mut value = serde_json::json!("leaf");
for _ in 0..=crate::error::MAX_NESTING_DEPTH {
value = serde_json::json!([value]);
}
let result = super::normalize_json_value_inner(value, false, 0);
assert!(result.is_err());
if let Err(NormalizationError::NestingTooDeep { depth, max }) = result {
assert!(depth > max);
assert_eq!(max, crate::error::MAX_NESTING_DEPTH);
} else {
panic!("Expected NestingTooDeep error");
}
}
#[test]
fn test_normalize_json_value_inner_within_limit() {
let mut value = serde_json::json!("leaf");
for _ in 0..50 {
value = serde_json::json!([value]);
}
let result = super::normalize_json_value_inner(value, false, 0);
assert!(result.is_ok());
}
#[test]
fn test_normalize_document_basic() {
use crate::parse::ParsedDocument;
let mut fields = std::collections::HashMap::new();
fields.insert(
"title".to_string(),
crate::value::QuillValue::from_json(serde_json::json!("<<placeholder>>")),
);
fields.insert(
BODY_FIELD.to_string(),
crate::value::QuillValue::from_json(serde_json::json!("<<content>> \u{202D}**bold**")),
);
let doc = ParsedDocument::new(fields);
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(
normalized.get_field("title").unwrap().as_str().unwrap(),
"<<placeholder>>"
);
assert_eq!(normalized.body().unwrap(), "<<content>> **bold**");
}
#[test]
fn test_normalize_document_preserves_quill_tag() {
use crate::parse::ParsedDocument;
use crate::version::QuillReference;
use std::str::FromStr;
let fields = std::collections::HashMap::new();
let quill_ref = QuillReference::from_str("custom_quill").unwrap();
let doc = ParsedDocument::with_quill_ref(fields, quill_ref);
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(normalized.quill_reference().name, "custom_quill");
}
#[test]
fn test_normalize_document_idempotent() {
use crate::parse::ParsedDocument;
let mut fields = std::collections::HashMap::new();
fields.insert(
BODY_FIELD.to_string(),
crate::value::QuillValue::from_json(serde_json::json!("<<content>>")),
);
let doc = ParsedDocument::new(fields);
let normalized_once = super::normalize_document(doc).unwrap();
let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
assert_eq!(
normalized_once.body().unwrap(),
normalized_twice.body().unwrap()
);
}
}