use crate::document::Card;
use crate::value::QuillValue;
use indexmap::IndexMap;
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, thiserror::Error)]
pub enum NormalizationError {
#[error("JSON nesting too deep: {depth} levels (max: {max} levels)")]
NestingTooDeep {
depth: usize,
max: usize,
},
}
#[inline]
fn is_bidi_char(c: char) -> bool {
matches!(
c,
'\u{061C}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
pub fn strip_bidi_formatting(s: &str) -> String {
if !s.chars().any(is_bidi_char) {
return s.to_string();
}
s.chars().filter(|c| !is_bidi_char(*c)).collect()
}
pub fn fix_html_comment_fences(s: &str) -> String {
if !s.contains("-->") {
return s.to_string();
}
let mut result = String::with_capacity(s.len() + 16);
let mut current_pos = 0;
while let Some(open_idx) = s[current_pos..].find("<!--") {
let abs_open = current_pos + open_idx;
if let Some(close_idx) = s[abs_open..].find("-->") {
let abs_close = abs_open + close_idx;
let mut after_fence = abs_close + 3;
let opener_has_extra_hyphen = s
.get(abs_open + 4..)
.is_some_and(|rest| rest.starts_with('-'));
if opener_has_extra_hyphen
&& s.get(after_fence..)
.is_some_and(|rest| rest.starts_with('-'))
{
after_fence += 1;
}
result.push_str(&s[current_pos..after_fence]);
let after_content = &s[after_fence..];
let needs_newline = if after_content.is_empty() {
false
} else if after_content.starts_with('\n') || after_content.starts_with("\r\n") {
false
} else {
let next_newline = after_content.find('\n');
let until_newline = match next_newline {
Some(pos) => &after_content[..pos],
None => after_content,
};
!until_newline.trim().is_empty()
};
if needs_newline {
result.push('\n');
}
current_pos = after_fence;
} else {
result.push_str(&s[current_pos..]);
current_pos = s.len();
break;
}
}
if current_pos < s.len() {
result.push_str(&s[current_pos..]);
}
result
}
pub fn normalize_markdown(markdown: &str) -> String {
let cleaned = normalize_line_endings(markdown);
let cleaned = strip_bidi_formatting(&cleaned);
fix_html_comment_fences(&cleaned)
}
fn normalize_line_endings(s: &str) -> String {
if !s.contains('\r') {
return s.to_string();
}
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '\r' {
if chars.peek() == Some(&'\n') {
chars.next();
}
out.push('\n');
} else {
out.push(c);
}
}
out
}
pub fn normalize_fields(fields: IndexMap<String, QuillValue>) -> IndexMap<String, QuillValue> {
fields
.into_iter()
.map(|(key, value)| {
let normalized_key = normalize_field_name(&key);
(normalized_key, value)
})
.collect()
}
pub fn normalize_field_name(name: &str) -> String {
name.nfc().collect()
}
pub fn normalize_document(
doc: crate::document::Document,
) -> Result<crate::document::Document, crate::error::ParseError> {
use crate::document::{Document, Sentinel};
let normalized_main_fm_map = normalize_fields(doc.main().frontmatter().to_index_map());
let normalized_main_body = normalize_markdown(doc.main().body());
let main_sentinel = doc.main().sentinel().clone();
let main = Card::new_with_sentinel(
main_sentinel,
crate::document::Frontmatter::from_index_map(normalized_main_fm_map),
normalized_main_body,
);
let normalized_cards: Vec<Card> = doc
.cards()
.iter()
.map(|card| {
let normalized_card_fields: IndexMap<String, QuillValue> = card
.frontmatter()
.iter()
.map(|(k, v)| (normalize_field_name(k), v.clone()))
.collect();
let normalized_card_body = normalize_markdown(card.body());
Card::new_with_sentinel(
Sentinel::Card(card.tag()),
crate::document::Frontmatter::from_index_map(normalized_card_fields),
normalized_card_body,
)
})
.collect();
Ok(Document::from_main_and_cards(
main,
normalized_cards,
doc.warnings().to_vec(),
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_bidi_no_change() {
assert_eq!(strip_bidi_formatting("hello world"), "hello world");
assert_eq!(strip_bidi_formatting(""), "");
assert_eq!(strip_bidi_formatting("**bold** text"), "**bold** text");
}
#[test]
fn test_strip_bidi_lro() {
assert_eq!(strip_bidi_formatting("he\u{202D}llo"), "hello");
assert_eq!(
strip_bidi_formatting("**asdf** or \u{202D}**(1234**"),
"**asdf** or **(1234**"
);
}
#[test]
fn test_strip_bidi_rlo() {
assert_eq!(strip_bidi_formatting("he\u{202E}llo"), "hello");
}
#[test]
fn test_strip_bidi_marks() {
assert_eq!(strip_bidi_formatting("a\u{200E}b\u{200F}c"), "abc");
}
#[test]
fn test_strip_bidi_embeddings() {
assert_eq!(
strip_bidi_formatting("\u{202A}text\u{202B}more\u{202C}"),
"textmore"
);
}
#[test]
fn test_strip_bidi_isolates() {
assert_eq!(
strip_bidi_formatting("\u{2066}a\u{2067}b\u{2068}c\u{2069}"),
"abc"
);
}
#[test]
fn test_strip_bidi_all_chars() {
let all_bidi = "\u{061C}\u{200E}\u{200F}\u{202A}\u{202B}\u{202C}\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
assert_eq!(strip_bidi_formatting(all_bidi), "");
}
#[test]
fn test_strip_bidi_arabic_letter_mark() {
assert_eq!(strip_bidi_formatting("hello\u{061C}world"), "helloworld");
assert_eq!(strip_bidi_formatting("\u{061C}**bold**"), "**bold**");
}
#[test]
fn test_strip_bidi_unicode_preserved() {
assert_eq!(strip_bidi_formatting("ä½ å¥½ä¸–ç•Œ"), "ä½ å¥½ä¸–ç•Œ");
assert_eq!(strip_bidi_formatting("Ù…Ø±ØØ¨Ø§"), "Ù…Ø±ØØ¨Ø§");
assert_eq!(strip_bidi_formatting("🎉"), "🎉");
}
#[test]
fn test_normalize_markdown_basic() {
assert_eq!(normalize_markdown("hello"), "hello");
assert_eq!(
normalize_markdown("**bold** \u{202D}**more**"),
"**bold** **more**"
);
}
#[test]
fn test_normalize_markdown_html_comment() {
assert_eq!(
normalize_markdown("<!-- comment -->Some text"),
"<!-- comment -->\nSome text"
);
}
#[test]
fn test_fix_html_comment_no_comment() {
assert_eq!(fix_html_comment_fences("hello world"), "hello world");
assert_eq!(fix_html_comment_fences("**bold** text"), "**bold** text");
assert_eq!(fix_html_comment_fences(""), "");
}
#[test]
fn test_fix_html_comment_single_line_trailing_text() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->Same line text"),
"<!-- comment -->\nSame line text"
);
}
#[test]
fn test_fix_html_comment_already_newline() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->\nNext line text"),
"<!-- comment -->\nNext line text"
);
}
#[test]
fn test_fix_html_comment_only_whitespace_after() {
assert_eq!(
fix_html_comment_fences("<!-- comment --> \nSome text"),
"<!-- comment --> \nSome text"
);
}
#[test]
fn test_fix_html_comment_multiline_trailing_text() {
assert_eq!(
fix_html_comment_fences("<!--\nmultiline\ncomment\n-->Trailing text"),
"<!--\nmultiline\ncomment\n-->\nTrailing text"
);
}
#[test]
fn test_fix_html_comment_multiline_proper() {
assert_eq!(
fix_html_comment_fences("<!--\nmultiline\n-->\n\nParagraph text"),
"<!--\nmultiline\n-->\n\nParagraph text"
);
}
#[test]
fn test_fix_html_comment_multiple_comments() {
assert_eq!(
fix_html_comment_fences("<!-- first -->Text\n\n<!-- second -->More text"),
"<!-- first -->\nText\n\n<!-- second -->\nMore text"
);
}
#[test]
fn test_fix_html_comment_end_of_string() {
assert_eq!(
fix_html_comment_fences("Some text before <!-- comment -->"),
"Some text before <!-- comment -->"
);
}
#[test]
fn test_fix_html_comment_only_comment() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->"),
"<!-- comment -->"
);
}
#[test]
fn test_fix_html_comment_arrow_not_comment() {
assert_eq!(fix_html_comment_fences("-->some text"), "-->some text");
}
#[test]
fn test_fix_html_comment_nested_opener() {
assert_eq!(
fix_html_comment_fences("<!-- <!-- -->Trailing"),
"<!-- <!-- -->\nTrailing"
);
}
#[test]
fn test_fix_html_comment_unmatched_closer() {
assert_eq!(
fix_html_comment_fences("text --> more text"),
"text --> more text"
);
}
#[test]
fn test_fix_html_comment_multiple_valid_invalid() {
let input = "<!-- valid -->FixMe\ntext --> Ignore\n<!-- valid2 -->FixMe2";
let expected = "<!-- valid -->\nFixMe\ntext --> Ignore\n<!-- valid2 -->\nFixMe2";
assert_eq!(fix_html_comment_fences(input), expected);
}
#[test]
fn test_fix_html_comment_crlf() {
assert_eq!(
fix_html_comment_fences("<!-- comment -->\r\nSome text"),
"<!-- comment -->\r\nSome text"
);
}
#[test]
fn test_fix_html_comment_triple_hyphen_single_line() {
assert_eq!(
fix_html_comment_fences("<!--- comment --->Trailing text"),
"<!--- comment --->\nTrailing text"
);
}
#[test]
fn test_fix_html_comment_triple_hyphen_multiline() {
assert_eq!(
fix_html_comment_fences("<!---\ncomment\n--->Trailing text"),
"<!---\ncomment\n--->\nTrailing text"
);
}
#[test]
fn test_normalize_fields_other_field_chevrons_preserved() {
let mut fields = IndexMap::new();
fields.insert(
"title".to_string(),
QuillValue::from_json(serde_json::json!("<<hello>>")),
);
let result = normalize_fields(fields);
assert_eq!(result.get("title").unwrap().as_str().unwrap(), "<<hello>>");
}
#[test]
fn test_normalize_fields_other_field_bidi_preserved() {
let mut fields = IndexMap::new();
fields.insert(
"title".to_string(),
QuillValue::from_json(serde_json::json!("a\u{202D}b")),
);
let result = normalize_fields(fields);
assert_eq!(result.get("title").unwrap().as_str().unwrap(), "a\u{202D}b");
}
#[test]
fn test_normalize_fields_non_string_unchanged() {
let mut fields = IndexMap::new();
fields.insert(
"count".to_string(),
QuillValue::from_json(serde_json::json!(42)),
);
fields.insert(
"enabled".to_string(),
QuillValue::from_json(serde_json::json!(true)),
);
let result = normalize_fields(fields);
assert_eq!(result.get("count").unwrap().as_i64().unwrap(), 42);
assert!(result.get("enabled").unwrap().as_bool().unwrap());
}
#[test]
fn test_normalize_document_basic() {
use crate::document::Document;
let doc = Document::from_markdown(
"---\nQUILL: test\ntitle: <<placeholder>>\n---\n\n<<content>> \u{202D}**bold**",
)
.unwrap();
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(
normalized
.main()
.frontmatter()
.get("title")
.unwrap()
.as_str()
.unwrap(),
"<<placeholder>>"
);
assert_eq!(normalized.main().body(), "\n<<content>> **bold**");
}
#[test]
fn test_normalize_document_preserves_quill_tag() {
use crate::document::Document;
let doc = Document::from_markdown("---\nQUILL: custom_quill\n---\n").unwrap();
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(normalized.quill_reference().name, "custom_quill");
}
#[test]
fn test_normalize_document_idempotent() {
use crate::document::Document;
let doc = Document::from_markdown("---\nQUILL: test\n---\n\n<<content>>").unwrap();
let normalized_once = super::normalize_document(doc).unwrap();
let normalized_twice = super::normalize_document(normalized_once.clone()).unwrap();
assert_eq!(
normalized_once.main().body(),
normalized_twice.main().body()
);
}
#[test]
fn test_normalize_document_body_bidi_stripped() {
use crate::document::Document;
let doc = Document::from_markdown("---\nQUILL: test\n---\n\nhello\u{202D}world").unwrap();
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(normalized.main().body(), "\nhelloworld");
}
#[test]
fn test_normalize_document_yaml_field_bidi_preserved() {
use crate::document::Document;
let doc = Document::from_markdown("---\nQUILL: test\ntitle: a\u{202D}b\n---\n").unwrap();
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(
normalized
.main()
.frontmatter()
.get("title")
.unwrap()
.as_str()
.unwrap(),
"a\u{202D}b"
);
}
#[test]
fn test_normalize_document_card_body_bidi_stripped() {
use crate::document::Document;
let md = "---\nQUILL: test\n---\n\nbody\n\n---\nCARD: note\n---\ncard\u{202D}body\n";
let doc = Document::from_markdown(md).unwrap();
assert_eq!(doc.cards().len(), 1, "expected 1 card");
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(normalized.cards()[0].body(), "cardbody\n");
}
#[test]
fn test_normalize_document_card_field_bidi_preserved() {
use crate::document::Document;
let md = "---\nQUILL: test\n---\n\nbody\n\n---\nCARD: note\nname: Ali\u{202D}ce\n---\n";
let doc = Document::from_markdown(md).unwrap();
assert_eq!(doc.cards().len(), 1, "expected 1 card");
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(
normalized.cards()[0]
.frontmatter()
.get("name")
.unwrap()
.as_str()
.unwrap(),
"Ali\u{202D}ce"
);
}
#[test]
fn test_normalize_document_card_body_html_comment_repair() {
use crate::document::Document;
let md = "---\nQUILL: test\n---\n\n---\nCARD: note\n---\n<!-- comment -->Trailing text\n";
let doc = Document::from_markdown(md).unwrap();
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(
normalized.cards()[0].body(),
"<!-- comment -->\nTrailing text\n"
);
}
#[test]
fn test_normalize_document_toplevel_body_html_comment_repair() {
use crate::document::Document;
let md = "---\nQUILL: test\n---\n\n<!-- note -->Content here";
let doc = Document::from_markdown(md).unwrap();
let normalized = super::normalize_document(doc).unwrap();
assert_eq!(normalized.main().body(), "\n<!-- note -->\nContent here");
}
}