use regex::Regex;
pub fn normalize_markdown(content: &str) -> String {
let mut normalized = content.to_string();
normalized = standardize_bullets(&normalized);
normalized = normalize_spacing(&normalized);
normalized = normalize_headings(&normalized);
normalized = trim_line_whitespace(&normalized);
normalized = normalized.trim_end().to_string() + "\n";
normalized
}
fn standardize_bullets(content: &str) -> String {
let re = Regex::new(r"^(\s*)[\*\+]\s").unwrap();
content
.lines()
.map(|line| re.replace(line, "${1}- ").to_string())
.collect::<Vec<_>>()
.join("\n")
}
fn normalize_spacing(content: &str) -> String {
let re = Regex::new(r"\n{3,}").unwrap();
re.replace_all(content, "\n\n").to_string()
}
fn normalize_headings(content: &str) -> String {
let lines: Vec<&str> = content.lines().collect();
let mut result = Vec::new();
for (i, line) in lines.iter().enumerate() {
let is_heading = line.trim().starts_with('#');
if is_heading {
if i > 0 && !result.is_empty() {
let last = result.last().unwrap_or(&"");
if !last.is_empty() {
result.push("");
}
}
result.push(line);
if i < lines.len() - 1 {
let next = lines[i + 1];
if !next.trim().is_empty() && !next.trim().starts_with('#') {
result.push("");
}
}
} else {
result.push(line);
}
}
result.join("\n")
}
fn trim_line_whitespace(content: &str) -> String {
content.lines().map(|line| line.trim_end()).collect::<Vec<_>>().join("\n")
}
pub fn strip_bad_frontmatter(content: &str) -> String {
let trimmed = content.trim_start();
if !trimmed.starts_with("---\n") {
return content.to_string();
}
if let Some(end_pos) = trimmed[4..].find("\n---\n") {
let frontmatter = &trimmed[4..end_pos + 4];
let lines: Vec<&str> = frontmatter.lines().filter(|l| !l.trim().is_empty()).collect();
let valid_lines = lines.iter().filter(|line| line.contains(':')).count();
if !lines.is_empty() && valid_lines < lines.len() / 2 {
return trimmed[end_pos + 8..].to_string();
}
} else if let Some(end_pos) = trimmed[4..].find("\n---") {
let frontmatter = &trimmed[4..end_pos + 4];
let lines: Vec<&str> = frontmatter.lines().filter(|l| !l.trim().is_empty()).collect();
let valid_lines = lines.iter().filter(|line| line.contains(':')).count();
if !lines.is_empty() && valid_lines < lines.len() / 2 {
return trimmed[end_pos + 7..].to_string();
}
}
content.to_string()
}
pub fn strip_frontmatter(content: &str) -> String {
let trimmed = content.trim_start();
if !trimmed.starts_with("---\n") {
return content.to_string();
}
if let Some(end_pos) = trimmed[4..].find("\n---\n") {
return trimmed[end_pos + 8..].to_string();
} else if let Some(end_pos) = trimmed[4..].find("\n---") {
return trimmed[end_pos + 7..].to_string();
}
content.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_standardize_bullets() {
let input = "* item 1\n+ item 2\n- item 3";
let expected = "- item 1\n- item 2\n- item 3";
assert_eq!(standardize_bullets(input), expected);
}
#[test]
fn test_normalize_spacing() {
let input = "line 1\n\n\n\nline 2";
let expected = "line 1\n\nline 2";
assert_eq!(normalize_spacing(input), expected);
}
#[test]
fn test_trim_whitespace() {
let input = "line 1 \nline 2 ";
let expected = "line 1\nline 2";
assert_eq!(trim_line_whitespace(input), expected);
}
#[test]
fn test_strip_frontmatter() {
let input = "---\ntitle: Test\n---\n\n# Content";
let expected = "\n\n# Content";
assert_eq!(strip_frontmatter(input), expected);
}
#[test]
fn test_normalize_markdown_complete() {
let input = "* bullet 1\n+ bullet 2\n\n\n\n# Heading\nText immediately after";
let result = normalize_markdown(input);
assert!(result.contains("- bullet 1"));
assert!(result.contains("- bullet 2"));
assert!(!result.contains("\n\n\n"));
assert!(result.ends_with('\n'));
assert!(!result.ends_with("\n\n"));
}
#[test]
fn test_normalize_markdown_empty() {
let result = normalize_markdown("");
assert_eq!(result, "\n");
}
#[test]
fn test_normalize_markdown_single_line() {
let result = normalize_markdown("Hello world");
assert_eq!(result, "Hello world\n");
}
#[test]
fn test_normalize_headings_basic() {
let input = "Text\n# Heading\nMore text";
let result = normalize_headings(input);
assert!(result.contains("\n\n# Heading\n\nMore text"));
}
#[test]
fn test_normalize_headings_first_line() {
let input = "# First Heading\nText";
let result = normalize_headings(input);
assert!(result.starts_with("# First Heading\n\nText"));
}
#[test]
fn test_normalize_headings_multiple_levels() {
let input = "# H1\nText\n## H2\nMore text";
let result = normalize_headings(input);
assert!(result.contains("# H1\n\nText"));
assert!(result.contains("\n\n## H2\n\nMore text"));
}
#[test]
fn test_normalize_headings_consecutive() {
let input = "# H1\n## H2";
let result = normalize_headings(input);
assert!(result.contains("# H1"));
assert!(result.contains("## H2"));
}
#[test]
fn test_standardize_bullets_nested() {
let input = "* top level\n * nested\n + deeply nested";
let result = standardize_bullets(input);
assert_eq!(result, "- top level\n - nested\n - deeply nested");
}
#[test]
fn test_standardize_bullets_mixed_indentation() {
let input = " * indented\n* not indented";
let result = standardize_bullets(input);
assert_eq!(result, " - indented\n- not indented");
}
#[test]
fn test_standardize_bullets_no_bullets() {
let input = "Regular text\nNo bullets here";
let result = standardize_bullets(input);
assert_eq!(result, input);
}
#[test]
fn test_normalize_spacing_many_newlines() {
let input = "a\n\n\n\n\n\n\nb";
let result = normalize_spacing(input);
assert_eq!(result, "a\n\nb");
}
#[test]
fn test_normalize_spacing_exactly_two() {
let input = "a\n\nb";
let result = normalize_spacing(input);
assert_eq!(result, "a\n\nb");
}
#[test]
fn test_normalize_spacing_single_newline() {
let input = "a\nb";
let result = normalize_spacing(input);
assert_eq!(result, "a\nb");
}
#[test]
fn test_trim_line_whitespace_tabs() {
let input = "line 1\t\t\nline 2\t";
let result = trim_line_whitespace(input);
assert_eq!(result, "line 1\nline 2");
}
#[test]
fn test_trim_line_whitespace_mixed() {
let input = "line 1 \t \nline 2";
let result = trim_line_whitespace(input);
assert_eq!(result, "line 1\nline 2");
}
#[test]
fn test_strip_frontmatter_no_frontmatter() {
let input = "# Just content\n\nNo frontmatter here";
assert_eq!(strip_frontmatter(input), input);
}
#[test]
fn test_strip_frontmatter_incomplete() {
let input = "---\ntitle: Test\n\nNo closing marker";
assert_eq!(strip_frontmatter(input), input);
}
#[test]
fn test_strip_frontmatter_empty() {
let input = "---\n---\n\nContent";
let result = strip_frontmatter(input);
assert_eq!(result, input);
}
#[test]
fn test_strip_bad_frontmatter_malformed() {
let input = "---\nthis is not yaml\njust random text\nno colons\n---\n\nContent";
let result = strip_bad_frontmatter(input);
assert_eq!(result, "\n\nContent");
}
#[test]
fn test_strip_bad_frontmatter_valid() {
let input = "---\ntitle: Test\nauthor: Someone\n---\n\nContent";
let result = strip_bad_frontmatter(input);
assert_eq!(result, input);
}
#[test]
fn test_strip_bad_frontmatter_no_frontmatter() {
let input = "# Just content";
assert_eq!(strip_bad_frontmatter(input), input);
}
#[test]
fn test_strip_bad_frontmatter_partially_valid() {
let input = "---\ntitle: Test\nsome invalid line\nauthor: Someone\n---\n\nContent";
let result = strip_bad_frontmatter(input);
assert_eq!(result, input);
}
#[test]
fn test_normalize_markdown_complex_document() {
let input = "---\ntitle: Test\n---\n\n\n\n* bullet\n+ another\n# Heading \nText\n\n\n\n## Subheading\nMore text \n\n\n";
let result = normalize_markdown(input);
assert!(result.contains("- bullet"));
assert!(result.contains("- another"));
assert!(!result.contains("\n\n\n"));
assert!(!result.contains(" \n"));
assert!(result.ends_with('\n'));
assert!(!result.ends_with("\n\n"));
}
#[test]
fn test_normalize_markdown_preserves_code_blocks() {
let input = "```\ncode here\n```\n\nText";
let result = normalize_markdown(input);
assert!(result.contains("```"));
}
}
#[cfg(test)]
mod property_tests {
use super::*;
use proptest::prelude::*;
proptest! {
#[test]
fn normalize_is_idempotent(s in "\\PC*") {
let once = normalize_markdown(&s);
let twice = normalize_markdown(&once);
prop_assert_eq!(once, twice);
}
#[test]
fn normalize_always_ends_with_single_newline(s in "\\PC*") {
let result = normalize_markdown(&s);
prop_assert!(result.ends_with('\n'));
prop_assert!(!result.ends_with("\n\n") || result == "\n");
}
#[test]
fn normalize_never_more_than_two_newlines(s in "\\PC*") {
let result = normalize_markdown(&s);
prop_assert!(!result.contains("\n\n\n"));
}
#[test]
fn standardize_bullets_only_uses_hyphens(s in "[*+\\- ][a-z \\n]+") {
let result = standardize_bullets(&s);
let has_asterisk = result.lines().any(|l| l.trim_start().starts_with("* "));
let has_plus = result.lines().any(|l| l.trim_start().starts_with("+ "));
prop_assert!(!has_asterisk);
prop_assert!(!has_plus);
}
#[test]
fn trim_whitespace_no_trailing_spaces(s in "\\PC*") {
let result = trim_line_whitespace(&s);
for line in result.lines() {
prop_assert!(!line.ends_with(' '));
prop_assert!(!line.ends_with('\t'));
}
}
#[test]
fn strip_frontmatter_idempotent(s in "\\PC*") {
let once = strip_frontmatter(&s);
let twice = strip_frontmatter(&once);
prop_assert_eq!(once, twice);
}
#[test]
fn strip_bad_frontmatter_never_adds_content(s in "\\PC*") {
let result = strip_bad_frontmatter(&s);
prop_assert!(result.len() <= s.len());
}
}
}