use once_cell::sync::Lazy;
use regex::Regex;
pub fn clean_markdown(input: &str) -> String {
let s = normalise_line_endings(input); let s = strip_markdown_fences(&s);
let s = trim_trailing_whitespace(&s);
let s = collapse_blank_lines(&s);
let s = normalise_heading_spacing(&s);
let s = fix_broken_tables(&s);
let s = remove_mid_table_separators(&s);
let s = remove_hallucinated_images(&s);
let s = remove_invisible_chars(&s);
ensure_final_newline(&s)
}
fn strip_markdown_fences(input: &str) -> String {
let trimmed = input.trim();
if trimmed.is_empty() {
return trimmed.to_string();
}
let mut lines: Vec<&str> = trimmed.lines().collect();
if lines.is_empty() {
return trimmed.to_string();
}
let first = lines[0].trim();
if first.starts_with("```") && !first[3..].contains('`') {
lines.remove(0);
}
if lines.is_empty() {
return String::new();
}
let last = lines[lines.len() - 1].trim();
if last == "```" {
lines.pop();
}
lines.join("\n")
}
fn normalise_line_endings(input: &str) -> String {
input.replace("\r\n", "\n").replace('\r', "\n")
}
fn trim_trailing_whitespace(input: &str) -> String {
input
.lines()
.map(|line| line.trim_end())
.collect::<Vec<_>>()
.join("\n")
}
static RE_BLANK_LINES: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n{4,}").unwrap());
fn collapse_blank_lines(input: &str) -> String {
RE_BLANK_LINES.replace_all(input, "\n\n\n").to_string()
}
fn ensure_final_newline(input: &str) -> String {
let trimmed = input.trim_end();
if trimmed.is_empty() {
String::from("\n")
} else {
format!("{}\n", trimmed)
}
}
fn normalise_heading_spacing(input: &str) -> String {
let mut result = String::with_capacity(input.len() + 64);
for (i, line) in input.lines().enumerate() {
let is_heading =
line.starts_with('#') && line.chars().nth(line.find(' ').unwrap_or(0)).is_some();
if is_heading && i > 0 {
let trimmed = result.trim_end_matches('\n');
result.truncate(trimmed.len());
result.push_str("\n\n");
}
result.push_str(line);
result.push('\n');
}
result
}
fn fix_broken_tables(input: &str) -> String {
let lines: Vec<&str> = input.lines().collect();
let mut result = Vec::with_capacity(lines.len() + 10);
let mut i = 0;
while i < lines.len() {
let line = lines[i];
if is_table_row(line) && !is_separator_row(line) {
result.push(line.to_string());
let next = lines.get(i + 1).copied().unwrap_or("");
if is_table_row(next) && !is_separator_row(next) {
let col_count = line.matches('|').count().saturating_sub(1).max(1);
let sep: String = std::iter::once("|")
.chain(std::iter::repeat_n(" --- |", col_count))
.collect();
result.push(sep);
}
i += 1;
continue;
}
result.push(line.to_string());
i += 1;
}
result.join("\n")
}
fn is_table_row(line: &str) -> bool {
let trimmed = line.trim();
trimmed.starts_with('|') && trimmed.ends_with('|') && trimmed.len() > 2
}
fn is_separator_row(line: &str) -> bool {
let trimmed = line.trim();
if !trimmed.starts_with('|') {
return false;
}
trimmed
.chars()
.all(|c| c == '|' || c == '-' || c == ':' || c == ' ')
}
fn remove_invisible_chars(input: &str) -> String {
input.replace(
[
'\u{200B}', '\u{FEFF}', '\u{00AD}', '\u{200C}', '\u{200D}', '\u{2060}',
],
"",
)
}
static RE_IMAGE: Lazy<Regex> = Lazy::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]*)\)").unwrap());
fn is_placeholder_url(url: &str) -> bool {
let u = url.trim();
if u.is_empty() {
return true;
}
if !u.starts_with("http://") && !u.starts_with("https://") {
return true;
}
let fake_domains = [
"example.com",
"placeholder.com",
"via.placeholder.com",
"dummyimage.com",
"lorempixel.com",
"picsum.photos",
"placehold.it",
];
fake_domains.iter().any(|d| u.contains(d))
}
fn remove_hallucinated_images(input: &str) -> String {
RE_IMAGE
.replace_all(input, |caps: ®ex::Captures<'_>| {
let alt = caps[1].trim();
let url = &caps[2];
if is_placeholder_url(url) {
if alt.is_empty() {
String::new()
} else {
format!("*{}*", alt)
}
} else {
caps[0].to_string()
}
})
.to_string()
}
fn remove_mid_table_separators(input: &str) -> String {
let lines: Vec<&str> = input.lines().collect();
let mut result: Vec<&str> = Vec::with_capacity(lines.len());
let mut in_table = false;
let mut table_line_count = 0usize;
for line in &lines {
if is_table_row(line) {
if !in_table {
in_table = true;
table_line_count = 0;
}
table_line_count += 1;
if is_separator_row(line) && table_line_count != 2 {
continue;
}
result.push(line);
} else {
in_table = false;
table_line_count = 0;
result.push(line);
}
}
result.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_fences() {
let input = "```markdown\n# Hello\nWorld\n```";
assert_eq!(strip_markdown_fences(input), "# Hello\nWorld");
}
#[test]
fn test_strip_fences_no_lang() {
let input = "```\n# Hello\nWorld\n```";
assert_eq!(strip_markdown_fences(input), "# Hello\nWorld");
}
#[test]
fn test_no_fences_passthrough() {
let input = "# Hello\nWorld";
assert_eq!(strip_markdown_fences(input), "# Hello\nWorld");
}
#[test]
fn test_strip_fences_crlf_local_model_output() {
let input = "```markdown\r\n# Tax Form 1040\r\nIncome: $50,000\r\n```";
let result = clean_markdown(input);
assert!(
!result.starts_with("```"),
"CRLF fences must be stripped; got: {:?}",
result
);
assert!(result.contains("Tax Form"), "Content must be preserved");
}
#[test]
fn test_normalise_line_endings() {
assert_eq!(normalise_line_endings("a\r\nb\rc"), "a\nb\nc");
}
#[test]
fn test_trim_trailing_whitespace() {
assert_eq!(
trim_trailing_whitespace(" hello \nworld "),
" hello\nworld"
);
}
#[test]
fn test_collapse_blank_lines() {
let input = "a\n\n\n\n\n\nb";
assert_eq!(collapse_blank_lines(input), "a\n\n\nb");
}
#[test]
fn test_ensure_final_newline() {
assert_eq!(ensure_final_newline("hello"), "hello\n");
assert_eq!(ensure_final_newline("hello\n\n\n"), "hello\n");
assert_eq!(ensure_final_newline(""), "\n");
}
#[test]
fn test_heading_spacing() {
let input = "some text\n# Heading\nmore text";
let result = normalise_heading_spacing(input);
assert!(result.contains("\n\n# Heading\n"));
}
#[test]
fn test_fix_broken_table() {
let input = "| A | B |\n| 1 | 2 |";
let result = fix_broken_tables(input);
let lines: Vec<&str> = result.lines().collect();
assert_eq!(lines.len(), 3);
assert!(is_separator_row(lines[1]));
}
#[test]
fn test_table_with_separator_unchanged() {
let input = "| A | B |\n| --- | --- |\n| 1 | 2 |";
let result = fix_broken_tables(input);
let lines: Vec<&str> = result.lines().collect();
assert_eq!(lines.len(), 3); }
#[test]
fn test_remove_invisible() {
let input = "hello\u{200B}world\u{FEFF}foo\u{00AD}bar";
assert_eq!(remove_invisible_chars(input), "helloworldfoobar");
}
#[test]
fn test_remove_hallucinated_image_placeholder_url() {
let input = "Some text\n\nMore text";
let result = remove_hallucinated_images(input);
assert!(
!result.contains("!["),
"Should remove image with local path"
);
assert!(
result.contains("*Chart Title*"),
"Should keep alt text as italic"
);
}
#[test]
fn test_remove_hallucinated_image_fake_url() {
let input = "";
let result = remove_hallucinated_images(input);
assert!(!result.contains("!["));
assert!(result.contains("*Diagram*"));
}
#[test]
fn test_keep_real_image_link() {
let input = "";
let result = remove_hallucinated_images(input);
assert!(result.contains("![Figure]"), "Should keep real image link");
}
#[test]
fn test_remove_mid_table_separator() {
let input = "| A | B |\n| --- | --- |\n| 1 | 2 |\n| --- | --- |\n| 3 | 4 |";
let result = remove_mid_table_separators(input);
let sep_count = result.lines().filter(|l| is_separator_row(l)).count();
assert_eq!(sep_count, 1, "Only one separator should remain");
assert!(result.contains("| 3 | 4 |"), "Data rows should remain");
}
#[test]
fn test_keep_only_header_separator() {
let input = "| H1 | H2 |\n| --- | --- |\n| a | b |\n| c | d |";
let result = remove_mid_table_separators(input);
assert_eq!(result, input, "Normal table should be unchanged");
}
#[test]
fn test_clean_markdown_full_pipeline() {
let input = "```markdown\n# Title\r\n\r\nSome text \n\n\n\n\n\n## Section\n\n| A | B |\n| 1 | 2 |\n```";
let result = clean_markdown(input);
assert!(result.starts_with("# Title"));
assert!(result.ends_with('\n'));
assert!(!result.contains("\n\n\n\n"));
}
}