use regex::Regex;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Heading {
pub level: u8,
pub text: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct WebContent {
#[serde(default)]
pub title: Option<String>,
#[serde(default)]
pub headings: Vec<Heading>,
#[serde(default)]
pub paragraphs: Vec<String>,
#[serde(default)]
pub code_blocks: Vec<String>,
#[serde(default)]
pub formulas: Vec<String>,
#[serde(default)]
pub blockquote_formulas: Vec<String>,
#[serde(default)]
pub list_items: Vec<String>,
#[serde(default)]
pub figures: Vec<u32>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct MissingContent {
pub title: bool,
pub headings: Vec<String>,
pub paragraphs: Vec<String>,
pub code_blocks: Vec<String>,
pub formulas: Vec<String>,
pub blockquote_formulas: Vec<String>,
pub list_items: Vec<String>,
pub images: u32,
}
#[derive(Debug, Clone, Default)]
pub struct VerifyOptions {
pub verbose: bool,
pub expected_figures: Option<u32>,
pub has_local_images: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VerifyResult {
pub total_checks: u32,
pub passed_checks: u32,
pub pass_rate: f64,
pub has_missing_content: bool,
pub missing: MissingContent,
pub success: bool,
}
#[must_use]
pub fn normalize_text(text: &str) -> String {
let mut result = text.trim().to_string();
if let Ok(re) = Regex::new(r"\s+") {
result = re.replace_all(&result, " ").to_string();
}
result = result.replace('\u{00A0}', " ");
result = result.replace('\u{2018}', "'");
result = result.replace('\u{2019}', "'");
result = result.replace('\u{201C}', "\"");
result = result.replace('\u{201D}', "\"");
result = result.replace('\u{00D7}', "x");
result = result.replace('\u{2192}', "->");
result = result.replace('\u{21A6}', "->");
result = result.replace('\u{2212}', "-");
result = result.replace("$$", "");
result = result.replace('$', "");
result = result.replace("\\times", "x");
result = result.replace("\\to", "->");
if let Ok(re) = Regex::new(r"\\displaystyle\s*") {
result = re.replace_all(&result, "").to_string();
}
if let Ok(re) = Regex::new(r"\\text\{([^}]*)\}") {
result = re.replace_all(&result, "$1").to_string();
}
result = result.replace("\\\\%", "%");
result = result.replace("\\%", "%");
result = result.replace("\\subseteq", "\u{2286}");
result = result.replace("\\in", "\u{2208}");
result = result.replace("\\emptyset", "\u{2205}");
result = result.replace("^2", "\u{00B2}");
result = result.replace("^n", "\u{207F}");
if let Ok(re) = Regex::new(r"(?i)\\mathbb\{n\}_0") {
result = re.replace_all(&result, "\u{2115}\u{2080}").to_string();
}
result.to_lowercase()
}
#[must_use]
pub fn normalize_code(text: &str) -> String {
let mut result = text.trim().to_string();
if let Ok(re) = Regex::new(r"\s+") {
result = re.replace_all(&result, " ").to_string();
}
result = result.replace('\u{00A0}', " ");
result = result.replace('\u{00D7}', "x");
result = result.replace("$$", "");
result = result.replace('$', "");
result = result.replace("\\times", "x");
result.to_lowercase()
}
#[must_use]
#[allow(clippy::too_many_lines, clippy::cast_precision_loss)]
pub fn verify_markdown_content(
web_content: &WebContent,
markdown_text: &str,
options: &VerifyOptions,
) -> VerifyResult {
let normalized_markdown = normalize_text(markdown_text);
let mut missing = MissingContent::default();
let mut total_checks: u32 = 0;
let mut passed_checks: u32 = 0;
if let Some(ref title) = web_content.title {
total_checks += 1;
let normalized_title = normalize_text(title);
if normalized_markdown.contains(&normalized_title) {
passed_checks += 1;
} else {
missing.title = true;
}
}
for heading in &web_content.headings {
total_checks += 1;
let normalized = normalize_text(&heading.text);
if normalized_markdown.contains(&normalized) {
passed_checks += 1;
} else {
missing.headings.push(heading.text.clone());
}
}
let paragraphs = &web_content.paragraphs;
let first_five = paragraphs.iter().take(5);
let last_five = if paragraphs.len() > 5 {
paragraphs.iter().skip(paragraphs.len().saturating_sub(5))
} else {
paragraphs.iter().skip(paragraphs.len()) };
let paragraphs_to_check: Vec<&String> = first_five.chain(last_five).collect();
for paragraph in ¶graphs_to_check {
total_checks += 1;
let normalized = normalize_text(paragraph);
let words: Vec<&str> = normalized.split(' ').filter(|w| w.len() > 2).collect();
let matching_words = words
.iter()
.filter(|word| normalized_markdown.contains(**word))
.count();
let match_rate = if words.is_empty() {
0.0
} else {
matching_words as f64 / words.len() as f64
};
let substring_match = normalized.len() > 20
&& normalized_markdown.contains(&normalized[..normalized.len().min(50)]);
if match_rate >= 0.6 || substring_match {
passed_checks += 1;
} else {
let truncated = if paragraph.len() > 100 {
format!("{}...", ¶graph[..100])
} else {
format!("{paragraph}...")
};
missing.paragraphs.push(truncated);
}
}
let normalized_markdown_for_code = normalize_code(markdown_text);
let punctuation_only_re = Regex::new(r"^[{}\[\](),;]+$").ok();
for code in &web_content.code_blocks {
total_checks += 1;
let normalized_code_full = normalize_code(code);
let lines: Vec<&str> = code
.lines()
.map(str::trim)
.filter(|l| {
l.len() > 3
&& !punctuation_only_re
.as_ref()
.is_some_and(|re| re.is_match(l))
})
.collect();
let matching_lines = lines
.iter()
.filter(|line| {
let normalized_line = normalize_code(line);
normalized_markdown_for_code.contains(&normalized_line)
})
.count();
let match_rate = if lines.is_empty() {
1.0
} else {
matching_lines as f64 / lines.len() as f64
};
if match_rate >= 0.6 || normalized_markdown_for_code.contains(&normalized_code_full) {
passed_checks += 1;
} else {
let truncated = if code.len() > 100 {
format!("{}...", &code[..100])
} else {
format!("{code}...")
};
missing.code_blocks.push(truncated);
}
}
for item in web_content.list_items.iter().take(10) {
total_checks += 1;
let normalized = normalize_text(item);
let words: Vec<&str> = normalized.split(' ').filter(|w| w.len() > 2).collect();
let matching_words = words
.iter()
.filter(|word| normalized_markdown.contains(**word))
.count();
let match_rate = if words.is_empty() {
0.0
} else {
matching_words as f64 / words.len() as f64
};
let substring_match = normalized.len() > 15
&& normalized_markdown.contains(&normalized[..normalized.len().min(40)]);
if match_rate >= 0.6 || substring_match {
passed_checks += 1;
} else {
let truncated = if item.len() > 100 {
format!("{}...", &item[..100])
} else {
format!("{item}...")
};
missing.list_items.push(truncated);
}
}
let blockquote_re = Regex::new(r"(?m)^>.*$").unwrap();
for formula in &web_content.blockquote_formulas {
total_checks += 1;
let normalized_formula = formula.split_whitespace().collect::<Vec<_>>().join(" ");
let cleaned = normalized_formula
.replace("\\mathbf{", "")
.replace("\\textbf{", "")
.replace(['{', '}', '\\'], "");
let key_parts: Vec<&str> = cleaned
.split_whitespace()
.filter(|part| part.len() > 1)
.collect();
let blockquote_lines: Vec<&str> = blockquote_re
.find_iter(markdown_text)
.map(|m| m.as_str())
.collect();
let mut found = false;
for line in &blockquote_lines {
if line.contains('$') {
let matching_parts = key_parts
.iter()
.filter(|part| line.to_lowercase().contains(&part.to_lowercase()))
.count();
if !key_parts.is_empty() && matching_parts >= key_parts.len().min(2) {
found = true;
break;
}
if line.contains(&normalized_formula)
|| line.contains(formula.as_str())
|| (formula.len() < 20 && line.contains(&formula.replace(' ', "")))
{
found = true;
break;
}
}
}
if found {
passed_checks += 1;
} else {
let truncated = if formula.len() > 100 {
formula[..100].to_string()
} else {
formula.clone()
};
missing.blockquote_formulas.push(truncated);
}
}
if options.has_local_images {
if let Some(expected) = options.expected_figures {
total_checks += 1;
let figure_re = Regex::new(
r"(?i)!\[(?:\*\*)?(?:Figure|Рис\.?|Рисунок)\s*\d+[\s\S]*?\]\(images/figure-\d+\.(png|jpg)\)",
)
.unwrap();
#[allow(clippy::cast_possible_truncation)]
let figure_count = figure_re.find_iter(markdown_text).count() as u32;
if figure_count >= expected {
passed_checks += 1;
} else {
missing.images = expected - figure_count;
}
}
}
let pass_rate = if total_checks > 0 {
f64::from(passed_checks) / f64::from(total_checks)
} else {
0.0
};
let has_missing_content = missing.title
|| missing.images > 0
|| !missing.headings.is_empty()
|| !missing.paragraphs.is_empty()
|| !missing.code_blocks.is_empty()
|| !missing.formulas.is_empty()
|| !missing.blockquote_formulas.is_empty()
|| !missing.list_items.is_empty();
VerifyResult {
total_checks,
passed_checks,
pass_rate,
has_missing_content,
success: !has_missing_content || pass_rate >= 0.85,
missing,
}
}