1use regex::Regex;
13use serde::{Deserialize, Serialize};
14
15#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct Heading {
18 pub level: u8,
19 pub text: String,
20}
21
22#[derive(Debug, Clone, Default, Serialize, Deserialize)]
24pub struct WebContent {
25 #[serde(default)]
26 pub title: Option<String>,
27 #[serde(default)]
28 pub headings: Vec<Heading>,
29 #[serde(default)]
30 pub paragraphs: Vec<String>,
31 #[serde(default)]
32 pub code_blocks: Vec<String>,
33 #[serde(default)]
34 pub formulas: Vec<String>,
35 #[serde(default)]
36 pub blockquote_formulas: Vec<String>,
37 #[serde(default)]
38 pub list_items: Vec<String>,
39 #[serde(default)]
40 pub figures: Vec<u32>,
41}
42
43#[derive(Debug, Clone, Default, Serialize, Deserialize)]
45pub struct MissingContent {
46 pub title: bool,
47 pub headings: Vec<String>,
48 pub paragraphs: Vec<String>,
49 pub code_blocks: Vec<String>,
50 pub formulas: Vec<String>,
51 pub blockquote_formulas: Vec<String>,
52 pub list_items: Vec<String>,
53 pub images: u32,
54}
55
56#[derive(Debug, Clone, Default)]
58pub struct VerifyOptions {
59 pub verbose: bool,
60 pub expected_figures: Option<u32>,
61 pub has_local_images: bool,
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct VerifyResult {
67 pub total_checks: u32,
68 pub passed_checks: u32,
69 pub pass_rate: f64,
70 pub has_missing_content: bool,
71 pub missing: MissingContent,
72 pub success: bool,
73}
74
75#[must_use]
80pub fn normalize_text(text: &str) -> String {
81 let mut result = text.trim().to_string();
82
83 if let Ok(re) = Regex::new(r"\s+") {
85 result = re.replace_all(&result, " ").to_string();
86 }
87 result = result.replace('\u{00A0}', " ");
89 result = result.replace('\u{2018}', "'");
90 result = result.replace('\u{2019}', "'");
91 result = result.replace('\u{201C}', "\"");
92 result = result.replace('\u{201D}', "\"");
93 result = result.replace('\u{00D7}', "x");
94 result = result.replace('\u{2192}', "->");
95 result = result.replace('\u{21A6}', "->");
96 result = result.replace('\u{2212}', "-");
97
98 result = result.replace("$$", "");
100 result = result.replace('$', "");
101
102 result = result.replace("\\times", "x");
104 result = result.replace("\\to", "->");
105 if let Ok(re) = Regex::new(r"\\displaystyle\s*") {
106 result = re.replace_all(&result, "").to_string();
107 }
108 if let Ok(re) = Regex::new(r"\\text\{([^}]*)\}") {
109 result = re.replace_all(&result, "$1").to_string();
110 }
111 result = result.replace("\\\\%", "%");
112 result = result.replace("\\%", "%");
113 result = result.replace("\\subseteq", "\u{2286}");
114 result = result.replace("\\in", "\u{2208}");
115 result = result.replace("\\emptyset", "\u{2205}");
116 result = result.replace("^2", "\u{00B2}");
117 result = result.replace("^n", "\u{207F}");
118
119 if let Ok(re) = Regex::new(r"(?i)\\mathbb\{n\}_0") {
121 result = re.replace_all(&result, "\u{2115}\u{2080}").to_string();
122 }
123
124 result.to_lowercase()
125}
126
127#[must_use]
129pub fn normalize_code(text: &str) -> String {
130 let mut result = text.trim().to_string();
131
132 if let Ok(re) = Regex::new(r"\s+") {
133 result = re.replace_all(&result, " ").to_string();
134 }
135 result = result.replace('\u{00A0}', " ");
136 result = result.replace('\u{00D7}', "x");
137 result = result.replace("$$", "");
138 result = result.replace('$', "");
139
140 result = result.replace("\\times", "x");
141
142 result.to_lowercase()
143}
144
145#[must_use]
147#[allow(clippy::too_many_lines, clippy::cast_precision_loss)]
148pub fn verify_markdown_content(
149 web_content: &WebContent,
150 markdown_text: &str,
151 options: &VerifyOptions,
152) -> VerifyResult {
153 let normalized_markdown = normalize_text(markdown_text);
154 let mut missing = MissingContent::default();
155 let mut total_checks: u32 = 0;
156 let mut passed_checks: u32 = 0;
157
158 if let Some(ref title) = web_content.title {
160 total_checks += 1;
161 let normalized_title = normalize_text(title);
162 if normalized_markdown.contains(&normalized_title) {
163 passed_checks += 1;
164 } else {
165 missing.title = true;
166 }
167 }
168
169 for heading in &web_content.headings {
171 total_checks += 1;
172 let normalized = normalize_text(&heading.text);
173 if normalized_markdown.contains(&normalized) {
174 passed_checks += 1;
175 } else {
176 missing.headings.push(heading.text.clone());
177 }
178 }
179
180 let paragraphs = &web_content.paragraphs;
182 let first_five = paragraphs.iter().take(5);
183 let last_five = if paragraphs.len() > 5 {
184 paragraphs.iter().skip(paragraphs.len().saturating_sub(5))
185 } else {
186 paragraphs.iter().skip(paragraphs.len()) };
188 let paragraphs_to_check: Vec<&String> = first_five.chain(last_five).collect();
189
190 for paragraph in ¶graphs_to_check {
191 total_checks += 1;
192 let normalized = normalize_text(paragraph);
193 let words: Vec<&str> = normalized.split(' ').filter(|w| w.len() > 2).collect();
194 let matching_words = words
195 .iter()
196 .filter(|word| normalized_markdown.contains(**word))
197 .count();
198 let match_rate = if words.is_empty() {
199 0.0
200 } else {
201 matching_words as f64 / words.len() as f64
202 };
203
204 let substring_match = normalized.len() > 20
205 && normalized_markdown.contains(&normalized[..normalized.len().min(50)]);
206
207 if match_rate >= 0.6 || substring_match {
208 passed_checks += 1;
209 } else {
210 let truncated = if paragraph.len() > 100 {
211 format!("{}...", ¶graph[..100])
212 } else {
213 format!("{paragraph}...")
214 };
215 missing.paragraphs.push(truncated);
216 }
217 }
218
219 let normalized_markdown_for_code = normalize_code(markdown_text);
221 let punctuation_only_re = Regex::new(r"^[{}\[\](),;]+$").ok();
222 for code in &web_content.code_blocks {
223 total_checks += 1;
224 let normalized_code_full = normalize_code(code);
225
226 let lines: Vec<&str> = code
227 .lines()
228 .map(str::trim)
229 .filter(|l| {
230 l.len() > 3
231 && !punctuation_only_re
232 .as_ref()
233 .is_some_and(|re| re.is_match(l))
234 })
235 .collect();
236
237 let matching_lines = lines
238 .iter()
239 .filter(|line| {
240 let normalized_line = normalize_code(line);
241 normalized_markdown_for_code.contains(&normalized_line)
242 })
243 .count();
244
245 let match_rate = if lines.is_empty() {
246 1.0
247 } else {
248 matching_lines as f64 / lines.len() as f64
249 };
250
251 if match_rate >= 0.6 || normalized_markdown_for_code.contains(&normalized_code_full) {
252 passed_checks += 1;
253 } else {
254 let truncated = if code.len() > 100 {
255 format!("{}...", &code[..100])
256 } else {
257 format!("{code}...")
258 };
259 missing.code_blocks.push(truncated);
260 }
261 }
262
263 for item in web_content.list_items.iter().take(10) {
265 total_checks += 1;
266 let normalized = normalize_text(item);
267 let words: Vec<&str> = normalized.split(' ').filter(|w| w.len() > 2).collect();
268 let matching_words = words
269 .iter()
270 .filter(|word| normalized_markdown.contains(**word))
271 .count();
272 let match_rate = if words.is_empty() {
273 0.0
274 } else {
275 matching_words as f64 / words.len() as f64
276 };
277
278 let substring_match = normalized.len() > 15
279 && normalized_markdown.contains(&normalized[..normalized.len().min(40)]);
280
281 if match_rate >= 0.6 || substring_match {
282 passed_checks += 1;
283 } else {
284 let truncated = if item.len() > 100 {
285 format!("{}...", &item[..100])
286 } else {
287 format!("{item}...")
288 };
289 missing.list_items.push(truncated);
290 }
291 }
292
293 let blockquote_re = Regex::new(r"(?m)^>.*$").unwrap();
295 for formula in &web_content.blockquote_formulas {
296 total_checks += 1;
297 let normalized_formula = formula.split_whitespace().collect::<Vec<_>>().join(" ");
298
299 let cleaned = normalized_formula
301 .replace("\\mathbf{", "")
302 .replace("\\textbf{", "")
303 .replace(['{', '}', '\\'], "");
304 let key_parts: Vec<&str> = cleaned
305 .split_whitespace()
306 .filter(|part| part.len() > 1)
307 .collect();
308
309 let blockquote_lines: Vec<&str> = blockquote_re
311 .find_iter(markdown_text)
312 .map(|m| m.as_str())
313 .collect();
314
315 let mut found = false;
316 for line in &blockquote_lines {
317 if line.contains('$') {
318 let matching_parts = key_parts
319 .iter()
320 .filter(|part| line.to_lowercase().contains(&part.to_lowercase()))
321 .count();
322 if !key_parts.is_empty() && matching_parts >= key_parts.len().min(2) {
323 found = true;
324 break;
325 }
326 if line.contains(&normalized_formula)
327 || line.contains(formula.as_str())
328 || (formula.len() < 20 && line.contains(&formula.replace(' ', "")))
329 {
330 found = true;
331 break;
332 }
333 }
334 }
335
336 if found {
337 passed_checks += 1;
338 } else {
339 let truncated = if formula.len() > 100 {
340 formula[..100].to_string()
341 } else {
342 formula.clone()
343 };
344 missing.blockquote_formulas.push(truncated);
345 }
346 }
347
348 if options.has_local_images {
350 if let Some(expected) = options.expected_figures {
351 total_checks += 1;
352 let figure_re = Regex::new(
353 r"(?i)!\[(?:\*\*)?(?:Figure|Рис\.?|Рисунок)\s*\d+[\s\S]*?\]\(images/figure-\d+\.(png|jpg)\)",
354 )
355 .unwrap();
356 #[allow(clippy::cast_possible_truncation)]
357 let figure_count = figure_re.find_iter(markdown_text).count() as u32;
358 if figure_count >= expected {
359 passed_checks += 1;
360 } else {
361 missing.images = expected - figure_count;
362 }
363 }
364 }
365
366 let pass_rate = if total_checks > 0 {
368 f64::from(passed_checks) / f64::from(total_checks)
369 } else {
370 0.0
371 };
372 let has_missing_content = missing.title
373 || missing.images > 0
374 || !missing.headings.is_empty()
375 || !missing.paragraphs.is_empty()
376 || !missing.code_blocks.is_empty()
377 || !missing.formulas.is_empty()
378 || !missing.blockquote_formulas.is_empty()
379 || !missing.list_items.is_empty();
380
381 VerifyResult {
382 total_checks,
383 passed_checks,
384 pass_rate,
385 has_missing_content,
386 success: !has_missing_content || pass_rate >= 0.85,
387 missing,
388 }
389}