Skip to main content

web_capture/
verify.rs

1//! Content verification module (R6).
2//!
3//! Compares captured markdown content against the original web page
4//! to verify completeness and accuracy.
5//!
6//! Checks: title, headings, paragraphs, code blocks, formulas,
7//! blockquote formulas, list items, links, and figure images.
8//!
9//! Based on reference implementation from:
10//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/verify.mjs>
11
12use regex::Regex;
13use serde::{Deserialize, Serialize};
14
15/// Heading with level and text.
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct Heading {
18    pub level: u8,
19    pub text: String,
20}
21
22/// Content extracted from a web page for verification.
23#[derive(Debug, Clone, Default, Serialize, Deserialize)]
24pub struct WebContent {
25    #[serde(default)]
26    pub title: Option<String>,
27    #[serde(default)]
28    pub headings: Vec<Heading>,
29    #[serde(default)]
30    pub paragraphs: Vec<String>,
31    #[serde(default)]
32    pub code_blocks: Vec<String>,
33    #[serde(default)]
34    pub formulas: Vec<String>,
35    #[serde(default)]
36    pub blockquote_formulas: Vec<String>,
37    #[serde(default)]
38    pub list_items: Vec<String>,
39    #[serde(default)]
40    pub figures: Vec<u32>,
41}
42
43/// Missing content detected during verification.
44#[derive(Debug, Clone, Default, Serialize, Deserialize)]
45pub struct MissingContent {
46    pub title: bool,
47    pub headings: Vec<String>,
48    pub paragraphs: Vec<String>,
49    pub code_blocks: Vec<String>,
50    pub formulas: Vec<String>,
51    pub blockquote_formulas: Vec<String>,
52    pub list_items: Vec<String>,
53    pub images: u32,
54}
55
56/// Verification options.
57#[derive(Debug, Clone, Default)]
58pub struct VerifyOptions {
59    pub verbose: bool,
60    pub expected_figures: Option<u32>,
61    pub has_local_images: bool,
62}
63
64/// Result of content verification.
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct VerifyResult {
67    pub total_checks: u32,
68    pub passed_checks: u32,
69    pub pass_rate: f64,
70    pub has_missing_content: bool,
71    pub missing: MissingContent,
72    pub success: bool,
73}
74
75/// Normalize text for comparison.
76///
77/// Removes extra whitespace and normalizes unicode characters,
78/// LaTeX delimiters, and common symbol substitutions.
79#[must_use]
80pub fn normalize_text(text: &str) -> String {
81    let mut result = text.trim().to_string();
82
83    // Collapse whitespace
84    if let Ok(re) = Regex::new(r"\s+") {
85        result = re.replace_all(&result, " ").to_string();
86    }
87    // Normalize unicode spaces and symbols
88    result = result.replace('\u{00A0}', " ");
89    result = result.replace('\u{2018}', "'");
90    result = result.replace('\u{2019}', "'");
91    result = result.replace('\u{201C}', "\"");
92    result = result.replace('\u{201D}', "\"");
93    result = result.replace('\u{00D7}', "x");
94    result = result.replace('\u{2192}', "->");
95    result = result.replace('\u{21A6}', "->");
96    result = result.replace('\u{2212}', "-");
97
98    // Remove LaTeX delimiters
99    result = result.replace("$$", "");
100    result = result.replace('$', "");
101
102    // Normalize LaTeX commands
103    result = result.replace("\\times", "x");
104    result = result.replace("\\to", "->");
105    if let Ok(re) = Regex::new(r"\\displaystyle\s*") {
106        result = re.replace_all(&result, "").to_string();
107    }
108    if let Ok(re) = Regex::new(r"\\text\{([^}]*)\}") {
109        result = re.replace_all(&result, "$1").to_string();
110    }
111    result = result.replace("\\\\%", "%");
112    result = result.replace("\\%", "%");
113    result = result.replace("\\subseteq", "\u{2286}");
114    result = result.replace("\\in", "\u{2208}");
115    result = result.replace("\\emptyset", "\u{2205}");
116    result = result.replace("^2", "\u{00B2}");
117    result = result.replace("^n", "\u{207F}");
118
119    // Handle \\mathbb{n}_0 case-insensitively
120    if let Ok(re) = Regex::new(r"(?i)\\mathbb\{n\}_0") {
121        result = re.replace_all(&result, "\u{2115}\u{2080}").to_string();
122    }
123
124    result.to_lowercase()
125}
126
127/// Normalize code for comparison (more lenient than text).
128#[must_use]
129pub fn normalize_code(text: &str) -> String {
130    let mut result = text.trim().to_string();
131
132    if let Ok(re) = Regex::new(r"\s+") {
133        result = re.replace_all(&result, " ").to_string();
134    }
135    result = result.replace('\u{00A0}', " ");
136    result = result.replace('\u{00D7}', "x");
137    result = result.replace("$$", "");
138    result = result.replace('$', "");
139
140    result = result.replace("\\times", "x");
141
142    result.to_lowercase()
143}
144
145/// Verify that markdown contains the expected web page content.
146#[must_use]
147#[allow(clippy::too_many_lines, clippy::cast_precision_loss)]
148pub fn verify_markdown_content(
149    web_content: &WebContent,
150    markdown_text: &str,
151    options: &VerifyOptions,
152) -> VerifyResult {
153    let normalized_markdown = normalize_text(markdown_text);
154    let mut missing = MissingContent::default();
155    let mut total_checks: u32 = 0;
156    let mut passed_checks: u32 = 0;
157
158    // Check title
159    if let Some(ref title) = web_content.title {
160        total_checks += 1;
161        let normalized_title = normalize_text(title);
162        if normalized_markdown.contains(&normalized_title) {
163            passed_checks += 1;
164        } else {
165            missing.title = true;
166        }
167    }
168
169    // Check headings
170    for heading in &web_content.headings {
171        total_checks += 1;
172        let normalized = normalize_text(&heading.text);
173        if normalized_markdown.contains(&normalized) {
174            passed_checks += 1;
175        } else {
176            missing.headings.push(heading.text.clone());
177        }
178    }
179
180    // Check paragraphs (sample first 5 and last 5)
181    let paragraphs = &web_content.paragraphs;
182    let first_five = paragraphs.iter().take(5);
183    let last_five = if paragraphs.len() > 5 {
184        paragraphs.iter().skip(paragraphs.len().saturating_sub(5))
185    } else {
186        paragraphs.iter().skip(paragraphs.len()) // empty
187    };
188    let paragraphs_to_check: Vec<&String> = first_five.chain(last_five).collect();
189
190    for paragraph in &paragraphs_to_check {
191        total_checks += 1;
192        let normalized = normalize_text(paragraph);
193        let words: Vec<&str> = normalized.split(' ').filter(|w| w.len() > 2).collect();
194        let matching_words = words
195            .iter()
196            .filter(|word| normalized_markdown.contains(**word))
197            .count();
198        let match_rate = if words.is_empty() {
199            0.0
200        } else {
201            matching_words as f64 / words.len() as f64
202        };
203
204        let substring_match = normalized.len() > 20
205            && normalized_markdown.contains(&normalized[..normalized.len().min(50)]);
206
207        if match_rate >= 0.6 || substring_match {
208            passed_checks += 1;
209        } else {
210            let truncated = if paragraph.len() > 100 {
211                format!("{}...", &paragraph[..100])
212            } else {
213                format!("{paragraph}...")
214            };
215            missing.paragraphs.push(truncated);
216        }
217    }
218
219    // Check code blocks (fuzzy matching)
220    let normalized_markdown_for_code = normalize_code(markdown_text);
221    let punctuation_only_re = Regex::new(r"^[{}\[\](),;]+$").ok();
222    for code in &web_content.code_blocks {
223        total_checks += 1;
224        let normalized_code_full = normalize_code(code);
225
226        let lines: Vec<&str> = code
227            .lines()
228            .map(str::trim)
229            .filter(|l| {
230                l.len() > 3
231                    && !punctuation_only_re
232                        .as_ref()
233                        .is_some_and(|re| re.is_match(l))
234            })
235            .collect();
236
237        let matching_lines = lines
238            .iter()
239            .filter(|line| {
240                let normalized_line = normalize_code(line);
241                normalized_markdown_for_code.contains(&normalized_line)
242            })
243            .count();
244
245        let match_rate = if lines.is_empty() {
246            1.0
247        } else {
248            matching_lines as f64 / lines.len() as f64
249        };
250
251        if match_rate >= 0.6 || normalized_markdown_for_code.contains(&normalized_code_full) {
252            passed_checks += 1;
253        } else {
254            let truncated = if code.len() > 100 {
255                format!("{}...", &code[..100])
256            } else {
257                format!("{code}...")
258            };
259            missing.code_blocks.push(truncated);
260        }
261    }
262
263    // Check list items (sample first 10)
264    for item in web_content.list_items.iter().take(10) {
265        total_checks += 1;
266        let normalized = normalize_text(item);
267        let words: Vec<&str> = normalized.split(' ').filter(|w| w.len() > 2).collect();
268        let matching_words = words
269            .iter()
270            .filter(|word| normalized_markdown.contains(**word))
271            .count();
272        let match_rate = if words.is_empty() {
273            0.0
274        } else {
275            matching_words as f64 / words.len() as f64
276        };
277
278        let substring_match = normalized.len() > 15
279            && normalized_markdown.contains(&normalized[..normalized.len().min(40)]);
280
281        if match_rate >= 0.6 || substring_match {
282            passed_checks += 1;
283        } else {
284            let truncated = if item.len() > 100 {
285                format!("{}...", &item[..100])
286            } else {
287                format!("{item}...")
288            };
289            missing.list_items.push(truncated);
290        }
291    }
292
293    // Check blockquote formulas
294    let blockquote_re = Regex::new(r"(?m)^>.*$").unwrap();
295    for formula in &web_content.blockquote_formulas {
296        total_checks += 1;
297        let normalized_formula = formula.split_whitespace().collect::<Vec<_>>().join(" ");
298
299        // Extract key parts
300        let cleaned = normalized_formula
301            .replace("\\mathbf{", "")
302            .replace("\\textbf{", "")
303            .replace(['{', '}', '\\'], "");
304        let key_parts: Vec<&str> = cleaned
305            .split_whitespace()
306            .filter(|part| part.len() > 1)
307            .collect();
308
309        // Find blockquote lines
310        let blockquote_lines: Vec<&str> = blockquote_re
311            .find_iter(markdown_text)
312            .map(|m| m.as_str())
313            .collect();
314
315        let mut found = false;
316        for line in &blockquote_lines {
317            if line.contains('$') {
318                let matching_parts = key_parts
319                    .iter()
320                    .filter(|part| line.to_lowercase().contains(&part.to_lowercase()))
321                    .count();
322                if !key_parts.is_empty() && matching_parts >= key_parts.len().min(2) {
323                    found = true;
324                    break;
325                }
326                if line.contains(&normalized_formula)
327                    || line.contains(formula.as_str())
328                    || (formula.len() < 20 && line.contains(&formula.replace(' ', "")))
329                {
330                    found = true;
331                    break;
332                }
333            }
334        }
335
336        if found {
337            passed_checks += 1;
338        } else {
339            let truncated = if formula.len() > 100 {
340                formula[..100].to_string()
341            } else {
342                formula.clone()
343            };
344            missing.blockquote_formulas.push(truncated);
345        }
346    }
347
348    // Check for figure images
349    if options.has_local_images {
350        if let Some(expected) = options.expected_figures {
351            total_checks += 1;
352            let figure_re = Regex::new(
353                r"(?i)!\[(?:\*\*)?(?:Figure|Рис\.?|Рисунок)\s*\d+[\s\S]*?\]\(images/figure-\d+\.(png|jpg)\)",
354            )
355            .unwrap();
356            #[allow(clippy::cast_possible_truncation)]
357            let figure_count = figure_re.find_iter(markdown_text).count() as u32;
358            if figure_count >= expected {
359                passed_checks += 1;
360            } else {
361                missing.images = expected - figure_count;
362            }
363        }
364    }
365
366    // Calculate results
367    let pass_rate = if total_checks > 0 {
368        f64::from(passed_checks) / f64::from(total_checks)
369    } else {
370        0.0
371    };
372    let has_missing_content = missing.title
373        || missing.images > 0
374        || !missing.headings.is_empty()
375        || !missing.paragraphs.is_empty()
376        || !missing.code_blocks.is_empty()
377        || !missing.formulas.is_empty()
378        || !missing.blockquote_formulas.is_empty()
379        || !missing.list_items.is_empty();
380
381    VerifyResult {
382        total_checks,
383        passed_checks,
384        pass_rate,
385        has_missing_content,
386        success: !has_missing_content || pass_rate >= 0.85,
387        missing,
388    }
389}