Skip to main content

web_capture/
figures.rs

1//! Figure image extraction and download module (R4).
2//!
3//! Extracts figure images from web pages and downloads them locally.
4//! Supports multi-language figure detection (English/Russian).
5//!
6//! Based on reference implementation from:
7//! <https://github.com/link-foundation/meta-theory/blob/main/scripts/download.mjs>
8
9use regex::Regex;
10use scraper::{Html, Selector};
11use serde::{Deserialize, Serialize};
12use url::Url;
13
14/// A figure extracted from HTML.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct Figure {
17    pub figure_num: u32,
18    pub src: String,
19    pub alt: String,
20    pub caption: String,
21    pub sequential_index: u32,
22}
23
24/// Result of downloading a single figure.
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct FigureDownloadResult {
27    pub figure_num: u32,
28    pub filename: String,
29    #[serde(skip)]
30    pub buffer: Option<Vec<u8>>,
31    pub caption: String,
32    pub original_url: String,
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub error: Option<String>,
35}
36
37/// Extract figure elements from HTML content.
38///
39/// Finds `<figure>` elements containing `<img>` tags and extracts
40/// their source URLs, alt text, and captions with multi-language
41/// figure number detection.
42#[must_use]
43pub fn extract_figures(html: &str, base_url: &str) -> Vec<Figure> {
44    let document = Html::parse_document(html);
45    let mut figures = Vec::new();
46    let mut sequential_index: u32 = 0;
47
48    let Ok(figure_sel) = Selector::parse("figure") else {
49        return figures;
50    };
51    let img_sel = Selector::parse("img").unwrap();
52    let caption_sel = Selector::parse("figcaption").unwrap();
53
54    let figure_num_re = Regex::new(r"(?i)(?:Figure|Рис\.?|Рисунок)\s*(\d+)").unwrap();
55
56    for figure_el in document.select(&figure_sel) {
57        let Some(img) = figure_el.select(&img_sel).next() else {
58            continue;
59        };
60
61        let src = match img.value().attr("src") {
62            Some(s) if !s.starts_with("data:") && !s.contains(".svg") => s,
63            _ => continue,
64        };
65
66        sequential_index += 1;
67
68        let caption_text = figure_el
69            .select(&caption_sel)
70            .next()
71            .map(|el| el.text().collect::<String>().trim().to_string())
72            .unwrap_or_default();
73
74        let figure_num = figure_num_re
75            .captures(&caption_text)
76            .and_then(|cap| cap[1].parse::<u32>().ok())
77            .unwrap_or(sequential_index);
78
79        let resolved_src = Url::parse(base_url)
80            .ok()
81            .and_then(|base| base.join(src).ok())
82            .map_or_else(|| src.to_string(), |u| u.to_string());
83
84        let alt = img.value().attr("alt").unwrap_or("").to_string();
85
86        figures.push(Figure {
87            figure_num,
88            src: resolved_src,
89            alt,
90            caption: caption_text,
91            sequential_index,
92        });
93    }
94
95    figures
96}
97
98/// Download figure images.
99///
100/// Downloads each figure's image and returns results with the buffer
101/// or error information.
102pub async fn download_figures(figures: &[Figure]) -> Vec<FigureDownloadResult> {
103    let mut results = Vec::new();
104    let client = reqwest::Client::new();
105
106    for figure in figures {
107        let ext = if figure.src.contains(".jpeg") || figure.src.contains(".jpg") {
108            "jpg"
109        } else {
110            "png"
111        };
112        let filename = format!("figure-{}.{ext}", figure.figure_num);
113
114        let mut last_error = None;
115        let mut buffer = None;
116
117        for attempt in 0..3 {
118            match client.get(&figure.src).send().await {
119                Ok(resp) if resp.status().is_success() => match resp.bytes().await {
120                    Ok(bytes) => {
121                        buffer = Some(bytes.to_vec());
122                        break;
123                    }
124                    Err(e) => last_error = Some(e.to_string()),
125                },
126                Ok(resp) => last_error = Some(format!("HTTP {}", resp.status())),
127                Err(e) => last_error = Some(e.to_string()),
128            }
129            if attempt < 2 {
130                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
131            }
132        }
133
134        let has_buffer = buffer.is_some();
135        results.push(FigureDownloadResult {
136            figure_num: figure.figure_num,
137            filename,
138            buffer,
139            caption: figure.caption.clone(),
140            original_url: figure.src.clone(),
141            error: if has_buffer { None } else { last_error },
142        });
143    }
144
145    results
146}