1use regex::Regex;
10use scraper::{Html, Selector};
11use serde::{Deserialize, Serialize};
12use url::Url;
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct Figure {
17 pub figure_num: u32,
18 pub src: String,
19 pub alt: String,
20 pub caption: String,
21 pub sequential_index: u32,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct FigureDownloadResult {
27 pub figure_num: u32,
28 pub filename: String,
29 #[serde(skip)]
30 pub buffer: Option<Vec<u8>>,
31 pub caption: String,
32 pub original_url: String,
33 #[serde(skip_serializing_if = "Option::is_none")]
34 pub error: Option<String>,
35}
36
37#[must_use]
43pub fn extract_figures(html: &str, base_url: &str) -> Vec<Figure> {
44 let document = Html::parse_document(html);
45 let mut figures = Vec::new();
46 let mut sequential_index: u32 = 0;
47
48 let Ok(figure_sel) = Selector::parse("figure") else {
49 return figures;
50 };
51 let img_sel = Selector::parse("img").unwrap();
52 let caption_sel = Selector::parse("figcaption").unwrap();
53
54 let figure_num_re = Regex::new(r"(?i)(?:Figure|Рис\.?|Рисунок)\s*(\d+)").unwrap();
55
56 for figure_el in document.select(&figure_sel) {
57 let Some(img) = figure_el.select(&img_sel).next() else {
58 continue;
59 };
60
61 let src = match img.value().attr("src") {
62 Some(s) if !s.starts_with("data:") && !s.contains(".svg") => s,
63 _ => continue,
64 };
65
66 sequential_index += 1;
67
68 let caption_text = figure_el
69 .select(&caption_sel)
70 .next()
71 .map(|el| el.text().collect::<String>().trim().to_string())
72 .unwrap_or_default();
73
74 let figure_num = figure_num_re
75 .captures(&caption_text)
76 .and_then(|cap| cap[1].parse::<u32>().ok())
77 .unwrap_or(sequential_index);
78
79 let resolved_src = Url::parse(base_url)
80 .ok()
81 .and_then(|base| base.join(src).ok())
82 .map_or_else(|| src.to_string(), |u| u.to_string());
83
84 let alt = img.value().attr("alt").unwrap_or("").to_string();
85
86 figures.push(Figure {
87 figure_num,
88 src: resolved_src,
89 alt,
90 caption: caption_text,
91 sequential_index,
92 });
93 }
94
95 figures
96}
97
98pub async fn download_figures(figures: &[Figure]) -> Vec<FigureDownloadResult> {
103 let mut results = Vec::new();
104 let client = reqwest::Client::new();
105
106 for figure in figures {
107 let ext = if figure.src.contains(".jpeg") || figure.src.contains(".jpg") {
108 "jpg"
109 } else {
110 "png"
111 };
112 let filename = format!("figure-{}.{ext}", figure.figure_num);
113
114 let mut last_error = None;
115 let mut buffer = None;
116
117 for attempt in 0..3 {
118 match client.get(&figure.src).send().await {
119 Ok(resp) if resp.status().is_success() => match resp.bytes().await {
120 Ok(bytes) => {
121 buffer = Some(bytes.to_vec());
122 break;
123 }
124 Err(e) => last_error = Some(e.to_string()),
125 },
126 Ok(resp) => last_error = Some(format!("HTTP {}", resp.status())),
127 Err(e) => last_error = Some(e.to_string()),
128 }
129 if attempt < 2 {
130 tokio::time::sleep(std::time::Duration::from_secs(1)).await;
131 }
132 }
133
134 let has_buffer = buffer.is_some();
135 results.push(FigureDownloadResult {
136 figure_num: figure.figure_num,
137 filename,
138 buffer,
139 caption: figure.caption.clone(),
140 original_url: figure.src.clone(),
141 error: if has_buffer { None } else { last_error },
142 });
143 }
144
145 results
146}