blockless_sdk/bless_crawl/
html_transform.rs1use kuchikiki::{parse_html, traits::TendrilSink};
2use serde::{Deserialize, Serialize};
3use url::Url;
4
5const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
6 "header",
7 "footer",
8 "nav",
9 "aside",
10 ".header",
11 ".top",
12 ".navbar",
13 "#header",
14 ".footer",
15 ".bottom",
16 "#footer",
17 ".sidebar",
18 ".side",
19 ".aside",
20 "#sidebar",
21 ".modal",
22 ".popup",
23 "#modal",
24 ".overlay",
25 ".ad",
26 ".ads",
27 ".advert",
28 "#ad",
29 ".lang-selector",
30 ".language",
31 "#language-selector",
32 ".social",
33 ".social-media",
34 ".social-links",
35 "#social",
36 ".menu",
37 ".navigation",
38 "#nav",
39 ".breadcrumbs",
40 "#breadcrumbs",
41 ".share",
42 "#share",
43 ".widget",
44 "#widget",
45 ".cookie",
46 "#cookie",
47];
48
49const FORCE_INCLUDE_MAIN_TAGS: [&str; 13] = [
50 "#main",
51 ".swoogo-cols",
53 ".swoogo-text",
54 ".swoogo-table-div",
55 ".swoogo-space",
56 ".swoogo-alert",
57 ".swoogo-sponsors",
58 ".swoogo-title",
59 ".swoogo-tabs",
60 ".swoogo-logo",
61 ".swoogo-image",
62 ".swoogo-button",
63 ".swoogo-agenda",
64];
65
66#[derive(Debug, Clone, Deserialize, Serialize)]
67pub struct TransformHtmlOptions {
68 pub html: String,
69 pub url: String,
70 pub include_tags: Vec<String>,
71 pub exclude_tags: Vec<String>,
72 pub only_main_content: bool,
73}
74
75#[derive(Debug)]
76struct ImageSource {
77 url: String,
78 size: i32,
79 is_x: bool,
80}
81
82#[derive(Debug)]
83pub enum HtmlTransformError {
84 ParseError,
85 UrlParseError,
86 SelectError,
87}
88
89impl std::fmt::Display for HtmlTransformError {
90 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
91 match self {
92 HtmlTransformError::ParseError => write!(f, "Failed to parse HTML"),
93 HtmlTransformError::UrlParseError => write!(f, "Failed to parse URL"),
94 HtmlTransformError::SelectError => write!(f, "Failed to select HTML elements"),
95 }
96 }
97}
98
99impl std::error::Error for HtmlTransformError {}
100
101pub fn transform_html(opts: TransformHtmlOptions) -> Result<String, HtmlTransformError> {
103 let mut document = parse_html().one(opts.html);
104
105 if !opts.include_tags.is_empty() {
107 let new_document = parse_html().one("<div></div>");
108 let root = new_document
109 .select_first("div")
110 .map_err(|_| HtmlTransformError::SelectError)?;
111
112 for tag_selector in opts.include_tags.iter() {
113 let matching_nodes: Vec<_> = document
114 .select(tag_selector)
115 .map_err(|_| HtmlTransformError::SelectError)?
116 .collect();
117 for tag in matching_nodes {
118 root.as_node().append(tag.as_node().clone());
119 }
120 }
121
122 document = new_document;
123 }
124
125 let unwanted_selectors = ["head", "meta", "noscript", "style", "script"];
127 for selector in &unwanted_selectors {
128 while let Ok(element) = document.select_first(selector) {
129 element.as_node().detach();
130 }
131 }
132
133 for tag_selector in opts.exclude_tags.iter() {
135 while let Ok(element) = document.select_first(tag_selector) {
136 element.as_node().detach();
137 }
138 }
139
140 if opts.only_main_content {
142 for selector in EXCLUDE_NON_MAIN_TAGS.iter() {
143 let elements: Vec<_> = document
144 .select(selector)
145 .map_err(|_| HtmlTransformError::SelectError)?
146 .collect();
147 for element in elements {
148 let should_keep = FORCE_INCLUDE_MAIN_TAGS.iter().any(|force_selector| {
150 element
151 .as_node()
152 .select(force_selector)
153 .map(|mut iter| iter.next().is_some())
154 .unwrap_or(false)
155 });
156
157 if !should_keep {
158 element.as_node().detach();
159 }
160 }
161 }
162 }
163
164 let srcset_images: Vec<_> = document
166 .select("img[srcset]")
167 .map_err(|_| HtmlTransformError::SelectError)?
168 .collect();
169
170 for img in srcset_images {
171 let srcset = img.attributes.borrow().get("srcset").map(|s| s.to_string());
172 if let Some(srcset) = srcset {
173 let mut sizes: Vec<ImageSource> = srcset
174 .split(',')
175 .filter_map(|entry| {
176 let tokens: Vec<&str> = entry.trim().split(' ').collect();
177 if tokens.is_empty() {
178 return None;
179 }
180
181 let size_token = if tokens.len() > 1 && !tokens[1].is_empty() {
182 tokens[1]
183 } else {
184 "1x"
185 };
186
187 if let Ok(parsed_size) = size_token[..size_token.len() - 1].parse() {
188 Some(ImageSource {
189 url: tokens[0].to_string(),
190 size: parsed_size,
191 is_x: size_token.ends_with('x'),
192 })
193 } else {
194 None
195 }
196 })
197 .collect();
198
199 if sizes.iter().all(|s| s.is_x) {
201 let src = img.attributes.borrow().get("src").map(|s| s.to_string());
202 if let Some(src) = src {
203 sizes.push(ImageSource {
204 url: src,
205 size: 1,
206 is_x: true,
207 });
208 }
209 }
210
211 sizes.sort_by(|a, b| b.size.cmp(&a.size));
213 if let Some(biggest) = sizes.first() {
214 img.attributes
215 .borrow_mut()
216 .insert("src", biggest.url.clone());
217 }
218 }
219 }
220
221 let base_url = Url::parse(&opts.url).map_err(|_| HtmlTransformError::UrlParseError)?;
223
224 let src_images: Vec<_> = document
226 .select("img[src]")
227 .map_err(|_| HtmlTransformError::SelectError)?
228 .collect();
229 for img in src_images {
230 let old_src = img.attributes.borrow().get("src").map(|s| s.to_string());
231 if let Some(old_src) = old_src {
232 if let Ok(new_url) = base_url.join(&old_src) {
233 img.attributes
234 .borrow_mut()
235 .insert("src", new_url.to_string());
236 }
237 }
238 }
239
240 let href_anchors: Vec<_> = document
242 .select("a[href]")
243 .map_err(|_| HtmlTransformError::SelectError)?
244 .collect();
245 for anchor in href_anchors {
246 let old_href = anchor
247 .attributes
248 .borrow()
249 .get("href")
250 .map(|s| s.to_string());
251 if let Some(old_href) = old_href {
252 if let Ok(new_url) = base_url.join(&old_href) {
253 anchor
254 .attributes
255 .borrow_mut()
256 .insert("href", new_url.to_string());
257 }
258 }
259 }
260
261 Ok(document.to_string())
262}
263
264#[cfg(test)]
265mod tests {
266 use super::*;
267
268 #[test]
269 fn test_transform_html_removes_unwanted_elements() {
270 let opts = TransformHtmlOptions {
271 html: "<html><head><title>Test</title></head><body><p>Content</p><script>alert('test')</script></body></html>".to_string(),
272 url: "https://example.com".to_string(),
273 include_tags: vec![],
274 exclude_tags: vec![],
275 only_main_content: false,
276 };
277
278 let result = transform_html(opts).unwrap();
279 let expected = "<html><body><p>Content</p></body></html>";
280 assert_eq!(result, expected);
281 }
282
283 #[test]
284 fn test_transform_html_include_tags() {
285 let opts = TransformHtmlOptions {
286 html: "<html><body><div class=\"content\">Keep this</div><div class=\"sidebar\">Remove this</div></body></html>".to_string(),
287 url: "https://example.com".to_string(),
288 include_tags: vec![".content".to_string()],
289 exclude_tags: vec![],
290 only_main_content: false,
291 };
292
293 let result = transform_html(opts).unwrap();
294 let expected =
295 "<html><body><div><div class=\"content\">Keep this</div></div></body></html>";
296 assert_eq!(result, expected);
297 }
298
299 #[test]
300 fn test_transform_html_exclude_tags() {
301 let opts = TransformHtmlOptions {
302 html: "<html><body><div class=\"content\">Keep this</div><div class=\"ad\">Remove this</div></body></html>".to_string(),
303 url: "https://example.com".to_string(),
304 include_tags: vec![],
305 exclude_tags: vec![".ad".to_string()],
306 only_main_content: false,
307 };
308
309 let result = transform_html(opts).unwrap();
310 let expected = "<html><body><div class=\"content\">Keep this</div></body></html>";
311 assert_eq!(result, expected);
312 }
313
314 #[test]
315 fn test_transform_html_relative_urls() {
316 let opts = TransformHtmlOptions {
317 html: r#"<html><body><img src="/image.jpg"><a href="/page">Link</a></body></html>"#
318 .to_string(),
319 url: "https://example.com/subdir/".to_string(),
320 include_tags: vec![],
321 exclude_tags: vec![],
322 only_main_content: false,
323 };
324
325 let result = transform_html(opts).unwrap();
326 let expected = r#"<html><body><img src="https://example.com/image.jpg"><a href="https://example.com/page">Link</a></body></html>"#;
327 assert_eq!(result, expected);
328 }
329
330 #[test]
331 fn test_transform_html_only_main_content() {
332 let opts = TransformHtmlOptions {
333 html: "<html><body><header>Header</header><main><p>Main content</p></main><footer>Footer</footer></body></html>".to_string(),
334 url: "https://example.com".to_string(),
335 include_tags: vec![],
336 exclude_tags: vec![],
337 only_main_content: true,
338 };
339
340 let result = transform_html(opts).unwrap();
341 let expected = "<html><body><main><p>Main content</p></main></body></html>";
342 assert_eq!(result, expected);
343 }
344
345 #[test]
346 fn test_transform_html_srcset_processing() {
347 let opts = TransformHtmlOptions {
348 html: r#"<html><body><img srcset="/small.jpg 1x, /large.jpg 2x" src="/default.jpg"></body></html>"#.to_string(),
349 url: "https://example.com".to_string(),
350 include_tags: vec![],
351 exclude_tags: vec![],
352 only_main_content: false,
353 };
354
355 let result = transform_html(opts).unwrap();
356 let expected = r#"<html><body><img srcset="/small.jpg 1x, /large.jpg 2x" src="https://example.com/large.jpg"></body></html>"#;
357 assert_eq!(result, expected);
358 }
359
360 #[test]
361 fn test_transform_html_force_include_tags() {
362 let opts = TransformHtmlOptions {
363 html: r#"<html><body><div class="widget"><div id="main"><p>Important content</p></div></div><div class="sidebar">Sidebar</div></body></html>"#.to_string(),
364 url: "https://example.com".to_string(),
365 include_tags: vec![],
366 exclude_tags: vec![],
367 only_main_content: true,
368 };
369
370 let result = transform_html(opts).unwrap();
371 let expected = r#"<html><body><div class="widget"><div id="main"><p>Important content</p></div></div></body></html>"#;
372 assert_eq!(result, expected);
373 }
374}