1use scraper::Html;
11use url::Url;
12
13use crate::types::{
14 ExtractedMedia, MediaConfig, MediaResult, MediaType,
15 ImageMedia, VideoMedia, AudioMedia, DocumentMedia, EmbeddedMedia,
16};
17use crate::{images, videos, audio, documents, embedded};
18
19#[derive(Debug, Clone)]
25#[derive(Default)]
26pub struct MediaExtractor {
27 config: MediaConfig,
28 base_url: Option<Url>,
29}
30
31
32impl MediaExtractor {
33 pub fn new() -> Self {
35 Self::default()
36 }
37
38 pub fn with_config(config: MediaConfig) -> Self {
40 Self {
41 config,
42 base_url: None,
43 }
44 }
45
46 pub fn with_base_url(mut self, url: &str) -> Self {
48 self.base_url = Url::parse(url).ok();
49 self
50 }
51
52 pub fn with_base(mut self, url: Url) -> Self {
54 self.base_url = Some(url);
55 self
56 }
57
58 pub fn config(&self) -> &MediaConfig {
60 &self.config
61 }
62
63 pub fn base_url(&self) -> Option<&Url> {
65 self.base_url.as_ref()
66 }
67
68 pub fn extract_all(&self, html: &str) -> MediaResult<ExtractedMedia> {
74 let document = Html::parse_document(html);
75 self.extract_from_document(&document)
76 }
77
78 pub fn extract_from_document(&self, document: &Html) -> MediaResult<ExtractedMedia> {
80 let mut result = ExtractedMedia::default();
81
82 if self.config.extract_images {
84 let mut images = images::extract_images(document, self.base_url.as_ref());
85
86 if self.config.filter_placeholders {
88 images = images::filter_placeholders(images);
89 }
90 result.images = images;
91 }
92
93 if self.config.extract_videos {
94 result.videos = videos::extract_videos(document, self.base_url.as_ref());
95 }
96
97 if self.config.extract_audio {
98 result.audio = audio::extract_audio(document, self.base_url.as_ref());
99 }
100
101 if self.config.extract_documents {
102 result.documents = documents::extract_documents(document, self.base_url.as_ref());
103 }
104
105 if self.config.extract_embeds {
106 result.embeds = embedded::extract_embeds(document, self.base_url.as_ref());
107 }
108
109 Ok(result)
110 }
111
112 pub fn extract_images(&self, html: &str) -> MediaResult<Vec<ImageMedia>> {
118 let document = Html::parse_document(html);
119 Ok(images::extract_images(&document, self.base_url.as_ref()))
120 }
121
122 pub fn extract_videos(&self, html: &str) -> MediaResult<Vec<VideoMedia>> {
124 let document = Html::parse_document(html);
125 Ok(videos::extract_videos(&document, self.base_url.as_ref()))
126 }
127
128 pub fn extract_audio(&self, html: &str) -> MediaResult<Vec<AudioMedia>> {
130 let document = Html::parse_document(html);
131 Ok(audio::extract_audio(&document, self.base_url.as_ref()))
132 }
133
134 pub fn extract_documents(&self, html: &str) -> MediaResult<Vec<DocumentMedia>> {
136 let document = Html::parse_document(html);
137 Ok(documents::extract_documents(&document, self.base_url.as_ref()))
138 }
139
140 pub fn extract_embeds(&self, html: &str) -> MediaResult<Vec<EmbeddedMedia>> {
142 let document = Html::parse_document(html);
143 Ok(embedded::extract_embeds(&document, self.base_url.as_ref()))
144 }
145
146 pub fn get_all_urls(&self, html: &str) -> Vec<String> {
152 let extracted = self.extract_all(html).unwrap_or_default();
153 extracted.all_urls()
154 }
155
156 pub fn get_urls_by_type(&self, html: &str, media_type: MediaType) -> Vec<String> {
158 match media_type {
159 MediaType::Image => images::get_image_urls(html, self.base_url.as_ref().map(|u| u.as_str())),
160 MediaType::Video => videos::get_video_urls(html, self.base_url.as_ref().map(|u| u.as_str())),
161 MediaType::Audio => audio::get_audio_urls(html, self.base_url.as_ref().map(|u| u.as_str())),
162 MediaType::Document => documents::get_document_urls(html, self.base_url.as_ref().map(|u| u.as_str())),
163 MediaType::Embedded => embedded::get_embed_urls(html, self.base_url.as_ref().map(|u| u.as_str())),
164 MediaType::Other => Vec::new(),
165 }
166 }
167
168 pub fn has_media(&self, html: &str) -> bool {
174 let document = Html::parse_document(html);
175 images::has_images(&document) ||
176 videos::has_videos(&document) ||
177 audio::has_audio(&document) ||
178 documents::has_documents(&document) ||
179 embedded::has_embeds(&document)
180 }
181
182 pub fn has_media_type(&self, html: &str, media_type: MediaType) -> bool {
184 let document = Html::parse_document(html);
185 match media_type {
186 MediaType::Image => images::has_images(&document),
187 MediaType::Video => videos::has_videos(&document),
188 MediaType::Audio => audio::has_audio(&document),
189 MediaType::Document => documents::has_documents(&document),
190 MediaType::Embedded => embedded::has_embeds(&document),
191 MediaType::Other => false,
192 }
193 }
194
195 pub fn count_media(&self, html: &str) -> MediaCounts {
201 let extracted = self.extract_all(html).unwrap_or_default();
202 MediaCounts {
203 images: extracted.images.len(),
204 videos: extracted.videos.len(),
205 audio: extracted.audio.len(),
206 documents: extracted.documents.len(),
207 embeds: extracted.embeds.len(),
208 total: extracted.total_count(),
209 }
210 }
211}
212
213#[derive(Debug, Clone, Default)]
219pub struct MediaCounts {
220 pub images: usize,
221 pub videos: usize,
222 pub audio: usize,
223 pub documents: usize,
224 pub embeds: usize,
225 pub total: usize,
226}
227
228impl MediaCounts {
229 pub fn has_any(&self) -> bool {
231 self.total > 0
232 }
233
234 pub fn has_type(&self, media_type: MediaType) -> bool {
236 match media_type {
237 MediaType::Image => self.images > 0,
238 MediaType::Video => self.videos > 0,
239 MediaType::Audio => self.audio > 0,
240 MediaType::Document => self.documents > 0,
241 MediaType::Embedded => self.embeds > 0,
242 MediaType::Other => false,
243 }
244 }
245}
246
247#[derive(Debug, Clone, Default)]
253pub struct MediaExtractorBuilder {
254 config: MediaConfig,
255 base_url: Option<String>,
256}
257
258impl MediaExtractorBuilder {
259 pub fn new() -> Self {
260 Self::default()
261 }
262
263 pub fn extract_images(mut self, enabled: bool) -> Self {
264 self.config.extract_images = enabled;
265 self
266 }
267
268 pub fn extract_videos(mut self, enabled: bool) -> Self {
269 self.config.extract_videos = enabled;
270 self
271 }
272
273 pub fn extract_audio(mut self, enabled: bool) -> Self {
274 self.config.extract_audio = enabled;
275 self
276 }
277
278 pub fn extract_documents(mut self, enabled: bool) -> Self {
279 self.config.extract_documents = enabled;
280 self
281 }
282
283 pub fn extract_embeds(mut self, enabled: bool) -> Self {
284 self.config.extract_embeds = enabled;
285 self
286 }
287
288 pub fn filter_placeholders(mut self, enabled: bool) -> Self {
289 self.config.filter_placeholders = enabled;
290 self
291 }
292
293 pub fn include_data_urls(mut self, enabled: bool) -> Self {
294 self.config.include_data_urls = enabled;
295 self
296 }
297
298 pub fn min_image_size(mut self, width: u32, height: u32) -> Self {
299 self.config.min_image_width = Some(width);
300 self.config.min_image_height = Some(height);
301 self
302 }
303
304 pub fn base_url(mut self, url: &str) -> Self {
305 self.base_url = Some(url.to_string());
306 self
307 }
308
309 pub fn build(self) -> MediaExtractor {
310 let mut extractor = MediaExtractor::with_config(self.config);
311 if let Some(url) = self.base_url {
312 extractor = extractor.with_base_url(&url);
313 }
314 extractor
315 }
316}
317
318pub fn extract_media(html: &str, base_url: Option<&str>) -> MediaResult<ExtractedMedia> {
324 let mut extractor = MediaExtractor::new();
325 if let Some(url) = base_url {
326 extractor = extractor.with_base_url(url);
327 }
328 extractor.extract_all(html)
329}
330
331pub fn has_any_media(html: &str) -> bool {
333 MediaExtractor::new().has_media(html)
334}
335
336pub fn count_all_media(html: &str) -> MediaCounts {
338 MediaExtractor::new().count_media(html)
339}
340
341pub fn get_all_media_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
343 let mut extractor = MediaExtractor::new();
344 if let Some(url) = base_url {
345 extractor = extractor.with_base_url(url);
346 }
347 extractor.get_all_urls(html)
348}
349
350#[cfg(test)]
355mod tests {
356 use super::*;
357
358 const TEST_HTML: &str = r#"
359 <html>
360 <body>
361 <img src="https://example.com/image.jpg" alt="Test">
362 <video src="https://example.com/video.mp4"></video>
363 <audio src="https://example.com/audio.mp3"></audio>
364 <a href="https://example.com/doc.pdf">PDF</a>
365 <iframe src="https://www.google.com/maps/embed"></iframe>
366 </body>
367 </html>
368 "#;
369
370 #[test]
371 fn test_extract_all() {
372 let extractor = MediaExtractor::new();
373 let result = extractor.extract_all(TEST_HTML).unwrap();
374
375 assert!(!result.images.is_empty());
376 assert!(!result.videos.is_empty());
377 assert!(!result.audio.is_empty());
378 assert!(!result.documents.is_empty());
379 assert!(!result.embeds.is_empty());
380 }
381
382 #[test]
383 fn test_extract_with_base_url() {
384 let html = r#"<img src="/images/test.jpg">"#;
385 let extractor = MediaExtractor::new()
386 .with_base_url("https://example.com");
387
388 let images = extractor.extract_images(html).unwrap();
389 assert_eq!(images.len(), 1);
390 assert_eq!(images[0].absolute_url, Some("https://example.com/images/test.jpg".to_string()));
391 }
392
393 #[test]
394 fn test_config_disable_types() {
395 let config = MediaConfig {
396 extract_images: true,
397 extract_videos: false,
398 extract_audio: false,
399 extract_documents: false,
400 extract_embeds: false,
401 ..Default::default()
402 };
403
404 let extractor = MediaExtractor::with_config(config);
405 let result = extractor.extract_all(TEST_HTML).unwrap();
406
407 assert!(!result.images.is_empty());
408 assert!(result.videos.is_empty());
409 assert!(result.audio.is_empty());
410 }
411
412 #[test]
413 fn test_has_media() {
414 let extractor = MediaExtractor::new();
415
416 assert!(extractor.has_media(TEST_HTML));
417 assert!(!extractor.has_media("<div>No media</div>"));
418 }
419
420 #[test]
421 fn test_has_media_type() {
422 let extractor = MediaExtractor::new();
423
424 assert!(extractor.has_media_type(TEST_HTML, MediaType::Image));
425 assert!(extractor.has_media_type(TEST_HTML, MediaType::Video));
426 assert!(!extractor.has_media_type("<div>No media</div>", MediaType::Image));
427 }
428
429 #[test]
430 fn test_count_media() {
431 let extractor = MediaExtractor::new();
432 let counts = extractor.count_media(TEST_HTML);
433
434 assert!(counts.has_any());
435 assert!(counts.images > 0);
436 assert!(counts.total > 0);
437 }
438
439 #[test]
440 fn test_get_all_urls() {
441 let extractor = MediaExtractor::new();
442 let urls = extractor.get_all_urls(TEST_HTML);
443
444 assert!(!urls.is_empty());
445 assert!(urls.iter().any(|u| u.contains("image.jpg")));
446 assert!(urls.iter().any(|u| u.contains("video.mp4")));
447 }
448
449 #[test]
450 fn test_get_urls_by_type() {
451 let extractor = MediaExtractor::new();
452
453 let image_urls = extractor.get_urls_by_type(TEST_HTML, MediaType::Image);
454 assert!(!image_urls.is_empty());
455 assert!(image_urls.iter().all(|u| u.contains("image")));
456 }
457
458 #[test]
459 fn test_builder() {
460 let extractor = MediaExtractorBuilder::new()
461 .extract_images(true)
462 .extract_videos(false)
463 .filter_placeholders(true)
464 .base_url("https://example.com")
465 .build();
466
467 assert!(extractor.config().extract_images);
468 assert!(!extractor.config().extract_videos);
469 assert!(extractor.base_url().is_some());
470 }
471
472 #[test]
473 fn test_convenience_functions() {
474 assert!(has_any_media(TEST_HTML));
475
476 let counts = count_all_media(TEST_HTML);
477 assert!(counts.has_any());
478
479 let urls = get_all_media_urls(TEST_HTML, None);
480 assert!(!urls.is_empty());
481 }
482
483 #[test]
484 fn test_media_counts() {
485 let counts = MediaCounts {
486 images: 5,
487 videos: 2,
488 audio: 1,
489 documents: 3,
490 embeds: 2,
491 total: 13,
492 };
493
494 assert!(counts.has_any());
495 assert!(counts.has_type(MediaType::Image));
496 assert!(counts.has_type(MediaType::Video));
497 assert!(!counts.has_type(MediaType::Other));
498 }
499
500 #[test]
501 fn test_extract_result_methods() {
502 let extractor = MediaExtractor::new();
503 let result = extractor.extract_all(TEST_HTML).unwrap();
504
505 assert!(result.total_count() > 0);
506 assert!(!result.all_urls().is_empty());
507 assert!(!result.is_empty());
508 }
509}