1use lazy_static::lazy_static;
9use regex::Regex;
10use scraper::{Html, Selector, ElementRef};
11use std::collections::HashSet;
12use url::Url;
13
14use crate::types::{
15 EmbeddedMedia, EmbedPlatform, MediaResult,
16};
17
18lazy_static! {
23 static ref GOOGLE_MAPS: Regex = Regex::new(
25 r"google\.com/maps|maps\.google\."
26 ).unwrap();
27
28 static ref TWITTER: Regex = Regex::new(
30 r"twitter\.com|x\.com|platform\.twitter"
31 ).unwrap();
32
33 static ref INSTAGRAM: Regex = Regex::new(
35 r"instagram\.com"
36 ).unwrap();
37
38 static ref FACEBOOK: Regex = Regex::new(
40 r"facebook\.com|fb\.com"
41 ).unwrap();
42
43 static ref LINKEDIN: Regex = Regex::new(
45 r"linkedin\.com"
46 ).unwrap();
47
48 static ref PINTEREST: Regex = Regex::new(
50 r"pinterest\.com"
51 ).unwrap();
52
53 static ref TIKTOK: Regex = Regex::new(
55 r"tiktok\.com"
56 ).unwrap();
57
58 static ref REDDIT: Regex = Regex::new(
60 r"reddit\.com|redd\.it"
61 ).unwrap();
62
63 static ref CODEPEN: Regex = Regex::new(
65 r"codepen\.io"
66 ).unwrap();
67
68 static ref JSFIDDLE: Regex = Regex::new(
70 r"jsfiddle\.net"
71 ).unwrap();
72
73 static ref CODESANDBOX: Regex = Regex::new(
75 r"codesandbox\.io"
76 ).unwrap();
77
78 static ref GIPHY: Regex = Regex::new(
80 r"giphy\.com"
81 ).unwrap();
82
83 static ref SLIDESHARE: Regex = Regex::new(
85 r"slideshare\.net"
86 ).unwrap();
87
88 static ref TYPEFORM: Regex = Regex::new(
90 r"typeform\.com"
91 ).unwrap();
92
93 static ref CALENDLY: Regex = Regex::new(
95 r"calendly\.com"
96 ).unwrap();
97
98 static ref STRIPE: Regex = Regex::new(
100 r"stripe\.com"
101 ).unwrap();
102
103 static ref PAYPAL: Regex = Regex::new(
105 r"paypal\.com"
106 ).unwrap();
107}
108
109pub fn extract_embeds(document: &Html, base_url: Option<&Url>) -> Vec<EmbeddedMedia> {
115 let mut embeds = Vec::new();
116 let mut seen_urls: HashSet<String> = HashSet::new();
117
118 if let Ok(sel) = Selector::parse("iframe[src]") {
120 for el in document.select(&sel) {
121 if let Some(embed) = extract_iframe(&el, base_url) {
122 let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
123 if seen_urls.insert(key) {
124 embeds.push(embed);
125 }
126 }
127 }
128 }
129
130 if let Ok(sel) = Selector::parse("object[data]") {
132 for el in document.select(&sel) {
133 if let Some(embed) = extract_object(&el, base_url) {
134 let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
135 if seen_urls.insert(key) {
136 embeds.push(embed);
137 }
138 }
139 }
140 }
141
142 if let Ok(sel) = Selector::parse("embed[src]") {
144 for el in document.select(&sel) {
145 if let Some(embed) = extract_embed_tag(&el, base_url) {
146 let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
147 if seen_urls.insert(key) {
148 embeds.push(embed);
149 }
150 }
151 }
152 }
153
154 extract_social_embeds(document, base_url, &mut embeds, &mut seen_urls);
156
157 embeds
158}
159
160fn extract_iframe(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
162 let src = el.value().attr("src")?;
163
164 if src.is_empty() || src.starts_with("javascript:") || src.starts_with("about:") {
166 return None;
167 }
168
169 let absolute_url = resolve_url(src, base_url);
170 let platform = detect_embed_platform(src);
171
172 if is_video_platform(&platform) {
174 return None;
175 }
176
177 let width = el.value().attr("width")
179 .and_then(parse_dimension);
180 let height = el.value().attr("height")
181 .and_then(parse_dimension);
182
183 Some(EmbeddedMedia {
184 url: src.to_string(),
185 absolute_url,
186 platform,
187 title: el.value().attr("title").map(|s| s.to_string()),
188 width,
189 height,
190 allow: el.value().attr("allow").map(|s| s.to_string()),
191 sandbox: el.value().attr("sandbox").map(|s| s.to_string()),
192 loading: el.value().attr("loading").map(|s| s.to_string()),
193 frameborder: el.value().attr("frameborder").map(|s| s.to_string()),
194 })
195}
196
197fn extract_object(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
199 let data = el.value().attr("data")?;
200
201 if data.to_lowercase().contains(".pdf") {
203 return None;
204 }
205
206 let absolute_url = resolve_url(data, base_url);
207 let platform = detect_embed_platform(data);
208
209 let width = el.value().attr("width")
210 .and_then(parse_dimension);
211 let height = el.value().attr("height")
212 .and_then(parse_dimension);
213
214 Some(EmbeddedMedia {
215 url: data.to_string(),
216 absolute_url,
217 platform,
218 title: el.value().attr("title").map(|s| s.to_string()),
219 width,
220 height,
221 ..Default::default()
222 })
223}
224
225fn extract_embed_tag(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
227 let src = el.value().attr("src")?;
228
229 if src.to_lowercase().contains(".pdf") {
231 return None;
232 }
233
234 let absolute_url = resolve_url(src, base_url);
235 let platform = detect_embed_platform(src);
236
237 if is_video_platform(&platform) {
238 return None;
239 }
240
241 let width = el.value().attr("width")
242 .and_then(parse_dimension);
243 let height = el.value().attr("height")
244 .and_then(parse_dimension);
245
246 Some(EmbeddedMedia {
247 url: src.to_string(),
248 absolute_url,
249 platform,
250 title: None,
251 width,
252 height,
253 ..Default::default()
254 })
255}
256
257fn extract_social_embeds(
259 document: &Html,
260 _base_url: Option<&Url>,
261 embeds: &mut Vec<EmbeddedMedia>,
262 seen_urls: &mut HashSet<String>,
263) {
264 if let Ok(sel) = Selector::parse("blockquote.twitter-tweet") {
266 for el in document.select(&sel) {
267 if let Ok(link_sel) = Selector::parse("a") {
268 for link in el.select(&link_sel) {
269 if let Some(href) = link.value().attr("href") {
270 if TWITTER.is_match(href) && seen_urls.insert(href.to_string()) {
271 embeds.push(EmbeddedMedia {
272 url: href.to_string(),
273 absolute_url: Some(href.to_string()),
274 platform: EmbedPlatform::Twitter,
275 ..Default::default()
276 });
277 break;
278 }
279 }
280 }
281 }
282 }
283 }
284
285 if let Ok(sel) = Selector::parse("blockquote.instagram-media") {
287 for el in document.select(&sel) {
288 if let Some(permalink) = el.value().attr("data-instgrm-permalink") {
289 if seen_urls.insert(permalink.to_string()) {
290 embeds.push(EmbeddedMedia {
291 url: permalink.to_string(),
292 absolute_url: Some(permalink.to_string()),
293 platform: EmbedPlatform::Instagram,
294 ..Default::default()
295 });
296 }
297 }
298 }
299 }
300
301 if let Ok(sel) = Selector::parse("div.fb-post, div.fb-video") {
303 for el in document.select(&sel) {
304 if let Some(href) = el.value().attr("data-href") {
305 if seen_urls.insert(href.to_string()) {
306 embeds.push(EmbeddedMedia {
307 url: href.to_string(),
308 absolute_url: Some(href.to_string()),
309 platform: EmbedPlatform::Facebook,
310 ..Default::default()
311 });
312 }
313 }
314 }
315 }
316
317 if let Ok(sel) = Selector::parse("blockquote.reddit-embed-bq") {
319 for el in document.select(&sel) {
320 if let Ok(link_sel) = Selector::parse("a") {
321 for link in el.select(&link_sel) {
322 if let Some(href) = link.value().attr("href") {
323 if REDDIT.is_match(href) && seen_urls.insert(href.to_string()) {
324 embeds.push(EmbeddedMedia {
325 url: href.to_string(),
326 absolute_url: Some(href.to_string()),
327 platform: EmbedPlatform::Reddit,
328 ..Default::default()
329 });
330 break;
331 }
332 }
333 }
334 }
335 }
336 }
337}
338
339pub fn detect_embed_platform(url: &str) -> EmbedPlatform {
341 if GOOGLE_MAPS.is_match(url) { return EmbedPlatform::GoogleMaps; }
342 if TWITTER.is_match(url) { return EmbedPlatform::Twitter; }
343 if INSTAGRAM.is_match(url) { return EmbedPlatform::Instagram; }
344 if FACEBOOK.is_match(url) { return EmbedPlatform::Facebook; }
345 if LINKEDIN.is_match(url) { return EmbedPlatform::LinkedIn; }
346 if PINTEREST.is_match(url) { return EmbedPlatform::Pinterest; }
347 if TIKTOK.is_match(url) { return EmbedPlatform::TikTok; }
348 if REDDIT.is_match(url) { return EmbedPlatform::Reddit; }
349 if CODEPEN.is_match(url) { return EmbedPlatform::CodePen; }
350 if JSFIDDLE.is_match(url) { return EmbedPlatform::JsFiddle; }
351 if CODESANDBOX.is_match(url) { return EmbedPlatform::CodeSandbox; }
352 if GIPHY.is_match(url) { return EmbedPlatform::Giphy; }
353 if SLIDESHARE.is_match(url) { return EmbedPlatform::SlideShare; }
354 if TYPEFORM.is_match(url) { return EmbedPlatform::Typeform; }
355 if CALENDLY.is_match(url) { return EmbedPlatform::Calendly; }
356 if STRIPE.is_match(url) { return EmbedPlatform::Stripe; }
357 if PAYPAL.is_match(url) { return EmbedPlatform::PayPal; }
358
359 EmbedPlatform::Other
360}
361
362fn is_video_platform(platform: &EmbedPlatform) -> bool {
364 matches!(platform,
365 EmbedPlatform::YouTube |
366 EmbedPlatform::Vimeo |
367 EmbedPlatform::Dailymotion |
368 EmbedPlatform::Twitch |
369 EmbedPlatform::Wistia |
370 EmbedPlatform::Spotify |
371 EmbedPlatform::SoundCloud |
372 EmbedPlatform::ApplePodcasts
373 )
374}
375
376fn parse_dimension(s: &str) -> Option<u32> {
378 s.trim()
379 .trim_end_matches("px")
380 .trim_end_matches('%')
381 .parse()
382 .ok()
383}
384
385fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
387 if href.starts_with("http://") || href.starts_with("https://") {
388 return Some(href.to_string());
389 }
390
391 if href.starts_with("//") {
392 return Some(format!("https:{}", href));
393 }
394
395 base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
396}
397
398pub fn extract_embeds_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<EmbeddedMedia>> {
404 let document = Html::parse_document(html);
405 let base = base_url.and_then(|u| Url::parse(u).ok());
406 Ok(extract_embeds(&document, base.as_ref()))
407}
408
409pub fn get_embed_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
411 extract_embeds_from_html(html, base_url)
412 .unwrap_or_default()
413 .into_iter()
414 .filter_map(|e| e.absolute_url)
415 .collect()
416}
417
418pub fn has_embeds(document: &Html) -> bool {
420 if let Ok(sel) = Selector::parse("iframe[src], object[data], embed[src]") {
421 document.select(&sel).next().is_some()
422 } else {
423 false
424 }
425}
426
427pub fn filter_by_platform(embeds: &[EmbeddedMedia], platform: EmbedPlatform) -> Vec<&EmbeddedMedia> {
429 embeds.iter()
430 .filter(|e| e.platform == platform)
431 .collect()
432}
433
434pub fn get_maps(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
436 filter_by_platform(embeds, EmbedPlatform::GoogleMaps)
437}
438
439pub fn get_social_embeds(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
441 embeds.iter()
442 .filter(|e| matches!(e.platform,
443 EmbedPlatform::Twitter |
444 EmbedPlatform::Instagram |
445 EmbedPlatform::Facebook |
446 EmbedPlatform::LinkedIn |
447 EmbedPlatform::Pinterest |
448 EmbedPlatform::TikTok |
449 EmbedPlatform::Reddit
450 ))
451 .collect()
452}
453
454pub fn get_code_embeds(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
456 embeds.iter()
457 .filter(|e| matches!(e.platform,
458 EmbedPlatform::CodePen |
459 EmbedPlatform::JsFiddle |
460 EmbedPlatform::CodeSandbox
461 ))
462 .collect()
463}
464
465pub fn count_by_platform(embeds: &[EmbeddedMedia]) -> std::collections::HashMap<EmbedPlatform, usize> {
467 let mut counts = std::collections::HashMap::new();
468 for embed in embeds {
469 *counts.entry(embed.platform).or_insert(0) += 1;
470 }
471 counts
472}
473
474#[cfg(test)]
479mod tests {
480 use super::*;
481
482 fn parse_html(html: &str) -> Html {
483 Html::parse_document(html)
484 }
485
486 #[test]
487 fn test_extract_google_maps_iframe() {
488 let html = r#"<iframe src="https://www.google.com/maps/embed?pb=..." width="600" height="450"></iframe>"#;
489 let doc = parse_html(html);
490 let embeds = extract_embeds(&doc, None);
491
492 assert_eq!(embeds.len(), 1);
493 assert_eq!(embeds[0].platform, EmbedPlatform::GoogleMaps);
494 assert_eq!(embeds[0].width, Some(600));
495 assert_eq!(embeds[0].height, Some(450));
496 }
497
498 #[test]
499 fn test_extract_codepen_embed() {
500 let html = r#"<iframe src="https://codepen.io/user/embed/pen" title="CodePen"></iframe>"#;
501 let doc = parse_html(html);
502 let embeds = extract_embeds(&doc, None);
503
504 assert_eq!(embeds.len(), 1);
505 assert_eq!(embeds[0].platform, EmbedPlatform::CodePen);
506 assert_eq!(embeds[0].title, Some("CodePen".to_string()));
507 }
508
509 #[test]
510 fn test_detect_platform() {
511 assert_eq!(detect_embed_platform("https://www.google.com/maps/embed"), EmbedPlatform::GoogleMaps);
512 assert_eq!(detect_embed_platform("https://twitter.com/user/status/123"), EmbedPlatform::Twitter);
513 assert_eq!(detect_embed_platform("https://www.instagram.com/p/abc"), EmbedPlatform::Instagram);
514 assert_eq!(detect_embed_platform("https://codepen.io/user/pen/abc"), EmbedPlatform::CodePen);
515 assert_eq!(detect_embed_platform("https://example.com/widget"), EmbedPlatform::Other);
516 }
517
518 #[test]
519 fn test_extract_typeform() {
520 let html = r#"<iframe src="https://form.typeform.com/to/abc123"></iframe>"#;
521 let doc = parse_html(html);
522 let embeds = extract_embeds(&doc, None);
523
524 assert_eq!(embeds.len(), 1);
525 assert_eq!(embeds[0].platform, EmbedPlatform::Typeform);
526 }
527
528 #[test]
529 fn test_extract_calendly() {
530 let html = r#"<iframe src="https://calendly.com/user/meeting"></iframe>"#;
531 let doc = parse_html(html);
532 let embeds = extract_embeds(&doc, None);
533
534 assert_eq!(embeds.len(), 1);
535 assert_eq!(embeds[0].platform, EmbedPlatform::Calendly);
536 }
537
538 #[test]
539 fn test_skip_empty_src() {
540 let html = r#"<iframe src=""></iframe><iframe src="javascript:void(0)"></iframe>"#;
541 let doc = parse_html(html);
542 let embeds = extract_embeds(&doc, None);
543
544 assert!(embeds.is_empty());
545 }
546
547 #[test]
548 fn test_has_embeds() {
549 let with_embed = r#"<iframe src="https://example.com"></iframe>"#;
550 let without_embed = r#"<div>No embed</div>"#;
551
552 assert!(has_embeds(&parse_html(with_embed)));
553 assert!(!has_embeds(&parse_html(without_embed)));
554 }
555
556 #[test]
557 fn test_parse_dimension() {
558 assert_eq!(parse_dimension("600"), Some(600));
559 assert_eq!(parse_dimension("600px"), Some(600));
560 assert_eq!(parse_dimension("100%"), Some(100));
561 assert_eq!(parse_dimension("invalid"), None);
562 }
563
564 #[test]
565 fn test_get_social_embeds() {
566 let embeds = vec![
567 EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
568 EmbeddedMedia { platform: EmbedPlatform::GoogleMaps, ..Default::default() },
569 EmbeddedMedia { platform: EmbedPlatform::Instagram, ..Default::default() },
570 ];
571
572 let social = get_social_embeds(&embeds);
573 assert_eq!(social.len(), 2);
574 }
575
576 #[test]
577 fn test_get_code_embeds() {
578 let embeds = vec![
579 EmbeddedMedia { platform: EmbedPlatform::CodePen, ..Default::default() },
580 EmbeddedMedia { platform: EmbedPlatform::JsFiddle, ..Default::default() },
581 EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
582 ];
583
584 let code = get_code_embeds(&embeds);
585 assert_eq!(code.len(), 2);
586 }
587
588 #[test]
589 fn test_twitter_blockquote() {
590 let html = r#"<blockquote class="twitter-tweet"><a href="https://twitter.com/user/status/123">Tweet</a></blockquote>"#;
591 let doc = parse_html(html);
592 let embeds = extract_embeds(&doc, None);
593
594 assert_eq!(embeds.len(), 1);
595 assert_eq!(embeds[0].platform, EmbedPlatform::Twitter);
596 }
597
598 #[test]
599 fn test_count_by_platform() {
600 let embeds = vec![
601 EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
602 EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
603 EmbeddedMedia { platform: EmbedPlatform::CodePen, ..Default::default() },
604 ];
605
606 let counts = count_by_platform(&embeds);
607 assert_eq!(counts.get(&EmbedPlatform::Twitter), Some(&2));
608 assert_eq!(counts.get(&EmbedPlatform::CodePen), Some(&1));
609 }
610}