halldyll_media/
embedded.rs

1//! Embedded content extraction for halldyll-media
2//!
3//! Extracts embedded content from HTML with support for:
4//! - iframes (maps, social, widgets)
5//! - Objects and embeds
6//! - Platform detection (Google Maps, Twitter, Instagram, etc.)
7
8use lazy_static::lazy_static;
9use regex::Regex;
10use scraper::{Html, Selector, ElementRef};
11use std::collections::HashSet;
12use url::Url;
13
14use crate::types::{
15    EmbeddedMedia, EmbedPlatform, MediaResult,
16};
17
18// ============================================================================
19// REGEX PATTERNS
20// ============================================================================
21
22lazy_static! {
23    /// Google Maps URL pattern
24    static ref GOOGLE_MAPS: Regex = Regex::new(
25        r"google\.com/maps|maps\.google\."
26    ).unwrap();
27    
28    /// Twitter/X embed pattern
29    static ref TWITTER: Regex = Regex::new(
30        r"twitter\.com|x\.com|platform\.twitter"
31    ).unwrap();
32    
33    /// Instagram embed pattern
34    static ref INSTAGRAM: Regex = Regex::new(
35        r"instagram\.com"
36    ).unwrap();
37    
38    /// Facebook embed pattern
39    static ref FACEBOOK: Regex = Regex::new(
40        r"facebook\.com|fb\.com"
41    ).unwrap();
42    
43    /// LinkedIn embed pattern
44    static ref LINKEDIN: Regex = Regex::new(
45        r"linkedin\.com"
46    ).unwrap();
47    
48    /// Pinterest embed pattern
49    static ref PINTEREST: Regex = Regex::new(
50        r"pinterest\.com"
51    ).unwrap();
52    
53    /// TikTok embed pattern
54    static ref TIKTOK: Regex = Regex::new(
55        r"tiktok\.com"
56    ).unwrap();
57    
58    /// Reddit embed pattern
59    static ref REDDIT: Regex = Regex::new(
60        r"reddit\.com|redd\.it"
61    ).unwrap();
62    
63    /// CodePen embed pattern
64    static ref CODEPEN: Regex = Regex::new(
65        r"codepen\.io"
66    ).unwrap();
67    
68    /// JSFiddle embed pattern
69    static ref JSFIDDLE: Regex = Regex::new(
70        r"jsfiddle\.net"
71    ).unwrap();
72    
73    /// CodeSandbox embed pattern
74    static ref CODESANDBOX: Regex = Regex::new(
75        r"codesandbox\.io"
76    ).unwrap();
77    
78    /// Giphy embed pattern
79    static ref GIPHY: Regex = Regex::new(
80        r"giphy\.com"
81    ).unwrap();
82    
83    /// SlideShare embed pattern
84    static ref SLIDESHARE: Regex = Regex::new(
85        r"slideshare\.net"
86    ).unwrap();
87    
88    /// Typeform embed pattern
89    static ref TYPEFORM: Regex = Regex::new(
90        r"typeform\.com"
91    ).unwrap();
92    
93    /// Calendly embed pattern
94    static ref CALENDLY: Regex = Regex::new(
95        r"calendly\.com"
96    ).unwrap();
97    
98    /// Stripe embed pattern
99    static ref STRIPE: Regex = Regex::new(
100        r"stripe\.com"
101    ).unwrap();
102    
103    /// PayPal embed pattern
104    static ref PAYPAL: Regex = Regex::new(
105        r"paypal\.com"
106    ).unwrap();
107}
108
109// ============================================================================
110// EXTRACTION FUNCTIONS
111// ============================================================================
112
113/// Extract all embedded content from HTML document
114pub fn extract_embeds(document: &Html, base_url: Option<&Url>) -> Vec<EmbeddedMedia> {
115    let mut embeds = Vec::new();
116    let mut seen_urls: HashSet<String> = HashSet::new();
117    
118    // Extract from iframes
119    if let Ok(sel) = Selector::parse("iframe[src]") {
120        for el in document.select(&sel) {
121            if let Some(embed) = extract_iframe(&el, base_url) {
122                let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
123                if seen_urls.insert(key) {
124                    embeds.push(embed);
125                }
126            }
127        }
128    }
129    
130    // Extract from object elements
131    if let Ok(sel) = Selector::parse("object[data]") {
132        for el in document.select(&sel) {
133            if let Some(embed) = extract_object(&el, base_url) {
134                let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
135                if seen_urls.insert(key) {
136                    embeds.push(embed);
137                }
138            }
139        }
140    }
141    
142    // Extract from embed elements
143    if let Ok(sel) = Selector::parse("embed[src]") {
144        for el in document.select(&sel) {
145            if let Some(embed) = extract_embed_tag(&el, base_url) {
146                let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
147                if seen_urls.insert(key) {
148                    embeds.push(embed);
149                }
150            }
151        }
152    }
153    
154    // Extract social embeds (blockquote/div with data attributes)
155    extract_social_embeds(document, base_url, &mut embeds, &mut seen_urls);
156    
157    embeds
158}
159
160/// Extract iframe element
161fn extract_iframe(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
162    let src = el.value().attr("src")?;
163    
164    // Skip empty or javascript URLs
165    if src.is_empty() || src.starts_with("javascript:") || src.starts_with("about:") {
166        return None;
167    }
168    
169    let absolute_url = resolve_url(src, base_url);
170    let platform = detect_embed_platform(src);
171    
172    // Skip video platforms (handled by videos.rs)
173    if is_video_platform(&platform) {
174        return None;
175    }
176    
177    // Parse dimensions
178    let width = el.value().attr("width")
179        .and_then(parse_dimension);
180    let height = el.value().attr("height")
181        .and_then(parse_dimension);
182    
183    Some(EmbeddedMedia {
184        url: src.to_string(),
185        absolute_url,
186        platform,
187        title: el.value().attr("title").map(|s| s.to_string()),
188        width,
189        height,
190        allow: el.value().attr("allow").map(|s| s.to_string()),
191        sandbox: el.value().attr("sandbox").map(|s| s.to_string()),
192        loading: el.value().attr("loading").map(|s| s.to_string()),
193        frameborder: el.value().attr("frameborder").map(|s| s.to_string()),
194    })
195}
196
197/// Extract object element
198fn extract_object(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
199    let data = el.value().attr("data")?;
200    
201    // Skip PDFs (handled by documents.rs)
202    if data.to_lowercase().contains(".pdf") {
203        return None;
204    }
205    
206    let absolute_url = resolve_url(data, base_url);
207    let platform = detect_embed_platform(data);
208    
209    let width = el.value().attr("width")
210        .and_then(parse_dimension);
211    let height = el.value().attr("height")
212        .and_then(parse_dimension);
213    
214    Some(EmbeddedMedia {
215        url: data.to_string(),
216        absolute_url,
217        platform,
218        title: el.value().attr("title").map(|s| s.to_string()),
219        width,
220        height,
221        ..Default::default()
222    })
223}
224
225/// Extract embed element
226fn extract_embed_tag(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
227    let src = el.value().attr("src")?;
228    
229    // Skip PDFs and videos
230    if src.to_lowercase().contains(".pdf") {
231        return None;
232    }
233    
234    let absolute_url = resolve_url(src, base_url);
235    let platform = detect_embed_platform(src);
236    
237    if is_video_platform(&platform) {
238        return None;
239    }
240    
241    let width = el.value().attr("width")
242        .and_then(parse_dimension);
243    let height = el.value().attr("height")
244        .and_then(parse_dimension);
245    
246    Some(EmbeddedMedia {
247        url: src.to_string(),
248        absolute_url,
249        platform,
250        title: None,
251        width,
252        height,
253        ..Default::default()
254    })
255}
256
257/// Extract social embeds (Twitter, Instagram, etc.)
258fn extract_social_embeds(
259    document: &Html,
260    _base_url: Option<&Url>,
261    embeds: &mut Vec<EmbeddedMedia>,
262    seen_urls: &mut HashSet<String>,
263) {
264    // Twitter embeds
265    if let Ok(sel) = Selector::parse("blockquote.twitter-tweet") {
266        for el in document.select(&sel) {
267            if let Ok(link_sel) = Selector::parse("a") {
268                for link in el.select(&link_sel) {
269                    if let Some(href) = link.value().attr("href") {
270                        if TWITTER.is_match(href) && seen_urls.insert(href.to_string()) {
271                            embeds.push(EmbeddedMedia {
272                                url: href.to_string(),
273                                absolute_url: Some(href.to_string()),
274                                platform: EmbedPlatform::Twitter,
275                                ..Default::default()
276                            });
277                            break;
278                        }
279                    }
280                }
281            }
282        }
283    }
284    
285    // Instagram embeds
286    if let Ok(sel) = Selector::parse("blockquote.instagram-media") {
287        for el in document.select(&sel) {
288            if let Some(permalink) = el.value().attr("data-instgrm-permalink") {
289                if seen_urls.insert(permalink.to_string()) {
290                    embeds.push(EmbeddedMedia {
291                        url: permalink.to_string(),
292                        absolute_url: Some(permalink.to_string()),
293                        platform: EmbedPlatform::Instagram,
294                        ..Default::default()
295                    });
296                }
297            }
298        }
299    }
300    
301    // Facebook embeds
302    if let Ok(sel) = Selector::parse("div.fb-post, div.fb-video") {
303        for el in document.select(&sel) {
304            if let Some(href) = el.value().attr("data-href") {
305                if seen_urls.insert(href.to_string()) {
306                    embeds.push(EmbeddedMedia {
307                        url: href.to_string(),
308                        absolute_url: Some(href.to_string()),
309                        platform: EmbedPlatform::Facebook,
310                        ..Default::default()
311                    });
312                }
313            }
314        }
315    }
316    
317    // Reddit embeds
318    if let Ok(sel) = Selector::parse("blockquote.reddit-embed-bq") {
319        for el in document.select(&sel) {
320            if let Ok(link_sel) = Selector::parse("a") {
321                for link in el.select(&link_sel) {
322                    if let Some(href) = link.value().attr("href") {
323                        if REDDIT.is_match(href) && seen_urls.insert(href.to_string()) {
324                            embeds.push(EmbeddedMedia {
325                                url: href.to_string(),
326                                absolute_url: Some(href.to_string()),
327                                platform: EmbedPlatform::Reddit,
328                                ..Default::default()
329                            });
330                            break;
331                        }
332                    }
333                }
334            }
335        }
336    }
337}
338
339/// Detect embed platform from URL
340pub fn detect_embed_platform(url: &str) -> EmbedPlatform {
341    if GOOGLE_MAPS.is_match(url) { return EmbedPlatform::GoogleMaps; }
342    if TWITTER.is_match(url) { return EmbedPlatform::Twitter; }
343    if INSTAGRAM.is_match(url) { return EmbedPlatform::Instagram; }
344    if FACEBOOK.is_match(url) { return EmbedPlatform::Facebook; }
345    if LINKEDIN.is_match(url) { return EmbedPlatform::LinkedIn; }
346    if PINTEREST.is_match(url) { return EmbedPlatform::Pinterest; }
347    if TIKTOK.is_match(url) { return EmbedPlatform::TikTok; }
348    if REDDIT.is_match(url) { return EmbedPlatform::Reddit; }
349    if CODEPEN.is_match(url) { return EmbedPlatform::CodePen; }
350    if JSFIDDLE.is_match(url) { return EmbedPlatform::JsFiddle; }
351    if CODESANDBOX.is_match(url) { return EmbedPlatform::CodeSandbox; }
352    if GIPHY.is_match(url) { return EmbedPlatform::Giphy; }
353    if SLIDESHARE.is_match(url) { return EmbedPlatform::SlideShare; }
354    if TYPEFORM.is_match(url) { return EmbedPlatform::Typeform; }
355    if CALENDLY.is_match(url) { return EmbedPlatform::Calendly; }
356    if STRIPE.is_match(url) { return EmbedPlatform::Stripe; }
357    if PAYPAL.is_match(url) { return EmbedPlatform::PayPal; }
358    
359    EmbedPlatform::Other
360}
361
362/// Check if platform is a video platform (handled elsewhere)
363fn is_video_platform(platform: &EmbedPlatform) -> bool {
364    matches!(platform, 
365        EmbedPlatform::YouTube | 
366        EmbedPlatform::Vimeo | 
367        EmbedPlatform::Dailymotion |
368        EmbedPlatform::Twitch |
369        EmbedPlatform::Wistia |
370        EmbedPlatform::Spotify |
371        EmbedPlatform::SoundCloud |
372        EmbedPlatform::ApplePodcasts
373    )
374}
375
376/// Parse dimension (handle px, %, etc.)
377fn parse_dimension(s: &str) -> Option<u32> {
378    s.trim()
379        .trim_end_matches("px")
380        .trim_end_matches('%')
381        .parse()
382        .ok()
383}
384
385/// Resolve relative URL
386fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
387    if href.starts_with("http://") || href.starts_with("https://") {
388        return Some(href.to_string());
389    }
390    
391    if href.starts_with("//") {
392        return Some(format!("https:{}", href));
393    }
394    
395    base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
396}
397
398// ============================================================================
399// CONVENIENCE FUNCTIONS
400// ============================================================================
401
402/// Extract embeds from HTML string
403pub fn extract_embeds_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<EmbeddedMedia>> {
404    let document = Html::parse_document(html);
405    let base = base_url.and_then(|u| Url::parse(u).ok());
406    Ok(extract_embeds(&document, base.as_ref()))
407}
408
409/// Get all embed URLs
410pub fn get_embed_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
411    extract_embeds_from_html(html, base_url)
412        .unwrap_or_default()
413        .into_iter()
414        .filter_map(|e| e.absolute_url)
415        .collect()
416}
417
418/// Check if HTML has embeds
419pub fn has_embeds(document: &Html) -> bool {
420    if let Ok(sel) = Selector::parse("iframe[src], object[data], embed[src]") {
421        document.select(&sel).next().is_some()
422    } else {
423        false
424    }
425}
426
427/// Filter embeds by platform
428pub fn filter_by_platform(embeds: &[EmbeddedMedia], platform: EmbedPlatform) -> Vec<&EmbeddedMedia> {
429    embeds.iter()
430        .filter(|e| e.platform == platform)
431        .collect()
432}
433
434/// Get Google Maps embeds
435pub fn get_maps(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
436    filter_by_platform(embeds, EmbedPlatform::GoogleMaps)
437}
438
439/// Get social embeds
440pub fn get_social_embeds(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
441    embeds.iter()
442        .filter(|e| matches!(e.platform,
443            EmbedPlatform::Twitter |
444            EmbedPlatform::Instagram |
445            EmbedPlatform::Facebook |
446            EmbedPlatform::LinkedIn |
447            EmbedPlatform::Pinterest |
448            EmbedPlatform::TikTok |
449            EmbedPlatform::Reddit
450        ))
451        .collect()
452}
453
454/// Get code embeds (CodePen, JSFiddle, etc.)
455pub fn get_code_embeds(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
456    embeds.iter()
457        .filter(|e| matches!(e.platform,
458            EmbedPlatform::CodePen |
459            EmbedPlatform::JsFiddle |
460            EmbedPlatform::CodeSandbox
461        ))
462        .collect()
463}
464
465/// Count embeds by platform
466pub fn count_by_platform(embeds: &[EmbeddedMedia]) -> std::collections::HashMap<EmbedPlatform, usize> {
467    let mut counts = std::collections::HashMap::new();
468    for embed in embeds {
469        *counts.entry(embed.platform).or_insert(0) += 1;
470    }
471    counts
472}
473
474// ============================================================================
475// TESTS
476// ============================================================================
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    fn parse_html(html: &str) -> Html {
483        Html::parse_document(html)
484    }
485
486    #[test]
487    fn test_extract_google_maps_iframe() {
488        let html = r#"<iframe src="https://www.google.com/maps/embed?pb=..." width="600" height="450"></iframe>"#;
489        let doc = parse_html(html);
490        let embeds = extract_embeds(&doc, None);
491        
492        assert_eq!(embeds.len(), 1);
493        assert_eq!(embeds[0].platform, EmbedPlatform::GoogleMaps);
494        assert_eq!(embeds[0].width, Some(600));
495        assert_eq!(embeds[0].height, Some(450));
496    }
497
498    #[test]
499    fn test_extract_codepen_embed() {
500        let html = r#"<iframe src="https://codepen.io/user/embed/pen" title="CodePen"></iframe>"#;
501        let doc = parse_html(html);
502        let embeds = extract_embeds(&doc, None);
503        
504        assert_eq!(embeds.len(), 1);
505        assert_eq!(embeds[0].platform, EmbedPlatform::CodePen);
506        assert_eq!(embeds[0].title, Some("CodePen".to_string()));
507    }
508
509    #[test]
510    fn test_detect_platform() {
511        assert_eq!(detect_embed_platform("https://www.google.com/maps/embed"), EmbedPlatform::GoogleMaps);
512        assert_eq!(detect_embed_platform("https://twitter.com/user/status/123"), EmbedPlatform::Twitter);
513        assert_eq!(detect_embed_platform("https://www.instagram.com/p/abc"), EmbedPlatform::Instagram);
514        assert_eq!(detect_embed_platform("https://codepen.io/user/pen/abc"), EmbedPlatform::CodePen);
515        assert_eq!(detect_embed_platform("https://example.com/widget"), EmbedPlatform::Other);
516    }
517
518    #[test]
519    fn test_extract_typeform() {
520        let html = r#"<iframe src="https://form.typeform.com/to/abc123"></iframe>"#;
521        let doc = parse_html(html);
522        let embeds = extract_embeds(&doc, None);
523        
524        assert_eq!(embeds.len(), 1);
525        assert_eq!(embeds[0].platform, EmbedPlatform::Typeform);
526    }
527
528    #[test]
529    fn test_extract_calendly() {
530        let html = r#"<iframe src="https://calendly.com/user/meeting"></iframe>"#;
531        let doc = parse_html(html);
532        let embeds = extract_embeds(&doc, None);
533        
534        assert_eq!(embeds.len(), 1);
535        assert_eq!(embeds[0].platform, EmbedPlatform::Calendly);
536    }
537
538    #[test]
539    fn test_skip_empty_src() {
540        let html = r#"<iframe src=""></iframe><iframe src="javascript:void(0)"></iframe>"#;
541        let doc = parse_html(html);
542        let embeds = extract_embeds(&doc, None);
543        
544        assert!(embeds.is_empty());
545    }
546
547    #[test]
548    fn test_has_embeds() {
549        let with_embed = r#"<iframe src="https://example.com"></iframe>"#;
550        let without_embed = r#"<div>No embed</div>"#;
551        
552        assert!(has_embeds(&parse_html(with_embed)));
553        assert!(!has_embeds(&parse_html(without_embed)));
554    }
555
556    #[test]
557    fn test_parse_dimension() {
558        assert_eq!(parse_dimension("600"), Some(600));
559        assert_eq!(parse_dimension("600px"), Some(600));
560        assert_eq!(parse_dimension("100%"), Some(100));
561        assert_eq!(parse_dimension("invalid"), None);
562    }
563
564    #[test]
565    fn test_get_social_embeds() {
566        let embeds = vec![
567            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
568            EmbeddedMedia { platform: EmbedPlatform::GoogleMaps, ..Default::default() },
569            EmbeddedMedia { platform: EmbedPlatform::Instagram, ..Default::default() },
570        ];
571        
572        let social = get_social_embeds(&embeds);
573        assert_eq!(social.len(), 2);
574    }
575
576    #[test]
577    fn test_get_code_embeds() {
578        let embeds = vec![
579            EmbeddedMedia { platform: EmbedPlatform::CodePen, ..Default::default() },
580            EmbeddedMedia { platform: EmbedPlatform::JsFiddle, ..Default::default() },
581            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
582        ];
583        
584        let code = get_code_embeds(&embeds);
585        assert_eq!(code.len(), 2);
586    }
587
588    #[test]
589    fn test_twitter_blockquote() {
590        let html = r#"<blockquote class="twitter-tweet"><a href="https://twitter.com/user/status/123">Tweet</a></blockquote>"#;
591        let doc = parse_html(html);
592        let embeds = extract_embeds(&doc, None);
593        
594        assert_eq!(embeds.len(), 1);
595        assert_eq!(embeds[0].platform, EmbedPlatform::Twitter);
596    }
597
598    #[test]
599    fn test_count_by_platform() {
600        let embeds = vec![
601            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
602            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
603            EmbeddedMedia { platform: EmbedPlatform::CodePen, ..Default::default() },
604        ];
605        
606        let counts = count_by_platform(&embeds);
607        assert_eq!(counts.get(&EmbedPlatform::Twitter), Some(&2));
608        assert_eq!(counts.get(&EmbedPlatform::CodePen), Some(&1));
609    }
610}