halldyll_core/parse/
audios.rs

1//! Audios - Audio extraction
2
3use scraper::{Html, Selector};
4use url::Url;
5
6use crate::types::assets::{AudioAsset, AudioType};
7
8/// Audio extractor
9pub struct AudioExtractor;
10
11impl Default for AudioExtractor {
12    fn default() -> Self {
13        Self
14    }
15}
16
17impl AudioExtractor {
18    /// New extractor
19    pub fn new() -> Self {
20        Self
21    }
22
23    /// Extract all audios
24    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<AudioAsset> {
25        let document = Html::parse_document(html);
26        let mut audios = Vec::new();
27
28        // <audio><source> tags
29        audios.extend(self.extract_audio_tags(&document, base_url));
30
31        // Deduplicate
32        audios.dedup_by(|a, b| a.url == b.url);
33
34        audios
35    }
36
37    /// Extract <audio> tags
38    fn extract_audio_tags(&self, document: &Html, base_url: &Url) -> Vec<AudioAsset> {
39        let audio_selector = Selector::parse("audio").unwrap();
40        let source_selector = Selector::parse("source").unwrap();
41        let mut audios = Vec::new();
42
43        for audio in document.select(&audio_selector) {
44            let attrs = audio.value();
45
46            // Direct src attribute
47            if let Some(src) = attrs.attr("src") {
48                if let Ok(url) = base_url.join(src) {
49                    let audio_type = self.detect_audio_type(&url);
50                    audios.push(AudioAsset {
51                        url,
52                        audio_type,
53                        duration: None,
54                        title: None,
55                    });
56                }
57            }
58
59            // Child sources
60            for source in audio.select(&source_selector) {
61                if let Some(src) = source.value().attr("src") {
62                    if let Ok(url) = base_url.join(src) {
63                        let audio_type = source.value().attr("type")
64                            .map(|t| self.audio_type_from_mime(t))
65                            .unwrap_or_else(|| self.detect_audio_type(&url));
66                        
67                        audios.push(AudioAsset {
68                            url,
69                            audio_type,
70                            duration: None,
71                            title: None,
72                        });
73                    }
74                }
75            }
76        }
77
78        audios
79    }
80
81    /// Detect audio type from URL
82    fn detect_audio_type(&self, url: &Url) -> AudioType {
83        let path = url.path().to_lowercase();
84        
85        if path.ends_with(".mp3") {
86            AudioType::Mp3
87        } else if path.ends_with(".wav") {
88            AudioType::Wav
89        } else if path.ends_with(".ogg") || path.ends_with(".oga") {
90            AudioType::Ogg
91        } else if path.ends_with(".aac") || path.ends_with(".m4a") {
92            AudioType::Aac
93        } else if path.ends_with(".flac") {
94            AudioType::Flac
95        } else {
96            AudioType::Unknown
97        }
98    }
99
100    /// Type d'audio depuis le MIME type
101    fn audio_type_from_mime(&self, mime: &str) -> AudioType {
102        let mime = mime.to_lowercase();
103        
104        if mime.contains("mp3") || mime.contains("mpeg") {
105            AudioType::Mp3
106        } else if mime.contains("wav") {
107            AudioType::Wav
108        } else if mime.contains("ogg") {
109            AudioType::Ogg
110        } else if mime.contains("aac") || mime.contains("mp4") {
111            AudioType::Aac
112        } else if mime.contains("flac") {
113            AudioType::Flac
114        } else {
115            AudioType::Unknown
116        }
117    }
118}
119
120/// Extract podcast RSS links from a page
121pub fn extract_podcast_links(html: &str, base_url: &Url) -> Vec<Url> {
122    let document = Html::parse_document(html);
123    let selector = Selector::parse(r#"link[type="application/rss+xml"], link[type="application/atom+xml"]"#).unwrap();
124    
125    document
126        .select(&selector)
127        .filter_map(|el| {
128            let href = el.value().attr("href")?;
129            let title = el.value().attr("title").unwrap_or("");
130            
131            // Check if it's probably a podcast
132            if title.to_lowercase().contains("podcast") 
133                || title.to_lowercase().contains("audio")
134                || href.to_lowercase().contains("podcast")
135            {
136                base_url.join(href).ok()
137            } else {
138                None
139            }
140        })
141        .collect()
142}