halldyll_core/parse/
audios.rs1use scraper::{Html, Selector};
4use url::Url;
5
6use crate::types::assets::{AudioAsset, AudioType};
7
8pub struct AudioExtractor;
10
11impl Default for AudioExtractor {
12 fn default() -> Self {
13 Self
14 }
15}
16
17impl AudioExtractor {
18 pub fn new() -> Self {
20 Self
21 }
22
23 pub fn extract(&self, html: &str, base_url: &Url) -> Vec<AudioAsset> {
25 let document = Html::parse_document(html);
26 let mut audios = Vec::new();
27
28 audios.extend(self.extract_audio_tags(&document, base_url));
30
31 audios.dedup_by(|a, b| a.url == b.url);
33
34 audios
35 }
36
37 fn extract_audio_tags(&self, document: &Html, base_url: &Url) -> Vec<AudioAsset> {
39 let audio_selector = Selector::parse("audio").unwrap();
40 let source_selector = Selector::parse("source").unwrap();
41 let mut audios = Vec::new();
42
43 for audio in document.select(&audio_selector) {
44 let attrs = audio.value();
45
46 if let Some(src) = attrs.attr("src") {
48 if let Ok(url) = base_url.join(src) {
49 let audio_type = self.detect_audio_type(&url);
50 audios.push(AudioAsset {
51 url,
52 audio_type,
53 duration: None,
54 title: None,
55 });
56 }
57 }
58
59 for source in audio.select(&source_selector) {
61 if let Some(src) = source.value().attr("src") {
62 if let Ok(url) = base_url.join(src) {
63 let audio_type = source.value().attr("type")
64 .map(|t| self.audio_type_from_mime(t))
65 .unwrap_or_else(|| self.detect_audio_type(&url));
66
67 audios.push(AudioAsset {
68 url,
69 audio_type,
70 duration: None,
71 title: None,
72 });
73 }
74 }
75 }
76 }
77
78 audios
79 }
80
81 fn detect_audio_type(&self, url: &Url) -> AudioType {
83 let path = url.path().to_lowercase();
84
85 if path.ends_with(".mp3") {
86 AudioType::Mp3
87 } else if path.ends_with(".wav") {
88 AudioType::Wav
89 } else if path.ends_with(".ogg") || path.ends_with(".oga") {
90 AudioType::Ogg
91 } else if path.ends_with(".aac") || path.ends_with(".m4a") {
92 AudioType::Aac
93 } else if path.ends_with(".flac") {
94 AudioType::Flac
95 } else {
96 AudioType::Unknown
97 }
98 }
99
100 fn audio_type_from_mime(&self, mime: &str) -> AudioType {
102 let mime = mime.to_lowercase();
103
104 if mime.contains("mp3") || mime.contains("mpeg") {
105 AudioType::Mp3
106 } else if mime.contains("wav") {
107 AudioType::Wav
108 } else if mime.contains("ogg") {
109 AudioType::Ogg
110 } else if mime.contains("aac") || mime.contains("mp4") {
111 AudioType::Aac
112 } else if mime.contains("flac") {
113 AudioType::Flac
114 } else {
115 AudioType::Unknown
116 }
117 }
118}
119
120pub fn extract_podcast_links(html: &str, base_url: &Url) -> Vec<Url> {
122 let document = Html::parse_document(html);
123 let selector = Selector::parse(r#"link[type="application/rss+xml"], link[type="application/atom+xml"]"#).unwrap();
124
125 document
126 .select(&selector)
127 .filter_map(|el| {
128 let href = el.value().attr("href")?;
129 let title = el.value().attr("title").unwrap_or("");
130
131 if title.to_lowercase().contains("podcast")
133 || title.to_lowercase().contains("audio")
134 || href.to_lowercase().contains("podcast")
135 {
136 base_url.join(href).ok()
137 } else {
138 None
139 }
140 })
141 .collect()
142}