use scraper::{Html, Selector};
use url::Url;
use crate::types::assets::{AudioAsset, AudioType};
pub struct AudioExtractor;
impl Default for AudioExtractor {
fn default() -> Self {
Self
}
}
impl AudioExtractor {
pub fn new() -> Self {
Self
}
pub fn extract(&self, html: &str, base_url: &Url) -> Vec<AudioAsset> {
let document = Html::parse_document(html);
let mut audios = Vec::new();
audios.extend(self.extract_audio_tags(&document, base_url));
audios.dedup_by(|a, b| a.url == b.url);
audios
}
fn extract_audio_tags(&self, document: &Html, base_url: &Url) -> Vec<AudioAsset> {
let audio_selector = Selector::parse("audio").unwrap();
let source_selector = Selector::parse("source").unwrap();
let mut audios = Vec::new();
for audio in document.select(&audio_selector) {
let attrs = audio.value();
if let Some(src) = attrs.attr("src") {
if let Ok(url) = base_url.join(src) {
let audio_type = self.detect_audio_type(&url);
audios.push(AudioAsset {
url,
audio_type,
duration: None,
title: None,
});
}
}
for source in audio.select(&source_selector) {
if let Some(src) = source.value().attr("src") {
if let Ok(url) = base_url.join(src) {
let audio_type = source.value().attr("type")
.map(|t| self.audio_type_from_mime(t))
.unwrap_or_else(|| self.detect_audio_type(&url));
audios.push(AudioAsset {
url,
audio_type,
duration: None,
title: None,
});
}
}
}
}
audios
}
fn detect_audio_type(&self, url: &Url) -> AudioType {
let path = url.path().to_lowercase();
if path.ends_with(".mp3") {
AudioType::Mp3
} else if path.ends_with(".wav") {
AudioType::Wav
} else if path.ends_with(".ogg") || path.ends_with(".oga") {
AudioType::Ogg
} else if path.ends_with(".aac") || path.ends_with(".m4a") {
AudioType::Aac
} else if path.ends_with(".flac") {
AudioType::Flac
} else {
AudioType::Unknown
}
}
fn audio_type_from_mime(&self, mime: &str) -> AudioType {
let mime = mime.to_lowercase();
if mime.contains("mp3") || mime.contains("mpeg") {
AudioType::Mp3
} else if mime.contains("wav") {
AudioType::Wav
} else if mime.contains("ogg") {
AudioType::Ogg
} else if mime.contains("aac") || mime.contains("mp4") {
AudioType::Aac
} else if mime.contains("flac") {
AudioType::Flac
} else {
AudioType::Unknown
}
}
}
pub fn extract_podcast_links(html: &str, base_url: &Url) -> Vec<Url> {
let document = Html::parse_document(html);
let selector = Selector::parse(r#"link[type="application/rss+xml"], link[type="application/atom+xml"]"#).unwrap();
document
.select(&selector)
.filter_map(|el| {
let href = el.value().attr("href")?;
let title = el.value().attr("title").unwrap_or("");
if title.to_lowercase().contains("podcast")
|| title.to_lowercase().contains("audio")
|| href.to_lowercase().contains("podcast")
{
base_url.join(href).ok()
} else {
None
}
})
.collect()
}