use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
use url::Url;
use crate::types::{
AudioMedia, AudioPlatform, AudioSource, MediaResult,
};
lazy_static! {
static ref SPOTIFY_ID: Regex = Regex::new(
r"open\.spotify\.com/(?:track|album|playlist|episode)/([a-zA-Z0-9]+)"
).unwrap();
static ref SOUNDCLOUD_URL: Regex = Regex::new(
r"soundcloud\.com/([^/]+/[^?]+)"
).unwrap();
}
pub fn extract_audio(document: &Html, base_url: Option<&Url>) -> Vec<AudioMedia> {
let mut audio_items = Vec::new();
let mut seen_urls: HashSet<String> = HashSet::new();
if let Ok(sel) = Selector::parse("audio") {
for el in document.select(&sel) {
if let Some(audio) = extract_audio_element(&el, base_url) {
let key = audio.absolute_url.as_ref().unwrap_or(&audio.src).clone();
if seen_urls.insert(key) {
audio_items.push(audio);
}
}
}
}
if let Ok(sel) = Selector::parse("iframe[src]") {
for el in document.select(&sel) {
if let Some(src) = el.value().attr("src") {
if is_audio_embed(src) {
if let Some(audio) = extract_embedded_audio(&el, base_url) {
let key = audio.absolute_url.as_ref().unwrap_or(&audio.src).clone();
if seen_urls.insert(key) {
audio_items.push(audio);
}
}
}
}
}
}
if let Ok(sel) = Selector::parse("a[href]") {
for el in document.select(&sel) {
if let Some(href) = el.value().attr("href") {
if is_audio_file(href) {
if let Some(audio) = create_audio_from_link(&el, base_url) {
let key = audio.absolute_url.as_ref().unwrap_or(&audio.src).clone();
if seen_urls.insert(key) {
audio_items.push(audio);
}
}
}
}
}
}
audio_items
}
fn extract_audio_element(el: &ElementRef, base_url: Option<&Url>) -> Option<AudioMedia> {
let src = el.value().attr("src")
.or_else(|| {
if let Ok(sel) = Selector::parse("source") {
el.select(&sel).next()
.and_then(|s| s.value().attr("src"))
} else {
None
}
})?;
let absolute_url = resolve_url(src, base_url);
let mut audio = AudioMedia {
src: src.to_string(),
absolute_url,
platform: AudioPlatform::Html5,
..Default::default()
};
audio.autoplay = el.value().attr("autoplay").is_some();
audio.loop_audio = el.value().attr("loop").is_some();
audio.muted = el.value().attr("muted").is_some();
audio.controls = el.value().attr("controls").is_some();
audio.mime_type = el.value().attr("type").map(|s| s.to_string())
.or_else(|| guess_audio_mime(&audio.src));
audio.sources = extract_audio_sources(el, base_url);
audio.title = el.value().attr("title").map(|s| s.to_string())
.or_else(|| el.value().attr("aria-label").map(|s| s.to_string()));
Some(audio)
}
fn extract_audio_sources(audio: &ElementRef, base_url: Option<&Url>) -> Vec<AudioSource> {
let mut sources = Vec::new();
if let Ok(sel) = Selector::parse("source") {
for source in audio.select(&sel) {
if let Some(src) = source.value().attr("src") {
sources.push(AudioSource {
src: resolve_url(src, base_url).unwrap_or_else(|| src.to_string()),
mime_type: source.value().attr("type").map(|s| s.to_string()),
});
}
}
}
sources
}
fn extract_embedded_audio(el: &ElementRef, base_url: Option<&Url>) -> Option<AudioMedia> {
let src = el.value().attr("src")?;
let platform = AudioPlatform::from_url(src);
let mut audio = AudioMedia {
src: src.to_string(),
absolute_url: resolve_url(src, base_url),
platform,
embed_url: Some(src.to_string()),
..Default::default()
};
audio.title = el.value().attr("title").map(|s| s.to_string());
Some(audio)
}
fn create_audio_from_link(el: &ElementRef, base_url: Option<&Url>) -> Option<AudioMedia> {
let href = el.value().attr("href")?;
let audio = AudioMedia {
src: href.to_string(),
absolute_url: resolve_url(href, base_url),
platform: AudioPlatform::Html5,
title: Some(el.text().collect::<String>().trim().to_string()),
mime_type: guess_audio_mime(href),
..Default::default()
};
Some(audio)
}
fn is_audio_embed(url: &str) -> bool {
let url_lower = url.to_lowercase();
let audio_hosts = [
"open.spotify.com",
"soundcloud.com",
"w.soundcloud.com",
"podcasts.apple.com",
"anchor.fm",
"podbean.com",
"buzzsprout.com",
"spreaker.com",
"castbox.fm",
];
audio_hosts.iter().any(|host| url_lower.contains(host))
}
fn is_audio_file(url: &str) -> bool {
let url_lower = url.to_lowercase();
let audio_extensions = [".mp3", ".wav", ".ogg", ".oga", ".flac", ".aac", ".m4a", ".opus", ".wma"];
audio_extensions.iter().any(|ext| url_lower.ends_with(ext))
}
fn guess_audio_mime(url: &str) -> Option<String> {
let url_lower = url.to_lowercase();
if url_lower.contains(".mp3") {
Some("audio/mpeg".to_string())
} else if url_lower.contains(".wav") {
Some("audio/wav".to_string())
} else if url_lower.contains(".ogg") || url_lower.contains(".oga") {
Some("audio/ogg".to_string())
} else if url_lower.contains(".flac") {
Some("audio/flac".to_string())
} else if url_lower.contains(".aac") {
Some("audio/aac".to_string())
} else if url_lower.contains(".m4a") {
Some("audio/mp4".to_string())
} else if url_lower.contains(".opus") {
Some("audio/opus".to_string())
} else {
None
}
}
fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
if href.starts_with("http://") || href.starts_with("https://") {
return Some(href.to_string());
}
if href.starts_with("//") {
return Some(format!("https:{}", href));
}
base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
}
pub fn extract_audio_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<AudioMedia>> {
let document = Html::parse_document(html);
let base = base_url.and_then(|u| Url::parse(u).ok());
Ok(extract_audio(&document, base.as_ref()))
}
pub fn get_audio_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
extract_audio_from_html(html, base_url)
.unwrap_or_default()
.into_iter()
.filter_map(|a| a.absolute_url)
.collect()
}
pub fn has_audio(document: &Html) -> bool {
if let Ok(sel) = Selector::parse("audio, iframe[src*='spotify'], iframe[src*='soundcloud']") {
document.select(&sel).next().is_some()
} else {
false
}
}
pub fn spotify_embed_url(track_id: &str) -> String {
format!("https://open.spotify.com/embed/track/{}", track_id)
}
pub fn soundcloud_embed_url(url: &str) -> String {
format!("https://w.soundcloud.com/player/?url={}&auto_play=false",
urlencoding::encode(url))
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_html(html: &str) -> Html {
Html::parse_document(html)
}
#[test]
fn test_extract_html5_audio() {
let html = r#"
<audio src="/audio/podcast.mp3" controls>
<source src="/audio/podcast.ogg" type="audio/ogg">
</audio>
"#;
let doc = parse_html(html);
let base = Url::parse("https://example.com").unwrap();
let audio = extract_audio(&doc, Some(&base));
assert_eq!(audio.len(), 1);
assert_eq!(audio[0].platform, AudioPlatform::Html5);
assert!(audio[0].controls);
assert!(!audio[0].sources.is_empty());
}
#[test]
fn test_extract_spotify_embed() {
let html = r#"
<iframe src="https://open.spotify.com/embed/track/4iV5W9uYEdYUVa79Axb7Rh"
width="300" height="380" title="Spotify Track">
</iframe>
"#;
let doc = parse_html(html);
let audio = extract_audio(&doc, None);
assert_eq!(audio.len(), 1);
assert_eq!(audio[0].platform, AudioPlatform::Spotify);
}
#[test]
fn test_extract_soundcloud_embed() {
let html = r#"
<iframe src="https://w.soundcloud.com/player/?url=https://soundcloud.com/artist/track">
</iframe>
"#;
let doc = parse_html(html);
let audio = extract_audio(&doc, None);
assert_eq!(audio.len(), 1);
assert_eq!(audio[0].platform, AudioPlatform::SoundCloud);
}
#[test]
fn test_extract_audio_link() {
let html = r#"<a href="/downloads/song.mp3">Download Song</a>"#;
let doc = parse_html(html);
let base = Url::parse("https://example.com").unwrap();
let audio = extract_audio(&doc, Some(&base));
assert_eq!(audio.len(), 1);
assert_eq!(audio[0].title, Some("Download Song".to_string()));
}
#[test]
fn test_audio_attributes() {
let html = r#"<audio src="test.mp3" autoplay loop muted></audio>"#;
let doc = parse_html(html);
let audio = extract_audio(&doc, None);
assert!(audio[0].autoplay);
assert!(audio[0].loop_audio);
assert!(audio[0].muted);
}
#[test]
fn test_audio_sources() {
let html = r#"
<audio>
<source src="audio.mp3" type="audio/mpeg">
<source src="audio.ogg" type="audio/ogg">
</audio>
"#;
let doc = parse_html(html);
let audio = extract_audio(&doc, None);
assert_eq!(audio[0].sources.len(), 2);
}
#[test]
fn test_has_audio() {
let with_audio = "<audio src='test.mp3'></audio>";
let with_spotify = "<iframe src='https://open.spotify.com/embed/track/abc'></iframe>";
let without = "<p>No audio</p>";
assert!(has_audio(&parse_html(with_audio)));
assert!(has_audio(&parse_html(with_spotify)));
assert!(!has_audio(&parse_html(without)));
}
#[test]
fn test_is_audio_file() {
assert!(is_audio_file("/audio/song.mp3"));
assert!(is_audio_file("/audio/track.wav"));
assert!(is_audio_file("/audio/podcast.ogg"));
assert!(!is_audio_file("/page.html"));
}
#[test]
fn test_guess_audio_mime() {
assert_eq!(guess_audio_mime("song.mp3"), Some("audio/mpeg".to_string()));
assert_eq!(guess_audio_mime("track.wav"), Some("audio/wav".to_string()));
assert_eq!(guess_audio_mime("audio.ogg"), Some("audio/ogg".to_string()));
assert_eq!(guess_audio_mime("audio.flac"), Some("audio/flac".to_string()));
}
#[test]
fn test_spotify_embed_url() {
let url = spotify_embed_url("4iV5W9uYEdYUVa79Axb7Rh");
assert_eq!(url, "https://open.spotify.com/embed/track/4iV5W9uYEdYUVa79Axb7Rh");
}
}