halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Audios - Audio extraction

use scraper::{Html, Selector};
use url::Url;

use crate::types::assets::{AudioAsset, AudioType};

/// Audio extractor
pub struct AudioExtractor;

impl Default for AudioExtractor {
    fn default() -> Self {
        Self
    }
}

impl AudioExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self
    }

    /// Extract all audios
    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<AudioAsset> {
        let document = Html::parse_document(html);
        let mut audios = Vec::new();

        // <audio><source> tags
        audios.extend(self.extract_audio_tags(&document, base_url));

        // Deduplicate
        audios.dedup_by(|a, b| a.url == b.url);

        audios
    }

    /// Extract <audio> tags
    fn extract_audio_tags(&self, document: &Html, base_url: &Url) -> Vec<AudioAsset> {
        let audio_selector = Selector::parse("audio").unwrap();
        let source_selector = Selector::parse("source").unwrap();
        let mut audios = Vec::new();

        for audio in document.select(&audio_selector) {
            let attrs = audio.value();

            // Direct src attribute
            if let Some(src) = attrs.attr("src") {
                if let Ok(url) = base_url.join(src) {
                    let audio_type = self.detect_audio_type(&url);
                    audios.push(AudioAsset {
                        url,
                        audio_type,
                        duration: None,
                        title: None,
                    });
                }
            }

            // Child sources
            for source in audio.select(&source_selector) {
                if let Some(src) = source.value().attr("src") {
                    if let Ok(url) = base_url.join(src) {
                        let audio_type = source.value().attr("type")
                            .map(|t| self.audio_type_from_mime(t))
                            .unwrap_or_else(|| self.detect_audio_type(&url));
                        
                        audios.push(AudioAsset {
                            url,
                            audio_type,
                            duration: None,
                            title: None,
                        });
                    }
                }
            }
        }

        audios
    }

    /// Detect audio type from URL
    fn detect_audio_type(&self, url: &Url) -> AudioType {
        let path = url.path().to_lowercase();
        
        if path.ends_with(".mp3") {
            AudioType::Mp3
        } else if path.ends_with(".wav") {
            AudioType::Wav
        } else if path.ends_with(".ogg") || path.ends_with(".oga") {
            AudioType::Ogg
        } else if path.ends_with(".aac") || path.ends_with(".m4a") {
            AudioType::Aac
        } else if path.ends_with(".flac") {
            AudioType::Flac
        } else {
            AudioType::Unknown
        }
    }

    /// Type d'audio depuis le MIME type
    fn audio_type_from_mime(&self, mime: &str) -> AudioType {
        let mime = mime.to_lowercase();
        
        if mime.contains("mp3") || mime.contains("mpeg") {
            AudioType::Mp3
        } else if mime.contains("wav") {
            AudioType::Wav
        } else if mime.contains("ogg") {
            AudioType::Ogg
        } else if mime.contains("aac") || mime.contains("mp4") {
            AudioType::Aac
        } else if mime.contains("flac") {
            AudioType::Flac
        } else {
            AudioType::Unknown
        }
    }
}

/// Extract podcast RSS links from a page
pub fn extract_podcast_links(html: &str, base_url: &Url) -> Vec<Url> {
    let document = Html::parse_document(html);
    let selector = Selector::parse(r#"link[type="application/rss+xml"], link[type="application/atom+xml"]"#).unwrap();
    
    document
        .select(&selector)
        .filter_map(|el| {
            let href = el.value().attr("href")?;
            let title = el.value().attr("title").unwrap_or("");
            
            // Check if it's probably a podcast
            if title.to_lowercase().contains("podcast") 
                || title.to_lowercase().contains("audio")
                || href.to_lowercase().contains("podcast")
            {
                base_url.join(href).ok()
            } else {
                None
            }
        })
        .collect()
}