halldyll-media 0.1.0

Media extraction (images, videos, links) for halldyll scraper
Documentation
//! # halldyll-media
//!
//! Comprehensive media extraction library for web scraping.
//!
//! This crate provides functionality to extract and process various types of media
//! from HTML documents:
//!
//! - **Images**: With srcset, lazy loading, and placeholder detection
//! - **Videos**: HTML5 video and embedded platforms (YouTube, Vimeo, etc.)
//! - **Audio**: HTML5 audio and streaming platforms (Spotify, SoundCloud, etc.)
//! - **Documents**: PDF, Word, Excel, PowerPoint, and other document formats
//! - **Embedded**: iframes, social embeds, maps, and widgets
//!
//! ## Quick Start
//!
//! ```rust,no_run
//! use halldyll_media::{MediaExtractor, MediaType};
//!
//! let html = r#"
//!     <img src="image.jpg" alt="Photo">
//!     <video src="video.mp4"></video>
//! "#;
//!
//! let extractor = MediaExtractor::new()
//!     .with_base_url("https://example.com");
//!
//! let media = extractor.extract_all(html).unwrap();
//! println!("Found {} images", media.images.len());
//! println!("Found {} videos", media.videos.len());
//! ```
//!
//! ## Modules
//!
//! - [`types`]: Core types, errors, and configuration
//! - [`images`]: Image extraction with responsive image support
//! - [`videos`]: Video extraction with embedded platform detection
//! - [`audio`]: Audio extraction with streaming platform support
//! - [`documents`]: Document extraction (PDF, Office, etc.)
//! - [`embedded`]: Embedded content extraction (iframes, social, widgets)
//! - [`downloader`]: Async media downloading with hashing
//! - [`extractor`]: Main MediaExtractor API

// ============================================================================
// MODULE DECLARATIONS
// ============================================================================

pub mod types;
pub mod images;
pub mod videos;
pub mod audio;
pub mod documents;
pub mod embedded;
pub mod links;
pub mod downloader;
pub mod extractor;

// ============================================================================
// PUBLIC RE-EXPORTS
// ============================================================================

// Core types
pub use types::{
    // Errors
    MediaError,
    MediaResult,
    
    // Media types
    MediaType,
    ImageFormat,
    ImageLoading,
    
    // Image types
    ImageMedia,
    SrcsetEntry,
    
    // Video types
    VideoMedia,
    VideoSource,
    VideoTrack,
    TrackKind,
    VideoPlatform,
    
    // Audio types
    AudioMedia,
    AudioSource,
    AudioPlatform,
    
    // Document types
    DocumentMedia,
    DocumentType,
    
    // Embedded types
    EmbeddedMedia,
    EmbedPlatform,
    EmbedType,
    
    // Link types
    LinkMedia,
    LinkType,
    
    // Configuration
    MediaConfig,
    DownloadConfig,
    DownloadResult,
    
    // Collection
    ExtractedMedia,
};

// Image extraction
pub use images::{
    extract_images,
    get_image_urls,
    has_images,
    filter_placeholders,
    get_best_image_url,
};

// Video extraction
pub use videos::{
    extract_videos,
    get_video_urls,
    has_videos,
    youtube_thumbnail,
    youtube_embed_url,
};

// Audio extraction
pub use audio::{
    extract_audio,
    get_audio_urls,
    has_audio,
    spotify_embed_url,
};

// Document extraction
pub use documents::{
    extract_documents,
    get_document_urls,
    has_documents,
    get_pdfs,
    get_office_docs,
};

// Embedded extraction
pub use embedded::{
    extract_embeds,
    get_embed_urls,
    has_embeds,
    detect_embed_platform,
    filter_by_platform,
    get_maps,
    get_social_embeds,
    get_code_embeds,
};

// Link extraction
pub use links::{
    extract_links,
    get_link_urls,
    has_links,
    get_internal_links,
    get_external_links,
    get_download_links,
    get_nofollow_links,
    get_mailto_links,
    extract_emails,
    extract_phones,
    get_unique_domains,
};

// Downloader
pub use downloader::{
    MediaDownloader,
    download_bytes,
    download_with_hash,
    download_to_base64,
    save_to_file,
    compute_sha256,
    detect_media_type,
    url_to_filename,
    is_downloadable,
};

// Main extractor API
pub use extractor::{
    MediaExtractor,
    MediaExtractorBuilder,
    MediaCounts,
    extract_media,
    has_any_media,
    count_all_media,
    get_all_media_urls,
};

// ============================================================================
// TESTS
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_reexports() {
        // Verify main types are accessible
        let _ = MediaType::Image;
        let _ = ImageFormat::Jpeg;
        let _ = VideoPlatform::YouTube;
        let _ = AudioPlatform::Spotify;
        let _ = DocumentType::Pdf;
        let _ = EmbedPlatform::GoogleMaps;
    }

    #[test]
    fn test_extractor_creation() {
        let extractor = MediaExtractor::new();
        assert!(extractor.config().extract_images);
    }

    #[test]
    fn test_builder_pattern() {
        let extractor = MediaExtractorBuilder::new()
            .extract_images(true)
            .extract_videos(false)
            .build();
        
        assert!(extractor.config().extract_images);
        assert!(!extractor.config().extract_videos);
    }

    #[test]
    fn test_quick_extraction() {
        let html = r#"<img src="https://example.com/test.jpg" alt="Test">"#;
        let media = extract_media(html, None).unwrap();
        
        assert_eq!(media.images.len(), 1);
    }

    #[test]
    fn test_has_media_check() {
        let with_media = r#"<img src="test.jpg">"#;
        let without_media = r#"<div>No media</div>"#;
        
        assert!(has_any_media(with_media));
        assert!(!has_any_media(without_media));
    }

    #[test]
    fn test_count_media() {
        let html = r#"
            <img src="a.jpg">
            <img src="b.jpg">
            <video src="v.mp4"></video>
        "#;
        
        let counts = count_all_media(html);
        assert_eq!(counts.images, 2);
        assert_eq!(counts.videos, 1);
        assert!(counts.has_any());
    }

    #[test]
    fn test_url_collection() {
        let html = r#"<img src="https://example.com/test.jpg">"#;
        let urls = get_all_media_urls(html, None);
        
        assert!(!urls.is_empty());
    }
}