halldyll_media/
lib.rs

1//! # halldyll-media
2//!
3//! Comprehensive media extraction library for web scraping.
4//!
5//! This crate provides functionality to extract and process various types of media
6//! from HTML documents:
7//!
8//! - **Images**: With srcset, lazy loading, and placeholder detection
9//! - **Videos**: HTML5 video and embedded platforms (YouTube, Vimeo, etc.)
10//! - **Audio**: HTML5 audio and streaming platforms (Spotify, SoundCloud, etc.)
11//! - **Documents**: PDF, Word, Excel, PowerPoint, and other document formats
12//! - **Embedded**: iframes, social embeds, maps, and widgets
13//!
14//! ## Quick Start
15//!
16//! ```rust,no_run
17//! use halldyll_media::{MediaExtractor, MediaType};
18//!
19//! let html = r#"
20//!     <img src="image.jpg" alt="Photo">
21//!     <video src="video.mp4"></video>
22//! "#;
23//!
24//! let extractor = MediaExtractor::new()
25//!     .with_base_url("https://example.com");
26//!
27//! let media = extractor.extract_all(html).unwrap();
28//! println!("Found {} images", media.images.len());
29//! println!("Found {} videos", media.videos.len());
30//! ```
31//!
32//! ## Modules
33//!
34//! - [`types`]: Core types, errors, and configuration
35//! - [`images`]: Image extraction with responsive image support
36//! - [`videos`]: Video extraction with embedded platform detection
37//! - [`audio`]: Audio extraction with streaming platform support
38//! - [`documents`]: Document extraction (PDF, Office, etc.)
39//! - [`embedded`]: Embedded content extraction (iframes, social, widgets)
40//! - [`downloader`]: Async media downloading with hashing
41//! - [`extractor`]: Main MediaExtractor API
42
43// ============================================================================
44// MODULE DECLARATIONS
45// ============================================================================
46
47pub mod types;
48pub mod images;
49pub mod videos;
50pub mod audio;
51pub mod documents;
52pub mod embedded;
53pub mod links;
54pub mod downloader;
55pub mod extractor;
56
57// ============================================================================
58// PUBLIC RE-EXPORTS
59// ============================================================================
60
61// Core types
62pub use types::{
63    // Errors
64    MediaError,
65    MediaResult,
66    
67    // Media types
68    MediaType,
69    ImageFormat,
70    ImageLoading,
71    
72    // Image types
73    ImageMedia,
74    SrcsetEntry,
75    
76    // Video types
77    VideoMedia,
78    VideoSource,
79    VideoTrack,
80    TrackKind,
81    VideoPlatform,
82    
83    // Audio types
84    AudioMedia,
85    AudioSource,
86    AudioPlatform,
87    
88    // Document types
89    DocumentMedia,
90    DocumentType,
91    
92    // Embedded types
93    EmbeddedMedia,
94    EmbedPlatform,
95    EmbedType,
96    
97    // Link types
98    LinkMedia,
99    LinkType,
100    
101    // Configuration
102    MediaConfig,
103    DownloadConfig,
104    DownloadResult,
105    
106    // Collection
107    ExtractedMedia,
108};
109
110// Image extraction
111pub use images::{
112    extract_images,
113    get_image_urls,
114    has_images,
115    filter_placeholders,
116    get_best_image_url,
117};
118
119// Video extraction
120pub use videos::{
121    extract_videos,
122    get_video_urls,
123    has_videos,
124    youtube_thumbnail,
125    youtube_embed_url,
126};
127
128// Audio extraction
129pub use audio::{
130    extract_audio,
131    get_audio_urls,
132    has_audio,
133    spotify_embed_url,
134};
135
136// Document extraction
137pub use documents::{
138    extract_documents,
139    get_document_urls,
140    has_documents,
141    get_pdfs,
142    get_office_docs,
143};
144
145// Embedded extraction
146pub use embedded::{
147    extract_embeds,
148    get_embed_urls,
149    has_embeds,
150    detect_embed_platform,
151    filter_by_platform,
152    get_maps,
153    get_social_embeds,
154    get_code_embeds,
155};
156
157// Link extraction
158pub use links::{
159    extract_links,
160    get_link_urls,
161    has_links,
162    get_internal_links,
163    get_external_links,
164    get_download_links,
165    get_nofollow_links,
166    get_mailto_links,
167    extract_emails,
168    extract_phones,
169    get_unique_domains,
170};
171
172// Downloader
173pub use downloader::{
174    MediaDownloader,
175    download_bytes,
176    download_with_hash,
177    download_to_base64,
178    save_to_file,
179    compute_sha256,
180    detect_media_type,
181    url_to_filename,
182    is_downloadable,
183};
184
185// Main extractor API
186pub use extractor::{
187    MediaExtractor,
188    MediaExtractorBuilder,
189    MediaCounts,
190    extract_media,
191    has_any_media,
192    count_all_media,
193    get_all_media_urls,
194};
195
196// ============================================================================
197// TESTS
198// ============================================================================
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn test_reexports() {
206        // Verify main types are accessible
207        let _ = MediaType::Image;
208        let _ = ImageFormat::Jpeg;
209        let _ = VideoPlatform::YouTube;
210        let _ = AudioPlatform::Spotify;
211        let _ = DocumentType::Pdf;
212        let _ = EmbedPlatform::GoogleMaps;
213    }
214
215    #[test]
216    fn test_extractor_creation() {
217        let extractor = MediaExtractor::new();
218        assert!(extractor.config().extract_images);
219    }
220
221    #[test]
222    fn test_builder_pattern() {
223        let extractor = MediaExtractorBuilder::new()
224            .extract_images(true)
225            .extract_videos(false)
226            .build();
227        
228        assert!(extractor.config().extract_images);
229        assert!(!extractor.config().extract_videos);
230    }
231
232    #[test]
233    fn test_quick_extraction() {
234        let html = r#"<img src="https://example.com/test.jpg" alt="Test">"#;
235        let media = extract_media(html, None).unwrap();
236        
237        assert_eq!(media.images.len(), 1);
238    }
239
240    #[test]
241    fn test_has_media_check() {
242        let with_media = r#"<img src="test.jpg">"#;
243        let without_media = r#"<div>No media</div>"#;
244        
245        assert!(has_any_media(with_media));
246        assert!(!has_any_media(without_media));
247    }
248
249    #[test]
250    fn test_count_media() {
251        let html = r#"
252            <img src="a.jpg">
253            <img src="b.jpg">
254            <video src="v.mp4"></video>
255        "#;
256        
257        let counts = count_all_media(html);
258        assert_eq!(counts.images, 2);
259        assert_eq!(counts.videos, 1);
260        assert!(counts.has_any());
261    }
262
263    #[test]
264    fn test_url_collection() {
265        let html = r#"<img src="https://example.com/test.jpg">"#;
266        let urls = get_all_media_urls(html, None);
267        
268        assert!(!urls.is_empty());
269    }
270}