halldyll_media/
types.rs

1//! Core types for halldyll-media
2//!
3//! This module contains all type definitions for media extraction.
4
5use bytes::Bytes;
6use serde::{Deserialize, Serialize};
7use thiserror::Error;
8
9// ============================================================================
10// ERRORS
11// ============================================================================
12
13/// Media extraction error types
14#[derive(Error, Debug)]
15pub enum MediaError {
16    #[error("Failed to download media: {0}")]
17    Download(String),
18
19    #[error("Network error: {0}")]
20    Network(String),
21
22    #[error("HTTP error {0}: {1}")]
23    Http(u16, String),
24
25    #[error("Invalid URL: {0}")]
26    InvalidUrl(String),
27
28    #[error("Unsupported media type: {0}")]
29    UnsupportedType(String),
30
31    #[error("File too large: {0} bytes (max: {1})")]
32    FileTooLarge(u64, u64),
33
34    #[error("Timeout downloading: {0}")]
35    Timeout(String),
36
37    #[error("IO error: {0}")]
38    Io(String),
39
40    #[error("Parse error: {0}")]
41    Parse(String),
42}
43
44/// Result type for media operations
45pub type MediaResult<T> = Result<T, MediaError>;
46
47// ============================================================================
48// MEDIA TYPES
49// ============================================================================
50
51/// Media type enumeration
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
53#[serde(rename_all = "lowercase")]
54pub enum MediaType {
55    Image,
56    Video,
57    Audio,
58    Document,
59    Embedded,
60    Other,
61}
62
63impl MediaType {
64    /// Get media type from file extension
65    pub fn from_extension(ext: &str) -> Self {
66        let ext = ext.to_lowercase();
67        match ext.as_str() {
68            // Images
69            "jpg" | "jpeg" | "png" | "gif" | "webp" | "svg" | "ico" | "bmp" 
70            | "avif" | "heic" | "heif" | "tiff" | "tif" => MediaType::Image,
71            
72            // Videos
73            "mp4" | "webm" | "ogg" | "ogv" | "avi" | "mov" | "mkv" | "m4v" 
74            | "wmv" | "flv" | "3gp" => MediaType::Video,
75            
76            // Audio
77            "mp3" | "wav" | "oga" | "flac" | "aac" | "m4a" | "wma" 
78            | "opus" | "aiff" => MediaType::Audio,
79            
80            // Documents
81            "pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" 
82            | "txt" | "rtf" | "odt" | "ods" | "odp" | "csv" | "epub" => MediaType::Document,
83            
84            _ => MediaType::Other,
85        }
86    }
87
88    /// Get media type from MIME type
89    pub fn from_mime(mime: &str) -> Self {
90        let mime_lower = mime.to_lowercase();
91        if mime_lower.starts_with("image/") {
92            MediaType::Image
93        } else if mime_lower.starts_with("video/") {
94            MediaType::Video
95        } else if mime_lower.starts_with("audio/") {
96            MediaType::Audio
97        } else if mime_lower.starts_with("application/pdf")
98            || mime_lower.contains("document")
99            || mime_lower.contains("spreadsheet")
100            || mime_lower.contains("presentation")
101        {
102            MediaType::Document
103        } else {
104            MediaType::Other
105        }
106    }
107}
108
109impl std::fmt::Display for MediaType {
110    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
111        match self {
112            MediaType::Image => write!(f, "image"),
113            MediaType::Video => write!(f, "video"),
114            MediaType::Audio => write!(f, "audio"),
115            MediaType::Document => write!(f, "document"),
116            MediaType::Embedded => write!(f, "embedded"),
117            MediaType::Other => write!(f, "other"),
118        }
119    }
120}
121
122// ============================================================================
123// IMAGE TYPES
124// ============================================================================
125
126/// Image format
127#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
128#[serde(rename_all = "lowercase")]
129pub enum ImageFormat {
130    Jpeg,
131    Png,
132    Gif,
133    WebP,
134    Svg,
135    Avif,
136    Heic,
137    Ico,
138    Bmp,
139    Tiff,
140    Unknown,
141}
142
143impl ImageFormat {
144    pub fn from_extension(ext: &str) -> Self {
145        match ext.to_lowercase().as_str() {
146            "jpg" | "jpeg" => ImageFormat::Jpeg,
147            "png" => ImageFormat::Png,
148            "gif" => ImageFormat::Gif,
149            "webp" => ImageFormat::WebP,
150            "svg" => ImageFormat::Svg,
151            "avif" => ImageFormat::Avif,
152            "heic" | "heif" => ImageFormat::Heic,
153            "ico" => ImageFormat::Ico,
154            "bmp" => ImageFormat::Bmp,
155            "tiff" | "tif" => ImageFormat::Tiff,
156            _ => ImageFormat::Unknown,
157        }
158    }
159
160    pub fn from_mime(mime: &str) -> Self {
161        match mime.to_lowercase().as_str() {
162            "image/jpeg" => ImageFormat::Jpeg,
163            "image/png" => ImageFormat::Png,
164            "image/gif" => ImageFormat::Gif,
165            "image/webp" => ImageFormat::WebP,
166            "image/svg+xml" => ImageFormat::Svg,
167            "image/avif" => ImageFormat::Avif,
168            "image/heic" | "image/heif" => ImageFormat::Heic,
169            "image/x-icon" | "image/vnd.microsoft.icon" => ImageFormat::Ico,
170            "image/bmp" => ImageFormat::Bmp,
171            "image/tiff" => ImageFormat::Tiff,
172            _ => ImageFormat::Unknown,
173        }
174    }
175
176    pub fn mime_type(&self) -> &'static str {
177        match self {
178            ImageFormat::Jpeg => "image/jpeg",
179            ImageFormat::Png => "image/png",
180            ImageFormat::Gif => "image/gif",
181            ImageFormat::WebP => "image/webp",
182            ImageFormat::Svg => "image/svg+xml",
183            ImageFormat::Avif => "image/avif",
184            ImageFormat::Heic => "image/heic",
185            ImageFormat::Ico => "image/x-icon",
186            ImageFormat::Bmp => "image/bmp",
187            ImageFormat::Tiff => "image/tiff",
188            ImageFormat::Unknown => "application/octet-stream",
189        }
190    }
191}
192
193/// Image loading strategy
194#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
195#[serde(rename_all = "lowercase")]
196pub enum ImageLoading {
197    #[default]
198    Eager,
199    Lazy,
200}
201
202/// Srcset entry for responsive images
203#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
204pub struct SrcsetEntry {
205    /// Image URL
206    pub url: String,
207    /// Width descriptor (e.g., 800w)
208    pub width: Option<u32>,
209    /// Pixel density descriptor (e.g., 2x)
210    pub density: Option<f32>,
211}
212
213/// Extracted image
214#[derive(Debug, Clone, Serialize, Deserialize)]
215pub struct ImageMedia {
216    /// Original src attribute
217    pub src: String,
218    /// Resolved absolute URL
219    pub absolute_url: Option<String>,
220    /// Alt text
221    pub alt: Option<String>,
222    /// Title attribute
223    pub title: Option<String>,
224    /// Width in pixels
225    pub width: Option<u32>,
226    /// Height in pixels
227    pub height: Option<u32>,
228    /// Image format
229    pub format: ImageFormat,
230    /// MIME type
231    pub mime_type: Option<String>,
232    /// Loading strategy
233    pub loading: ImageLoading,
234    /// Is decorative (empty alt)
235    pub is_decorative: bool,
236    /// Srcset entries for responsive images
237    pub srcset: Vec<SrcsetEntry>,
238    /// Sizes attribute
239    pub sizes: Option<String>,
240    /// Data-src for lazy loading
241    pub data_src: Option<String>,
242    /// Is placeholder/low quality
243    pub is_placeholder: bool,
244    /// File size in bytes (after download)
245    pub size_bytes: Option<usize>,
246    /// Content hash (SHA256)
247    pub content_hash: Option<String>,
248    /// CSS classes
249    pub classes: Vec<String>,
250    /// ID attribute
251    pub id: Option<String>,
252}
253
254impl Default for ImageMedia {
255    fn default() -> Self {
256        Self {
257            src: String::new(),
258            absolute_url: None,
259            alt: None,
260            title: None,
261            width: None,
262            height: None,
263            format: ImageFormat::Unknown,
264            mime_type: None,
265            loading: ImageLoading::Eager,
266            is_decorative: false,
267            srcset: Vec::new(),
268            sizes: None,
269            data_src: None,
270            is_placeholder: false,
271            size_bytes: None,
272            content_hash: None,
273            classes: Vec::new(),
274            id: None,
275        }
276    }
277}
278
279// ============================================================================
280// VIDEO TYPES
281// ============================================================================
282
283/// Video platform for embedded videos
284#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
285#[serde(rename_all = "lowercase")]
286pub enum VideoPlatform {
287    YouTube,
288    Vimeo,
289    Dailymotion,
290    Twitch,
291    Facebook,
292    Twitter,
293    TikTok,
294    Wistia,
295    Brightcove,
296    JWPlayer,
297    VideoJs,
298    Html5,
299    Other,
300}
301
302impl VideoPlatform {
303    /// Detect platform from URL
304    pub fn from_url(url: &str) -> Self {
305        let url_lower = url.to_lowercase();
306        if url_lower.contains("youtube.com") || url_lower.contains("youtu.be") {
307            VideoPlatform::YouTube
308        } else if url_lower.contains("vimeo.com") {
309            VideoPlatform::Vimeo
310        } else if url_lower.contains("dailymotion.com") || url_lower.contains("dai.ly") {
311            VideoPlatform::Dailymotion
312        } else if url_lower.contains("twitch.tv") {
313            VideoPlatform::Twitch
314        } else if url_lower.contains("facebook.com") || url_lower.contains("fb.watch") {
315            VideoPlatform::Facebook
316        } else if url_lower.contains("twitter.com") || url_lower.contains("x.com") {
317            VideoPlatform::Twitter
318        } else if url_lower.contains("tiktok.com") {
319            VideoPlatform::TikTok
320        } else if url_lower.contains("wistia.com") || url_lower.contains("wistia.net") {
321            VideoPlatform::Wistia
322        } else if url_lower.contains("brightcove") {
323            VideoPlatform::Brightcove
324        } else if url_lower.contains("jwplayer") || url_lower.contains("jwplatform") {
325            VideoPlatform::JWPlayer
326        } else {
327            VideoPlatform::Other
328        }
329    }
330}
331
332/// Extracted video
333#[derive(Debug, Clone, Serialize, Deserialize)]
334pub struct VideoMedia {
335    /// Video source URL
336    pub src: String,
337    /// Resolved absolute URL
338    pub absolute_url: Option<String>,
339    /// Video platform
340    pub platform: VideoPlatform,
341    /// Video ID (platform-specific)
342    pub video_id: Option<String>,
343    /// Poster/thumbnail URL
344    pub poster: Option<String>,
345    /// Width in pixels
346    pub width: Option<u32>,
347    /// Height in pixels
348    pub height: Option<u32>,
349    /// Duration in seconds
350    pub duration: Option<f64>,
351    /// MIME type
352    pub mime_type: Option<String>,
353    /// Title
354    pub title: Option<String>,
355    /// Alternative sources
356    pub sources: Vec<VideoSource>,
357    /// Tracks (subtitles, captions)
358    pub tracks: Vec<VideoTrack>,
359    /// Autoplay enabled
360    pub autoplay: bool,
361    /// Loop enabled
362    pub loop_video: bool,
363    /// Muted
364    pub muted: bool,
365    /// Controls visible
366    pub controls: bool,
367    /// Playsinline (mobile)
368    pub playsinline: bool,
369    /// Embed URL (for iframes)
370    pub embed_url: Option<String>,
371    /// File size in bytes
372    pub size_bytes: Option<usize>,
373}
374
375impl Default for VideoMedia {
376    fn default() -> Self {
377        Self {
378            src: String::new(),
379            absolute_url: None,
380            platform: VideoPlatform::Html5,
381            video_id: None,
382            poster: None,
383            width: None,
384            height: None,
385            duration: None,
386            mime_type: None,
387            title: None,
388            sources: Vec::new(),
389            tracks: Vec::new(),
390            autoplay: false,
391            loop_video: false,
392            muted: false,
393            controls: true,
394            playsinline: false,
395            embed_url: None,
396            size_bytes: None,
397        }
398    }
399}
400
401/// Video source alternative
402#[derive(Debug, Clone, Serialize, Deserialize)]
403pub struct VideoSource {
404    pub src: String,
405    pub mime_type: Option<String>,
406    pub quality: Option<String>,
407}
408
409/// Video track (subtitles, captions)
410#[derive(Debug, Clone, Serialize, Deserialize)]
411pub struct VideoTrack {
412    pub src: String,
413    pub kind: TrackKind,
414    pub label: Option<String>,
415    pub srclang: Option<String>,
416    pub is_default: bool,
417}
418
419/// Track kind
420#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
421#[serde(rename_all = "lowercase")]
422#[derive(Default)]
423pub enum TrackKind {
424    #[default]
425    Subtitles,
426    Captions,
427    Descriptions,
428    Chapters,
429    Metadata,
430}
431
432
433// ============================================================================
434// AUDIO TYPES
435// ============================================================================
436
437/// Audio platform
438#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
439#[serde(rename_all = "lowercase")]
440pub enum AudioPlatform {
441    Spotify,
442    SoundCloud,
443    ApplePodcasts,
444    Anchor,
445    Podbean,
446    Buzzsprout,
447    Html5,
448    Other,
449}
450
451impl AudioPlatform {
452    pub fn from_url(url: &str) -> Self {
453        let url_lower = url.to_lowercase();
454        if url_lower.contains("spotify.com") || url_lower.contains("open.spotify") {
455            AudioPlatform::Spotify
456        } else if url_lower.contains("soundcloud.com") {
457            AudioPlatform::SoundCloud
458        } else if url_lower.contains("podcasts.apple.com") {
459            AudioPlatform::ApplePodcasts
460        } else if url_lower.contains("anchor.fm") {
461            AudioPlatform::Anchor
462        } else if url_lower.contains("podbean.com") {
463            AudioPlatform::Podbean
464        } else if url_lower.contains("buzzsprout.com") {
465            AudioPlatform::Buzzsprout
466        } else {
467            AudioPlatform::Other
468        }
469    }
470}
471
472/// Extracted audio
473#[derive(Debug, Clone, Serialize, Deserialize)]
474pub struct AudioMedia {
475    /// Audio source URL
476    pub src: String,
477    /// Resolved absolute URL
478    pub absolute_url: Option<String>,
479    /// Audio platform
480    pub platform: AudioPlatform,
481    /// Title
482    pub title: Option<String>,
483    /// Artist/Author
484    pub artist: Option<String>,
485    /// Album
486    pub album: Option<String>,
487    /// Duration in seconds
488    pub duration: Option<f64>,
489    /// MIME type
490    pub mime_type: Option<String>,
491    /// Alternative sources
492    pub sources: Vec<AudioSource>,
493    /// Autoplay
494    pub autoplay: bool,
495    /// Loop
496    pub loop_audio: bool,
497    /// Muted
498    pub muted: bool,
499    /// Controls visible
500    pub controls: bool,
501    /// Embed URL
502    pub embed_url: Option<String>,
503    /// File size in bytes
504    pub size_bytes: Option<usize>,
505}
506
507impl Default for AudioMedia {
508    fn default() -> Self {
509        Self {
510            src: String::new(),
511            absolute_url: None,
512            platform: AudioPlatform::Html5,
513            title: None,
514            artist: None,
515            album: None,
516            duration: None,
517            mime_type: None,
518            sources: Vec::new(),
519            autoplay: false,
520            loop_audio: false,
521            muted: false,
522            controls: true,
523            embed_url: None,
524            size_bytes: None,
525        }
526    }
527}
528
529/// Audio source alternative
530#[derive(Debug, Clone, Serialize, Deserialize)]
531pub struct AudioSource {
532    pub src: String,
533    pub mime_type: Option<String>,
534}
535
536// ============================================================================
537// DOCUMENT TYPES
538// ============================================================================
539
540/// Document type
541#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
542#[serde(rename_all = "lowercase")]
543pub enum DocumentType {
544    Pdf,
545    Word,
546    Excel,
547    PowerPoint,
548    Text,
549    Csv,
550    Epub,
551    Other,
552}
553
554impl DocumentType {
555    pub fn from_extension(ext: &str) -> Self {
556        match ext.to_lowercase().as_str() {
557            "pdf" => DocumentType::Pdf,
558            "doc" | "docx" | "odt" | "rtf" => DocumentType::Word,
559            "xls" | "xlsx" | "ods" => DocumentType::Excel,
560            "ppt" | "pptx" | "odp" => DocumentType::PowerPoint,
561            "txt" => DocumentType::Text,
562            "csv" => DocumentType::Csv,
563            "epub" => DocumentType::Epub,
564            _ => DocumentType::Other,
565        }
566    }
567}
568
569/// Extracted document
570#[derive(Debug, Clone, Serialize, Deserialize)]
571pub struct DocumentMedia {
572    /// Document URL
573    pub url: String,
574    /// Resolved absolute URL
575    pub absolute_url: Option<String>,
576    /// Document type
577    pub doc_type: DocumentType,
578    /// Filename
579    pub filename: Option<String>,
580    /// Title (from link text or title attribute)
581    pub title: Option<String>,
582    /// MIME type
583    pub mime_type: Option<String>,
584    /// File size in bytes
585    pub size_bytes: Option<usize>,
586    /// Page count (if known)
587    pub page_count: Option<u32>,
588}
589
590impl Default for DocumentMedia {
591    fn default() -> Self {
592        Self {
593            url: String::new(),
594            absolute_url: None,
595            doc_type: DocumentType::Other,
596            filename: None,
597            title: None,
598            mime_type: None,
599            size_bytes: None,
600            page_count: None,
601        }
602    }
603}
604
605// ============================================================================
606// EMBEDDED CONTENT TYPES
607// ============================================================================
608
609/// Embedded content type
610#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
611#[serde(rename_all = "lowercase")]
612pub enum EmbedType {
613    Iframe,
614    Object,
615    Embed,
616    Script,
617}
618
619/// Embedded content platform
620#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
621#[serde(rename_all = "lowercase")]
622pub enum EmbedPlatform {
623    YouTube,
624    Vimeo,
625    Dailymotion,
626    Twitch,
627    Wistia,
628    Twitter,
629    Instagram,
630    Facebook,
631    LinkedIn,
632    Pinterest,
633    TikTok,
634    Reddit,
635    Spotify,
636    SoundCloud,
637    ApplePodcasts,
638    GoogleMaps,
639    GoogleDocs,
640    CodePen,
641    JsFiddle,
642    CodeSandbox,
643    Gist,
644    SlideShare,
645    Giphy,
646    Typeform,
647    Calendly,
648    Stripe,
649    PayPal,
650    Scribd,
651    Other,
652}
653
654impl EmbedPlatform {
655    pub fn from_url(url: &str) -> Self {
656        let url_lower = url.to_lowercase();
657        if url_lower.contains("youtube.com") || url_lower.contains("youtube-nocookie.com") {
658            EmbedPlatform::YouTube
659        } else if url_lower.contains("player.vimeo.com") || url_lower.contains("vimeo.com") {
660            EmbedPlatform::Vimeo
661        } else if url_lower.contains("dailymotion.com") {
662            EmbedPlatform::Dailymotion
663        } else if url_lower.contains("twitch.tv") {
664            EmbedPlatform::Twitch
665        } else if url_lower.contains("wistia.com") || url_lower.contains("wistia.net") {
666            EmbedPlatform::Wistia
667        } else if url_lower.contains("platform.twitter.com") || url_lower.contains("twitter.com/") || url_lower.contains("x.com") {
668            EmbedPlatform::Twitter
669        } else if url_lower.contains("instagram.com") {
670            EmbedPlatform::Instagram
671        } else if url_lower.contains("facebook.com") || url_lower.contains("fb.com") {
672            EmbedPlatform::Facebook
673        } else if url_lower.contains("linkedin.com") {
674            EmbedPlatform::LinkedIn
675        } else if url_lower.contains("pinterest.com") {
676            EmbedPlatform::Pinterest
677        } else if url_lower.contains("tiktok.com") {
678            EmbedPlatform::TikTok
679        } else if url_lower.contains("reddit.com") || url_lower.contains("redd.it") {
680            EmbedPlatform::Reddit
681        } else if url_lower.contains("open.spotify.com") || url_lower.contains("spotify.com") {
682            EmbedPlatform::Spotify
683        } else if url_lower.contains("soundcloud.com") {
684            EmbedPlatform::SoundCloud
685        } else if url_lower.contains("podcasts.apple.com") {
686            EmbedPlatform::ApplePodcasts
687        } else if url_lower.contains("google.com/maps") || url_lower.contains("maps.google") {
688            EmbedPlatform::GoogleMaps
689        } else if url_lower.contains("docs.google.com") {
690            EmbedPlatform::GoogleDocs
691        } else if url_lower.contains("codepen.io") {
692            EmbedPlatform::CodePen
693        } else if url_lower.contains("jsfiddle.net") {
694            EmbedPlatform::JsFiddle
695        } else if url_lower.contains("codesandbox.io") {
696            EmbedPlatform::CodeSandbox
697        } else if url_lower.contains("gist.github.com") {
698            EmbedPlatform::Gist
699        } else if url_lower.contains("slideshare.net") {
700            EmbedPlatform::SlideShare
701        } else if url_lower.contains("giphy.com") {
702            EmbedPlatform::Giphy
703        } else if url_lower.contains("typeform.com") {
704            EmbedPlatform::Typeform
705        } else if url_lower.contains("calendly.com") {
706            EmbedPlatform::Calendly
707        } else if url_lower.contains("stripe.com") {
708            EmbedPlatform::Stripe
709        } else if url_lower.contains("paypal.com") {
710            EmbedPlatform::PayPal
711        } else if url_lower.contains("scribd.com") {
712            EmbedPlatform::Scribd
713        } else {
714            EmbedPlatform::Other
715        }
716    }
717}
718
719/// Extracted embedded content
720#[derive(Debug, Clone, Serialize, Deserialize)]
721pub struct EmbeddedMedia {
722    /// Source URL
723    pub url: String,
724    /// Resolved absolute URL
725    pub absolute_url: Option<String>,
726    /// Platform
727    pub platform: EmbedPlatform,
728    /// Title
729    pub title: Option<String>,
730    /// Width
731    pub width: Option<u32>,
732    /// Height
733    pub height: Option<u32>,
734    /// Allow attributes (for iframes)
735    pub allow: Option<String>,
736    /// Sandbox attributes
737    pub sandbox: Option<String>,
738    /// Loading strategy
739    pub loading: Option<String>,
740    /// Frameborder
741    pub frameborder: Option<String>,
742}
743
744impl Default for EmbeddedMedia {
745    fn default() -> Self {
746        Self {
747            url: String::new(),
748            absolute_url: None,
749            platform: EmbedPlatform::Other,
750            title: None,
751            width: None,
752            height: None,
753            allow: None,
754            sandbox: None,
755            loading: None,
756            frameborder: None,
757        }
758    }
759}
760
761// ============================================================================
762// LINK TYPES
763// ============================================================================
764
765/// Link type
766#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
767#[serde(rename_all = "lowercase")]
768pub enum LinkType {
769    #[default]
770    Internal,
771    External,
772    Mailto,
773    Tel,
774    Download,
775    Anchor,
776}
777
778/// Extracted link
779#[derive(Debug, Clone, Serialize, Deserialize)]
780pub struct LinkMedia {
781    /// Original href
782    pub href: String,
783    /// Resolved absolute URL
784    pub absolute_url: Option<String>,
785    /// Link text
786    pub text: String,
787    /// Title attribute
788    pub title: Option<String>,
789    /// Rel attribute values
790    pub rel: Vec<String>,
791    /// Link type
792    pub link_type: LinkType,
793    /// Is nofollow
794    pub is_nofollow: bool,
795    /// Is sponsored
796    pub is_sponsored: bool,
797    /// Is UGC (user generated content)
798    pub is_ugc: bool,
799    /// Target attribute
800    pub target: Option<String>,
801    /// Download attribute (filename)
802    pub download: Option<String>,
803    /// Hreflang
804    pub hreflang: Option<String>,
805    /// Detected media type (for downloads)
806    pub media_type: Option<MediaType>,
807}
808
809impl Default for LinkMedia {
810    fn default() -> Self {
811        Self {
812            href: String::new(),
813            absolute_url: None,
814            text: String::new(),
815            title: None,
816            rel: Vec::new(),
817            link_type: LinkType::Internal,
818            is_nofollow: false,
819            is_sponsored: false,
820            is_ugc: false,
821            target: None,
822            download: None,
823            hreflang: None,
824            media_type: None,
825        }
826    }
827}
828
829// ============================================================================
830// CONFIGURATION
831// ============================================================================
832
833/// Configuration for media extraction
834#[derive(Debug, Clone, Serialize, Deserialize)]
835pub struct MediaConfig {
836    /// Extract images
837    pub extract_images: bool,
838    /// Extract videos
839    pub extract_videos: bool,
840    /// Extract audio
841    pub extract_audio: bool,
842    /// Extract documents
843    pub extract_documents: bool,
844    /// Extract embedded content
845    pub extract_embeds: bool,
846    /// Extract links
847    pub extract_links: bool,
848    /// Include data URLs
849    pub include_data_urls: bool,
850    /// Filter placeholder images
851    pub filter_placeholders: bool,
852    /// Minimum image width
853    pub min_image_width: Option<u32>,
854    /// Minimum image height
855    pub min_image_height: Option<u32>,
856    /// Download settings
857    pub download: DownloadConfig,
858}
859
860impl Default for MediaConfig {
861    fn default() -> Self {
862        Self {
863            extract_images: true,
864            extract_videos: true,
865            extract_audio: true,
866            extract_documents: true,
867            extract_embeds: true,
868            extract_links: true,
869            include_data_urls: false,
870            filter_placeholders: true,
871            min_image_width: None,
872            min_image_height: None,
873            download: DownloadConfig::default(),
874        }
875    }
876}
877
878impl MediaConfig {
879    /// Create minimal config (only images and links)
880    pub fn minimal() -> Self {
881        Self {
882            extract_images: true,
883            extract_videos: false,
884            extract_audio: false,
885            extract_documents: false,
886            extract_embeds: false,
887            extract_links: true,
888            ..Default::default()
889        }
890    }
891
892    /// Create full config (extract everything)
893    pub fn full() -> Self {
894        Self {
895            extract_images: true,
896            extract_videos: true,
897            extract_audio: true,
898            extract_documents: true,
899            extract_embeds: true,
900            extract_links: true,
901            include_data_urls: true,
902            ..Default::default()
903        }
904    }
905}
906
907/// Download configuration
908#[derive(Debug, Clone, Serialize, Deserialize)]
909pub struct DownloadConfig {
910    /// Maximum file size in bytes
911    pub max_file_size: Option<u64>,
912    /// Maximum concurrent downloads
913    pub max_concurrent: usize,
914    /// Timeout in seconds
915    pub timeout_secs: u64,
916    /// Encode to base64
917    pub encode_base64: bool,
918    /// Maximum retries
919    pub max_retries: u32,
920    /// Retry delay in milliseconds
921    pub retry_delay_ms: u64,
922    /// User agent for downloads
923    pub user_agent: String,
924}
925
926impl Default for DownloadConfig {
927    fn default() -> Self {
928        Self {
929            max_file_size: Some(50 * 1024 * 1024), // 50MB
930            max_concurrent: 10,
931            timeout_secs: 30,
932            encode_base64: false,
933            max_retries: 2,
934            retry_delay_ms: 1000,
935            user_agent: "halldyll-media/1.0".to_string(),
936        }
937    }
938}
939
940/// Download result
941#[derive(Debug, Clone)]
942pub struct DownloadResult {
943    /// Original URL
944    pub url: String,
945    /// Downloaded bytes
946    pub bytes: Bytes,
947    /// Content type
948    pub content_type: Option<String>,
949    /// File size
950    pub size: u64,
951    /// SHA256 hash
952    pub hash: String,
953    /// Media type
954    pub media_type: MediaType,
955    /// Base64 encoded (if configured)
956    pub base64: Option<String>,
957}
958
959// ============================================================================
960// EXTRACTED MEDIA COLLECTION
961// ============================================================================
962
963/// All extracted media from a page
964#[derive(Debug, Clone, Default, Serialize, Deserialize)]
965pub struct ExtractedMedia {
966    /// Images
967    pub images: Vec<ImageMedia>,
968    /// Videos
969    pub videos: Vec<VideoMedia>,
970    /// Audio
971    pub audio: Vec<AudioMedia>,
972    /// Documents
973    pub documents: Vec<DocumentMedia>,
974    /// Embedded content
975    pub embeds: Vec<EmbeddedMedia>,
976    /// Links
977    pub links: Vec<LinkMedia>,
978}
979
980impl ExtractedMedia {
981    pub fn new() -> Self {
982        Self::default()
983    }
984
985    /// Get total media count
986    pub fn total_count(&self) -> usize {
987        self.images.len() 
988            + self.videos.len() 
989            + self.audio.len() 
990            + self.documents.len()
991            + self.embeds.len()
992            + self.links.len()
993    }
994
995    /// Check if empty
996    pub fn is_empty(&self) -> bool {
997        self.total_count() == 0
998    }
999
1000    /// Check if has any media
1001    pub fn has_media(&self) -> bool {
1002        !self.is_empty()
1003    }
1004
1005    /// Get all media URLs
1006    pub fn all_urls(&self) -> Vec<String> {
1007        let mut urls = Vec::new();
1008        
1009        for img in &self.images {
1010            if let Some(url) = &img.absolute_url {
1011                urls.push(url.clone());
1012            }
1013        }
1014        
1015        for vid in &self.videos {
1016            if let Some(url) = &vid.absolute_url {
1017                urls.push(url.clone());
1018            }
1019        }
1020        
1021        for aud in &self.audio {
1022            if let Some(url) = &aud.absolute_url {
1023                urls.push(url.clone());
1024            }
1025        }
1026        
1027        for doc in &self.documents {
1028            if let Some(url) = &doc.absolute_url {
1029                urls.push(url.clone());
1030            }
1031        }
1032        
1033        for emb in &self.embeds {
1034            if let Some(url) = &emb.absolute_url {
1035                urls.push(url.clone());
1036            }
1037        }
1038        
1039        urls
1040    }
1041
1042    /// Get all image URLs
1043    pub fn image_urls(&self) -> Vec<&str> {
1044        self.images.iter()
1045            .filter_map(|i| i.absolute_url.as_deref())
1046            .collect()
1047    }
1048
1049    /// Get all video URLs
1050    pub fn video_urls(&self) -> Vec<&str> {
1051        self.videos.iter()
1052            .filter_map(|v| v.absolute_url.as_deref())
1053            .collect()
1054    }
1055}
1056
1057// ============================================================================
1058// TESTS
1059// ============================================================================
1060
1061#[cfg(test)]
1062mod tests {
1063    use super::*;
1064
1065    #[test]
1066    fn test_media_type_from_extension() {
1067        assert_eq!(MediaType::from_extension("jpg"), MediaType::Image);
1068        assert_eq!(MediaType::from_extension("PNG"), MediaType::Image);
1069        assert_eq!(MediaType::from_extension("mp4"), MediaType::Video);
1070        assert_eq!(MediaType::from_extension("mp3"), MediaType::Audio);
1071        assert_eq!(MediaType::from_extension("pdf"), MediaType::Document);
1072        assert_eq!(MediaType::from_extension("xyz"), MediaType::Other);
1073    }
1074
1075    #[test]
1076    fn test_media_type_from_mime() {
1077        assert_eq!(MediaType::from_mime("image/jpeg"), MediaType::Image);
1078        assert_eq!(MediaType::from_mime("video/mp4"), MediaType::Video);
1079        assert_eq!(MediaType::from_mime("audio/mpeg"), MediaType::Audio);
1080        assert_eq!(MediaType::from_mime("application/pdf"), MediaType::Document);
1081    }
1082
1083    #[test]
1084    fn test_image_format() {
1085        assert_eq!(ImageFormat::from_extension("jpg"), ImageFormat::Jpeg);
1086        assert_eq!(ImageFormat::from_extension("webp"), ImageFormat::WebP);
1087        assert_eq!(ImageFormat::from_mime("image/png"), ImageFormat::Png);
1088        assert_eq!(ImageFormat::Png.mime_type(), "image/png");
1089    }
1090
1091    #[test]
1092    fn test_video_platform_detection() {
1093        assert_eq!(VideoPlatform::from_url("https://youtube.com/watch?v=abc"), VideoPlatform::YouTube);
1094        assert_eq!(VideoPlatform::from_url("https://vimeo.com/123"), VideoPlatform::Vimeo);
1095        assert_eq!(VideoPlatform::from_url("https://twitch.tv/channel"), VideoPlatform::Twitch);
1096        assert_eq!(VideoPlatform::from_url("https://example.com/video.mp4"), VideoPlatform::Other);
1097    }
1098
1099    #[test]
1100    fn test_audio_platform_detection() {
1101        assert_eq!(AudioPlatform::from_url("https://open.spotify.com/track/abc"), AudioPlatform::Spotify);
1102        assert_eq!(AudioPlatform::from_url("https://soundcloud.com/artist/track"), AudioPlatform::SoundCloud);
1103    }
1104
1105    #[test]
1106    fn test_embed_platform_detection() {
1107        assert_eq!(EmbedPlatform::from_url("https://www.youtube.com/embed/abc"), EmbedPlatform::YouTube);
1108        assert_eq!(EmbedPlatform::from_url("https://player.vimeo.com/video/123"), EmbedPlatform::Vimeo);
1109        assert_eq!(EmbedPlatform::from_url("https://codepen.io/user/pen/abc"), EmbedPlatform::CodePen);
1110    }
1111
1112    #[test]
1113    fn test_document_type() {
1114        assert_eq!(DocumentType::from_extension("pdf"), DocumentType::Pdf);
1115        assert_eq!(DocumentType::from_extension("docx"), DocumentType::Word);
1116        assert_eq!(DocumentType::from_extension("xlsx"), DocumentType::Excel);
1117    }
1118
1119    #[test]
1120    fn test_extracted_media() {
1121        let mut media = ExtractedMedia::new();
1122        assert!(!media.has_media());
1123        assert_eq!(media.total_count(), 0);
1124
1125        media.images.push(ImageMedia::default());
1126        assert!(media.has_media());
1127        assert_eq!(media.total_count(), 1);
1128    }
1129
1130    #[test]
1131    fn test_media_config() {
1132        let config = MediaConfig::default();
1133        assert!(config.extract_images);
1134        assert!(config.download.max_concurrent > 0);
1135
1136        let minimal = MediaConfig::minimal();
1137        assert!(minimal.extract_images);
1138        assert!(!minimal.extract_videos);
1139    }
1140}
halldyll_media/types.rs

halldyll_media/
types.rs