halldyll_core/parse/
router.rs

1//! Router - Routing by Content-Type
2
3
4/// Detected content type
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub enum ContentType {
7    /// HTML document
8    Html,
9    /// XHTML document
10    Xhtml,
11    /// XML document
12    Xml,
13    /// JSON document
14    Json,
15    /// PDF document
16    Pdf,
17    /// Plain text
18    PlainText,
19    /// Image with subtype (e.g., "png", "jpeg")
20    Image(String),
21    /// Video with subtype (e.g., "mp4", "webm")
22    Video(String),
23    /// Audio with subtype (e.g., "mp3", "ogg")
24    Audio(String),
25    /// Binary data
26    Binary,
27    /// Unknown content type
28    Unknown,
29}
30
31impl ContentType {
32    /// Parse from a Content-Type header
33    pub fn from_header(content_type: Option<&str>) -> Self {
34        let ct = match content_type {
35            Some(ct) => ct.split(';').next().unwrap_or("").trim().to_lowercase(),
36            None => return ContentType::Unknown,
37        };
38
39        match ct.as_str() {
40            "text/html" => ContentType::Html,
41            "application/xhtml+xml" => ContentType::Xhtml,
42            "application/xml" | "text/xml" => ContentType::Xml,
43            "application/json" | "text/json" => ContentType::Json,
44            "application/pdf" => ContentType::Pdf,
45            "text/plain" => ContentType::PlainText,
46            _ if ct.starts_with("image/") => {
47                ContentType::Image(ct.strip_prefix("image/").unwrap_or("").to_string())
48            }
49            _ if ct.starts_with("video/") => {
50                ContentType::Video(ct.strip_prefix("video/").unwrap_or("").to_string())
51            }
52            _ if ct.starts_with("audio/") => {
53                ContentType::Audio(ct.strip_prefix("audio/").unwrap_or("").to_string())
54            }
55            _ if ct.starts_with("application/octet-stream") => ContentType::Binary,
56            _ => ContentType::Unknown,
57        }
58    }
59
60    /// Detect from content (magic bytes)
61    pub fn detect_from_content(content: &[u8]) -> Self {
62        if content.len() < 4 {
63            return ContentType::Unknown;
64        }
65
66        // PDF
67        if content.starts_with(b"%PDF") {
68            return ContentType::Pdf;
69        }
70
71        // Images
72        if content.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
73            return ContentType::Image("png".to_string());
74        }
75        if content.starts_with(&[0xFF, 0xD8, 0xFF]) {
76            return ContentType::Image("jpeg".to_string());
77        }
78        if content.starts_with(b"GIF87a") || content.starts_with(b"GIF89a") {
79            return ContentType::Image("gif".to_string());
80        }
81        if content.starts_with(b"RIFF") && content.len() > 12 && &content[8..12] == b"WEBP" {
82            return ContentType::Image("webp".to_string());
83        }
84
85        // HTML (heuristic)
86        let start = String::from_utf8_lossy(&content[..std::cmp::min(1024, content.len())]);
87        let start_lower = start.to_lowercase();
88        if start_lower.contains("<!doctype html") || start_lower.contains("<html") {
89            return ContentType::Html;
90        }
91
92        // XML
93        if start.trim_start().starts_with("<?xml") {
94            return ContentType::Xml;
95        }
96
97        // JSON
98        let trimmed = start.trim_start();
99        if trimmed.starts_with('{') || trimmed.starts_with('[') {
100            return ContentType::Json;
101        }
102
103        ContentType::Unknown
104    }
105
106    /// Is it parseable as HTML?
107    pub fn is_html(&self) -> bool {
108        matches!(self, ContentType::Html | ContentType::Xhtml)
109    }
110
111    /// Is it text?
112    pub fn is_text(&self) -> bool {
113        matches!(
114            self,
115            ContentType::Html
116                | ContentType::Xhtml
117                | ContentType::Xml
118                | ContentType::Json
119                | ContentType::PlainText
120        )
121    }
122}
123
124/// Content router
125pub struct ContentRouter;
126
127impl ContentRouter {
128    /// Determine the content type
129    pub fn detect(content_type_header: Option<&str>, content: &[u8]) -> ContentType {
130        // Priority to header
131        let from_header = ContentType::from_header(content_type_header);
132        if from_header != ContentType::Unknown {
133            return from_header;
134        }
135
136        // Otherwise detect from content
137        ContentType::detect_from_content(content)
138    }
139
140    /// Is the content extractable (HTML/XML/text)?
141    pub fn is_extractable(content_type: &ContentType) -> bool {
142        matches!(
143            content_type,
144            ContentType::Html | ContentType::Xhtml | ContentType::Xml | ContentType::PlainText
145        )
146    }
147}