halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Router - Routing by Content-Type


/// Detected content type
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ContentType {
    /// HTML document
    Html,
    /// XHTML document
    Xhtml,
    /// XML document
    Xml,
    /// JSON document
    Json,
    /// PDF document
    Pdf,
    /// Plain text
    PlainText,
    /// Image with subtype (e.g., "png", "jpeg")
    Image(String),
    /// Video with subtype (e.g., "mp4", "webm")
    Video(String),
    /// Audio with subtype (e.g., "mp3", "ogg")
    Audio(String),
    /// Binary data
    Binary,
    /// Unknown content type
    Unknown,
}

impl ContentType {
    /// Parse from a Content-Type header
    pub fn from_header(content_type: Option<&str>) -> Self {
        let ct = match content_type {
            Some(ct) => ct.split(';').next().unwrap_or("").trim().to_lowercase(),
            None => return ContentType::Unknown,
        };

        match ct.as_str() {
            "text/html" => ContentType::Html,
            "application/xhtml+xml" => ContentType::Xhtml,
            "application/xml" | "text/xml" => ContentType::Xml,
            "application/json" | "text/json" => ContentType::Json,
            "application/pdf" => ContentType::Pdf,
            "text/plain" => ContentType::PlainText,
            _ if ct.starts_with("image/") => {
                ContentType::Image(ct.strip_prefix("image/").unwrap_or("").to_string())
            }
            _ if ct.starts_with("video/") => {
                ContentType::Video(ct.strip_prefix("video/").unwrap_or("").to_string())
            }
            _ if ct.starts_with("audio/") => {
                ContentType::Audio(ct.strip_prefix("audio/").unwrap_or("").to_string())
            }
            _ if ct.starts_with("application/octet-stream") => ContentType::Binary,
            _ => ContentType::Unknown,
        }
    }

    /// Detect from content (magic bytes)
    pub fn detect_from_content(content: &[u8]) -> Self {
        if content.len() < 4 {
            return ContentType::Unknown;
        }

        // PDF
        if content.starts_with(b"%PDF") {
            return ContentType::Pdf;
        }

        // Images
        if content.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
            return ContentType::Image("png".to_string());
        }
        if content.starts_with(&[0xFF, 0xD8, 0xFF]) {
            return ContentType::Image("jpeg".to_string());
        }
        if content.starts_with(b"GIF87a") || content.starts_with(b"GIF89a") {
            return ContentType::Image("gif".to_string());
        }
        if content.starts_with(b"RIFF") && content.len() > 12 && &content[8..12] == b"WEBP" {
            return ContentType::Image("webp".to_string());
        }

        // HTML (heuristic)
        let start = String::from_utf8_lossy(&content[..std::cmp::min(1024, content.len())]);
        let start_lower = start.to_lowercase();
        if start_lower.contains("<!doctype html") || start_lower.contains("<html") {
            return ContentType::Html;
        }

        // XML
        if start.trim_start().starts_with("<?xml") {
            return ContentType::Xml;
        }

        // JSON
        let trimmed = start.trim_start();
        if trimmed.starts_with('{') || trimmed.starts_with('[') {
            return ContentType::Json;
        }

        ContentType::Unknown
    }

    /// Is it parseable as HTML?
    pub fn is_html(&self) -> bool {
        matches!(self, ContentType::Html | ContentType::Xhtml)
    }

    /// Is it text?
    pub fn is_text(&self) -> bool {
        matches!(
            self,
            ContentType::Html
                | ContentType::Xhtml
                | ContentType::Xml
                | ContentType::Json
                | ContentType::PlainText
        )
    }
}

/// Content router
pub struct ContentRouter;

impl ContentRouter {
    /// Determine the content type
    pub fn detect(content_type_header: Option<&str>, content: &[u8]) -> ContentType {
        // Priority to header
        let from_header = ContentType::from_header(content_type_header);
        if from_header != ContentType::Unknown {
            return from_header;
        }

        // Otherwise detect from content
        ContentType::detect_from_content(content)
    }

    /// Is the content extractable (HTML/XML/text)?
    pub fn is_extractable(content_type: &ContentType) -> bool {
        matches!(
            content_type,
            ContentType::Html | ContentType::Xhtml | ContentType::Xml | ContentType::PlainText
        )
    }
}