halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Response - HTTP response handling

use bytes::Bytes;
use reqwest::header::HeaderMap;
use reqwest::StatusCode;
use std::collections::HashMap;
use url::Url;

use crate::types::provenance::{RedirectHop, RequestTimings};

/// HTTP response with metadata
#[derive(Debug)]
pub struct FetchResponse {
    /// Final URL after redirects
    pub final_url: Url,
    /// HTTP status code
    pub status: StatusCode,
    /// Response headers
    pub headers: HeaderMap,
    /// Response body (raw bytes)
    pub body: Bytes,
    /// Redirect chain
    pub redirect_chain: Vec<RedirectHop>,
    /// Request timings
    pub timings: RequestTimings,
    /// Size before decompression
    pub compressed_size: Option<u64>,
    /// Is this a 304 Not Modified response?
    pub not_modified: bool,
}

impl FetchResponse {
    /// Creates a new response
    pub fn new(final_url: Url, status: StatusCode, headers: HeaderMap, body: Bytes) -> Self {
        Self {
            final_url,
            status,
            headers,
            body,
            redirect_chain: Vec::new(),
            timings: RequestTimings::default(),
            compressed_size: None,
            not_modified: status == StatusCode::NOT_MODIFIED,
        }
    }

    /// Response Content-Type
    pub fn content_type(&self) -> Option<String> {
        self.headers
            .get("content-type")
            .and_then(|v| v.to_str().ok())
            .map(|s| s.split(';').next().unwrap_or(s).trim().to_lowercase())
    }

    /// Response charset
    pub fn charset(&self) -> Option<String> {
        self.headers
            .get("content-type")
            .and_then(|v| v.to_str().ok())
            .and_then(|ct| {
                ct.split(';')
                    .find(|part| part.trim().to_lowercase().starts_with("charset="))
                    .map(|part| part.split('=').nth(1).unwrap_or("utf-8").trim().to_lowercase())
            })
    }

    /// Response ETag
    pub fn etag(&self) -> Option<String> {
        self.headers
            .get("etag")
            .and_then(|v| v.to_str().ok())
            .map(String::from)
    }

    /// Response Last-Modified
    pub fn last_modified(&self) -> Option<String> {
        self.headers
            .get("last-modified")
            .and_then(|v| v.to_str().ok())
            .map(String::from)
    }

    /// Response Cache-Control
    pub fn cache_control(&self) -> Option<String> {
        self.headers
            .get("cache-control")
            .and_then(|v| v.to_str().ok())
            .map(String::from)
    }

    /// X-Robots-Tag header
    pub fn x_robots_tag(&self) -> Option<String> {
        self.headers
            .get("x-robots-tag")
            .and_then(|v| v.to_str().ok())
            .map(String::from)
    }

    /// Headers as HashMap
    pub fn headers_map(&self) -> HashMap<String, String> {
        self.headers
            .iter()
            .filter_map(|(name, value)| {
                value.to_str().ok().map(|v| (name.to_string(), v.to_string()))
            })
            .collect()
    }

    /// Body size
    pub fn body_size(&self) -> u64 {
        self.body.len() as u64
    }

    /// Is this a success (2xx)?
    pub fn is_success(&self) -> bool {
        self.status.is_success()
    }

    /// Is this a client error (4xx)?
    pub fn is_client_error(&self) -> bool {
        self.status.is_client_error()
    }

    /// Is this a server error (5xx)?
    pub fn is_server_error(&self) -> bool {
        self.status.is_server_error()
    }

    /// Is this a rate limit (429)?
    pub fn is_rate_limited(&self) -> bool {
        self.status == StatusCode::TOO_MANY_REQUESTS
    }

    /// Is this forbidden (403)?
    pub fn is_forbidden(&self) -> bool {
        self.status == StatusCode::FORBIDDEN
    }

    /// Body as UTF-8 string
    pub fn text(&self) -> Result<String, std::string::FromUtf8Error> {
        String::from_utf8(self.body.to_vec())
    }
}