kreuzcrawl 0.2.0

High-performance web crawling engine
Documentation
//! Single-page scrape operation.

use tl::ParserOptions;
use url::Url;

use crate::assets;
use crate::browser_detect;
use crate::error::CrawlError;
use crate::html::{
    detect_charset, detect_nofollow, detect_noindex, extract_page_data, is_binary_content_type, is_binary_url,
    is_html_content, is_pdf_content,
};
use crate::http::{build_client, http_fetch};
use crate::normalize::robots_url;
use crate::robots::{is_path_allowed, parse_robots_txt};
use crate::types::{CrawlConfig, ScrapeResult};

/// Build a `ScrapeResult` from a Tower [`CrawlResponse`](crate::tower::CrawlResponse).
///
/// Runs the extraction pipeline (metadata, links, images, feeds, JSON-LD, assets)
/// on the response body returned by the Tower service stack.
pub(crate) async fn scrape_from_crawl_response(
    url: &str,
    resp: &crate::tower::CrawlResponse,
    config: &CrawlConfig,
) -> Result<ScrapeResult, CrawlError> {
    let parsed_url = Url::parse(url).map_err(|e| CrawlError::Other(format!("invalid URL: {e}")))?;
    let client = build_client(config)?;
    let auth_header_sent = config.auth.is_some();

    // Check robots.txt
    let mut is_allowed = true;
    let mut crawl_delay = None;
    if config.respect_robots_txt {
        let robots = robots_url(&parsed_url);
        if let Ok(robots_resp) = http_fetch(&robots, config, &std::collections::HashMap::new(), &client).await {
            let ua = config.user_agent.as_deref().unwrap_or("*");
            let rules = parse_robots_txt(&robots_resp.body, ua);
            is_allowed = is_path_allowed(parsed_url.path(), &rules);
            crawl_delay = rules.crawl_delay;
        }
    }

    let response_meta = crate::http::extract_response_meta_from_hashmap(&resp.headers);

    let status_code = resp.status;
    let content_type = resp.content_type.clone();
    let mut body = resp.body.clone();

    let detected_charset = detect_charset(&content_type, &body);

    // Re-decode from raw bytes when a non-UTF-8 charset is detected.
    // `resp.body` was produced via `String::from_utf8_lossy`, which garbles
    // Shift_JIS, EUC-JP, and other non-UTF-8 encodings. Re-decode from the
    // preserved raw bytes using encoding_rs to produce correct text.
    if let Some(ref charset) = detected_charset {
        let charset_lower = charset.as_str();
        if charset_lower != "utf-8"
            && charset_lower != "us-ascii"
            && let Some(encoding) = encoding_rs::Encoding::for_label(charset.as_bytes())
        {
            let (decoded, _, had_errors) = encoding.decode(&resp.body_bytes);
            if !had_errors {
                body = decoded.into_owned();
            }
        }
    }
    let is_pdf = is_pdf_content(&content_type, &body);

    // Handle max body size
    let mut body_size = body.len();
    if let Some(max_size) = config.max_body_size
        && body.len() > max_size
    {
        body.truncate(max_size);
        body_size = max_size;
    }

    // Check for X-Robots-Tag
    let x_robots_tag = resp.headers.get("x-robots-tag").and_then(|v| v.first().cloned());

    let mut noindex_detected = false;
    let mut nofollow_detected = false;

    if let Some(ref xrt) = x_robots_tag {
        let lower = xrt.to_lowercase();
        if lower.contains("noindex") {
            noindex_detected = true;
        }
        if lower.contains("nofollow") {
            nofollow_detected = true;
        }
    }

    let was_skipped = is_binary_content_type(&content_type) || is_binary_url(parsed_url.as_str()) || is_pdf;

    let is_html = is_html_content(&content_type, &body);

    let (extraction, asset_refs) = {
        let doc = tl::parse(&body, ParserOptions::default())
            .map_err(|e| CrawlError::Other(format!("HTML parse error: {e:?}")))?;

        if !noindex_detected {
            noindex_detected = detect_noindex(&doc);
        }
        if !nofollow_detected {
            nofollow_detected = detect_nofollow(&doc);
        }

        let extraction = extract_page_data(&doc, &body, &parsed_url, is_html, true);

        let asset_refs = if config.download_assets && is_html {
            assets::discover_assets(&doc, &parsed_url)
        } else {
            Vec::new()
        };

        (extraction, asset_refs)
    };

    let word_count = extraction.metadata.word_count.unwrap_or(0);
    let js_render_hint = is_html && browser_detect::detect_js_render_needed(&body, word_count);

    let downloaded_assets = if !asset_refs.is_empty() {
        assets::download_assets(asset_refs, config, &client).await
    } else {
        Vec::new()
    };

    // Merge CrawlConfig.remove_tags into ContentConfig.exclude_selectors for h2m.
    // This replaces the old apply_remove_tags() pre-processing step — h2m now
    // handles all element exclusion during its DOM walk.
    let content_config = if config.remove_tags.is_empty() {
        std::borrow::Cow::Borrowed(&config.content)
    } else {
        let mut merged = config.content.clone();
        merged.exclude_selectors.extend(config.remove_tags.iter().cloned());
        std::borrow::Cow::Owned(merged)
    };
    let markdown = crate::markdown::convert_to_markdown(&body, &content_config).await;

    Ok(ScrapeResult {
        status_code,
        content_type,
        html: body,
        body_size,
        metadata: extraction.metadata,
        links: extraction.links,
        images: extraction.images,
        feeds: extraction.feeds,
        json_ld: extraction.json_ld,
        is_allowed,
        crawl_delay,
        noindex_detected,
        nofollow_detected,
        x_robots_tag,
        is_pdf,
        was_skipped,
        detected_charset,
        auth_header_sent,
        response_meta: Some(response_meta),
        assets: downloaded_assets,
        js_render_hint,
        browser_used: false,
        markdown,
        extracted_data: None,
        extraction_meta: None,
        screenshot: None,
        downloaded_document: None,
    })
}