qrawl 0.6.0

Rust toolkit to crawl web data for AI agents
Documentation
use scraper::Html;
use serde_json::Value;

use crate::selectors::{
    BODY_SELECTOR, HTML_LANG_SELECTOR, JSONLD_SELECTOR, META_SELECTOR, TITLE_SELECTOR,
};
use crate::types::{Jsonld, Metadata};

pub(super) fn scrape_body_content(html: &str) -> String {
    let document = Html::parse_document(html);

    if let Some(body) = document.select(&BODY_SELECTOR).next() {
        return body.html();
    }

    html.to_string()
}

pub(super) fn scrape_jsonld_scripts(html: &str) -> Jsonld {
    let document = Html::parse_document(html);
    scrape_jsonld_from_doc(&document)
}

pub(super) fn scrape_jsonld_from_doc(document: &Html) -> Jsonld {
    document
        .select(&JSONLD_SELECTOR)
        .filter_map(|el| {
            let raw = el.text().collect::<String>();
            serde_json::from_str(&raw).ok()
        })
        .flat_map(flatten_jsonld)
        .collect()
}

fn flatten_jsonld(value: Value) -> Vec<Value> {
    match value {
        Value::Array(arr) => arr.into_iter().flat_map(flatten_jsonld).collect(),
        Value::Object(mut obj) => {
            if let Some(Value::Array(arr)) = obj.remove("@graph") {
                return arr.into_iter().flat_map(flatten_jsonld).collect();
            }
            vec![Value::Object(obj)]
        }
        _ => Vec::new(),
    }
}

pub(super) fn scrape_metadata_tags(html: &str) -> Metadata {
    let document = Html::parse_document(html);
    scrape_metadata_from_doc(&document)
}

pub(super) fn scrape_metadata_from_doc(document: &Html) -> Metadata {
    let mut tags = Vec::new();

    if let Some(el) = document.select(&TITLE_SELECTOR).next() {
        let text = el.text().collect::<String>().trim().to_string();
        if !text.is_empty() {
            tags.push(("title".to_string(), text));
        }
    }

    for el in document.select(&META_SELECTOR) {
        let key = el
            .value()
            .attr("name")
            .or_else(|| el.value().attr("property"))
            .map(|s| s.to_string());
        let value = el.value().attr("content").map(|s| s.to_string());
        if let (Some(k), Some(v)) = (key, value) {
            if !v.trim().is_empty() {
                tags.push((k, v));
            }
        }
    }

    if let Some(el) = document.select(&HTML_LANG_SELECTOR).next() {
        if let Some(lang) = el.value().attr("lang") {
            tags.push(("lang".to_string(), lang.to_string()));
        }
    }

    tags
}