spider 2.51.39

A web crawler and scraper, building blocks for data curation workloads.
Documentation
/// Base css selector to use for getting valid web pages.
pub(crate) const BASE_CSS_SELECTORS: &str = concat!(
    "a[href]",
    ":not([aria-hidden=\"true\"])",
    ":not([href$=\".jpg\"])",
    ":not([href$=\".jpeg\"])",
    ":not([href$=\".png\"])",
    ":not([href$=\".gif\"])",
    ":not([href$=\".svg\"])",
    ":not([href$=\".webp\"])",
    ":not([href$=\".mp4\"])",
    ":not([href$=\".avi\"])",
    ":not([href$=\".mov\"])",
    ":not([href$=\".wmv\"])",
    ":not([href$=\".flv\"])",
    ":not([href$=\".mp3\"])",
    ":not([href$=\".wav\"])",
    ":not([href$=\".wma\"])",
    ":not([href$=\".wpl\"])",
    ":not([href$=\".mpa\"])",
    ":not([href$=\".ogg\"])",
    ":not([href$=\".woff\"])",
    ":not([href$=\".woff2\"])",
    ":not([href$=\".ttf\"])",
    ":not([href$=\".otf\"])",
    ":not([href$=\".swf\"])",
    ":not([href$=\".xap\"])",
    ":not([href$=\".ico\"])",
    ":not([href$=\".eot\"])",
    ":not([href$=\".bmp\"])",
    ":not([href$=\".psd\"])",
    ":not([href$=\".tiff\"])",
    ":not([href$=\".tif\"])",
    ":not([href$=\".heic\"])",
    ":not([href$=\".heif\"])",
    ":not([href$=\".mkv\"])",
    ":not([href$=\".webm\"])",
    ":not([href$=\".m4v\"])",
    ":not([href$=\".aac\"])",
    ":not([href$=\".flac\"])",
    ":not([href$=\".m4a\"])",
    ":not([href$=\".aiff\"])",
    ":not([href$=\".pdf\"])",
    ":not([href$=\".rtf\"])",
    ":not([href$=\".eps\"])",
    ":not([href$=\".yaml\"])",
    ":not([href$=\".yml\"])",
    ":not([href$=\".xml\"])",
    ":not([href$=\".css\"])",
    ":not([href$=\".js\"])",
    ":not([href$=\".txt\"])",
    ":not([href$=\".tar\"])",
    ":not([href$=\".doc\"])",
    ":not([href$=\".docx\"])",
    ":not([href$=\".zip\"])",
    ":not([href$=\".deb\"])",
    ":not([href$=\".pkg\"])",
    ":not([href$=\".tar.gz\"])",
    ":not([href$=\".rpm\"])",
    ":not([href$=\".z\"])",
    ":not([href$=\".7z\"])",
    ":not([href$=\".arj\"])",
    ":not([href$=\".rar\"])",
    ":not([href$=\".bin\"])",
    ":not([href$=\".msi\"])",
    ":not([href$=\".sh\"])",
    ":not([href$=\".bat\"])",
    ":not([href$=\".dmg\"])",
    ":not([href$=\".iso\"])",
    ":not([href$=\".toast\"])",
    ":not([href$=\".vcd\"])",
    ":not([href$=\".csv\"])",
    ":not([href$=\".log\"])",
    ":not([href$=\".sql\"])",
    ":not([href$=\".db\"])",
    ":not([href$=\".exe\"])",
    ":not([href$=\".rss\"])",
    ":not([href$=\".key\"])",
    ":not([href$=\".odp\"])",
    ":not([href$=\".pps\"])",
    ":not([href$=\".ptt\"])",
    ":not([href$=\".pptx\"])",
    ":not([href$=\".dump\"])",
);

// Pre-compiled selector for BASE_CSS_SELECTORS. Lock-free after first init
// (single atomic Acquire load on the fast path). Avoids re-parsing the 80+
// :not() CSS selector on every page.
static COMPILED_BASE_SELECTOR: std::sync::OnceLock<lol_html::Selector> = std::sync::OnceLock::new();

// Pre-compiled selector for BASE_CSS_SELECTORS_WITH_XML.
static COMPILED_BASE_XML_SELECTOR: std::sync::OnceLock<lol_html::Selector> =
    std::sync::OnceLock::new();

// Pre-compiled selector for the <base> element.
static COMPILED_BASE_ELEMENT_SELECTOR: std::sync::OnceLock<lol_html::Selector> =
    std::sync::OnceLock::new();

/// Get the pre-compiled link extraction selector (non-XML).
#[inline]
pub(crate) fn compiled_selector() -> &'static lol_html::Selector {
    COMPILED_BASE_SELECTOR.get_or_init(|| BASE_CSS_SELECTORS.parse().unwrap())
}

/// Get the pre-compiled link extraction selector (XML variant).
#[inline]
pub(crate) fn compiled_xml_selector() -> &'static lol_html::Selector {
    COMPILED_BASE_XML_SELECTOR.get_or_init(|| BASE_CSS_SELECTORS_WITH_XML.parse().unwrap())
}

/// Get the pre-compiled `<base>` element selector.
#[inline]
pub(crate) fn compiled_base_element_selector() -> &'static lol_html::Selector {
    COMPILED_BASE_ELEMENT_SELECTOR.get_or_init(|| "base".parse().unwrap())
}

/// Base css selector to use for getting valid web pages including xml files. We may remove this for general xml including links always.
pub(crate) const BASE_CSS_SELECTORS_WITH_XML: &str = concat!(
    "a[href]",
    ":not([aria-hidden=\"true\"])",
    ":not([href$=\".jpg\"])",
    ":not([href$=\".jpeg\"])",
    ":not([href$=\".png\"])",
    ":not([href$=\".gif\"])",
    ":not([href$=\".svg\"])",
    ":not([href$=\".webp\"])",
    ":not([href$=\".mp4\"])",
    ":not([href$=\".avi\"])",
    ":not([href$=\".mov\"])",
    ":not([href$=\".wmv\"])",
    ":not([href$=\".flv\"])",
    ":not([href$=\".mp3\"])",
    ":not([href$=\".wav\"])",
    ":not([href$=\".wma\"])",
    ":not([href$=\".wpl\"])",
    ":not([href$=\".mpa\"])",
    ":not([href$=\".ogg\"])",
    ":not([href$=\".woff\"])",
    ":not([href$=\".woff2\"])",
    ":not([href$=\".ttf\"])",
    ":not([href$=\".otf\"])",
    ":not([href$=\".swf\"])",
    ":not([href$=\".xap\"])",
    ":not([href$=\".ico\"])",
    ":not([href$=\".eot\"])",
    ":not([href$=\".bmp\"])",
    ":not([href$=\".psd\"])",
    ":not([href$=\".tiff\"])",
    ":not([href$=\".tif\"])",
    ":not([href$=\".heic\"])",
    ":not([href$=\".heif\"])",
    ":not([href$=\".mkv\"])",
    ":not([href$=\".webm\"])",
    ":not([href$=\".m4v\"])",
    ":not([href$=\".aac\"])",
    ":not([href$=\".flac\"])",
    ":not([href$=\".m4a\"])",
    ":not([href$=\".aiff\"])",
    ":not([href$=\".pdf\"])",
    ":not([href$=\".rtf\"])",
    ":not([href$=\".eps\"])",
    ":not([href$=\".yaml\"])",
    ":not([href$=\".yml\"])",
    ":not([href$=\".css\"])",
    ":not([href$=\".js\"])",
    ":not([href$=\".txt\"])",
    ":not([href$=\".tar\"])",
    ":not([href$=\".doc\"])",
    ":not([href$=\".docx\"])",
    ":not([href$=\".zip\"])",
    ":not([href$=\".deb\"])",
    ":not([href$=\".pkg\"])",
    ":not([href$=\".tar.gz\"])",
    ":not([href$=\".rpm\"])",
    ":not([href$=\".z\"])",
    ":not([href$=\".7z\"])",
    ":not([href$=\".arj\"])",
    ":not([href$=\".rar\"])",
    ":not([href$=\".bin\"])",
    ":not([href$=\".msi\"])",
    ":not([href$=\".sh\"])",
    ":not([href$=\".bat\"])",
    ":not([href$=\".dmg\"])",
    ":not([href$=\".iso\"])",
    ":not([href$=\".toast\"])",
    ":not([href$=\".vcd\"])",
    ":not([href$=\".csv\"])",
    ":not([href$=\".log\"])",
    ":not([href$=\".sql\"])",
    ":not([href$=\".db\"])",
    ":not([href$=\".exe\"])",
    ":not([href$=\".rss\"])",
    ":not([href$=\".key\"])",
    ":not([href$=\".odp\"])",
    ":not([href$=\".pps\"])",
    ":not([href$=\".ptt\"])",
    ":not([href$=\".pptx\"])",
    ":not([href$=\".dump\"])",
);