spider 2.48.13

A web crawler and scraper, building blocks for data curation workloads.
/// Base css selector to use for getting valid web pages.
pub(crate) const BASE_CSS_SELECTORS: &str = concat!(
    "a[href]",
    ":not([href$=\".jpg\"])",
    ":not([href$=\".jpeg\"])",
    ":not([href$=\".png\"])",
    ":not([href$=\".gif\"])",
    ":not([href$=\".svg\"])",
    ":not([href$=\".webp\"])",
    ":not([href$=\".mp4\"])",
    ":not([href$=\".avi\"])",
    ":not([href$=\".mov\"])",
    ":not([href$=\".wmv\"])",
    ":not([href$=\".flv\"])",
    ":not([href$=\".mp3\"])",
    ":not([href$=\".wav\"])",
    ":not([href$=\".wma\"])",
    ":not([href$=\".wpl\"])",
    ":not([href$=\".mpa\"])",
    ":not([href$=\".ogg\"])",
    ":not([href$=\".woff\"])",
    ":not([href$=\".woff2\"])",
    ":not([href$=\".ttf\"])",
    ":not([href$=\".otf\"])",
    ":not([href$=\".swf\"])",
    ":not([href$=\".xap\"])",
    ":not([href$=\".ico\"])",
    ":not([href$=\".eot\"])",
    ":not([href$=\".bmp\"])",
    ":not([href$=\".psd\"])",
    ":not([href$=\".tiff\"])",
    ":not([href$=\".tif\"])",
    ":not([href$=\".heic\"])",
    ":not([href$=\".heif\"])",
    ":not([href$=\".mkv\"])",
    ":not([href$=\".webm\"])",
    ":not([href$=\".m4v\"])",
    ":not([href$=\".aac\"])",
    ":not([href$=\".flac\"])",
    ":not([href$=\".m4a\"])",
    ":not([href$=\".aiff\"])",
    ":not([href$=\".pdf\"])",
    ":not([href$=\".rtf\"])",
    ":not([href$=\".eps\"])",
    ":not([href$=\".yaml\"])",
    ":not([href$=\".yml\"])",
    ":not([href$=\".xml\"])",
    ":not([href$=\".css\"])",
    ":not([href$=\".js\"])",
    ":not([href$=\".txt\"])",
    ":not([href$=\".tar\"])",
    ":not([href$=\".doc\"])",
    ":not([href$=\".docx\"])",
    ":not([href$=\".zip\"])",
    ":not([href$=\".deb\"])",
    ":not([href$=\".pkg\"])",
    ":not([href$=\".tar.gz\"])",
    ":not([href$=\".rpm\"])",
    ":not([href$=\".z\"])",
    ":not([href$=\".7z\"])",
    ":not([href$=\".arj\"])",
    ":not([href$=\".rar\"])",
    ":not([href$=\".bin\"])",
    ":not([href$=\".msi\"])",
    ":not([href$=\".sh\"])",
    ":not([href$=\".bat\"])",
    ":not([href$=\".dmg\"])",
    ":not([href$=\".iso\"])",
    ":not([href$=\".toast\"])",
    ":not([href$=\".vcd\"])",
    ":not([href$=\".csv\"])",
    ":not([href$=\".log\"])",
    ":not([href$=\".sql\"])",
    ":not([href$=\".db\"])",
    ":not([href$=\".exe\"])",
    ":not([href$=\".rss\"])",
    ":not([href$=\".key\"])",
    ":not([href$=\".odp\"])",
    ":not([href$=\".pps\"])",
    ":not([href$=\".ptt\"])",
    ":not([href$=\".pptx\"])",
    ":not([href$=\".dump\"])",
);

/// Base css selector to use for getting valid web pages including xml files. We may remove this for general xml including links always.
pub(crate) const BASE_CSS_SELECTORS_WITH_XML: &str = concat!(
    "a[href]",
    ":not([href$=\".jpg\"])",
    ":not([href$=\".jpeg\"])",
    ":not([href$=\".png\"])",
    ":not([href$=\".gif\"])",
    ":not([href$=\".svg\"])",
    ":not([href$=\".webp\"])",
    ":not([href$=\".mp4\"])",
    ":not([href$=\".avi\"])",
    ":not([href$=\".mov\"])",
    ":not([href$=\".wmv\"])",
    ":not([href$=\".flv\"])",
    ":not([href$=\".mp3\"])",
    ":not([href$=\".wav\"])",
    ":not([href$=\".wma\"])",
    ":not([href$=\".wpl\"])",
    ":not([href$=\".mpa\"])",
    ":not([href$=\".ogg\"])",
    ":not([href$=\".woff\"])",
    ":not([href$=\".woff2\"])",
    ":not([href$=\".ttf\"])",
    ":not([href$=\".otf\"])",
    ":not([href$=\".swf\"])",
    ":not([href$=\".xap\"])",
    ":not([href$=\".ico\"])",
    ":not([href$=\".eot\"])",
    ":not([href$=\".bmp\"])",
    ":not([href$=\".psd\"])",
    ":not([href$=\".tiff\"])",
    ":not([href$=\".tif\"])",
    ":not([href$=\".heic\"])",
    ":not([href$=\".heif\"])",
    ":not([href$=\".mkv\"])",
    ":not([href$=\".webm\"])",
    ":not([href$=\".m4v\"])",
    ":not([href$=\".aac\"])",
    ":not([href$=\".flac\"])",
    ":not([href$=\".m4a\"])",
    ":not([href$=\".aiff\"])",
    ":not([href$=\".pdf\"])",
    ":not([href$=\".rtf\"])",
    ":not([href$=\".eps\"])",
    ":not([href$=\".yaml\"])",
    ":not([href$=\".yml\"])",
    ":not([href$=\".css\"])",
    ":not([href$=\".js\"])",
    ":not([href$=\".txt\"])",
    ":not([href$=\".tar\"])",
    ":not([href$=\".doc\"])",
    ":not([href$=\".docx\"])",
    ":not([href$=\".zip\"])",
    ":not([href$=\".deb\"])",
    ":not([href$=\".pkg\"])",
    ":not([href$=\".tar.gz\"])",
    ":not([href$=\".rpm\"])",
    ":not([href$=\".z\"])",
    ":not([href$=\".7z\"])",
    ":not([href$=\".arj\"])",
    ":not([href$=\".rar\"])",
    ":not([href$=\".bin\"])",
    ":not([href$=\".msi\"])",
    ":not([href$=\".sh\"])",
    ":not([href$=\".bat\"])",
    ":not([href$=\".dmg\"])",
    ":not([href$=\".iso\"])",
    ":not([href$=\".toast\"])",
    ":not([href$=\".vcd\"])",
    ":not([href$=\".csv\"])",
    ":not([href$=\".log\"])",
    ":not([href$=\".sql\"])",
    ":not([href$=\".db\"])",
    ":not([href$=\".exe\"])",
    ":not([href$=\".rss\"])",
    ":not([href$=\".key\"])",
    ":not([href$=\".odp\"])",
    ":not([href$=\".pps\"])",
    ":not([href$=\".ptt\"])",
    ":not([href$=\".pptx\"])",
    ":not([href$=\".dump\"])",
);