1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
#![warn(missing_docs)]
//! Website crawling library that rapidly crawls all pages to
//! gather links via isolated contexts.
//!
//! Spider is multi-threaded crawler that can be configured
//! to scrape web pages. It has the ability to gather
//! tens of thousands of pages within seconds.
//!
//! # How to use Spider
//!
//! There are a couple of ways to use Spider:
//!
//! - **Concurrent** is the fastest way to start crawling a web page and
//! typically the most efficient.
//! - [`crawl`] is used to crawl concurrently.
//! - **Sequential** lets you crawl the web pages one after another respecting delay sequences.
//! - [`crawl_sync`] is used to crawl in sync.
//! - **Scrape** Scrape the page and hold onto the HTML raw string to parse.
//! - [`scrape`] is used to gather the HTML.
//!
//! [`crawl`]: website/struct.Website.html#method.crawl
//! [`crawl_sync`]: website/struct.Website.html#method.crawl_sync
//! [`scrape`]: website/struct.Website.html#method.scrape
//!
//! # Basic usage
//!
//! First, you will need to add `spider` to your `Cargo.toml`.
//!
//! Next, simply add the website url in the struct of website and crawl,
//! you can also crawl sequentially.
pub extern crate compact_str;
pub extern crate hashbrown;
extern crate log;
pub extern crate reqwest;
pub extern crate tokio;
#[cfg(feature = "ua_generator")]
extern crate ua_generator;
pub extern crate url;
#[macro_use]
extern crate string_concat;
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate fast_html5ever;
#[macro_use]
extern crate matches;
// performance reasons jemalloc memory backend for dedicated work and large crawls
#[cfg(all(
not(windows),
not(target_os = "android"),
not(target_env = "musl"),
feature = "jemalloc"
))]
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
/// Configuration structure for `Website`.
pub mod configuration;
/// Internal packages customized.
pub mod packages;
/// A page scraped.
pub mod page;
/// Application utils.
pub mod utils;
/// A website to crawl.
pub mod website;
#[cfg(feature = "regex")]
/// Black list checking url exist with Regex.
pub mod black_list {
use compact_str::CompactString;
use regex::Regex;
/// check if link exist in blacklists with regex.
pub fn contains(blacklist_url: &Vec<CompactString>, link: &CompactString) -> bool {
for pattern in blacklist_url {
let re = Regex::new(pattern).unwrap();
if re.is_match(link) {
return true;
}
}
return false;
}
}
#[cfg(not(feature = "regex"))]
/// Black list checking url exist.
pub mod black_list {
use compact_str::CompactString;
/// check if link exist in blacklists.
pub fn contains(blacklist_url: &Vec<CompactString>, link: &CompactString) -> bool {
blacklist_url.contains(&link)
}
}