spider/lib.rs
1#![warn(missing_docs)]
2#![allow(clippy::perf)]
3//! Website crawling library that rapidly crawls all pages to
4//! gather links via isolated contexts.
5//!
6//! Spider is multi-threaded crawler that can be configured
7//! to scrape web pages. It has the ability to gather
8//! millions of pages within seconds.
9//!
10//! # How to use Spider
11//!
12//! There are a couple of ways to use Spider:
13//!
14//! - [`crawl`]: start concurrently crawling a site. Can be used to send each page (including URL
15//! and HTML) to a subscriber for processing, or just to gather links.
16//!
17//! - [`scrape`]: like `crawl`, but saves the HTML raw strings to parse after scraping is complete.
18//!
19//! [`crawl`]: website/struct.Website.html#method.crawl
20//! [`scrape`]: website/struct.Website.html#method.scrape
21//!
22//! # Examples
23//!
24//! A simple crawl to index a website:
25//!
26//! ```no_run
27//! use spider::tokio;
28//! use spider::website::Website;
29//!
30//! #[tokio::main]
31//! async fn main() {
32//! let mut website: Website = Website::new("https://spider.cloud");
33//!
34//! website.crawl().await;
35//!
36//! let links = website.get_links();
37//!
38//! for link in links {
39//! println!("- {:?}", link.as_ref());
40//! }
41//! }
42//! ```
43//!
44//! Subscribe to crawl events:
45//!
46//! ```no_run
47//! use spider::tokio;
48//! use spider::website::Website;
49//! use tokio::io::AsyncWriteExt;
50//!
51//! #[tokio::main]
52//! async fn main() {
53//! let mut website: Website = Website::new("https://spider.cloud");
54//! let mut rx2 = website.subscribe(16).unwrap();
55//!
56//! tokio::spawn(async move {
57//! let mut stdout = tokio::io::stdout();
58//!
59//! while let Ok(res) = rx2.recv().await {
60//! let _ = stdout
61//! .write_all(format!("- {}\n", res.get_url()).as_bytes())
62//! .await;
63//! }
64//! });
65//!
66//! website.crawl().await;
67//! }
68//! ```
69//!
70//! ## Feature flags
71//!
72//! - `ua_generator`: Enables auto generating a random real User-Agent.
73//! - `disk`: Enables SQLite hybrid disk storage to balance memory usage with no tls.
74//! - `disk_native_tls`: Enables SQLite hybrid disk storage to balance memory usage with native tls.
75//! - `disk_aws`: Enables SQLite hybrid disk storage to balance memory usage with aws_tls.
76//! - `balance`: Enables balancing the CPU and memory to scale more efficiently.
77//! - `regex`: Enables blacklisting paths with regx.
78//! - `firewall`: Enables spider_firewall crate to prevent bad websites from crawling.
79//! - `decentralized`: Enables decentralized processing of IO, requires the [spider_worker](https://docs.rs/crate/spider_worker/latest) startup before crawls.
80//! - `sync`: Subscribe to changes for Page data processing async.
81//! - `control`: Enables the ability to pause, start, and shutdown crawls on demand.
82//! - `full_resources`: Enables gathering all content that relates to the domain like css,jss, and etc.
83//! - `serde`: Enables serde serialization support.
84//! - `socks`: Enables socks5 proxy support.
85//! - `glob`: Enables [url glob](https://everything.curl.dev/cmdline/globbing) support.
86//! - `fs`: Enables storing resources to disk for parsing (may greatly increases performance at the cost of temp storage). Enabled by default.
87//! - `sitemap`: Include sitemap pages in results.
88//! - `time`: Enables duration tracking per page.
89//! - `cache`: Enables HTTP caching request to disk.
90//! - `cache_mem`: Enables HTTP caching request to persist in memory.
91//! - `cache_chrome_hybrid`: Enables hybrid chrome request caching between HTTP.
92//! - `cache_openai`: Enables caching the OpenAI request. This can drastically save costs when developing AI workflows.
93//! - `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely.
94//! - `chrome_headed`: Enables chrome rendering headful rendering.
95//! - `chrome_cpu`: Disable gpu usage for chrome browser.
96//! - `chrome_stealth`: Enables stealth mode to make it harder to be detected as a bot.
97//! - `chrome_store_page`: Store the page object to perform other actions like taking screenshots conditionally.
98//! - `chrome_screenshot`: Enables storing a screenshot of each page on crawl. Defaults the screenshots to the ./storage/ directory. Use the env variable `SCREENSHOT_DIRECTORY` to adjust the directory.
99//! - `chrome_intercept`: Allows intercepting network request to speed up processing.
100//! - `chrome_headless_new`: Use headless=new to launch the chrome instance.
101//! - `cookies`: Enables cookies storing and setting to use for request.
102//! - `real_browser`: Enables the ability to bypass protected pages.
103//! - `cron`: Enables the ability to start cron jobs for the website.
104//! - `openai`: Enables OpenAI to generate dynamic browser executable scripts. Make sure to use the env var `OPENAI_API_KEY`.
105//! - `smart`: Enables smart mode. This runs request as HTTP until JavaScript rendering is needed. This avoids sending multiple network request by re-using the content.
106//! - `encoding`: Enables handling the content with different encodings like Shift_JIS.
107//! - `spoof`: Spoof HTTP headers for the request.
108//! - `headers`: Enables the extraction of header information on each retrieved page. Adds a `headers` field to the page struct.
109//! - `decentralized_headers`: Enables the extraction of suppressed header information of the decentralized processing of IO. This is needed if `headers` is set in both [spider](https://docs.rs/spider/latest/spider/) and [spider_worker](https://docs.rs/crate/spider_worker/latest).
110//!
111//! Additional learning resources include:
112//!
113//! - [Spider Repository Examples](https://github.com/spider-rs/spider/tree/main/examples)
114
115// performance reasons jemalloc memory backend for dedicated work and large crawls
116#[cfg(all(
117 not(windows),
118 not(target_os = "android"),
119 not(target_env = "musl"),
120 feature = "jemalloc"
121))]
122#[global_allocator]
123static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
124
125pub extern crate bytes;
126pub extern crate case_insensitive_string;
127pub extern crate hashbrown;
128extern crate log;
129pub extern crate percent_encoding;
130pub extern crate quick_xml;
131pub extern crate reqwest;
132pub extern crate smallvec;
133pub extern crate tokio;
134pub extern crate tokio_stream;
135pub extern crate url;
136
137#[cfg(feature = "cron")]
138pub extern crate async_job;
139#[cfg(feature = "openai")]
140pub extern crate async_openai;
141pub extern crate auto_encoder;
142#[cfg(feature = "flexbuffers")]
143pub extern crate flexbuffers;
144#[cfg(feature = "cache_request")]
145pub extern crate http_cache_reqwest;
146#[cfg(feature = "cache_openai")]
147pub extern crate moka;
148#[cfg(feature = "cache_request")]
149pub extern crate reqwest_middleware;
150#[cfg(feature = "serde")]
151pub extern crate serde;
152#[cfg(feature = "ua_generator")]
153pub extern crate ua_generator;
154#[macro_use]
155pub extern crate string_concat;
156pub extern crate strum;
157#[macro_use]
158pub extern crate lazy_static;
159
160/// Configuration structure for `Website`.
161pub mod configuration;
162/// Optional features to use.
163pub mod features;
164/// Internal packages customized.
165pub mod packages;
166/// A page scraped.
167pub mod page;
168/// Application utils.
169pub mod utils;
170/// A website to crawl.
171pub mod website;
172
173pub use case_insensitive_string::compact_str;
174pub use case_insensitive_string::CaseInsensitiveString;
175
176#[cfg(feature = "chrome")]
177pub use chromiumoxide;
178
179#[cfg(feature = "regex")]
180/// Black list checking url exist with Regex.
181pub mod black_list {
182 use crate::compact_str::CompactString;
183 /// check if link exist in blacklists with regex.
184 pub fn contains(blacklist_url: ®ex::RegexSet, link: &CompactString) -> bool {
185 blacklist_url.is_match(link)
186 }
187}
188
189#[cfg(not(feature = "regex"))]
190/// Black list checking url exist.
191pub mod black_list {
192 use crate::compact_str::CompactString;
193 /// check if link exist in blacklists.
194 pub fn contains(blacklist_url: &[CompactString], link: &CompactString) -> bool {
195 blacklist_url.contains(link)
196 }
197}
198
199/// The asynchronous Client to make requests with.
200#[cfg(not(feature = "cache_request"))]
201pub type Client = reqwest::Client;
202#[cfg(not(feature = "cache_request"))]
203/// The asynchronous Client Builder.
204pub type ClientBuilder = reqwest::ClientBuilder;
205
206/// The asynchronous Client to make requests with HTTP Cache.
207#[cfg(feature = "cache_request")]
208pub type Client = reqwest_middleware::ClientWithMiddleware;
209#[cfg(feature = "cache_request")]
210/// The asynchronous Client Builder.
211pub type ClientBuilder = reqwest_middleware::ClientBuilder;
212
213/// The selectors type. The values are held to make sure the relative domain can be crawled upon base redirects.
214pub type RelativeSelectors = (
215 // base domain
216 compact_str::CompactString,
217 smallvec::SmallVec<[compact_str::CompactString; 2]>,
218 // redirected domain
219 compact_str::CompactString,
220);