spider/lib.rs
1#![warn(missing_docs)]
2#![allow(clippy::perf)]
3//! Website crawling library that rapidly crawls all pages to
4//! gather links via isolated contexts.
5//!
6//! Spider is multi-threaded crawler that can be configured
7//! to scrape web pages. It has the ability to gather
8//! millions of pages within seconds.
9//!
10//! # How to use Spider
11//!
12//! There are a couple of ways to use Spider:
13//!
14//! - [`crawl`]: start concurrently crawling a site. Can be used to send each page (including URL
15//! and HTML) to a subscriber for processing, or just to gather links.
16//!
17//! - [`scrape`]: like `crawl`, but saves the HTML raw strings to parse after scraping is complete.
18//!
19//! [`crawl`]: website/struct.Website.html#method.crawl
20//! [`scrape`]: website/struct.Website.html#method.scrape
21//!
22//! # Examples
23//!
24//! A simple crawl to index a website:
25//!
26//! ```no_run
27//! use spider::tokio;
28//! use spider::website::Website;
29//!
30//! #[tokio::main]
31//! async fn main() {
32//! let mut website: Website = Website::new("https://spider.cloud");
33//!
34//! website.crawl().await;
35//!
36//! let links = website.get_links();
37//!
38//! for link in links {
39//! println!("- {:?}", link.as_ref());
40//! }
41//! }
42//! ```
43//!
44//! Subscribe to crawl events:
45//!
46//! ```no_run
47//! use spider::tokio;
48//! use spider::website::Website;
49//! use tokio::io::AsyncWriteExt;
50//!
51//! #[tokio::main]
52//! async fn main() {
53//! let mut website: Website = Website::new("https://spider.cloud");
54//! let mut rx2 = website.subscribe(16).unwrap();
55//!
56//! tokio::spawn(async move {
57//! let mut stdout = tokio::io::stdout();
58//!
59//! while let Ok(res) = rx2.recv().await {
60//! let _ = stdout
61//! .write_all(format!("- {}\n", res.get_url()).as_bytes())
62//! .await;
63//! }
64//! });
65//!
66//! website.crawl().await;
67//! }
68//! ```
69//!
70//! ## Feature flags
71//!
72//! - `ua_generator`: Enables auto generating a random real User-Agent.
73//! - `disk`: Enables SQLite hybrid disk storage to balance memory usage with no tls.
74//! - `disk_native_tls`: Enables SQLite hybrid disk storage to balance memory usage with native tls.
75//! - `disk_aws`: Enables SQLite hybrid disk storage to balance memory usage with aws_tls.
76//! - `balance`: Enables balancing the CPU and memory to scale more efficiently.
77//! - `regex`: Enables blacklisting paths with regx.
78//! - `firewall`: Enables spider_firewall crate to prevent bad websites from crawling.
79//! - `decentralized`: Enables decentralized processing of IO, requires the [spider_worker](https://docs.rs/crate/spider_worker/latest) startup before crawls.
80//! - `sync`: Subscribe to changes for Page data processing async.
81//! - `control`: Enables the ability to pause, start, and shutdown crawls on demand.
82//! - `full_resources`: Enables gathering all content that relates to the domain like css,jss, and etc.
83//! - `serde`: Enables serde serialization support.
84//! - `socks`: Enables socks5 proxy support.
85//! - `glob`: Enables [url glob](https://everything.curl.dev/cmdline/globbing) support.
86//! - `fs`: Enables storing resources to disk for parsing (may greatly increases performance at the cost of temp storage). Enabled by default.
87//! - `sitemap`: Include sitemap pages in results.
88//! - `time`: Enables duration tracking per page.
89//! - `cache`: Enables HTTP caching request to disk.
90//! - `cache_mem`: Enables HTTP caching request to persist in memory.
91//! - `cache_chrome_hybrid`: Enables hybrid chrome request caching between HTTP.
92//! - `cache_openai`: Enables caching the OpenAI request. This can drastically save costs when developing AI workflows.
93//! - `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely.
94//! - `chrome_headed`: Enables chrome rendering headful rendering.
95//! - `chrome_cpu`: Disable gpu usage for chrome browser.
96//! - `chrome_stealth`: Enables stealth mode to make it harder to be detected as a bot.
97//! - `chrome_store_page`: Store the page object to perform other actions like taking screenshots conditionally.
98//! - `chrome_screenshot`: Enables storing a screenshot of each page on crawl. Defaults the screenshots to the ./storage/ directory. Use the env variable `SCREENSHOT_DIRECTORY` to adjust the directory.
99//! - `chrome_intercept`: Allows intercepting network request to speed up processing.
100//! - `chrome_headless_new`: Use headless=new to launch the chrome instance.
101//! - `cookies`: Enables cookies storing and setting to use for request.
102//! - `real_browser`: Enables the ability to bypass protected pages.
103//! - `cron`: Enables the ability to start cron jobs for the website.
104//! - `openai`: Enables OpenAI to generate dynamic browser executable scripts. Make sure to use the env var `OPENAI_API_KEY`.
105//! - `smart`: Enables smart mode. This runs request as HTTP until JavaScript rendering is needed. This avoids sending multiple network request by re-using the content.
106//! - `encoding`: Enables handling the content with different encodings like Shift_JIS.
107//! - `spoof`: Spoof HTTP headers for the request.
108//! - `headers`: Enables the extraction of header information on each retrieved page. Adds a `headers` field to the page struct.
109//! - `decentralized_headers`: Enables the extraction of suppressed header information of the decentralized processing of IO. This is needed if `headers` is set in both [spider](https://docs.rs/spider/latest/spider/) and [spider_worker](https://docs.rs/crate/spider_worker/latest).
110//!
111//! Additional learning resources include:
112//!
113//! - [Spider Repository Examples](https://github.com/spider-rs/spider/tree/main/examples)
114pub extern crate bytes;
115pub extern crate case_insensitive_string;
116pub extern crate hashbrown;
117extern crate log;
118pub extern crate percent_encoding;
119pub extern crate quick_xml;
120pub extern crate reqwest;
121pub extern crate smallvec;
122pub extern crate spider_fingerprint;
123pub extern crate tokio;
124pub extern crate tokio_stream;
125pub extern crate url;
126
127#[cfg(feature = "cron")]
128pub extern crate async_job;
129#[cfg(feature = "openai")]
130pub extern crate async_openai;
131pub extern crate auto_encoder;
132#[cfg(feature = "flexbuffers")]
133pub extern crate flexbuffers;
134#[cfg(feature = "cache_request")]
135pub extern crate http_cache_reqwest;
136#[cfg(feature = "cache_openai")]
137pub extern crate moka;
138#[cfg(feature = "cache_request")]
139pub extern crate reqwest_middleware;
140#[cfg(feature = "serde")]
141pub extern crate serde;
142#[cfg(feature = "ua_generator")]
143pub extern crate ua_generator;
144#[macro_use]
145pub extern crate string_concat;
146pub extern crate strum;
147#[macro_use]
148pub extern crate lazy_static;
149#[cfg(feature = "firewall")]
150pub extern crate spider_firewall;
151
152/// Client interface.
153pub mod client;
154/// Configuration structure for `Website`.
155pub mod configuration;
156/// Optional features to use.
157pub mod features;
158/// Internal packages customized.
159pub mod packages;
160/// A page scraped.
161pub mod page;
162/// Application utils.
163pub mod utils;
164/// A website to crawl.
165pub mod website;
166
167pub use case_insensitive_string::compact_str;
168pub use case_insensitive_string::CaseInsensitiveString;
169pub use client::{Client, ClientBuilder};
170
171#[cfg(feature = "chrome")]
172pub use chromiumoxide;
173
174#[cfg(feature = "regex")]
175/// Black list checking url exist with Regex.
176pub mod black_list {
177 use crate::compact_str::CompactString;
178 /// check if link exist in blacklists with regex.
179 pub fn contains(blacklist_url: ®ex::RegexSet, link: &CompactString) -> bool {
180 blacklist_url.is_match(link)
181 }
182}
183
184#[cfg(not(feature = "regex"))]
185/// Black list checking url exist.
186pub mod black_list {
187 use crate::compact_str::CompactString;
188 /// check if link exist in blacklists.
189 pub fn contains(blacklist_url: &[CompactString], link: &CompactString) -> bool {
190 blacklist_url.contains(link)
191 }
192}
193
194/// The selectors type. The values are held to make sure the relative domain can be crawled upon base redirects.
195pub type RelativeSelectors = (
196 // base domain
197 compact_str::CompactString,
198 smallvec::SmallVec<[compact_str::CompactString; 2]>,
199 // redirected domain
200 compact_str::CompactString,
201);