spider/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2#![warn(missing_docs)]
3#![allow(clippy::perf)]
4#![allow(clippy::borrowed_box)]
5#![allow(clippy::collapsible_match)]
6#![allow(clippy::empty_docs)]
7#![allow(clippy::empty_line_after_doc_comments)]
8#![allow(clippy::explicit_counter_loop)]
9#![allow(clippy::field_reassign_with_default)]
10#![allow(clippy::if_same_then_else)]
11#![allow(clippy::just_underscores_and_digits)]
12#![allow(clippy::let_underscore_future)]
13#![allow(clippy::manual_clamp)]
14#![allow(clippy::manual_strip)]
15#![allow(clippy::manual_unwrap_or_default)]
16#![allow(clippy::map_identity)]
17#![allow(clippy::match_like_matches_macro)]
18#![allow(clippy::match_single_binding)]
19#![allow(clippy::mixed_attributes_style)]
20#![allow(clippy::non_minimal_cfg)]
21#![allow(clippy::nonminimal_bool)]
22#![allow(clippy::ptr_arg)]
23#![allow(clippy::redundant_pattern_matching)]
24#![allow(clippy::should_implement_trait)]
25#![allow(clippy::too_many_arguments)]
26#![allow(clippy::type_complexity)]
27#![allow(clippy::vec_box)]
28//! Website crawling library that rapidly crawls all pages to
29//! gather links via isolated contexts.
30//!
31//! Spider is multi-threaded crawler that can be configured
32//! to scrape web pages. It has the ability to gather
33//! millions of pages within seconds.
34//!
35//! # How to use Spider
36//!
37//! There are a couple of ways to use Spider:
38//!
39//! - [`crawl`]: start concurrently crawling a site. Can be used to send each page (including URL
40//!   and HTML) to a subscriber for processing, or just to gather links.
41//!
42//! - [`scrape`]: like `crawl`, but saves the HTML raw strings to parse after scraping is complete.
43//!
44//! [`crawl`]: website/struct.Website.html#method.crawl
45//! [`scrape`]: website/struct.Website.html#method.scrape
46//!
47//! # Examples
48//!
49//! A simple crawl to index a website:
50//!
51//! ```no_run
52//! use spider::tokio;
53//! use spider::website::Website;
54//!
55//! #[tokio::main]
56//! async fn main() {
57//!     let mut website: Website = Website::new("https://spider.cloud");
58//!
59//!     website.crawl().await;
60//!
61//!     let links = website.get_links();
62//!
63//!     for link in links {
64//!         println!("- {:?}", link.as_ref());
65//!     }
66//! }
67//! ```
68//!
69//! Subscribe to crawl events:
70//!
71//! ```no_run
72//! use spider::tokio;
73//! use spider::website::Website;
74//!
75//! #[tokio::main]
76//! async fn main() {
77//!     let mut website: Website = Website::new("https://spider.cloud");
78//!     let mut rx2 = website.subscribe(16);
79//!
80//!     tokio::spawn(async move {
81//!         while let Ok(res) = rx2.recv().await {
82//!             println!("- {}", res.get_url());
83//!         }
84//!     });
85//!
86//!     website.crawl().await;
87//! }
88//! ```
89//!
90//! ## Spider Cloud Integration
91//!
92//! Use [Spider Cloud](https://spider.cloud) for anti-bot bypass, proxy rotation, and high-throughput
93//! data collection. Enable the `spider_cloud` feature and set your API key.
94//! Set `return_format` to `"markdown"` for clean LLM-ready output:
95//!
96//! ```ignore
97//! use spider::configuration::{SpiderCloudConfig, SpiderCloudMode, SpiderCloudReturnFormat};
98//! use spider::tokio;
99//! use spider::website::Website;
100//!
101//! #[tokio::main]
102//! async fn main() {
103//!     let config = SpiderCloudConfig::new("YOUR_API_KEY")
104//!         .with_mode(SpiderCloudMode::Smart)
105//!         .with_return_format(SpiderCloudReturnFormat::Markdown);
106//!
107//!     let mut website: Website = Website::new("https://example.com")
108//!         .with_limit(10)
109//!         .with_spider_cloud_config(config)
110//!         .build()
111//!         .unwrap();
112//!
113//!     let mut rx = website.subscribe(16);
114//!
115//!     tokio::spawn(async move {
116//!         while let Ok(page) = rx.recv().await {
117//!             let url = page.get_url();
118//!             let markdown = page.get_content();
119//!             let status = page.status_code;
120//!
121//!             println!("[{status}] {url}\n---\n{markdown}\n");
122//!         }
123//!     });
124//!
125//!     website.crawl().await;
126//!     website.unsubscribe();
127//! }
128//! ```
129//!
130//! ## Chrome Rendering
131//!
132//! Enable the `chrome` feature to render JavaScript-heavy pages. Use the env var
133//! `CHROME_URL` to connect to a remote instance:
134//!
135//! ```no_run
136//! use spider::tokio;
137//! use spider::website::Website;
138//!
139//! #[tokio::main]
140//! async fn main() {
141//!     let mut website: Website = Website::new("https://spider.cloud")
142//!         .with_limit(10)
143//!         .with_chrome_intercept(Default::default())
144//!         .build()
145//!         .unwrap();
146//!
147//!     let mut rx = website.subscribe(16);
148//!
149//!     tokio::spawn(async move {
150//!         while let Ok(page) = rx.recv().await {
151//!             println!("{} - {}", page.get_url(), page.get_html_bytes_u8().len());
152//!         }
153//!     });
154//!
155//!     website.crawl().await;
156//! }
157//! ```
158//!
159//! ## Feature flags
160//!
161//! ### Core
162//!
163//! - `ua_generator`: Enables auto generating a random real User-Agent.
164//! - `regex`: Enables blacklisting paths with regex.
165//! - `glob`: Enables [url glob](https://everything.curl.dev/cmdline/globbing) support.
166//! - `fs`: Enables storing resources to disk for parsing (may greatly increase performance at the cost of temp storage). Enabled by default.
167//! - `sitemap`: Include sitemap pages in results.
168//! - `time`: Enables duration tracking per page.
169//! - `encoding`: Enables handling the content with different encodings like Shift_JIS.
170//! - `serde`: Enables serde serialization support.
171//! - `sync`: Subscribe to changes for Page data processing async.
172//! - `control`: Enables the ability to pause, start, and shutdown crawls on demand.
173//! - `full_resources`: Enables gathering all content that relates to the domain like CSS, JS, and etc.
174//! - `cookies`: Enables cookies storing and setting to use for request.
175//! - `spoof`: Spoof HTTP headers for the request.
176//! - `headers`: Enables the extraction of header information on each retrieved page. Adds a `headers` field to the page struct.
177//! - `balance`: Enables balancing the CPU and memory to scale more efficiently.
178//! - `cron`: Enables the ability to start cron jobs for the website.
179//! - `tracing`: Enables tokio tracing support for diagnostics.
180//! - `cowboy`: Enables full concurrency mode with no throttle.
181//! - `llm_json`: Enables LLM-friendly JSON parsing.
182//! - `page_error_status_details`: Enables storing detailed error status information on pages.
183//! - `extra_information`: Enables extra page metadata collection.
184//! - `cmd`: Enables tokio process support.
185//! - `io_uring`: Enables Linux io_uring support for async I/O (default on Linux).
186//! - `simd`: Enables SIMD-accelerated JSON parsing.
187//! - `inline-more`: More aggressive function inlining for performance (may increase compile times).
188//!
189//! ### Storage
190//!
191//! - `disk`: Enables SQLite hybrid disk storage to balance memory usage with no TLS.
192//! - `disk_native_tls`: Enables SQLite hybrid disk storage to balance memory usage with native TLS.
193//! - `disk_aws`: Enables SQLite hybrid disk storage to balance memory usage with AWS TLS.
194//!
195//! ### Caching
196//!
197//! - `cache`: Enables HTTP caching request to disk.
198//! - `cache_mem`: Enables HTTP caching request to persist in memory.
199//! - `cache_openai`: Enables caching the OpenAI request. This can drastically save costs when developing AI workflows.
200//! - `cache_gemini`: Enables caching Gemini AI requests.
201//! - `cache_chrome_hybrid`: Enables hybrid Chrome + HTTP caching to disk.
202//! - `cache_chrome_hybrid_mem`: Enables hybrid Chrome + HTTP caching in memory.
203//!
204//! ### Chrome / Browser
205//!
206//! - `chrome`: Enables Chrome headless rendering, use the env var `CHROME_URL` to connect remotely.
207//! - `chrome_headed`: Enables Chrome headful rendering.
208//! - `chrome_cpu`: Disable GPU usage for Chrome browser.
209//! - `chrome_stealth`: Enables stealth mode to make it harder to be detected as a bot.
210//! - `chrome_store_page`: Store the page object to perform other actions like taking screenshots conditionally.
211//! - `chrome_screenshot`: Enables storing a screenshot of each page on crawl. Defaults the screenshots to the `./storage/` directory. Use the env variable `SCREENSHOT_DIRECTORY` to adjust the directory.
212//! - `chrome_intercept`: Allows intercepting network requests to speed up processing.
213//! - `chrome_headless_new`: Use `headless=new` to launch the Chrome instance.
214//! - `chrome_simd`: Enables SIMD optimizations for Chrome message parsing.
215//! - `chrome_tls_connection`: Enables TLS connection support for Chrome.
216//! - `chrome_serde_stacker`: Enables serde stacker for deeply nested Chrome messages.
217//! - `chrome_remote_cache`: Enables remote Chrome caching in memory.
218//! - `chrome_remote_cache_disk`: Enables remote Chrome caching to disk.
219//! - `chrome_remote_cache_mem`: Enables remote Chrome caching in memory only.
220//! - `adblock`: Enables adblock support for Chrome to block ads during rendering.
221//! - `real_browser`: Enables the ability to bypass protected pages.
222//! - `smart`: Enables smart mode. This runs request as HTTP until JavaScript rendering is needed. This avoids sending multiple network requests by re-using the content.
223//!
224//! ### WebDriver
225//!
226//! - `webdriver`: Enables WebDriver support via [thirtyfour](https://docs.rs/thirtyfour). Use with chromedriver, geckodriver, or Selenium.
227//! - `webdriver_headed`: Enables WebDriver headful mode.
228//! - `webdriver_stealth`: Enables stealth mode for WebDriver.
229//! - `webdriver_chrome`: WebDriver with Chrome browser.
230//! - `webdriver_firefox`: WebDriver with Firefox browser.
231//! - `webdriver_edge`: WebDriver with Edge browser.
232//! - `webdriver_screenshot`: Enables screenshots via WebDriver.
233//!
234//! ### AI / LLM
235//!
236//! - `openai`: Enables OpenAI to generate dynamic browser executable scripts. Make sure to use the env var `OPENAI_API_KEY`.
237//! - `gemini`: Enables Gemini AI to generate dynamic browser executable scripts. Make sure to use the env var `GEMINI_API_KEY`.
238//!
239//! ### Spider Cloud
240//!
241//! - `spider_cloud`: Enables [Spider Cloud](https://spider.cloud) integration for anti-bot bypass, proxy rotation, and API-based crawling.
242//!
243//! ### Agent
244//!
245//! - `agent`: Enables the [spider_agent](https://docs.rs/spider_agent) multimodal autonomous agent.
246//! - `agent_openai`: Agent with OpenAI provider.
247//! - `agent_chrome`: Agent with Chrome browser context.
248//! - `agent_webdriver`: Agent with WebDriver context.
249//! - `agent_skills`: Agent with dynamic skill system for web automation challenges.
250//! - `agent_skills_s3`: Agent skills with S3 storage.
251//! - `agent_fs`: Agent with filesystem support for temp storage.
252//! - `agent_search_serper`: Agent with [Serper](https://serper.dev) search integration.
253//! - `agent_search_brave`: Agent with [Brave Search](https://brave.com/search/api/) integration.
254//! - `agent_search_bing`: Agent with [Bing Search](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) integration.
255//! - `agent_search_tavily`: Agent with [Tavily](https://tavily.com) search integration.
256//! - `agent_full`: Full agent with all features enabled.
257//!
258//! ### Search
259//!
260//! - `search`: Enables search provider base.
261//! - `search_serper`: Enables [Serper](https://serper.dev) search integration.
262//! - `search_brave`: Enables [Brave Search](https://brave.com/search/api/) integration.
263//! - `search_bing`: Enables [Bing Search](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) integration.
264//! - `search_tavily`: Enables [Tavily](https://tavily.com) search integration.
265//!
266//! ### Networking
267//!
268//! - `socks`: Enables SOCKS5 proxy support.
269//! - `wreq`: Enables the [wreq](https://docs.rs/wreq) HTTP client alternative with built-in impersonation.
270//!
271//! ### Distributed
272//!
273//! - `decentralized`: Enables decentralized processing of IO, requires the [spider_worker](https://docs.rs/crate/spider_worker/latest) startup before crawls.
274//! - `decentralized_headers`: Enables the extraction of suppressed header information of the decentralized processing of IO. This is needed if `headers` is set in both [spider](https://docs.rs/spider/latest/spider/) and [spider_worker](https://docs.rs/crate/spider_worker/latest).
275//! - `firewall`: Enables spider_firewall crate to prevent bad websites from crawling.
276//!
277//! Additional learning resources include:
278//!
279//! - [Spider Repository Examples](https://github.com/spider-rs/spider/tree/main/examples)
280//! - [Spider Cloud](https://spider.cloud)
281pub extern crate bytes;
282pub extern crate case_insensitive_string;
283pub extern crate hashbrown;
284extern crate log;
285pub extern crate percent_encoding;
286pub extern crate quick_xml;
287pub extern crate reqwest;
288pub extern crate smallvec;
289pub extern crate spider_fingerprint;
290pub extern crate tokio;
291pub extern crate tokio_stream;
292pub extern crate url;
293
294#[cfg(feature = "cron")]
295pub extern crate async_job;
296#[cfg(feature = "openai")]
297pub extern crate async_openai;
298pub extern crate auto_encoder;
299#[cfg(feature = "flexbuffers")]
300pub extern crate flexbuffers;
301#[cfg(feature = "gemini")]
302pub extern crate gemini_rust;
303#[cfg(feature = "cache_request")]
304pub extern crate http_cache_reqwest;
305#[cfg(feature = "cache_openai")]
306pub extern crate moka;
307#[cfg(feature = "cache_request")]
308pub extern crate reqwest_middleware;
309#[cfg(feature = "serde")]
310pub extern crate serde;
311#[cfg(feature = "ua_generator")]
312pub extern crate ua_generator;
313#[macro_use]
314pub extern crate string_concat;
315pub extern crate strum;
316#[macro_use]
317pub extern crate lazy_static;
318#[cfg(feature = "agent")]
319pub extern crate spider_agent;
320#[cfg(feature = "firewall")]
321pub extern crate spider_firewall;
322
323/// Re-export agent types from spider_agent crate.
324#[cfg(feature = "agent")]
325pub mod agent {
326    //! Agent module re-exports from spider_agent crate.
327    //!
328    //! This provides convenient access to the multimodal agent functionality.
329    pub use spider_agent::{
330        Agent,
331        AgentBuilder,
332        AgentConfig,
333        AgentError,
334        AgentMemory,
335        AgentResult,
336        // Custom tool types
337        AuthConfig,
338        CustomTool,
339        CustomToolRegistry,
340        CustomToolResult,
341        FetchResult,
342        HtmlCleaningMode,
343        HttpMethod,
344        LimitType,
345        Message,
346        RetryConfig,
347        SpiderCloudToolConfig,
348        UsageLimits,
349        UsageSnapshot,
350        UsageStats,
351    };
352
353    #[cfg(feature = "agent_openai")]
354    pub use spider_agent::OpenAIProvider;
355
356    #[cfg(feature = "agent_chrome")]
357    pub use spider_agent::BrowserContext;
358
359    #[cfg(feature = "agent_webdriver")]
360    pub use spider_agent::WebDriverContext;
361
362    #[cfg(feature = "agent_fs")]
363    pub use spider_agent::{TempFile, TempStorage};
364
365    #[cfg(any(
366        feature = "agent_search_serper",
367        feature = "agent_search_brave",
368        feature = "agent_search_bing",
369        feature = "agent_search_tavily"
370    ))]
371    pub use spider_agent::{
372        ResearchOptions, ResearchResult, SearchOptions, SearchProvider, SearchResult,
373        SearchResults, TimeRange,
374    };
375
376    #[cfg(feature = "agent_search_serper")]
377    pub use spider_agent::SerperProvider;
378
379    #[cfg(feature = "agent_search_brave")]
380    pub use spider_agent::BraveProvider;
381
382    #[cfg(feature = "agent_search_bing")]
383    pub use spider_agent::BingProvider;
384
385    #[cfg(feature = "agent_search_tavily")]
386    pub use spider_agent::TavilyProvider;
387}
388
389/// Client interface.
390pub mod client;
391/// Configuration structure for `Website`.
392pub mod configuration;
393/// Optional features to use.
394pub mod features;
395/// Internal packages customized.
396pub mod packages;
397/// A page scraped.
398pub mod page;
399/// Per-request proxy routing strategy.
400pub mod proxy_strategy;
401/// Configurable retry strategy for advanced retry logic.
402pub mod retry_strategy;
403/// Trait abstractions for core types.
404pub mod traits;
405/// Application utils.
406pub mod utils;
407/// A website to crawl.
408pub mod website;
409
410pub use case_insensitive_string::compact_str;
411pub use case_insensitive_string::CaseInsensitiveString;
412pub use client::{Client, ClientBuilder};
413pub use traits::Crawler;
414pub use traits::PageData;
415
416#[cfg(feature = "chrome")]
417pub use chromiumoxide;
418
419#[cfg(feature = "search")]
420pub use features::search;
421#[cfg(feature = "search")]
422pub use features::search_providers;
423
424#[cfg(feature = "regex")]
425/// Black list checking url exist with Regex.
426pub mod black_list {
427    use crate::compact_str::CompactString;
428    /// check if link exist in blacklists with regex.
429    pub fn contains(blacklist_url: &regex::RegexSet, link: &CompactString) -> bool {
430        blacklist_url.is_match(link)
431    }
432}
433
434#[cfg(not(feature = "regex"))]
435/// Black list checking url exist.
436pub mod black_list {
437    use crate::compact_str::CompactString;
438    /// check if link exist in blacklists.
439    pub fn contains(blacklist_url: &[CompactString], link: &CompactString) -> bool {
440        blacklist_url.contains(link)
441    }
442}
443
444/// The selectors type. The values are held to make sure the relative domain can be crawled upon base redirects.
445pub type RelativeSelectors = (
446    // base domain
447    compact_str::CompactString,
448    smallvec::SmallVec<[compact_str::CompactString; 2]>,
449    // redirected domain
450    compact_str::CompactString,
451);
spider/lib.rs

spider/
lib.rs