pub struct CrawlerBuilder<S: Spider, D = ReqwestClientDownloader>where
D: Downloader,{ /* private fields */ }Implementations§
Source§impl<S: Spider, D: Downloader> CrawlerBuilder<S, D>
impl<S: Spider, D: Downloader> CrawlerBuilder<S, D>
Sourcepub fn new(spider: S) -> Selfwhere
D: Default,
pub fn new(spider: S) -> Selfwhere
D: Default,
Creates a new CrawlerBuilder for a given spider.
Examples found in repository?
examples/quotes_scraper.rs (line 89)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn max_concurrent_downloads(self, limit: usize) -> Self
pub fn max_concurrent_downloads(self, limit: usize) -> Self
Sets the maximum number of concurrent downloads.
Examples found in repository?
examples/quotes_scraper.rs (line 109)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn max_parser_workers(self, limit: usize) -> Self
pub fn max_parser_workers(self, limit: usize) -> Self
Sets the maximum number of concurrent parser workers.
Examples found in repository?
examples/quotes_scraper.rs (line 110)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn max_concurrent_pipelines(self, limit: usize) -> Self
pub fn max_concurrent_pipelines(self, limit: usize) -> Self
Sets the maximum number of concurrent pipelines.
Examples found in repository?
examples/quotes_scraper.rs (line 111)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn downloader(self, downloader: D) -> Self
pub fn downloader(self, downloader: D) -> Self
Sets a custom downloader for the crawler.
Sourcepub fn add_middleware<M>(self, middleware: M) -> Self
pub fn add_middleware<M>(self, middleware: M) -> Self
Adds a middleware to the crawler.
Examples found in repository?
examples/quotes_scraper.rs (line 92)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn add_pipeline<P>(self, pipeline: P) -> Self
pub fn add_pipeline<P>(self, pipeline: P) -> Self
Adds an item pipeline to the crawler.
Examples found in repository?
examples/quotes_scraper.rs (line 90)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn with_checkpoint_path<P: AsRef<Path>>(self, path: P) -> Self
pub fn with_checkpoint_path<P: AsRef<Path>>(self, path: P) -> Self
Enables checkpointing and sets the path for the checkpoint file.
Examples found in repository?
examples/quotes_scraper.rs (line 107)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub fn with_checkpoint_interval(self, interval: Duration) -> Self
pub fn with_checkpoint_interval(self, interval: Duration) -> Self
Sets the interval for periodic checkpointing.
Examples found in repository?
examples/quotes_scraper.rs (line 108)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Sourcepub async fn build(self) -> Result<Crawler<S, D::Client>, SpiderError>
pub async fn build(self) -> Result<Crawler<S, D::Client>, SpiderError>
Builds the Crawler instance.
Examples found in repository?
examples/quotes_scraper.rs (line 112)
83async fn main() -> Result<(), SpiderError> {
84 tracing_subscriber::fmt()
85 .with_env_filter("info,spider_lib=debug")
86 .without_time()
87 .init();
88
89 let crawler = CrawlerBuilder::<_, ReqwestClientDownloader>::new(QuotesSpider)
90 .add_pipeline(DeduplicationPipeline::new(&["text"]))
91 .add_pipeline(CsvExporterPipeline::<QuoteItem>::new("output/quotes.csv")?)
92 .add_middleware(HttpCacheMiddleware::builder().build()?)
93 .add_middleware(
94 RateLimitMiddleware::builder()
95 .use_token_bucket_limiter(5)
96 .build(),
97 )
98 .add_middleware(RetryMiddleware::default().max_retries(2))
99 .add_middleware(UserAgentMiddleware::builder().build()?)
100 .add_middleware(RobotsTxtMiddleware::new())
101 .add_middleware(
102 RefererMiddleware::new()
103 .same_origin_only(true)
104 .max_chain_length(100)
105 .include_fragment(false),
106 )
107 .with_checkpoint_path("output/quotes.bin")
108 .with_checkpoint_interval(Duration::from_secs(15))
109 .max_concurrent_downloads(5)
110 .max_parser_workers(2)
111 .max_concurrent_pipelines(2)
112 .build()
113 .await?;
114
115 crawler.start_crawl().await?;
116
117 Ok(())
118}Trait Implementations§
Auto Trait Implementations§
impl<S, D> Freeze for CrawlerBuilder<S, D>
impl<S, D = ReqwestClientDownloader> !RefUnwindSafe for CrawlerBuilder<S, D>
impl<S, D> Send for CrawlerBuilder<S, D>
impl<S, D> Sync for CrawlerBuilder<S, D>
impl<S, D> Unpin for CrawlerBuilder<S, D>
impl<S, D = ReqwestClientDownloader> !UnwindSafe for CrawlerBuilder<S, D>
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more