pub struct Crawl<'a> { /* private fields */ }Expand description
High-level crawler lifecycle wrapper. Holds a borrow of the Client
and caches the last status + downloaded artifacts.
Implementations§
Source§impl<'a> Crawl<'a>
impl<'a> Crawl<'a>
Sourcepub fn new(client: &'a Client, config: CrawlerConfig) -> Self
pub fn new(client: &'a Client, config: CrawlerConfig) -> Self
Wrap a CrawlerConfig without starting the job.
Sourcepub fn uuid(&self) -> &str
pub fn uuid(&self) -> &str
Job UUID (empty string before Crawl::start).
Sourcepub async fn start(&mut self) -> Result<(), ScrapflyError>
pub async fn start(&mut self) -> Result<(), ScrapflyError>
Schedule the crawler job. Returns CrawlerAlreadyStarted on re-entry.
Sourcepub async fn status(
&mut self,
refresh: bool,
) -> Result<&CrawlerStatus, ScrapflyError>
pub async fn status( &mut self, refresh: bool, ) -> Result<&CrawlerStatus, ScrapflyError>
Fetch the status, optionally using the cached copy.
Sourcepub async fn wait(&mut self, opts: WaitOptions) -> Result<(), ScrapflyError>
pub async fn wait(&mut self, opts: WaitOptions) -> Result<(), ScrapflyError>
Poll status until the job reaches a terminal state.
Sourcepub async fn cancel(&self) -> Result<(), ScrapflyError>
pub async fn cancel(&self) -> Result<(), ScrapflyError>
Cancel the running crawl. No-op if already finished server-side.
Sourcepub async fn urls(
&self,
status_filter: Option<&str>,
page: u32,
per_page: u32,
) -> Result<CrawlerUrls, ScrapflyError>
pub async fn urls( &self, status_filter: Option<&str>, page: u32, per_page: u32, ) -> Result<CrawlerUrls, ScrapflyError>
Paginated URL listing.
Sourcepub async fn read(
&self,
target_url: &str,
format: CrawlerContentFormat,
) -> Result<Option<CrawlContent>, ScrapflyError>
pub async fn read( &self, target_url: &str, format: CrawlerContentFormat, ) -> Result<Option<CrawlContent>, ScrapflyError>
Read a single URL’s content and wrap it in a CrawlContent. Returns
Ok(None) when the URL isn’t part of the crawl.
Sourcepub async fn read_string(
&self,
target_url: &str,
format: CrawlerContentFormat,
) -> Result<String, ScrapflyError>
pub async fn read_string( &self, target_url: &str, format: CrawlerContentFormat, ) -> Result<String, ScrapflyError>
Read the raw content string (empty string when URL not in crawl).
Sourcepub async fn read_batch(
&self,
urls: &[String],
formats: &[CrawlerContentFormat],
) -> Result<BTreeMap<String, BTreeMap<String, String>>, ScrapflyError>
pub async fn read_batch( &self, urls: &[String], formats: &[CrawlerContentFormat], ) -> Result<BTreeMap<String, BTreeMap<String, String>>, ScrapflyError>
Batch read up to 100 URLs.
Sourcepub async fn contents(
&self,
format: CrawlerContentFormat,
limit: Option<u32>,
offset: Option<u32>,
) -> Result<CrawlerContents, ScrapflyError>
pub async fn contents( &self, format: CrawlerContentFormat, limit: Option<u32>, offset: Option<u32>, ) -> Result<CrawlerContents, ScrapflyError>
Bulk JSON contents.
Sourcepub async fn warc(&mut self) -> Result<&CrawlerArtifact, ScrapflyError>
pub async fn warc(&mut self) -> Result<&CrawlerArtifact, ScrapflyError>
Download + cache the WARC artifact.
Sourcepub async fn har(&mut self) -> Result<&CrawlerArtifact, ScrapflyError>
pub async fn har(&mut self) -> Result<&CrawlerArtifact, ScrapflyError>
Download + cache the HAR artifact.