Skip to main content

Crawl

Struct Crawl 

Source
pub struct Crawl<'a> { /* private fields */ }
Expand description

High-level crawler lifecycle wrapper. Holds a borrow of the Client and caches the last status + downloaded artifacts.

Implementations§

Source§

impl<'a> Crawl<'a>

Source

pub fn new(client: &'a Client, config: CrawlerConfig) -> Self

Wrap a CrawlerConfig without starting the job.

Source

pub fn uuid(&self) -> &str

Job UUID (empty string before Crawl::start).

Source

pub fn started(&self) -> bool

Whether start() has been called successfully.

Source

pub async fn start(&mut self) -> Result<(), ScrapflyError>

Schedule the crawler job. Returns CrawlerAlreadyStarted on re-entry.

Source

pub async fn status( &mut self, refresh: bool, ) -> Result<&CrawlerStatus, ScrapflyError>

Fetch the status, optionally using the cached copy.

Source

pub async fn wait(&mut self, opts: WaitOptions) -> Result<(), ScrapflyError>

Poll status until the job reaches a terminal state.

Source

pub async fn cancel(&self) -> Result<(), ScrapflyError>

Cancel the running crawl. No-op if already finished server-side.

Source

pub async fn urls( &self, status_filter: Option<&str>, page: u32, per_page: u32, ) -> Result<CrawlerUrls, ScrapflyError>

Paginated URL listing.

Source

pub async fn read( &self, target_url: &str, format: CrawlerContentFormat, ) -> Result<Option<CrawlContent>, ScrapflyError>

Read a single URL’s content and wrap it in a CrawlContent. Returns Ok(None) when the URL isn’t part of the crawl.

Source

pub async fn read_string( &self, target_url: &str, format: CrawlerContentFormat, ) -> Result<String, ScrapflyError>

Read the raw content string (empty string when URL not in crawl).

Source

pub async fn read_batch( &self, urls: &[String], formats: &[CrawlerContentFormat], ) -> Result<BTreeMap<String, BTreeMap<String, String>>, ScrapflyError>

Batch read up to 100 URLs.

Source

pub async fn contents( &self, format: CrawlerContentFormat, limit: Option<u32>, offset: Option<u32>, ) -> Result<CrawlerContents, ScrapflyError>

Bulk JSON contents.

Source

pub async fn warc(&mut self) -> Result<&CrawlerArtifact, ScrapflyError>

Download + cache the WARC artifact.

Source

pub async fn har(&mut self) -> Result<&CrawlerArtifact, ScrapflyError>

Download + cache the HAR artifact.

Auto Trait Implementations§

§

impl<'a> !Freeze for Crawl<'a>

§

impl<'a> !RefUnwindSafe for Crawl<'a>

§

impl<'a> Send for Crawl<'a>

§

impl<'a> Sync for Crawl<'a>

§

impl<'a> Unpin for Crawl<'a>

§

impl<'a> UnsafeUnpin for Crawl<'a>

§

impl<'a> !UnwindSafe for Crawl<'a>

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> PolicyExt for T
where T: ?Sized,

Source§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow only if self and other return Action::Follow. Read more
Source§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow if either self or other returns Action::Follow. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

impl<T> ErasedDestructor for T
where T: 'static,

Source§

impl<T> MaybeSendSync for T