Struct BlessCrawl

Source

pub struct BlessCrawl { /* private fields */ }

Expand description

BlessCrawl client for distributed web scraping operations.

Implementations§

Source §

impl BlessCrawl

Source

pub const DEFAULT_TIMEOUT_MS: u32 = 15_000u32

Default timeout in milliseconds (15 seconds)

Source

pub const DEFAULT_WAIT_TIME_MS: u32 = 3_000u32

Default wait time in milliseconds (3 seconds)

Source

pub const MAX_TIMEOUT_MS: u32 = 120_000u32

Maximum timeout in milliseconds (2 minutes)

Source

pub const MAX_WAIT_TIME_MS: u32 = 20_000u32

Maximum wait time in milliseconds (20 seconds)

Source

pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2_097_152usize

Maximum result buffer size in bytes (2MB)

Source

pub const MAX_MAP_BUFFER_SIZE: usize = 1_048_576usize

Maximum result buffer size in bytes (1MB)

Source

pub const MAX_CRAWL_BUFFER_SIZE: usize = 8_388_608usize

Maximum result buffer size in bytes (8MB)

Source

pub fn with_config(config: ScrapeOptions) -> Result<Self, WebScrapeErrorKind>

Creates a new BlessCrawl instance with the given configuration.

Source

pub fn get_config(&self) -> &ScrapeOptions

Returns a reference to the current configuration.

Source

pub fn handle(&self) -> u32

Source

pub fn scrape( &self, url: &str, options: Option<ScrapeOptions>, ) -> Result<Response<ScrapeData>, WebScrapeErrorKind>

Scrapes webpage content and returns it as markdown with metadata.

Examples found in repository ?

examples/web-scrape.rs (line 27)

19fn example_scraping() {
20    println!("--- Example 1: Basic Web Scraping ---");
21
22    let url = "https://example.com";
23    println!("scraping: {}...", url);
24
25    // First scrape with default config
26    let response = BlessCrawl::default()
27        .scrape(url, None)
28        .expect("Failed to scrape");
29    println!("response with default config: {:?}", response);
30    println!();
31    println!(
32        "---------- markdown ----------\n{}\n------------------------------",
33        response.data.content
34    );
35}

Source

pub fn map( &self, url: &str, options: Option<MapOptions>, ) -> Result<Response<MapData>, WebScrapeErrorKind>

Extracts all links from a webpage, categorized by type.

Examples found in repository ?

examples/web-scrape.rs (line 49)

37fn example_mapping() {
38    println!("--- Example 2: Link Mapping/Discovery ---");
39
40    let url = "https://example.com";
41    println!("Mapping links from: {}", url);
42
43    let options = MapOptions::new()
44        .with_link_types(vec!["internal".to_string(), "external".to_string()])
45        .with_base_url(url.to_string())
46        .with_filter_extensions(vec![".html".to_string(), ".htm".to_string()]);
47
48    let response = BlessCrawl::default()
49        .map(url, Some(options))
50        .expect("Failed to map");
51    println!("response: {:?}", response);
52    println!();
53    println!(
54        "------------ links ------------\n{:?}\n------------------------------",
55        response.data.links
56    );
57    println!();
58    println!(
59        "------------ total links ------------\n{}\n------------------------------",
60        response.data.total_links
61    );
62}

Source

pub fn crawl( &self, url: &str, options: Option<CrawlOptions>, ) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind>

Recursively crawls a website with configurable depth and filtering.

Examples found in repository ?

examples/web-scrape.rs (line 80)

64fn example_crawling() {
65    println!("--- Example 3: Recursive Website Crawling ---");
66
67    let url = "https://example.com";
68    println!("Crawling website: {}", url);
69
70    let options = CrawlOptions::new()
71        .with_max_depth(2)
72        .with_limit(10)
73        .with_include_paths(vec!["/".to_string()])
74        .with_exclude_paths(vec!["/admin/".to_string(), "/api/".to_string()])
75        .with_follow_external(false)
76        .with_delay_between_requests(1000)
77        .with_parallel_requests(3);
78
79    let response = BlessCrawl::default()
80        .crawl(url, Some(options))
81        .expect("Failed to crawl");
82    println!("response: {:?}", response);
83    println!();
84    println!(
85        "------------ pages ------------\n{:?}\n------------------------------",
86        response.data.pages
87    );
88    println!();
89    println!(
90        "------------ total pages ------------\n{}\n------------------------------",
91        response.data.total_pages
92    );
93}