blockless_sdk/bless_crawl/
mod.rs

1//! # BlessCrawl - Distributed Web Scraping SDK
2//!
3//! Provides distributed web scraping across the BLESS network's browser nodes.
4//!
5//! ## Features
6//!
7//! - **scrape()**: Extract content from a URL as markdown
8//! - **map()**: Discover and extract all links from a webpage
9//! - **crawl()**: Recursively crawl websites with depth controls
10//!
11//! ## Limits
12//!
13//! - Timeout: 15s default, 120s max
14//! - Wait time: 3s default, 20s max
15//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl)
16
17mod html_to_markdown;
18mod html_transform;
19
20use html_to_markdown::parse_markdown;
21pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
22use std::collections::HashMap;
23
24type Handle = u32;
25type ExitCode = u8;
26
27#[cfg(not(feature = "mock-ffi"))]
28#[link(wasm_import_module = "bless_crawl")]
29extern "C" {
30    /// Scrape webpage content and return as markdown
31    #[allow(clippy::too_many_arguments)]
32    fn scrape(
33        h: *mut Handle,
34        url_ptr: *const u8,
35        url_len: usize,
36        options_ptr: *const u8,
37        options_len: usize,
38        result_ptr: *mut u8,
39        result_len: usize,
40        bytes_written: *mut usize,
41    ) -> ExitCode;
42
43    /// Close and cleanup a web scraper instance
44    fn close(h: Handle) -> ExitCode;
45}
46
47#[cfg(feature = "mock-ffi")]
48#[allow(unused_variables)]
49mod mock_ffi {
50    use super::{ExitCode, Handle};
51
52    #[allow(clippy::too_many_arguments)]
53    pub unsafe fn scrape(
54        h: *mut Handle,
55        _url_ptr: *const u8,
56        _url_len: usize,
57        _options_ptr: *const u8,
58        _options_len: usize,
59        result_ptr: *mut u8,
60        result_len: usize,
61        bytes_written: *mut usize,
62    ) -> ExitCode {
63        1
64    }
65
66    pub unsafe fn close(_h: Handle) -> ExitCode {
67        1
68    }
69}
70
71#[cfg(feature = "mock-ffi")]
72use mock_ffi::*;
73
74#[derive(Debug, Clone, PartialEq, serde::Serialize)]
75pub struct ScrapeOptions {
76    pub timeout: u32,
77    pub wait_time: u32,
78    #[serde(skip_serializing_if = "Option::is_none")]
79    pub include_tags: Option<Vec<String>>,
80    #[serde(skip_serializing_if = "Option::is_none")]
81    pub exclude_tags: Option<Vec<String>>,
82    pub only_main_content: bool,
83    pub format: Format,
84    #[serde(skip_serializing_if = "Option::is_none")]
85    pub viewport: Option<Viewport>,
86    #[serde(skip_serializing_if = "Option::is_none")]
87    pub user_agent: Option<String>,
88    #[serde(skip_serializing_if = "Option::is_none")]
89    pub headers: Option<HashMap<String, String>>,
90}
91
92impl Default for ScrapeOptions {
93    fn default() -> Self {
94        Self {
95            timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
96            wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
97            include_tags: None,
98            exclude_tags: None,
99            only_main_content: false,
100            format: Format::Markdown,
101            viewport: None,
102            user_agent: None,
103            headers: None,
104        }
105    }
106}
107
108#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
109pub enum Format {
110    #[default]
111    #[serde(rename = "markdown")]
112    Markdown,
113    #[serde(rename = "html")]
114    Html,
115    #[serde(rename = "json")]
116    Json,
117}
118
119impl std::str::FromStr for Format {
120    type Err = ();
121    fn from_str(s: &str) -> Result<Self, Self::Err> {
122        match s {
123            "markdown" => Ok(Format::Markdown),
124            "html" => Ok(Format::Html),
125            "json" => Ok(Format::Json),
126            _ => Err(()),
127        }
128    }
129}
130
131#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
132pub struct Viewport {
133    #[serde(skip_serializing_if = "Option::is_none")]
134    pub width: Option<u32>,
135    #[serde(skip_serializing_if = "Option::is_none")]
136    pub height: Option<u32>,
137}
138
139#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
140pub struct MapOptions {
141    #[serde(skip_serializing_if = "Option::is_none")]
142    pub link_types: Option<Vec<String>>,
143    #[serde(skip_serializing_if = "Option::is_none")]
144    pub base_url: Option<String>,
145    #[serde(skip_serializing_if = "Option::is_none")]
146    pub filter_extensions: Option<Vec<String>>,
147}
148
149#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
150pub struct CrawlOptions {
151    #[serde(skip_serializing_if = "Option::is_none")]
152    pub limit: Option<u32>,
153    #[serde(skip_serializing_if = "Option::is_none")]
154    pub max_depth: Option<u8>,
155    #[serde(skip_serializing_if = "Option::is_none")]
156    pub exclude_paths: Option<Vec<String>>,
157    #[serde(skip_serializing_if = "Option::is_none")]
158    pub include_paths: Option<Vec<String>>,
159    #[serde(skip_serializing_if = "Option::is_none")]
160    pub follow_external: Option<bool>,
161    #[serde(skip_serializing_if = "Option::is_none")]
162    pub delay_between_requests: Option<u32>,
163    #[serde(skip_serializing_if = "Option::is_none")]
164    pub parallel_requests: Option<u32>,
165}
166
167#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
168pub struct PageMetadata {
169    #[serde(skip_serializing_if = "Option::is_none")]
170    pub title: Option<String>,
171    #[serde(skip_serializing_if = "Option::is_none")]
172    pub description: Option<String>,
173    pub url: String,
174    pub status_code: u16,
175    #[serde(skip_serializing_if = "Option::is_none")]
176    pub language: Option<String>,
177    #[serde(skip_serializing_if = "Option::is_none")]
178    pub keywords: Option<String>,
179    #[serde(skip_serializing_if = "Option::is_none")]
180    pub robots: Option<String>,
181    #[serde(skip_serializing_if = "Option::is_none")]
182    pub author: Option<String>,
183    #[serde(skip_serializing_if = "Option::is_none")]
184    pub creator: Option<String>,
185    #[serde(skip_serializing_if = "Option::is_none")]
186    pub publisher: Option<String>,
187    #[serde(skip_serializing_if = "Option::is_none")]
188    pub og_title: Option<String>,
189    #[serde(skip_serializing_if = "Option::is_none")]
190    pub og_description: Option<String>,
191    #[serde(skip_serializing_if = "Option::is_none")]
192    pub og_image: Option<String>,
193    #[serde(skip_serializing_if = "Option::is_none")]
194    pub og_url: Option<String>,
195    #[serde(skip_serializing_if = "Option::is_none")]
196    pub og_site_name: Option<String>,
197    #[serde(skip_serializing_if = "Option::is_none")]
198    pub og_type: Option<String>,
199    #[serde(skip_serializing_if = "Option::is_none")]
200    pub twitter_title: Option<String>,
201    #[serde(skip_serializing_if = "Option::is_none")]
202    pub twitter_description: Option<String>,
203    #[serde(skip_serializing_if = "Option::is_none")]
204    pub twitter_image: Option<String>,
205    #[serde(skip_serializing_if = "Option::is_none")]
206    pub twitter_card: Option<String>,
207    #[serde(skip_serializing_if = "Option::is_none")]
208    pub twitter_site: Option<String>,
209    #[serde(skip_serializing_if = "Option::is_none")]
210    pub twitter_creator: Option<String>,
211    #[serde(skip_serializing_if = "Option::is_none")]
212    pub favicon: Option<String>,
213    #[serde(skip_serializing_if = "Option::is_none")]
214    pub viewport: Option<String>,
215    #[serde(skip_serializing_if = "Option::is_none")]
216    pub referrer: Option<String>,
217    #[serde(skip_serializing_if = "Option::is_none")]
218    pub content_type: Option<String>,
219    #[serde(skip_serializing_if = "Option::is_none")]
220    pub scrape_id: Option<String>,
221    #[serde(skip_serializing_if = "Option::is_none")]
222    pub source_url: Option<String>,
223    #[serde(skip_serializing_if = "Option::is_none")]
224    pub proxy_used: Option<String>,
225}
226
227#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
228pub struct ScrapeData {
229    pub success: bool,
230    pub timestamp: u64,
231    pub format: Format,
232    pub content: String,
233    pub metadata: PageMetadata,
234}
235
236#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
237pub struct Response<T> {
238    pub success: bool,
239    #[serde(skip_serializing_if = "Option::is_none")]
240    pub error: Option<String>,
241    pub data: T,
242}
243
244#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
245pub struct LinkInfo {
246    pub url: String,
247    // TODO: use enum instead of string
248    pub link_type: String, // "internal", "external", "anchor"
249}
250
251#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
252pub struct MapData {
253    pub url: String,
254    pub links: Vec<LinkInfo>,
255    pub total_links: usize,
256    pub timestamp: u64,
257}
258
259#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
260pub struct CrawlError {
261    pub url: String,
262    pub error: String,
263    pub depth: u32,
264}
265
266#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
267pub struct CrawlData<T> {
268    pub root_url: String,
269    pub pages: Vec<T>,
270    #[serde(skip_serializing_if = "Option::is_none")]
271    pub link_map: Option<MapData>,
272    pub depth_reached: u8,
273    pub total_pages: usize,
274    pub errors: Vec<CrawlError>,
275}
276
277impl ScrapeOptions {
278    pub fn new() -> Self {
279        Self::default()
280    }
281
282    pub fn with_include_tags(mut self, tags: Vec<String>) -> Self {
283        self.include_tags = Some(tags);
284        self
285    }
286
287    pub fn with_exclude_tags(mut self, tags: Vec<String>) -> Self {
288        self.exclude_tags = Some(tags);
289        self
290    }
291
292    pub fn with_format(mut self, format: Format) -> Self {
293        self.format = format;
294        self
295    }
296
297    pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
298        self.viewport = Some(Viewport {
299            width: Some(width),
300            height: Some(height),
301        });
302        self
303    }
304
305    pub fn with_user_agent(mut self, user_agent: String) -> Self {
306        self.user_agent = Some(user_agent);
307        self
308    }
309
310    pub fn with_headers(mut self, headers: HashMap<String, String>) -> Self {
311        self.headers = Some(headers);
312        self
313    }
314}
315
316impl MapOptions {
317    pub fn new() -> Self {
318        Self::default()
319    }
320
321    pub fn with_link_types(mut self, link_types: Vec<String>) -> Self {
322        self.link_types = Some(link_types);
323        self
324    }
325
326    pub fn with_base_url(mut self, base_url: String) -> Self {
327        self.base_url = Some(base_url);
328        self
329    }
330
331    pub fn with_filter_extensions(mut self, extensions: Vec<String>) -> Self {
332        self.filter_extensions = Some(extensions);
333        self
334    }
335}
336
337impl CrawlOptions {
338    pub fn new() -> Self {
339        Self::default()
340    }
341
342    pub fn with_limit(mut self, limit: u32) -> Self {
343        self.limit = Some(limit);
344        self
345    }
346
347    pub fn with_max_depth(mut self, max_depth: u8) -> Self {
348        self.max_depth = Some(max_depth);
349        self
350    }
351
352    pub fn with_exclude_paths(mut self, paths: Vec<String>) -> Self {
353        self.exclude_paths = Some(paths);
354        self
355    }
356
357    pub fn with_include_paths(mut self, paths: Vec<String>) -> Self {
358        self.include_paths = Some(paths);
359        self
360    }
361
362    pub fn with_follow_external(mut self, follow: bool) -> Self {
363        self.follow_external = Some(follow);
364        self
365    }
366
367    pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
368        self.delay_between_requests = Some(delay);
369        self
370    }
371
372    pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
373        self.parallel_requests = Some(parallel);
374        self
375    }
376}
377
378/// BlessCrawl client for distributed web scraping operations.
379#[derive(Debug, Clone, Default)]
380pub struct BlessCrawl {
381    inner: Handle,
382    config: ScrapeOptions,
383}
384
385impl BlessCrawl {
386    /// Default timeout in milliseconds (15 seconds)
387    pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
388    /// Default wait time in milliseconds (3 seconds)
389    pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
390
391    /// Maximum timeout in milliseconds (2 minutes)
392    pub const MAX_TIMEOUT_MS: u32 = 120000;
393    /// Maximum wait time in milliseconds (20 seconds)
394    pub const MAX_WAIT_TIME_MS: u32 = 20000;
395
396    /// Maximum result buffer size in bytes (2MB)
397    pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
398
399    /// Maximum result buffer size in bytes (1MB)
400    pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
401
402    /// Maximum result buffer size in bytes (8MB)
403    pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
404
405    /// Creates a new BlessCrawl instance with the given configuration.
406    pub fn with_config(config: ScrapeOptions) -> Result<Self, WebScrapeErrorKind> {
407        let instance = Self { inner: 0, config };
408        instance.validate_config(&instance.config)?;
409        Ok(instance)
410    }
411
412    fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
413        if config.timeout > Self::MAX_TIMEOUT_MS {
414            return Err(WebScrapeErrorKind::InvalidTimeout);
415        }
416        if config.wait_time > Self::MAX_WAIT_TIME_MS {
417            return Err(WebScrapeErrorKind::InvalidWaitTime);
418        }
419        Ok(())
420    }
421
422    /// Returns a reference to the current configuration.
423    pub fn get_config(&self) -> &ScrapeOptions {
424        &self.config
425    }
426
427    pub fn handle(&self) -> Handle {
428        self.inner
429    }
430
431    /// Scrapes webpage content and returns it as markdown with metadata.
432    pub fn scrape(
433        &self,
434        url: &str,
435        options: Option<ScrapeOptions>,
436    ) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
437        // Use provided options or fall back to instance config
438        let config = if let Some(opts) = options {
439            self.validate_config(&opts)?;
440            opts
441        } else {
442            self.config.clone()
443        };
444
445        let options_json = serde_json::to_vec(&config).unwrap();
446
447        let mut handle = self.inner;
448        let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
449        let mut bytes_written: usize = 0;
450
451        let code = unsafe {
452            scrape(
453                &mut handle,
454                url.as_ptr(),
455                url.len(),
456                options_json.as_ptr(),
457                options_json.len(),
458                result_buf.as_mut_ptr(),
459                result_buf.len(),
460                &mut bytes_written,
461            )
462        };
463
464        if code != 0 {
465            return Err(code.into());
466        }
467        if bytes_written == 0 {
468            return Err(WebScrapeErrorKind::EmptyResponse);
469        }
470        if bytes_written > result_buf.len() {
471            return Err(WebScrapeErrorKind::MemoryError);
472        }
473
474        let result_bytes =
475            unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
476
477        // deserialize the result to host ScrapeResponse
478        let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(result_bytes)
479            .map_err(|e| {
480                eprintln!("error: {:?}", e);
481                WebScrapeErrorKind::ParseError
482            })?;
483
484        if let Some(error) = scrape_response.error {
485            return Err(WebScrapeErrorKind::RuntimeError(error));
486        }
487
488        // post-process html
489        scrape_response.data.content = transform_html(TransformHtmlOptions {
490            html: scrape_response.data.content,
491            url: scrape_response.data.metadata.url.clone(),
492            include_tags: config.include_tags.unwrap_or_default(),
493            exclude_tags: config.exclude_tags.unwrap_or_default(),
494            only_main_content: config.only_main_content,
495        })
496        .map_err(|e| {
497            eprintln!("error: {:?}", e);
498            WebScrapeErrorKind::TransformError
499        })?;
500
501        // if the format is markdown, set the data to the markdown of the html
502        match config.format {
503            Format::Markdown => {
504                scrape_response.data.content = parse_markdown(&scrape_response.data.content);
505            }
506            Format::Html => (), // no need to do anything
507            Format::Json => unimplemented!(),
508        }
509
510        // convert the host ScrapeResponse to the user ScrapeResponse
511        Ok(scrape_response)
512    }
513
514    /// Extracts all links from a webpage, categorized by type.
515    pub fn map(
516        &self,
517        url: &str,
518        options: Option<MapOptions>,
519    ) -> Result<Response<MapData>, WebScrapeErrorKind> {
520        let _map_options = options.unwrap_or_default();
521
522        // let scrape_response = self.scrape(url, None)?;
523        // TODO: implement map by post-processing the scrape response or using fetch
524
525        Ok(Response {
526            success: true,
527            error: None,
528            data: MapData {
529                url: url.to_string(),
530                links: vec![],
531                total_links: 0,
532                timestamp: 0,
533            },
534        })
535    }
536
537    /// Recursively crawls a website with configurable depth and filtering.
538    pub fn crawl(
539        &self,
540        url: &str,
541        options: Option<CrawlOptions>,
542    ) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind> {
543        let _crawl_options = options.unwrap_or_default();
544
545        // TODO: implement crawl by post-processing the scrape response or using fetch
546
547        Ok(Response {
548            success: true,
549            error: None,
550            data: CrawlData {
551                root_url: url.to_string(),
552                pages: vec![],
553                link_map: None,
554                depth_reached: 0,
555                total_pages: 0,
556                errors: vec![],
557            },
558        })
559    }
560}
561
562impl Drop for BlessCrawl {
563    fn drop(&mut self) {
564        // if the handle is 0, it means the instance was never initialized on the host
565        if self.inner == 0 {
566            return;
567        }
568        let code = unsafe { close(self.inner) };
569        if code != 0 {
570            eprintln!("Error closing web scraper: {}", code);
571        }
572    }
573}
574
575#[derive(Debug)]
576pub enum WebScrapeErrorKind {
577    InvalidUrl,
578    Timeout,
579    NetworkError,
580    RenderingError,
581    MemoryError,
582    DepthExceeded,
583    RateLimited,
584    TransformError,
585    Utf8Error,
586    ParseError,
587    ScrapeFailed,
588    MapFailed,
589    CrawlFailed,
590    EmptyResponse,
591    InvalidTimeout,
592    InvalidWaitTime,
593    RuntimeError(String),
594}
595
596impl From<u8> for WebScrapeErrorKind {
597    fn from(code: u8) -> Self {
598        match code {
599            1 => WebScrapeErrorKind::InvalidUrl,
600            2 => WebScrapeErrorKind::Timeout,
601            3 => WebScrapeErrorKind::NetworkError,
602            4 => WebScrapeErrorKind::RenderingError,
603            5 => WebScrapeErrorKind::MemoryError,
604            6 => WebScrapeErrorKind::DepthExceeded,
605            7 => WebScrapeErrorKind::RateLimited,
606            8 => WebScrapeErrorKind::TransformError,
607            9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
608            10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
609            _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
610        }
611    }
612}
613
614impl std::fmt::Display for WebScrapeErrorKind {
615    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
616        match self {
617            WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
618            WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
619            WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
620            WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
621            WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
622            WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
623            WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
624            WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
625            WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
626            WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
627            WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
628            WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
629            WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
630            WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
631            WebScrapeErrorKind::InvalidTimeout => {
632                write!(f, "Timeout exceeds maximum allowed (120s)")
633            }
634            WebScrapeErrorKind::InvalidWaitTime => {
635                write!(f, "Wait time exceeds maximum allowed (20s)")
636            }
637            WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
638        }
639    }
640}
641
642impl std::error::Error for WebScrapeErrorKind {}
blockless_sdk/bless_crawl/mod.rs

blockless_sdk/bless_crawl/
mod.rs