blockless_sdk/bless_crawl/
mod.rs

1//! # BlessCrawl - Distributed Web Scraping SDK
2//!
3//! Provides distributed web scraping across the BLESS network's browser nodes.
4//!
5//! ## Features
6//!
7//! - **scrape()**: Extract content from a URL as markdown
8//! - **map()**: Discover and extract all links from a webpage
9//! - **crawl()**: Recursively crawl websites with depth controls
10//!
11//! ## Limits
12//!
13//! - Timeout: 15s default, 120s max
14//! - Wait time: 3s default, 20s max
15//! - Buffer sizes: 2MB (scrape), 128KB (map), 8MB (crawl)
16
17mod html_to_markdown;
18mod html_transform;
19
20use html_to_markdown::parse_markdown;
21pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
22use std::collections::HashMap;
23
24type Handle = u32;
25type ExitCode = u8;
26
27#[cfg(not(feature = "mock-ffi"))]
28#[link(wasm_import_module = "bless_crawl")]
29extern "C" {
30    /// Scrape webpage content and return as markdown
31    #[allow(clippy::too_many_arguments)]
32    fn scrape(
33        h: *mut Handle,
34        url_ptr: *const u8,
35        url_len: usize,
36        options_ptr: *const u8,
37        options_len: usize,
38        result_ptr: *mut u8,
39        result_len: usize,
40        bytes_written: *mut usize,
41    ) -> ExitCode;
42
43    /// Close and cleanup a web scraper instance
44    fn close(h: Handle) -> ExitCode;
45}
46
47#[cfg(feature = "mock-ffi")]
48#[allow(unused_variables)]
49mod mock_ffi {
50    use super::{ExitCode, Handle};
51
52    #[allow(clippy::too_many_arguments)]
53    pub unsafe fn scrape(
54        h: *mut Handle,
55        _url_ptr: *const u8,
56        _url_len: usize,
57        _options_ptr: *const u8,
58        _options_len: usize,
59        result_ptr: *mut u8,
60        result_len: usize,
61        bytes_written: *mut usize,
62    ) -> ExitCode {
63        1
64    }
65
66    pub unsafe fn close(_h: Handle) -> ExitCode {
67        1
68    }
69}
70
71#[cfg(feature = "mock-ffi")]
72use mock_ffi::*;
73
74#[derive(Debug, Clone, PartialEq, serde::Serialize)]
75pub struct ScrapeOptions {
76    pub timeout: u32,
77    pub wait_time: u32,
78    pub include_tags: Option<Vec<String>>,
79    pub exclude_tags: Option<Vec<String>>,
80    pub only_main_content: bool,
81    pub format: Format,
82    pub viewport: Option<Viewport>,
83    pub user_agent: Option<String>,
84    pub headers: Option<HashMap<String, String>>,
85}
86
87impl Default for ScrapeOptions {
88    fn default() -> Self {
89        Self {
90            timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
91            wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
92            include_tags: None,
93            exclude_tags: None,
94            only_main_content: false,
95            format: Format::Markdown,
96            viewport: None,
97            user_agent: None,
98            headers: None,
99        }
100    }
101}
102
103#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
104pub enum Format {
105    #[default]
106    #[serde(rename = "markdown")]
107    Markdown,
108    #[serde(rename = "html")]
109    Html,
110    #[serde(rename = "json")]
111    Json,
112}
113
114impl std::str::FromStr for Format {
115    type Err = ();
116    fn from_str(s: &str) -> Result<Self, Self::Err> {
117        match s {
118            "markdown" => Ok(Format::Markdown),
119            "html" => Ok(Format::Html),
120            "json" => Ok(Format::Json),
121            _ => Err(()),
122        }
123    }
124}
125
126#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
127pub struct Viewport {
128    pub width: Option<u32>,
129    pub height: Option<u32>,
130}
131
132#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
133pub struct MapOptions {
134    pub link_types: Option<Vec<String>>,
135    pub base_url: Option<String>,
136    pub filter_extensions: Option<Vec<String>>,
137}
138
139#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
140pub struct CrawlOptions {
141    pub limit: Option<u32>,
142    pub max_depth: Option<u8>,
143    pub exclude_paths: Option<Vec<String>>,
144    pub include_paths: Option<Vec<String>>,
145    pub follow_external: Option<bool>,
146    pub delay_between_requests: Option<u32>,
147    pub parallel_requests: Option<u32>,
148}
149
150#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
151pub struct PageMetadata {
152    pub title: Option<String>,
153    pub description: Option<String>,
154    pub url: String,
155    pub status_code: u16,
156    pub language: Option<String>,
157    pub keywords: Option<String>,
158    pub robots: Option<String>,
159    pub author: Option<String>,
160    pub creator: Option<String>,
161    pub publisher: Option<String>,
162    pub og_title: Option<String>,
163    pub og_description: Option<String>,
164    pub og_image: Option<String>,
165    pub og_url: Option<String>,
166    pub og_site_name: Option<String>,
167    pub og_type: Option<String>,
168    pub twitter_title: Option<String>,
169    pub twitter_description: Option<String>,
170    pub twitter_image: Option<String>,
171    pub twitter_card: Option<String>,
172    pub twitter_site: Option<String>,
173    pub twitter_creator: Option<String>,
174    pub favicon: Option<String>,
175    pub viewport: Option<String>,
176    pub referrer: Option<String>,
177    pub content_type: Option<String>,
178    pub scrape_id: Option<String>,
179    pub source_url: Option<String>,
180    pub proxy_used: Option<String>,
181}
182
183#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
184pub struct ScrapeData {
185    pub success: bool,
186    pub timestamp: u64,
187    pub format: Format,
188    pub content: String,
189    pub metadata: PageMetadata,
190}
191
192#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
193pub struct Response<T> {
194    pub success: bool,
195    pub error: Option<String>,
196    pub data: T,
197}
198
199#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
200pub struct LinkInfo {
201    pub url: String,
202    // TODO: use enum instead of string
203    pub link_type: String, // "internal", "external", "anchor"
204}
205
206#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
207pub struct MapData {
208    pub url: String,
209    pub links: Vec<LinkInfo>,
210    pub total_links: usize,
211    pub timestamp: u64,
212}
213
214#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
215pub struct CrawlError {
216    pub url: String,
217    pub error: String,
218    pub depth: u32,
219}
220
221#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
222pub struct CrawlData<T> {
223    pub root_url: String,
224    pub pages: Vec<T>,
225    pub link_map: Option<MapData>,
226    pub depth_reached: u8,
227    pub total_pages: usize,
228    pub errors: Vec<CrawlError>,
229}
230
231impl ScrapeOptions {
232    pub fn new() -> Self {
233        Self::default()
234    }
235
236    pub fn with_include_tags(mut self, tags: Vec<String>) -> Self {
237        self.include_tags = Some(tags);
238        self
239    }
240
241    pub fn with_exclude_tags(mut self, tags: Vec<String>) -> Self {
242        self.exclude_tags = Some(tags);
243        self
244    }
245
246    pub fn with_format(mut self, format: Format) -> Self {
247        self.format = format;
248        self
249    }
250
251    pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
252        self.viewport = Some(Viewport {
253            width: Some(width),
254            height: Some(height),
255        });
256        self
257    }
258
259    pub fn with_user_agent(mut self, user_agent: String) -> Self {
260        self.user_agent = Some(user_agent);
261        self
262    }
263
264    pub fn with_headers(mut self, headers: HashMap<String, String>) -> Self {
265        self.headers = Some(headers);
266        self
267    }
268}
269
270impl MapOptions {
271    pub fn new() -> Self {
272        Self::default()
273    }
274
275    pub fn with_link_types(mut self, link_types: Vec<String>) -> Self {
276        self.link_types = Some(link_types);
277        self
278    }
279
280    pub fn with_base_url(mut self, base_url: String) -> Self {
281        self.base_url = Some(base_url);
282        self
283    }
284
285    pub fn with_filter_extensions(mut self, extensions: Vec<String>) -> Self {
286        self.filter_extensions = Some(extensions);
287        self
288    }
289}
290
291impl CrawlOptions {
292    pub fn new() -> Self {
293        Self::default()
294    }
295
296    pub fn with_limit(mut self, limit: u32) -> Self {
297        self.limit = Some(limit);
298        self
299    }
300
301    pub fn with_max_depth(mut self, max_depth: u8) -> Self {
302        self.max_depth = Some(max_depth);
303        self
304    }
305
306    pub fn with_exclude_paths(mut self, paths: Vec<String>) -> Self {
307        self.exclude_paths = Some(paths);
308        self
309    }
310
311    pub fn with_include_paths(mut self, paths: Vec<String>) -> Self {
312        self.include_paths = Some(paths);
313        self
314    }
315
316    pub fn with_follow_external(mut self, follow: bool) -> Self {
317        self.follow_external = Some(follow);
318        self
319    }
320
321    pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
322        self.delay_between_requests = Some(delay);
323        self
324    }
325
326    pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
327        self.parallel_requests = Some(parallel);
328        self
329    }
330}
331
332/// BlessCrawl client for distributed web scraping operations.
333#[derive(Debug, Clone, Default)]
334pub struct BlessCrawl {
335    inner: Handle,
336    config: ScrapeOptions,
337}
338
339impl BlessCrawl {
340    /// Default timeout in milliseconds (15 seconds)
341    pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
342    /// Default wait time in milliseconds (3 seconds)
343    pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
344
345    /// Maximum timeout in milliseconds (2 minutes)
346    pub const MAX_TIMEOUT_MS: u32 = 120000;
347    /// Maximum wait time in milliseconds (20 seconds)
348    pub const MAX_WAIT_TIME_MS: u32 = 20000;
349
350    /// Maximum result buffer size in bytes (2MB)
351    pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
352
353    /// Maximum result buffer size in bytes (1MB)
354    pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
355
356    /// Maximum result buffer size in bytes (8MB)
357    pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
358
359    /// Creates a new BlessCrawl instance with the given configuration.
360    pub fn with_config(config: ScrapeOptions) -> Result<Self, WebScrapeErrorKind> {
361        let instance = Self { inner: 0, config };
362        instance.validate_config(&instance.config)?;
363        Ok(instance)
364    }
365
366    fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
367        if config.timeout > Self::MAX_TIMEOUT_MS {
368            return Err(WebScrapeErrorKind::InvalidTimeout);
369        }
370        if config.wait_time > Self::MAX_WAIT_TIME_MS {
371            return Err(WebScrapeErrorKind::InvalidWaitTime);
372        }
373        Ok(())
374    }
375
376    /// Returns a reference to the current configuration.
377    pub fn get_config(&self) -> &ScrapeOptions {
378        &self.config
379    }
380
381    pub fn handle(&self) -> Handle {
382        self.inner
383    }
384
385    /// Scrapes webpage content and returns it as markdown with metadata.
386    pub fn scrape(
387        &self,
388        url: &str,
389        options: Option<ScrapeOptions>,
390    ) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
391        // Use provided options or fall back to instance config
392        let config = if let Some(opts) = options {
393            self.validate_config(&opts)?;
394            opts
395        } else {
396            self.config.clone()
397        };
398
399        let options_json = serde_json::to_vec(&config).unwrap();
400
401        let mut handle = self.inner;
402        let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
403        let mut bytes_written: usize = 0;
404
405        let code = unsafe {
406            scrape(
407                &mut handle,
408                url.as_ptr(),
409                url.len(),
410                options_json.as_ptr(),
411                options_json.len(),
412                result_buf.as_mut_ptr(),
413                result_buf.len(),
414                &mut bytes_written,
415            )
416        };
417
418        if code != 0 {
419            return Err(code.into());
420        }
421        if bytes_written == 0 {
422            return Err(WebScrapeErrorKind::EmptyResponse);
423        }
424        if bytes_written > result_buf.len() {
425            return Err(WebScrapeErrorKind::MemoryError);
426        }
427
428        let result_bytes =
429            unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
430
431        // deserialize the result to host ScrapeResponse
432        let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(result_bytes)
433            .map_err(|e| {
434                eprintln!("error: {:?}", e);
435                WebScrapeErrorKind::ParseError
436            })?;
437
438        if let Some(error) = scrape_response.error {
439            return Err(WebScrapeErrorKind::RuntimeError(error));
440        }
441
442        // post-process html
443        scrape_response.data.content = transform_html(TransformHtmlOptions {
444            html: scrape_response.data.content,
445            url: scrape_response.data.metadata.url.clone(),
446            include_tags: config.include_tags.unwrap_or_default(),
447            exclude_tags: config.exclude_tags.unwrap_or_default(),
448            only_main_content: config.only_main_content,
449        })
450        .map_err(|e| {
451            eprintln!("error: {:?}", e);
452            WebScrapeErrorKind::TransformError
453        })?;
454
455        // if the format is markdown, set the data to the markdown of the html
456        match config.format {
457            Format::Markdown => {
458                scrape_response.data.content = parse_markdown(&scrape_response.data.content);
459            }
460            Format::Html => (), // no need to do anything
461            Format::Json => unimplemented!(),
462        }
463
464        // convert the host ScrapeResponse to the user ScrapeResponse
465        Ok(scrape_response)
466    }
467
468    /// Extracts all links from a webpage, categorized by type.
469    pub fn map(
470        &self,
471        url: &str,
472        options: Option<MapOptions>,
473    ) -> Result<Response<MapData>, WebScrapeErrorKind> {
474        let _map_options = options.unwrap_or_default();
475
476        // let scrape_response = self.scrape(url, None)?;
477        // TODO: implement map by post-processing the scrape response or using fetch
478
479        Ok(Response {
480            success: true,
481            error: None,
482            data: MapData {
483                url: url.to_string(),
484                links: vec![],
485                total_links: 0,
486                timestamp: 0,
487            },
488        })
489    }
490
491    /// Recursively crawls a website with configurable depth and filtering.
492    pub fn crawl(
493        &self,
494        url: &str,
495        options: Option<CrawlOptions>,
496    ) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind> {
497        let _crawl_options = options.unwrap_or_default();
498
499        // TODO: implement crawl by post-processing the scrape response or using fetch
500
501        Ok(Response {
502            success: true,
503            error: None,
504            data: CrawlData {
505                root_url: url.to_string(),
506                pages: vec![],
507                link_map: None,
508                depth_reached: 0,
509                total_pages: 0,
510                errors: vec![],
511            },
512        })
513    }
514}
515
516impl Drop for BlessCrawl {
517    fn drop(&mut self) {
518        // if the handle is 0, it means the instance was never initialized on the host
519        if self.inner == 0 {
520            return;
521        }
522        let code = unsafe { close(self.inner) };
523        if code != 0 {
524            eprintln!("Error closing web scraper: {}", code);
525        }
526    }
527}
528
529#[derive(Debug)]
530pub enum WebScrapeErrorKind {
531    InvalidUrl,
532    Timeout,
533    NetworkError,
534    RenderingError,
535    MemoryError,
536    DepthExceeded,
537    RateLimited,
538    TransformError,
539    Utf8Error,
540    ParseError,
541    ScrapeFailed,
542    MapFailed,
543    CrawlFailed,
544    EmptyResponse,
545    InvalidTimeout,
546    InvalidWaitTime,
547    RuntimeError(String),
548}
549
550impl From<u8> for WebScrapeErrorKind {
551    fn from(code: u8) -> Self {
552        match code {
553            1 => WebScrapeErrorKind::InvalidUrl,
554            2 => WebScrapeErrorKind::Timeout,
555            3 => WebScrapeErrorKind::NetworkError,
556            4 => WebScrapeErrorKind::RenderingError,
557            5 => WebScrapeErrorKind::MemoryError,
558            6 => WebScrapeErrorKind::DepthExceeded,
559            7 => WebScrapeErrorKind::RateLimited,
560            8 => WebScrapeErrorKind::TransformError,
561            9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
562            10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
563            _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
564        }
565    }
566}
567
568impl std::fmt::Display for WebScrapeErrorKind {
569    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
570        match self {
571            WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
572            WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
573            WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
574            WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
575            WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
576            WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
577            WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
578            WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
579            WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
580            WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
581            WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
582            WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
583            WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
584            WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
585            WebScrapeErrorKind::InvalidTimeout => {
586                write!(f, "Timeout exceeds maximum allowed (120s)")
587            }
588            WebScrapeErrorKind::InvalidWaitTime => {
589                write!(f, "Wait time exceeds maximum allowed (20s)")
590            }
591            WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
592        }
593    }
594}
595
596impl std::error::Error for WebScrapeErrorKind {}