1mod html_to_markdown;
18mod html_transform;
19
20use html_to_markdown::parse_markdown;
21pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
22use std::collections::HashMap;
23
24type Handle = u32;
25type ExitCode = u8;
26
27#[cfg(not(feature = "mock-ffi"))]
28#[link(wasm_import_module = "bless_crawl")]
29extern "C" {
30 #[allow(clippy::too_many_arguments)]
32 fn scrape(
33 h: *mut Handle,
34 url_ptr: *const u8,
35 url_len: usize,
36 options_ptr: *const u8,
37 options_len: usize,
38 result_ptr: *mut u8,
39 result_len: usize,
40 bytes_written: *mut usize,
41 ) -> ExitCode;
42
43 fn close(h: Handle) -> ExitCode;
45}
46
47#[cfg(feature = "mock-ffi")]
48#[allow(unused_variables)]
49mod mock_ffi {
50 use super::{ExitCode, Handle};
51
52 #[allow(clippy::too_many_arguments)]
53 pub unsafe fn scrape(
54 h: *mut Handle,
55 _url_ptr: *const u8,
56 _url_len: usize,
57 _options_ptr: *const u8,
58 _options_len: usize,
59 result_ptr: *mut u8,
60 result_len: usize,
61 bytes_written: *mut usize,
62 ) -> ExitCode {
63 1
64 }
65
66 pub unsafe fn close(_h: Handle) -> ExitCode {
67 1
68 }
69}
70
71#[cfg(feature = "mock-ffi")]
72use mock_ffi::*;
73
74#[derive(Debug, Clone, PartialEq, serde::Serialize)]
75pub struct ScrapeOptions {
76 pub timeout: u32,
77 pub wait_time: u32,
78 #[serde(skip_serializing_if = "Option::is_none")]
79 pub include_tags: Option<Vec<String>>,
80 #[serde(skip_serializing_if = "Option::is_none")]
81 pub exclude_tags: Option<Vec<String>>,
82 pub only_main_content: bool,
83 pub format: Format,
84 #[serde(skip_serializing_if = "Option::is_none")]
85 pub viewport: Option<Viewport>,
86 #[serde(skip_serializing_if = "Option::is_none")]
87 pub user_agent: Option<String>,
88 #[serde(skip_serializing_if = "Option::is_none")]
89 pub headers: Option<HashMap<String, String>>,
90}
91
92impl Default for ScrapeOptions {
93 fn default() -> Self {
94 Self {
95 timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
96 wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
97 include_tags: None,
98 exclude_tags: None,
99 only_main_content: false,
100 format: Format::Markdown,
101 viewport: None,
102 user_agent: None,
103 headers: None,
104 }
105 }
106}
107
108#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
109pub enum Format {
110 #[default]
111 #[serde(rename = "markdown")]
112 Markdown,
113 #[serde(rename = "html")]
114 Html,
115 #[serde(rename = "json")]
116 Json,
117}
118
119impl std::str::FromStr for Format {
120 type Err = ();
121 fn from_str(s: &str) -> Result<Self, Self::Err> {
122 match s {
123 "markdown" => Ok(Format::Markdown),
124 "html" => Ok(Format::Html),
125 "json" => Ok(Format::Json),
126 _ => Err(()),
127 }
128 }
129}
130
131#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
132pub struct Viewport {
133 #[serde(skip_serializing_if = "Option::is_none")]
134 pub width: Option<u32>,
135 #[serde(skip_serializing_if = "Option::is_none")]
136 pub height: Option<u32>,
137}
138
139#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
140pub struct MapOptions {
141 #[serde(skip_serializing_if = "Option::is_none")]
142 pub link_types: Option<Vec<String>>,
143 #[serde(skip_serializing_if = "Option::is_none")]
144 pub base_url: Option<String>,
145 #[serde(skip_serializing_if = "Option::is_none")]
146 pub filter_extensions: Option<Vec<String>>,
147}
148
149#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
150pub struct CrawlOptions {
151 #[serde(skip_serializing_if = "Option::is_none")]
152 pub limit: Option<u32>,
153 #[serde(skip_serializing_if = "Option::is_none")]
154 pub max_depth: Option<u8>,
155 #[serde(skip_serializing_if = "Option::is_none")]
156 pub exclude_paths: Option<Vec<String>>,
157 #[serde(skip_serializing_if = "Option::is_none")]
158 pub include_paths: Option<Vec<String>>,
159 #[serde(skip_serializing_if = "Option::is_none")]
160 pub follow_external: Option<bool>,
161 #[serde(skip_serializing_if = "Option::is_none")]
162 pub delay_between_requests: Option<u32>,
163 #[serde(skip_serializing_if = "Option::is_none")]
164 pub parallel_requests: Option<u32>,
165}
166
167#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
168pub struct PageMetadata {
169 #[serde(skip_serializing_if = "Option::is_none")]
170 pub title: Option<String>,
171 #[serde(skip_serializing_if = "Option::is_none")]
172 pub description: Option<String>,
173 pub url: String,
174 pub status_code: u16,
175 #[serde(skip_serializing_if = "Option::is_none")]
176 pub language: Option<String>,
177 #[serde(skip_serializing_if = "Option::is_none")]
178 pub keywords: Option<String>,
179 #[serde(skip_serializing_if = "Option::is_none")]
180 pub robots: Option<String>,
181 #[serde(skip_serializing_if = "Option::is_none")]
182 pub author: Option<String>,
183 #[serde(skip_serializing_if = "Option::is_none")]
184 pub creator: Option<String>,
185 #[serde(skip_serializing_if = "Option::is_none")]
186 pub publisher: Option<String>,
187 #[serde(skip_serializing_if = "Option::is_none")]
188 pub og_title: Option<String>,
189 #[serde(skip_serializing_if = "Option::is_none")]
190 pub og_description: Option<String>,
191 #[serde(skip_serializing_if = "Option::is_none")]
192 pub og_image: Option<String>,
193 #[serde(skip_serializing_if = "Option::is_none")]
194 pub og_url: Option<String>,
195 #[serde(skip_serializing_if = "Option::is_none")]
196 pub og_site_name: Option<String>,
197 #[serde(skip_serializing_if = "Option::is_none")]
198 pub og_type: Option<String>,
199 #[serde(skip_serializing_if = "Option::is_none")]
200 pub twitter_title: Option<String>,
201 #[serde(skip_serializing_if = "Option::is_none")]
202 pub twitter_description: Option<String>,
203 #[serde(skip_serializing_if = "Option::is_none")]
204 pub twitter_image: Option<String>,
205 #[serde(skip_serializing_if = "Option::is_none")]
206 pub twitter_card: Option<String>,
207 #[serde(skip_serializing_if = "Option::is_none")]
208 pub twitter_site: Option<String>,
209 #[serde(skip_serializing_if = "Option::is_none")]
210 pub twitter_creator: Option<String>,
211 #[serde(skip_serializing_if = "Option::is_none")]
212 pub favicon: Option<String>,
213 #[serde(skip_serializing_if = "Option::is_none")]
214 pub viewport: Option<String>,
215 #[serde(skip_serializing_if = "Option::is_none")]
216 pub referrer: Option<String>,
217 #[serde(skip_serializing_if = "Option::is_none")]
218 pub content_type: Option<String>,
219 #[serde(skip_serializing_if = "Option::is_none")]
220 pub scrape_id: Option<String>,
221 #[serde(skip_serializing_if = "Option::is_none")]
222 pub source_url: Option<String>,
223 #[serde(skip_serializing_if = "Option::is_none")]
224 pub proxy_used: Option<String>,
225}
226
227#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
228pub struct ScrapeData {
229 pub success: bool,
230 pub timestamp: u64,
231 pub format: Format,
232 pub content: String,
233 pub metadata: PageMetadata,
234}
235
236#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
237pub struct Response<T> {
238 pub success: bool,
239 #[serde(skip_serializing_if = "Option::is_none")]
240 pub error: Option<String>,
241 pub data: T,
242}
243
244#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
245pub struct LinkInfo {
246 pub url: String,
247 pub link_type: String, }
250
251#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
252pub struct MapData {
253 pub url: String,
254 pub links: Vec<LinkInfo>,
255 pub total_links: usize,
256 pub timestamp: u64,
257}
258
259#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
260pub struct CrawlError {
261 pub url: String,
262 pub error: String,
263 pub depth: u32,
264}
265
266#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
267pub struct CrawlData<T> {
268 pub root_url: String,
269 pub pages: Vec<T>,
270 #[serde(skip_serializing_if = "Option::is_none")]
271 pub link_map: Option<MapData>,
272 pub depth_reached: u8,
273 pub total_pages: usize,
274 pub errors: Vec<CrawlError>,
275}
276
277impl ScrapeOptions {
278 pub fn new() -> Self {
279 Self::default()
280 }
281
282 pub fn with_include_tags(mut self, tags: Vec<String>) -> Self {
283 self.include_tags = Some(tags);
284 self
285 }
286
287 pub fn with_exclude_tags(mut self, tags: Vec<String>) -> Self {
288 self.exclude_tags = Some(tags);
289 self
290 }
291
292 pub fn with_format(mut self, format: Format) -> Self {
293 self.format = format;
294 self
295 }
296
297 pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
298 self.viewport = Some(Viewport {
299 width: Some(width),
300 height: Some(height),
301 });
302 self
303 }
304
305 pub fn with_user_agent(mut self, user_agent: String) -> Self {
306 self.user_agent = Some(user_agent);
307 self
308 }
309
310 pub fn with_headers(mut self, headers: HashMap<String, String>) -> Self {
311 self.headers = Some(headers);
312 self
313 }
314}
315
316impl MapOptions {
317 pub fn new() -> Self {
318 Self::default()
319 }
320
321 pub fn with_link_types(mut self, link_types: Vec<String>) -> Self {
322 self.link_types = Some(link_types);
323 self
324 }
325
326 pub fn with_base_url(mut self, base_url: String) -> Self {
327 self.base_url = Some(base_url);
328 self
329 }
330
331 pub fn with_filter_extensions(mut self, extensions: Vec<String>) -> Self {
332 self.filter_extensions = Some(extensions);
333 self
334 }
335}
336
337impl CrawlOptions {
338 pub fn new() -> Self {
339 Self::default()
340 }
341
342 pub fn with_limit(mut self, limit: u32) -> Self {
343 self.limit = Some(limit);
344 self
345 }
346
347 pub fn with_max_depth(mut self, max_depth: u8) -> Self {
348 self.max_depth = Some(max_depth);
349 self
350 }
351
352 pub fn with_exclude_paths(mut self, paths: Vec<String>) -> Self {
353 self.exclude_paths = Some(paths);
354 self
355 }
356
357 pub fn with_include_paths(mut self, paths: Vec<String>) -> Self {
358 self.include_paths = Some(paths);
359 self
360 }
361
362 pub fn with_follow_external(mut self, follow: bool) -> Self {
363 self.follow_external = Some(follow);
364 self
365 }
366
367 pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
368 self.delay_between_requests = Some(delay);
369 self
370 }
371
372 pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
373 self.parallel_requests = Some(parallel);
374 self
375 }
376}
377
378#[derive(Debug, Clone, Default)]
380pub struct BlessCrawl {
381 inner: Handle,
382 config: ScrapeOptions,
383}
384
385impl BlessCrawl {
386 pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
388 pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
390
391 pub const MAX_TIMEOUT_MS: u32 = 120000;
393 pub const MAX_WAIT_TIME_MS: u32 = 20000;
395
396 pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
398
399 pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
401
402 pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
404
405 pub fn with_config(config: ScrapeOptions) -> Result<Self, WebScrapeErrorKind> {
407 let instance = Self { inner: 0, config };
408 instance.validate_config(&instance.config)?;
409 Ok(instance)
410 }
411
412 fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
413 if config.timeout > Self::MAX_TIMEOUT_MS {
414 return Err(WebScrapeErrorKind::InvalidTimeout);
415 }
416 if config.wait_time > Self::MAX_WAIT_TIME_MS {
417 return Err(WebScrapeErrorKind::InvalidWaitTime);
418 }
419 Ok(())
420 }
421
422 pub fn get_config(&self) -> &ScrapeOptions {
424 &self.config
425 }
426
427 pub fn handle(&self) -> Handle {
428 self.inner
429 }
430
431 pub fn scrape(
433 &self,
434 url: &str,
435 options: Option<ScrapeOptions>,
436 ) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
437 let config = if let Some(opts) = options {
439 self.validate_config(&opts)?;
440 opts
441 } else {
442 self.config.clone()
443 };
444
445 let options_json = serde_json::to_vec(&config).unwrap();
446
447 let mut handle = self.inner;
448 let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
449 let mut bytes_written: usize = 0;
450
451 let code = unsafe {
452 scrape(
453 &mut handle,
454 url.as_ptr(),
455 url.len(),
456 options_json.as_ptr(),
457 options_json.len(),
458 result_buf.as_mut_ptr(),
459 result_buf.len(),
460 &mut bytes_written,
461 )
462 };
463
464 if code != 0 {
465 return Err(code.into());
466 }
467 if bytes_written == 0 {
468 return Err(WebScrapeErrorKind::EmptyResponse);
469 }
470 if bytes_written > result_buf.len() {
471 return Err(WebScrapeErrorKind::MemoryError);
472 }
473
474 let result_bytes =
475 unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
476
477 let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(result_bytes)
479 .map_err(|e| {
480 eprintln!("error: {:?}", e);
481 WebScrapeErrorKind::ParseError
482 })?;
483
484 if let Some(error) = scrape_response.error {
485 return Err(WebScrapeErrorKind::RuntimeError(error));
486 }
487
488 scrape_response.data.content = transform_html(TransformHtmlOptions {
490 html: scrape_response.data.content,
491 url: scrape_response.data.metadata.url.clone(),
492 include_tags: config.include_tags.unwrap_or_default(),
493 exclude_tags: config.exclude_tags.unwrap_or_default(),
494 only_main_content: config.only_main_content,
495 })
496 .map_err(|e| {
497 eprintln!("error: {:?}", e);
498 WebScrapeErrorKind::TransformError
499 })?;
500
501 match config.format {
503 Format::Markdown => {
504 scrape_response.data.content = parse_markdown(&scrape_response.data.content);
505 }
506 Format::Html => (), Format::Json => unimplemented!(),
508 }
509
510 Ok(scrape_response)
512 }
513
514 pub fn map(
516 &self,
517 url: &str,
518 options: Option<MapOptions>,
519 ) -> Result<Response<MapData>, WebScrapeErrorKind> {
520 let _map_options = options.unwrap_or_default();
521
522 Ok(Response {
526 success: true,
527 error: None,
528 data: MapData {
529 url: url.to_string(),
530 links: vec![],
531 total_links: 0,
532 timestamp: 0,
533 },
534 })
535 }
536
537 pub fn crawl(
539 &self,
540 url: &str,
541 options: Option<CrawlOptions>,
542 ) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind> {
543 let _crawl_options = options.unwrap_or_default();
544
545 Ok(Response {
548 success: true,
549 error: None,
550 data: CrawlData {
551 root_url: url.to_string(),
552 pages: vec![],
553 link_map: None,
554 depth_reached: 0,
555 total_pages: 0,
556 errors: vec![],
557 },
558 })
559 }
560}
561
562impl Drop for BlessCrawl {
563 fn drop(&mut self) {
564 if self.inner == 0 {
566 return;
567 }
568 let code = unsafe { close(self.inner) };
569 if code != 0 {
570 eprintln!("Error closing web scraper: {}", code);
571 }
572 }
573}
574
575#[derive(Debug)]
576pub enum WebScrapeErrorKind {
577 InvalidUrl,
578 Timeout,
579 NetworkError,
580 RenderingError,
581 MemoryError,
582 DepthExceeded,
583 RateLimited,
584 TransformError,
585 Utf8Error,
586 ParseError,
587 ScrapeFailed,
588 MapFailed,
589 CrawlFailed,
590 EmptyResponse,
591 InvalidTimeout,
592 InvalidWaitTime,
593 RuntimeError(String),
594}
595
596impl From<u8> for WebScrapeErrorKind {
597 fn from(code: u8) -> Self {
598 match code {
599 1 => WebScrapeErrorKind::InvalidUrl,
600 2 => WebScrapeErrorKind::Timeout,
601 3 => WebScrapeErrorKind::NetworkError,
602 4 => WebScrapeErrorKind::RenderingError,
603 5 => WebScrapeErrorKind::MemoryError,
604 6 => WebScrapeErrorKind::DepthExceeded,
605 7 => WebScrapeErrorKind::RateLimited,
606 8 => WebScrapeErrorKind::TransformError,
607 9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
608 10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
609 _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
610 }
611 }
612}
613
614impl std::fmt::Display for WebScrapeErrorKind {
615 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
616 match self {
617 WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
618 WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
619 WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
620 WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
621 WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
622 WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
623 WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
624 WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
625 WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
626 WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
627 WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
628 WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
629 WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
630 WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
631 WebScrapeErrorKind::InvalidTimeout => {
632 write!(f, "Timeout exceeds maximum allowed (120s)")
633 }
634 WebScrapeErrorKind::InvalidWaitTime => {
635 write!(f, "Wait time exceeds maximum allowed (20s)")
636 }
637 WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
638 }
639 }
640}
641
642impl std::error::Error for WebScrapeErrorKind {}