1mod html_to_markdown;
18mod html_transform;
19
20use html_to_markdown::parse_markdown;
21pub use html_transform::{transform_html, HtmlTransformError, TransformHtmlOptions};
22use std::collections::HashMap;
23
24type Handle = u32;
25type ExitCode = u8;
26
27#[cfg(not(feature = "mock-ffi"))]
28#[link(wasm_import_module = "bless_crawl")]
29extern "C" {
30 #[allow(clippy::too_many_arguments)]
32 fn scrape(
33 h: *mut Handle,
34 url_ptr: *const u8,
35 url_len: usize,
36 options_ptr: *const u8,
37 options_len: usize,
38 result_ptr: *mut u8,
39 result_len: usize,
40 bytes_written: *mut usize,
41 ) -> ExitCode;
42
43 fn close(h: Handle) -> ExitCode;
45}
46
47#[cfg(feature = "mock-ffi")]
48#[allow(unused_variables)]
49mod mock_ffi {
50 use super::{ExitCode, Handle};
51
52 #[allow(clippy::too_many_arguments)]
53 pub unsafe fn scrape(
54 h: *mut Handle,
55 _url_ptr: *const u8,
56 _url_len: usize,
57 _options_ptr: *const u8,
58 _options_len: usize,
59 result_ptr: *mut u8,
60 result_len: usize,
61 bytes_written: *mut usize,
62 ) -> ExitCode {
63 1
64 }
65
66 pub unsafe fn close(_h: Handle) -> ExitCode {
67 1
68 }
69}
70
71#[cfg(feature = "mock-ffi")]
72use mock_ffi::*;
73
74#[derive(Debug, Clone, PartialEq, serde::Serialize)]
75pub struct ScrapeOptions {
76 pub timeout: u32,
77 pub wait_time: u32,
78 pub include_tags: Option<Vec<String>>,
79 pub exclude_tags: Option<Vec<String>>,
80 pub only_main_content: bool,
81 pub format: Format,
82 pub viewport: Option<Viewport>,
83 pub user_agent: Option<String>,
84 pub headers: Option<HashMap<String, String>>,
85}
86
87impl Default for ScrapeOptions {
88 fn default() -> Self {
89 Self {
90 timeout: BlessCrawl::DEFAULT_TIMEOUT_MS,
91 wait_time: BlessCrawl::DEFAULT_WAIT_TIME_MS,
92 include_tags: None,
93 exclude_tags: None,
94 only_main_content: false,
95 format: Format::Markdown,
96 viewport: None,
97 user_agent: None,
98 headers: None,
99 }
100 }
101}
102
103#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
104pub enum Format {
105 #[default]
106 #[serde(rename = "markdown")]
107 Markdown,
108 #[serde(rename = "html")]
109 Html,
110 #[serde(rename = "json")]
111 Json,
112}
113
114impl std::str::FromStr for Format {
115 type Err = ();
116 fn from_str(s: &str) -> Result<Self, Self::Err> {
117 match s {
118 "markdown" => Ok(Format::Markdown),
119 "html" => Ok(Format::Html),
120 "json" => Ok(Format::Json),
121 _ => Err(()),
122 }
123 }
124}
125
126#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
127pub struct Viewport {
128 pub width: Option<u32>,
129 pub height: Option<u32>,
130}
131
132#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
133pub struct MapOptions {
134 pub link_types: Option<Vec<String>>,
135 pub base_url: Option<String>,
136 pub filter_extensions: Option<Vec<String>>,
137}
138
139#[derive(Debug, Clone, Default, PartialEq, serde::Serialize)]
140pub struct CrawlOptions {
141 pub limit: Option<u32>,
142 pub max_depth: Option<u8>,
143 pub exclude_paths: Option<Vec<String>>,
144 pub include_paths: Option<Vec<String>>,
145 pub follow_external: Option<bool>,
146 pub delay_between_requests: Option<u32>,
147 pub parallel_requests: Option<u32>,
148}
149
150#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
151pub struct PageMetadata {
152 pub title: Option<String>,
153 pub description: Option<String>,
154 pub url: String,
155 pub status_code: u16,
156 pub language: Option<String>,
157 pub keywords: Option<String>,
158 pub robots: Option<String>,
159 pub author: Option<String>,
160 pub creator: Option<String>,
161 pub publisher: Option<String>,
162 pub og_title: Option<String>,
163 pub og_description: Option<String>,
164 pub og_image: Option<String>,
165 pub og_url: Option<String>,
166 pub og_site_name: Option<String>,
167 pub og_type: Option<String>,
168 pub twitter_title: Option<String>,
169 pub twitter_description: Option<String>,
170 pub twitter_image: Option<String>,
171 pub twitter_card: Option<String>,
172 pub twitter_site: Option<String>,
173 pub twitter_creator: Option<String>,
174 pub favicon: Option<String>,
175 pub viewport: Option<String>,
176 pub referrer: Option<String>,
177 pub content_type: Option<String>,
178 pub scrape_id: Option<String>,
179 pub source_url: Option<String>,
180 pub proxy_used: Option<String>,
181}
182
183#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
184pub struct ScrapeData {
185 pub success: bool,
186 pub timestamp: u64,
187 pub format: Format,
188 pub content: String,
189 pub metadata: PageMetadata,
190}
191
192#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
193pub struct Response<T> {
194 pub success: bool,
195 pub error: Option<String>,
196 pub data: T,
197}
198
199#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
200pub struct LinkInfo {
201 pub url: String,
202 pub link_type: String, }
205
206#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
207pub struct MapData {
208 pub url: String,
209 pub links: Vec<LinkInfo>,
210 pub total_links: usize,
211 pub timestamp: u64,
212}
213
214#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
215pub struct CrawlError {
216 pub url: String,
217 pub error: String,
218 pub depth: u32,
219}
220
221#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
222pub struct CrawlData<T> {
223 pub root_url: String,
224 pub pages: Vec<T>,
225 pub link_map: Option<MapData>,
226 pub depth_reached: u8,
227 pub total_pages: usize,
228 pub errors: Vec<CrawlError>,
229}
230
231impl ScrapeOptions {
232 pub fn new() -> Self {
233 Self::default()
234 }
235
236 pub fn with_include_tags(mut self, tags: Vec<String>) -> Self {
237 self.include_tags = Some(tags);
238 self
239 }
240
241 pub fn with_exclude_tags(mut self, tags: Vec<String>) -> Self {
242 self.exclude_tags = Some(tags);
243 self
244 }
245
246 pub fn with_format(mut self, format: Format) -> Self {
247 self.format = format;
248 self
249 }
250
251 pub fn with_viewport(mut self, width: u32, height: u32) -> Self {
252 self.viewport = Some(Viewport {
253 width: Some(width),
254 height: Some(height),
255 });
256 self
257 }
258
259 pub fn with_user_agent(mut self, user_agent: String) -> Self {
260 self.user_agent = Some(user_agent);
261 self
262 }
263
264 pub fn with_headers(mut self, headers: HashMap<String, String>) -> Self {
265 self.headers = Some(headers);
266 self
267 }
268}
269
270impl MapOptions {
271 pub fn new() -> Self {
272 Self::default()
273 }
274
275 pub fn with_link_types(mut self, link_types: Vec<String>) -> Self {
276 self.link_types = Some(link_types);
277 self
278 }
279
280 pub fn with_base_url(mut self, base_url: String) -> Self {
281 self.base_url = Some(base_url);
282 self
283 }
284
285 pub fn with_filter_extensions(mut self, extensions: Vec<String>) -> Self {
286 self.filter_extensions = Some(extensions);
287 self
288 }
289}
290
291impl CrawlOptions {
292 pub fn new() -> Self {
293 Self::default()
294 }
295
296 pub fn with_limit(mut self, limit: u32) -> Self {
297 self.limit = Some(limit);
298 self
299 }
300
301 pub fn with_max_depth(mut self, max_depth: u8) -> Self {
302 self.max_depth = Some(max_depth);
303 self
304 }
305
306 pub fn with_exclude_paths(mut self, paths: Vec<String>) -> Self {
307 self.exclude_paths = Some(paths);
308 self
309 }
310
311 pub fn with_include_paths(mut self, paths: Vec<String>) -> Self {
312 self.include_paths = Some(paths);
313 self
314 }
315
316 pub fn with_follow_external(mut self, follow: bool) -> Self {
317 self.follow_external = Some(follow);
318 self
319 }
320
321 pub fn with_delay_between_requests(mut self, delay: u32) -> Self {
322 self.delay_between_requests = Some(delay);
323 self
324 }
325
326 pub fn with_parallel_requests(mut self, parallel: u32) -> Self {
327 self.parallel_requests = Some(parallel);
328 self
329 }
330}
331
332#[derive(Debug, Clone, Default)]
334pub struct BlessCrawl {
335 inner: Handle,
336 config: ScrapeOptions,
337}
338
339impl BlessCrawl {
340 pub const DEFAULT_TIMEOUT_MS: u32 = 15000;
342 pub const DEFAULT_WAIT_TIME_MS: u32 = 3000;
344
345 pub const MAX_TIMEOUT_MS: u32 = 120000;
347 pub const MAX_WAIT_TIME_MS: u32 = 20000;
349
350 pub const MAX_SCRAPE_BUFFER_SIZE: usize = 2 * 1024 * 1024;
352
353 pub const MAX_MAP_BUFFER_SIZE: usize = 1024 * 1024;
355
356 pub const MAX_CRAWL_BUFFER_SIZE: usize = 8 * 1024 * 1024;
358
359 pub fn with_config(config: ScrapeOptions) -> Result<Self, WebScrapeErrorKind> {
361 let instance = Self { inner: 0, config };
362 instance.validate_config(&instance.config)?;
363 Ok(instance)
364 }
365
366 fn validate_config(&self, config: &ScrapeOptions) -> Result<(), WebScrapeErrorKind> {
367 if config.timeout > Self::MAX_TIMEOUT_MS {
368 return Err(WebScrapeErrorKind::InvalidTimeout);
369 }
370 if config.wait_time > Self::MAX_WAIT_TIME_MS {
371 return Err(WebScrapeErrorKind::InvalidWaitTime);
372 }
373 Ok(())
374 }
375
376 pub fn get_config(&self) -> &ScrapeOptions {
378 &self.config
379 }
380
381 pub fn handle(&self) -> Handle {
382 self.inner
383 }
384
385 pub fn scrape(
387 &self,
388 url: &str,
389 options: Option<ScrapeOptions>,
390 ) -> Result<Response<ScrapeData>, WebScrapeErrorKind> {
391 let config = if let Some(opts) = options {
393 self.validate_config(&opts)?;
394 opts
395 } else {
396 self.config.clone()
397 };
398
399 let options_json = serde_json::to_vec(&config).unwrap();
400
401 let mut handle = self.inner;
402 let mut result_buf = vec![0u8; Self::MAX_SCRAPE_BUFFER_SIZE];
403 let mut bytes_written: usize = 0;
404
405 let code = unsafe {
406 scrape(
407 &mut handle,
408 url.as_ptr(),
409 url.len(),
410 options_json.as_ptr(),
411 options_json.len(),
412 result_buf.as_mut_ptr(),
413 result_buf.len(),
414 &mut bytes_written,
415 )
416 };
417
418 if code != 0 {
419 return Err(code.into());
420 }
421 if bytes_written == 0 {
422 return Err(WebScrapeErrorKind::EmptyResponse);
423 }
424 if bytes_written > result_buf.len() {
425 return Err(WebScrapeErrorKind::MemoryError);
426 }
427
428 let result_bytes =
429 unsafe { std::slice::from_raw_parts(result_buf.as_ptr(), bytes_written) };
430
431 let mut scrape_response = serde_json::from_slice::<Response<ScrapeData>>(result_bytes)
433 .map_err(|e| {
434 eprintln!("error: {:?}", e);
435 WebScrapeErrorKind::ParseError
436 })?;
437
438 if let Some(error) = scrape_response.error {
439 return Err(WebScrapeErrorKind::RuntimeError(error));
440 }
441
442 scrape_response.data.content = transform_html(TransformHtmlOptions {
444 html: scrape_response.data.content,
445 url: scrape_response.data.metadata.url.clone(),
446 include_tags: config.include_tags.unwrap_or_default(),
447 exclude_tags: config.exclude_tags.unwrap_or_default(),
448 only_main_content: config.only_main_content,
449 })
450 .map_err(|e| {
451 eprintln!("error: {:?}", e);
452 WebScrapeErrorKind::TransformError
453 })?;
454
455 match config.format {
457 Format::Markdown => {
458 scrape_response.data.content = parse_markdown(&scrape_response.data.content);
459 }
460 Format::Html => (), Format::Json => unimplemented!(),
462 }
463
464 Ok(scrape_response)
466 }
467
468 pub fn map(
470 &self,
471 url: &str,
472 options: Option<MapOptions>,
473 ) -> Result<Response<MapData>, WebScrapeErrorKind> {
474 let _map_options = options.unwrap_or_default();
475
476 Ok(Response {
480 success: true,
481 error: None,
482 data: MapData {
483 url: url.to_string(),
484 links: vec![],
485 total_links: 0,
486 timestamp: 0,
487 },
488 })
489 }
490
491 pub fn crawl(
493 &self,
494 url: &str,
495 options: Option<CrawlOptions>,
496 ) -> Result<Response<CrawlData<ScrapeData>>, WebScrapeErrorKind> {
497 let _crawl_options = options.unwrap_or_default();
498
499 Ok(Response {
502 success: true,
503 error: None,
504 data: CrawlData {
505 root_url: url.to_string(),
506 pages: vec![],
507 link_map: None,
508 depth_reached: 0,
509 total_pages: 0,
510 errors: vec![],
511 },
512 })
513 }
514}
515
516impl Drop for BlessCrawl {
517 fn drop(&mut self) {
518 if self.inner == 0 {
520 return;
521 }
522 let code = unsafe { close(self.inner) };
523 if code != 0 {
524 eprintln!("Error closing web scraper: {}", code);
525 }
526 }
527}
528
529#[derive(Debug)]
530pub enum WebScrapeErrorKind {
531 InvalidUrl,
532 Timeout,
533 NetworkError,
534 RenderingError,
535 MemoryError,
536 DepthExceeded,
537 RateLimited,
538 TransformError,
539 Utf8Error,
540 ParseError,
541 ScrapeFailed,
542 MapFailed,
543 CrawlFailed,
544 EmptyResponse,
545 InvalidTimeout,
546 InvalidWaitTime,
547 RuntimeError(String),
548}
549
550impl From<u8> for WebScrapeErrorKind {
551 fn from(code: u8) -> Self {
552 match code {
553 1 => WebScrapeErrorKind::InvalidUrl,
554 2 => WebScrapeErrorKind::Timeout,
555 3 => WebScrapeErrorKind::NetworkError,
556 4 => WebScrapeErrorKind::RenderingError,
557 5 => WebScrapeErrorKind::MemoryError,
558 6 => WebScrapeErrorKind::DepthExceeded,
559 7 => WebScrapeErrorKind::RateLimited,
560 8 => WebScrapeErrorKind::TransformError,
561 9 => WebScrapeErrorKind::RuntimeError(String::from("Invalid timeout")),
562 10 => WebScrapeErrorKind::RuntimeError(String::from("Invalid wait time")),
563 _ => WebScrapeErrorKind::RuntimeError(String::from("Unknown error")),
564 }
565 }
566}
567
568impl std::fmt::Display for WebScrapeErrorKind {
569 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
570 match self {
571 WebScrapeErrorKind::InvalidUrl => write!(f, "Invalid URL provided"),
572 WebScrapeErrorKind::Timeout => write!(f, "Request timeout"),
573 WebScrapeErrorKind::NetworkError => write!(f, "Network error"),
574 WebScrapeErrorKind::RenderingError => write!(f, "Page rendering error"),
575 WebScrapeErrorKind::MemoryError => write!(f, "Memory allocation error"),
576 WebScrapeErrorKind::DepthExceeded => write!(f, "Maximum crawl depth exceeded"),
577 WebScrapeErrorKind::RateLimited => write!(f, "Rate limited"),
578 WebScrapeErrorKind::TransformError => write!(f, "Transform error"),
579 WebScrapeErrorKind::Utf8Error => write!(f, "UTF-8 conversion error"),
580 WebScrapeErrorKind::ParseError => write!(f, "JSON parse error"),
581 WebScrapeErrorKind::ScrapeFailed => write!(f, "Scrape operation failed"),
582 WebScrapeErrorKind::MapFailed => write!(f, "Map operation failed"),
583 WebScrapeErrorKind::CrawlFailed => write!(f, "Crawl operation failed"),
584 WebScrapeErrorKind::EmptyResponse => write!(f, "Empty response from host"),
585 WebScrapeErrorKind::InvalidTimeout => {
586 write!(f, "Timeout exceeds maximum allowed (120s)")
587 }
588 WebScrapeErrorKind::InvalidWaitTime => {
589 write!(f, "Wait time exceeds maximum allowed (20s)")
590 }
591 WebScrapeErrorKind::RuntimeError(error) => write!(f, "Runtime error: {}", error),
592 }
593 }
594}
595
596impl std::error::Error for WebScrapeErrorKind {}