1use std::time::Duration;
4
5use crate::error::Error;
6
7#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11 pub html: String,
13 pub inner_text: String,
15 pub title: Option<String>,
17 #[serde(skip_serializing_if = "Option::is_none")]
19 pub layout_json: Option<String>,
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub js_result: Option<String>,
23 pub console_messages: Vec<ConsoleMessage>,
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub accessibility_tree: Option<String>,
28 #[serde(skip)]
29 screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33 pub fn markdown(&self) -> crate::error::Result<String> {
35 self.markdown_with_url("")
36 }
37
38 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40 let input = crate::extract::ExtractInput::new(&self.html, url)
41 .with_layout_json(self.layout_json.as_deref())
42 .with_inner_text(Some(&self.inner_text));
43 Ok(crate::extract::extract_text(&input)?)
44 }
45
46 pub fn extract_json(&self) -> crate::error::Result<String> {
48 self.extract_json_with_url("")
49 }
50
51 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53 let input = crate::extract::ExtractInput::new(&self.html, url)
54 .with_layout_json(self.layout_json.as_deref())
55 .with_inner_text(Some(&self.inner_text));
56 Ok(crate::extract::extract_json(&input)?)
57 }
58
59 #[must_use]
61 pub fn screenshot_png(&self) -> Option<&[u8]> {
62 self.screenshot_png.as_deref()
63 }
64
65 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
66 let title = {
67 let doc = dom_query::Document::from(page.html.as_str());
68 let t = doc.select("title").text().to_string();
69 if t.is_empty() { None } else { Some(t) }
70 };
71 let screenshot_png = page.screenshot.and_then(|img| {
72 let mut buf = std::io::Cursor::new(Vec::new());
73 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
74 Some(buf.into_inner())
75 });
76 Self {
77 html: page.html,
78 inner_text: page.inner_text.unwrap_or_default(),
79 title,
80 layout_json: page.layout_json,
81 js_result: page.js_result,
82 console_messages: page
83 .console_messages
84 .into_iter()
85 .map(|m| ConsoleMessage {
86 level: match m.level {
87 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
88 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
89 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
90 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
91 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
92 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
93 },
94 message: m.message,
95 })
96 .collect(),
97 screenshot_png,
98 accessibility_tree: page.accessibility_tree,
99 }
100 }
101}
102
103#[derive(Debug, Clone, serde::Serialize)]
105#[non_exhaustive]
106pub struct ConsoleMessage {
107 pub level: ConsoleLevel,
109 pub message: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
115#[serde(rename_all = "lowercase")]
116#[non_exhaustive]
117pub enum ConsoleLevel {
118 Log,
120 Debug,
122 Info,
124 Warn,
126 Error,
128 Trace,
130}
131
132#[derive(Debug, Clone, Default)]
133pub(crate) enum FetchMode {
134 #[default]
135 Content,
136 Screenshot {
137 full_page: bool,
138 },
139 JavaScript(String),
140}
141
142#[must_use = "options do nothing until passed to fetch()"]
149#[derive(Debug, Clone)]
150pub struct FetchOptions {
151 pub(crate) url: String,
152 pub(crate) timeout: Duration,
153 pub(crate) settle: Duration,
154 pub(crate) mode: FetchMode,
155}
156
157impl FetchOptions {
158 pub fn new(url: &str) -> Self {
160 Self {
161 url: url.into(),
162 timeout: Duration::from_secs(30),
163 settle: Duration::ZERO,
164 mode: FetchMode::Content,
165 }
166 }
167
168 pub fn screenshot(url: &str, full_page: bool) -> Self {
170 Self {
171 mode: FetchMode::Screenshot { full_page },
172 ..Self::new(url)
173 }
174 }
175
176 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
178 Self {
179 mode: FetchMode::JavaScript(expression.into()),
180 ..Self::new(url)
181 }
182 }
183
184 pub fn timeout(mut self, timeout: Duration) -> Self {
186 self.timeout = timeout;
187 self
188 }
189
190 pub fn settle(mut self, settle: Duration) -> Self {
192 self.settle = settle;
193 self
194 }
195}
196
197#[allow(clippy::needless_pass_by_value)]
202pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
203 crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
204 url: opts.url.clone(),
205 reason: e.to_string(),
206 })?;
207
208 if matches!(opts.mode, FetchMode::Content)
209 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
210 {
211 let text = crate::extract::extract_pdf(&bytes);
212 return Ok(Page {
213 html: String::new(),
214 inner_text: text,
215 ..Page::default()
216 });
217 }
218
219 let bridge_opts = crate::bridge::FetchOptions {
220 url: &opts.url,
221 timeout_secs: opts.timeout.as_secs().max(1),
222 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
223 mode: match opts.mode {
224 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
225 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
226 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
227 expression: expr.clone(),
228 },
229 },
230 };
231
232 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
233 let msg = format!("{e:#}");
234 if msg.contains("timed out") {
235 Error::Timeout {
236 url: opts.url.clone(),
237 timeout: opts.timeout,
238 }
239 } else {
240 Error::Engine(msg)
241 }
242 })?;
243
244 Ok(Page::from_servo(servo_page))
245}
246
247#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
249#[derive(Debug, Clone)]
250pub struct CrawlOptions {
251 pub(crate) url: String,
252 pub(crate) limit: usize,
253 pub(crate) max_depth: usize,
254 pub(crate) timeout: Duration,
255 pub(crate) settle: Duration,
256 pub(crate) include: Vec<String>,
257 pub(crate) exclude: Vec<String>,
258 pub(crate) selector: Option<String>,
259 pub(crate) json: bool,
260}
261
262impl CrawlOptions {
263 pub fn new(url: &str) -> Self {
265 Self {
266 url: url.into(),
267 limit: 50,
268 max_depth: 3,
269 timeout: Duration::from_secs(30),
270 settle: Duration::ZERO,
271 include: Vec::new(),
272 exclude: Vec::new(),
273 selector: None,
274 json: false,
275 }
276 }
277
278 pub fn limit(mut self, n: usize) -> Self {
280 self.limit = n;
281 self
282 }
283
284 pub fn max_depth(mut self, n: usize) -> Self {
286 self.max_depth = n;
287 self
288 }
289
290 pub fn timeout(mut self, timeout: Duration) -> Self {
292 self.timeout = timeout;
293 self
294 }
295
296 pub fn settle(mut self, settle: Duration) -> Self {
298 self.settle = settle;
299 self
300 }
301
302 pub fn include(mut self, patterns: &[&str]) -> Self {
304 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
305 self
306 }
307
308 pub fn exclude(mut self, patterns: &[&str]) -> Self {
310 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
311 self
312 }
313
314 pub fn json(mut self, json: bool) -> Self {
316 self.json = json;
317 self
318 }
319
320 pub fn selector(mut self, selector: impl Into<String>) -> Self {
322 self.selector = Some(selector.into());
323 self
324 }
325}
326
327#[derive(Debug, Clone, serde::Serialize)]
329#[non_exhaustive]
330pub struct CrawlResult {
331 pub url: String,
333 pub depth: usize,
335 pub status: CrawlStatus,
337 #[serde(skip_serializing_if = "Option::is_none")]
339 pub title: Option<String>,
340 #[serde(skip_serializing_if = "Option::is_none")]
342 pub content: Option<String>,
343 #[serde(skip_serializing_if = "Option::is_none")]
345 pub error: Option<String>,
346 pub links_found: usize,
348}
349
350impl CrawlResult {
351 fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
352 Self {
353 url: r.url.clone(),
354 depth: r.depth,
355 status: match r.status {
356 crate::crawl::CrawlStatus::Ok => CrawlStatus::Ok,
357 crate::crawl::CrawlStatus::Error => CrawlStatus::Error,
358 },
359 title: r.title.clone(),
360 content: r.content.clone(),
361 error: r.error.clone(),
362 links_found: r.links_found,
363 }
364 }
365}
366
367#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
369#[serde(rename_all = "lowercase")]
370#[non_exhaustive]
371pub enum CrawlStatus {
372 Ok,
374 Error,
376}
377
378#[allow(clippy::needless_pass_by_value)]
380pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
381 let internal_opts = build_crawl_options(&opts)?;
382 crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
383 on_page(&CrawlResult::from_internal(r));
384 }))
385 .map_err(|e| Error::Engine(e.to_string()))?;
386 Ok(())
387}
388
389#[allow(clippy::needless_pass_by_value)]
391pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
392 let mut results = Vec::new();
393 crawl_each(opts, |r| results.push(r.clone()))?;
394 Ok(results)
395}
396
397pub fn markdown(url: &str) -> crate::error::Result<String> {
399 fetch(FetchOptions::new(url))?.markdown_with_url(url)
400}
401
402pub fn extract_json(url: &str) -> crate::error::Result<String> {
404 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
405}
406
407pub fn text(url: &str) -> crate::error::Result<String> {
409 Ok(fetch(FetchOptions::new(url))?.inner_text)
410}
411
412pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
414 crate::net::validate_url(url).map_err(|e| Error::InvalidUrl {
415 url: url.into(),
416 reason: e.to_string(),
417 })
418}
419
420fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
421 let seed = crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
422 url: opts.url.clone(),
423 reason: e.to_string(),
424 })?;
425 let include = if opts.include.is_empty() {
426 None
427 } else {
428 Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
429 };
430 let exclude = if opts.exclude.is_empty() {
431 None
432 } else {
433 Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
434 };
435 Ok(crate::crawl::CrawlOptions {
436 seed,
437 limit: opts.limit,
438 max_depth: opts.max_depth,
439 timeout_secs: opts.timeout.as_secs().max(1),
440 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
441 include,
442 exclude,
443 selector: opts.selector.clone(),
444 json: opts.json,
445 })
446}
447
448#[cfg(test)]
449mod tests {
450 use super::*;
451
452 #[test]
453 fn fetch_options_defaults() {
454 let opts = FetchOptions::new("https://example.com");
455 assert_eq!(opts.url, "https://example.com");
456 assert_eq!(opts.timeout, Duration::from_secs(30));
457 assert_eq!(opts.settle, Duration::ZERO);
458 assert!(matches!(opts.mode, FetchMode::Content));
459 }
460
461 #[test]
462 fn fetch_options_screenshot() {
463 let opts = FetchOptions::screenshot("https://example.com", true);
464 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
465 }
466
467 #[test]
468 fn fetch_options_javascript() {
469 let opts = FetchOptions::javascript("https://example.com", "document.title");
470 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
471 }
472
473 #[test]
474 fn fetch_options_chaining() {
475 let opts = FetchOptions::new("https://example.com")
476 .timeout(Duration::from_secs(60))
477 .settle(Duration::from_millis(500));
478 assert_eq!(opts.timeout, Duration::from_secs(60));
479 assert_eq!(opts.settle, Duration::from_millis(500));
480 }
481
482 #[test]
483 fn crawl_options_defaults() {
484 let opts = CrawlOptions::new("https://example.com");
485 assert_eq!(opts.url, "https://example.com");
486 assert_eq!(opts.limit, 50);
487 assert_eq!(opts.max_depth, 3);
488 assert_eq!(opts.timeout, Duration::from_secs(30));
489 assert!(opts.include.is_empty());
490 assert!(opts.exclude.is_empty());
491 }
492
493 #[test]
494 fn crawl_options_chaining() {
495 let opts = CrawlOptions::new("https://example.com")
496 .limit(100)
497 .max_depth(5)
498 .timeout(Duration::from_secs(60))
499 .include(&["/docs/**"])
500 .exclude(&["/docs/archive/**"]);
501 assert_eq!(opts.limit, 100);
502 assert_eq!(opts.max_depth, 5);
503 assert_eq!(opts.include, vec!["/docs/**"]);
504 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
505 }
506
507 #[test]
508 fn page_markdown_from_html() {
509 let page = Page {
510 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
511 inner_text: "hello world".into(),
512 ..Page::default()
513 };
514 let md = page.markdown().unwrap();
515 assert!(md.contains("hello world"));
516 }
517
518 #[test]
519 fn page_extract_json_produces_valid_json() {
520 let page = Page {
521 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
522 inner_text: "content".into(),
523 ..Page::default()
524 };
525 let json = page.extract_json().unwrap();
526 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
527 }
528
529 #[test]
530 fn page_screenshot_png_none_by_default() {
531 let page = Page::default();
532 assert!(page.screenshot_png().is_none());
533 }
534
535 #[test]
536 fn fetch_rejects_invalid_url() {
537 let result = fetch(FetchOptions::new("not a url"));
538 assert!(result.is_err());
539 let err = result.unwrap_err();
540 assert!(matches!(err, Error::InvalidUrl { .. }));
541 }
542
543 #[test]
544 fn fetch_rejects_private_ip() {
545 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
546 assert!(result.is_err());
547 }
548
549 #[test]
550 fn fetch_rejects_file_scheme() {
551 let result = fetch(FetchOptions::new("file:///etc/passwd"));
552 assert!(result.is_err());
553 }
554}