1use std::time::Duration;
4
5use crate::error::Error;
6
7#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11 pub html: String,
13 pub inner_text: String,
15 pub title: Option<String>,
17 #[serde(skip_serializing_if = "Option::is_none")]
19 pub layout_json: Option<String>,
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub js_result: Option<String>,
23 pub console_messages: Vec<ConsoleMessage>,
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub accessibility_tree: Option<String>,
28 #[serde(skip)]
29 screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33 pub fn markdown(&self) -> crate::error::Result<String> {
35 self.markdown_with_url("")
36 }
37
38 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40 let input = crate::extract::ExtractInput::new(&self.html, url)
41 .with_layout_json(self.layout_json.as_deref())
42 .with_inner_text(Some(&self.inner_text));
43 Ok(crate::extract::extract_text(&input)?)
44 }
45
46 pub fn extract_json(&self) -> crate::error::Result<String> {
48 self.extract_json_with_url("")
49 }
50
51 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53 let input = crate::extract::ExtractInput::new(&self.html, url)
54 .with_layout_json(self.layout_json.as_deref())
55 .with_inner_text(Some(&self.inner_text));
56 Ok(crate::extract::extract_json(&input)?)
57 }
58
59 #[must_use]
61 pub fn screenshot_png(&self) -> Option<&[u8]> {
62 self.screenshot_png.as_deref()
63 }
64
65 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
66 let title = {
67 let doc = dom_query::Document::from(page.html.as_str());
68 let t = doc.select("title").text().to_string();
69 if t.is_empty() { None } else { Some(t) }
70 };
71 let screenshot_png = page.screenshot.and_then(|img| {
72 let mut buf = std::io::Cursor::new(Vec::new());
73 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
74 Some(buf.into_inner())
75 });
76 Self {
77 html: page.html,
78 inner_text: page.inner_text.unwrap_or_default(),
79 title,
80 layout_json: page.layout_json,
81 js_result: page.js_result,
82 console_messages: page
83 .console_messages
84 .into_iter()
85 .map(|m| ConsoleMessage {
86 level: match m.level {
87 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
88 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
89 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
90 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
91 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
92 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
93 },
94 message: m.message,
95 })
96 .collect(),
97 screenshot_png,
98 accessibility_tree: page.accessibility_tree,
99 }
100 }
101}
102
103#[derive(Debug, Clone, serde::Serialize)]
105#[non_exhaustive]
106pub struct ConsoleMessage {
107 pub level: ConsoleLevel,
109 pub message: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
115#[serde(rename_all = "lowercase")]
116#[non_exhaustive]
117pub enum ConsoleLevel {
118 Log,
120 Debug,
122 Info,
124 Warn,
126 Error,
128 Trace,
130}
131
132#[derive(Debug, Clone, Default)]
133pub(crate) enum FetchMode {
134 #[default]
135 Content,
136 Screenshot {
137 full_page: bool,
138 },
139 JavaScript(String),
140}
141
142#[must_use = "options do nothing until passed to fetch()"]
149#[derive(Debug, Clone)]
150pub struct FetchOptions {
151 pub(crate) url: String,
152 pub(crate) timeout: Duration,
153 pub(crate) settle: Duration,
154 pub(crate) mode: FetchMode,
155}
156
157impl FetchOptions {
158 pub fn new(url: &str) -> Self {
160 Self {
161 url: url.into(),
162 timeout: Duration::from_secs(30),
163 settle: Duration::ZERO,
164 mode: FetchMode::Content,
165 }
166 }
167
168 pub fn screenshot(url: &str, full_page: bool) -> Self {
170 Self {
171 mode: FetchMode::Screenshot { full_page },
172 ..Self::new(url)
173 }
174 }
175
176 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
178 Self {
179 mode: FetchMode::JavaScript(expression.into()),
180 ..Self::new(url)
181 }
182 }
183
184 pub fn timeout(mut self, timeout: Duration) -> Self {
186 self.timeout = timeout;
187 self
188 }
189
190 pub fn settle(mut self, settle: Duration) -> Self {
192 self.settle = settle;
193 self
194 }
195}
196
197#[allow(clippy::needless_pass_by_value)]
202pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
203 ensure_crypto_provider();
204
205 crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
206 url: opts.url.clone(),
207 reason: e.to_string(),
208 })?;
209
210 if matches!(opts.mode, FetchMode::Content)
211 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
212 {
213 let text = crate::extract::extract_pdf(&bytes);
214 return Ok(Page {
215 html: String::new(),
216 inner_text: text,
217 ..Page::default()
218 });
219 }
220
221 let bridge_opts = crate::bridge::FetchOptions {
222 url: &opts.url,
223 timeout_secs: opts.timeout.as_secs().max(1),
224 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
225 mode: match opts.mode {
226 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
227 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
228 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
229 expression: expr.clone(),
230 },
231 },
232 };
233
234 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
235 let msg = format!("{e:#}");
236 if msg.contains("timed out") {
237 Error::Timeout {
238 url: opts.url.clone(),
239 timeout: opts.timeout,
240 }
241 } else {
242 Error::Engine(msg)
243 }
244 })?;
245
246 Ok(Page::from_servo(servo_page))
247}
248
249#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
251#[derive(Debug, Clone)]
252pub struct CrawlOptions {
253 pub(crate) url: String,
254 pub(crate) limit: usize,
255 pub(crate) max_depth: usize,
256 pub(crate) timeout: Duration,
257 pub(crate) settle: Duration,
258 pub(crate) include: Vec<String>,
259 pub(crate) exclude: Vec<String>,
260 pub(crate) selector: Option<String>,
261 pub(crate) json: bool,
262}
263
264impl CrawlOptions {
265 pub fn new(url: &str) -> Self {
267 Self {
268 url: url.into(),
269 limit: 50,
270 max_depth: 3,
271 timeout: Duration::from_secs(30),
272 settle: Duration::ZERO,
273 include: Vec::new(),
274 exclude: Vec::new(),
275 selector: None,
276 json: false,
277 }
278 }
279
280 pub fn limit(mut self, n: usize) -> Self {
282 self.limit = n;
283 self
284 }
285
286 pub fn max_depth(mut self, n: usize) -> Self {
288 self.max_depth = n;
289 self
290 }
291
292 pub fn timeout(mut self, timeout: Duration) -> Self {
294 self.timeout = timeout;
295 self
296 }
297
298 pub fn settle(mut self, settle: Duration) -> Self {
300 self.settle = settle;
301 self
302 }
303
304 pub fn include(mut self, patterns: &[&str]) -> Self {
306 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
307 self
308 }
309
310 pub fn exclude(mut self, patterns: &[&str]) -> Self {
312 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
313 self
314 }
315
316 pub fn json(mut self, json: bool) -> Self {
318 self.json = json;
319 self
320 }
321
322 pub fn selector(mut self, selector: impl Into<String>) -> Self {
324 self.selector = Some(selector.into());
325 self
326 }
327}
328
329#[derive(Debug, Clone, serde::Serialize)]
331#[non_exhaustive]
332pub struct CrawlResult {
333 pub url: String,
335 pub depth: usize,
337 pub status: CrawlStatus,
339 #[serde(skip_serializing_if = "Option::is_none")]
341 pub title: Option<String>,
342 #[serde(skip_serializing_if = "Option::is_none")]
344 pub content: Option<String>,
345 #[serde(skip_serializing_if = "Option::is_none")]
347 pub error: Option<String>,
348 pub links_found: usize,
350}
351
352impl CrawlResult {
353 fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
354 Self {
355 url: r.url.clone(),
356 depth: r.depth,
357 status: match r.status {
358 crate::crawl::CrawlStatus::Ok => CrawlStatus::Ok,
359 crate::crawl::CrawlStatus::Error => CrawlStatus::Error,
360 },
361 title: r.title.clone(),
362 content: r.content.clone(),
363 error: r.error.clone(),
364 links_found: r.links_found,
365 }
366 }
367}
368
369#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
371#[serde(rename_all = "lowercase")]
372#[non_exhaustive]
373pub enum CrawlStatus {
374 Ok,
376 Error,
378}
379
380#[allow(clippy::needless_pass_by_value)]
382pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
383 let internal_opts = build_crawl_options(&opts)?;
384 crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
385 on_page(&CrawlResult::from_internal(r));
386 }))
387 .map_err(|e| Error::Engine(e.to_string()))?;
388 Ok(())
389}
390
391#[allow(clippy::needless_pass_by_value)]
393pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
394 let mut results = Vec::new();
395 crawl_each(opts, |r| results.push(r.clone()))?;
396 Ok(results)
397}
398
399pub fn markdown(url: &str) -> crate::error::Result<String> {
401 fetch(FetchOptions::new(url))?.markdown_with_url(url)
402}
403
404pub fn extract_json(url: &str) -> crate::error::Result<String> {
406 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
407}
408
409pub fn text(url: &str) -> crate::error::Result<String> {
411 Ok(fetch(FetchOptions::new(url))?.inner_text)
412}
413
414pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
416 crate::net::validate_url(url).map_err(|e| Error::InvalidUrl {
417 url: url.into(),
418 reason: e.to_string(),
419 })
420}
421
422fn ensure_crypto_provider() {
423 let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
424}
425
426fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
427 let seed = crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
428 url: opts.url.clone(),
429 reason: e.to_string(),
430 })?;
431 let include = if opts.include.is_empty() {
432 None
433 } else {
434 Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
435 };
436 let exclude = if opts.exclude.is_empty() {
437 None
438 } else {
439 Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
440 };
441 Ok(crate::crawl::CrawlOptions {
442 seed,
443 limit: opts.limit,
444 max_depth: opts.max_depth,
445 timeout_secs: opts.timeout.as_secs().max(1),
446 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
447 include,
448 exclude,
449 selector: opts.selector.clone(),
450 json: opts.json,
451 })
452}
453
454#[cfg(test)]
455mod tests {
456 use super::*;
457
458 #[test]
459 fn fetch_options_defaults() {
460 let opts = FetchOptions::new("https://example.com");
461 assert_eq!(opts.url, "https://example.com");
462 assert_eq!(opts.timeout, Duration::from_secs(30));
463 assert_eq!(opts.settle, Duration::ZERO);
464 assert!(matches!(opts.mode, FetchMode::Content));
465 }
466
467 #[test]
468 fn fetch_options_screenshot() {
469 let opts = FetchOptions::screenshot("https://example.com", true);
470 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
471 }
472
473 #[test]
474 fn fetch_options_javascript() {
475 let opts = FetchOptions::javascript("https://example.com", "document.title");
476 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
477 }
478
479 #[test]
480 fn fetch_options_chaining() {
481 let opts = FetchOptions::new("https://example.com")
482 .timeout(Duration::from_secs(60))
483 .settle(Duration::from_millis(500));
484 assert_eq!(opts.timeout, Duration::from_secs(60));
485 assert_eq!(opts.settle, Duration::from_millis(500));
486 }
487
488 #[test]
489 fn crawl_options_defaults() {
490 let opts = CrawlOptions::new("https://example.com");
491 assert_eq!(opts.url, "https://example.com");
492 assert_eq!(opts.limit, 50);
493 assert_eq!(opts.max_depth, 3);
494 assert_eq!(opts.timeout, Duration::from_secs(30));
495 assert!(opts.include.is_empty());
496 assert!(opts.exclude.is_empty());
497 }
498
499 #[test]
500 fn crawl_options_chaining() {
501 let opts = CrawlOptions::new("https://example.com")
502 .limit(100)
503 .max_depth(5)
504 .timeout(Duration::from_secs(60))
505 .include(&["/docs/**"])
506 .exclude(&["/docs/archive/**"]);
507 assert_eq!(opts.limit, 100);
508 assert_eq!(opts.max_depth, 5);
509 assert_eq!(opts.include, vec!["/docs/**"]);
510 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
511 }
512
513 #[test]
514 fn page_markdown_from_html() {
515 let page = Page {
516 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
517 inner_text: "hello world".into(),
518 ..Page::default()
519 };
520 let md = page.markdown().unwrap();
521 assert!(md.contains("hello world"));
522 }
523
524 #[test]
525 fn page_extract_json_produces_valid_json() {
526 let page = Page {
527 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
528 inner_text: "content".into(),
529 ..Page::default()
530 };
531 let json = page.extract_json().unwrap();
532 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
533 }
534
535 #[test]
536 fn page_screenshot_png_none_by_default() {
537 let page = Page::default();
538 assert!(page.screenshot_png().is_none());
539 }
540
541 #[test]
542 fn fetch_rejects_invalid_url() {
543 let result = fetch(FetchOptions::new("not a url"));
544 assert!(result.is_err());
545 let err = result.unwrap_err();
546 assert!(matches!(err, Error::InvalidUrl { .. }));
547 }
548
549 #[test]
550 fn fetch_rejects_private_ip() {
551 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
552 assert!(result.is_err());
553 }
554
555 #[test]
556 fn fetch_rejects_file_scheme() {
557 let result = fetch(FetchOptions::new("file:///etc/passwd"));
558 assert!(result.is_err());
559 }
560}