use std::time::Duration;
use crate::error::Error;
#[derive(Debug, Clone, Default, serde::Serialize)]
#[non_exhaustive]
pub struct Page {
pub html: String,
pub inner_text: String,
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub layout_json: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub js_result: Option<String>,
pub console_messages: Vec<ConsoleMessage>,
#[serde(skip_serializing_if = "Option::is_none")]
pub accessibility_tree: Option<String>,
#[serde(skip)]
screenshot_png: Option<Vec<u8>>,
}
impl Page {
pub fn markdown(&self) -> crate::error::Result<String> {
self.markdown_with_url("")
}
pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
let input = crate::extract::ExtractInput::new(&self.html, url)
.with_layout_json(self.layout_json.as_deref())
.with_inner_text(Some(&self.inner_text));
Ok(crate::extract::extract_text(&input)?)
}
pub fn extract_json(&self) -> crate::error::Result<String> {
self.extract_json_with_url("")
}
pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
let input = crate::extract::ExtractInput::new(&self.html, url)
.with_layout_json(self.layout_json.as_deref())
.with_inner_text(Some(&self.inner_text));
Ok(crate::extract::extract_json(&input)?)
}
#[must_use]
pub fn screenshot_png(&self) -> Option<&[u8]> {
self.screenshot_png.as_deref()
}
pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
let title = {
let doc = dom_query::Document::from(page.html.as_str());
let t = doc.select("title").text().to_string();
if t.is_empty() { None } else { Some(t) }
};
let screenshot_png = page.screenshot.and_then(|img| {
let mut buf = std::io::Cursor::new(Vec::new());
img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
Some(buf.into_inner())
});
Self {
html: page.html,
inner_text: page.inner_text.unwrap_or_default(),
title,
layout_json: page.layout_json,
js_result: page.js_result,
console_messages: page
.console_messages
.into_iter()
.map(|m| ConsoleMessage {
level: match m.level {
crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
},
message: m.message,
})
.collect(),
screenshot_png,
accessibility_tree: page.accessibility_tree,
}
}
}
#[derive(Debug, Clone, serde::Serialize)]
#[non_exhaustive]
pub struct ConsoleMessage {
pub level: ConsoleLevel,
pub message: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "lowercase")]
#[non_exhaustive]
pub enum ConsoleLevel {
Log,
Debug,
Info,
Warn,
Error,
Trace,
}
#[derive(Debug, Clone, Default)]
pub(crate) enum FetchMode {
#[default]
Content,
Screenshot {
full_page: bool,
},
JavaScript(String),
}
#[must_use = "options do nothing until passed to fetch()"]
#[derive(Debug, Clone)]
pub struct FetchOptions {
pub(crate) url: String,
pub(crate) timeout: Duration,
pub(crate) settle: Duration,
pub(crate) mode: FetchMode,
}
impl FetchOptions {
pub fn new(url: &str) -> Self {
Self {
url: url.into(),
timeout: Duration::from_secs(30),
settle: Duration::ZERO,
mode: FetchMode::Content,
}
}
pub fn screenshot(url: &str, full_page: bool) -> Self {
Self {
mode: FetchMode::Screenshot { full_page },
..Self::new(url)
}
}
pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
Self {
mode: FetchMode::JavaScript(expression.into()),
..Self::new(url)
}
}
pub fn timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
pub fn settle(mut self, settle: Duration) -> Self {
self.settle = settle;
self
}
}
#[allow(clippy::needless_pass_by_value)]
pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
ensure_crypto_provider();
crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
url: opts.url.clone(),
reason: e.to_string(),
})?;
if matches!(opts.mode, FetchMode::Content)
&& let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
{
let text = crate::extract::extract_pdf(&bytes);
return Ok(Page {
html: String::new(),
inner_text: text,
..Page::default()
});
}
let bridge_opts = crate::bridge::FetchOptions {
url: &opts.url,
timeout_secs: opts.timeout.as_secs().max(1),
settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
mode: match opts.mode {
FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
expression: expr.clone(),
},
},
};
let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
let msg = format!("{e:#}");
if msg.contains("timed out") {
Error::Timeout {
url: opts.url.clone(),
timeout: opts.timeout,
}
} else {
Error::Engine(msg)
}
})?;
Ok(Page::from_servo(servo_page))
}
#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
#[derive(Debug, Clone)]
pub struct CrawlOptions {
pub(crate) url: String,
pub(crate) limit: usize,
pub(crate) max_depth: usize,
pub(crate) timeout: Duration,
pub(crate) settle: Duration,
pub(crate) include: Vec<String>,
pub(crate) exclude: Vec<String>,
pub(crate) selector: Option<String>,
pub(crate) json: bool,
}
impl CrawlOptions {
pub fn new(url: &str) -> Self {
Self {
url: url.into(),
limit: 50,
max_depth: 3,
timeout: Duration::from_secs(30),
settle: Duration::ZERO,
include: Vec::new(),
exclude: Vec::new(),
selector: None,
json: false,
}
}
pub fn limit(mut self, n: usize) -> Self {
self.limit = n;
self
}
pub fn max_depth(mut self, n: usize) -> Self {
self.max_depth = n;
self
}
pub fn timeout(mut self, timeout: Duration) -> Self {
self.timeout = timeout;
self
}
pub fn settle(mut self, settle: Duration) -> Self {
self.settle = settle;
self
}
pub fn include(mut self, patterns: &[&str]) -> Self {
self.include = patterns.iter().map(|s| (*s).to_string()).collect();
self
}
pub fn exclude(mut self, patterns: &[&str]) -> Self {
self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
self
}
pub fn json(mut self, json: bool) -> Self {
self.json = json;
self
}
pub fn selector(mut self, selector: impl Into<String>) -> Self {
self.selector = Some(selector.into());
self
}
}
#[derive(Debug, Clone, serde::Serialize)]
#[non_exhaustive]
pub struct CrawlResult {
pub url: String,
pub depth: usize,
pub status: CrawlStatus,
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
pub links_found: usize,
}
impl CrawlResult {
fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
Self {
url: r.url.clone(),
depth: r.depth,
status: match r.status {
crate::crawl::CrawlStatus::Ok => CrawlStatus::Ok,
crate::crawl::CrawlStatus::Error => CrawlStatus::Error,
},
title: r.title.clone(),
content: r.content.clone(),
error: r.error.clone(),
links_found: r.links_found,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "lowercase")]
#[non_exhaustive]
pub enum CrawlStatus {
Ok,
Error,
}
#[allow(clippy::needless_pass_by_value)]
pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
let internal_opts = build_crawl_options(&opts)?;
crate::runtime::block_on(crate::crawl::run(internal_opts, |r| {
on_page(&CrawlResult::from_internal(r));
}))
.map_err(|e| Error::Engine(e.to_string()))?;
Ok(())
}
#[allow(clippy::needless_pass_by_value)]
pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
let mut results = Vec::new();
crawl_each(opts, |r| results.push(r.clone()))?;
Ok(results)
}
pub fn markdown(url: &str) -> crate::error::Result<String> {
fetch(FetchOptions::new(url))?.markdown_with_url(url)
}
pub fn extract_json(url: &str) -> crate::error::Result<String> {
fetch(FetchOptions::new(url))?.extract_json_with_url(url)
}
pub fn text(url: &str) -> crate::error::Result<String> {
Ok(fetch(FetchOptions::new(url))?.inner_text)
}
pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
crate::net::validate_url(url).map_err(|e| Error::InvalidUrl {
url: url.into(),
reason: e.to_string(),
})
}
fn ensure_crypto_provider() {
let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
}
fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
let seed = crate::net::validate_url(&opts.url).map_err(|e| Error::InvalidUrl {
url: opts.url.clone(),
reason: e.to_string(),
})?;
let include = if opts.include.is_empty() {
None
} else {
Some(crate::crawl::build_globset(&opts.include).map_err(|e| Error::Engine(e.to_string()))?)
};
let exclude = if opts.exclude.is_empty() {
None
} else {
Some(crate::crawl::build_globset(&opts.exclude).map_err(|e| Error::Engine(e.to_string()))?)
};
Ok(crate::crawl::CrawlOptions {
seed,
limit: opts.limit,
max_depth: opts.max_depth,
timeout_secs: opts.timeout.as_secs().max(1),
settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
include,
exclude,
selector: opts.selector.clone(),
json: opts.json,
})
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fetch_options_defaults() {
let opts = FetchOptions::new("https://example.com");
assert_eq!(opts.url, "https://example.com");
assert_eq!(opts.timeout, Duration::from_secs(30));
assert_eq!(opts.settle, Duration::ZERO);
assert!(matches!(opts.mode, FetchMode::Content));
}
#[test]
fn fetch_options_screenshot() {
let opts = FetchOptions::screenshot("https://example.com", true);
assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
}
#[test]
fn fetch_options_javascript() {
let opts = FetchOptions::javascript("https://example.com", "document.title");
assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
}
#[test]
fn fetch_options_chaining() {
let opts = FetchOptions::new("https://example.com")
.timeout(Duration::from_secs(60))
.settle(Duration::from_millis(500));
assert_eq!(opts.timeout, Duration::from_secs(60));
assert_eq!(opts.settle, Duration::from_millis(500));
}
#[test]
fn crawl_options_defaults() {
let opts = CrawlOptions::new("https://example.com");
assert_eq!(opts.url, "https://example.com");
assert_eq!(opts.limit, 50);
assert_eq!(opts.max_depth, 3);
assert_eq!(opts.timeout, Duration::from_secs(30));
assert!(opts.include.is_empty());
assert!(opts.exclude.is_empty());
}
#[test]
fn crawl_options_chaining() {
let opts = CrawlOptions::new("https://example.com")
.limit(100)
.max_depth(5)
.timeout(Duration::from_secs(60))
.include(&["/docs/**"])
.exclude(&["/docs/archive/**"]);
assert_eq!(opts.limit, 100);
assert_eq!(opts.max_depth, 5);
assert_eq!(opts.include, vec!["/docs/**"]);
assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
}
#[test]
fn page_markdown_from_html() {
let page = Page {
html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
inner_text: "hello world".into(),
..Page::default()
};
let md = page.markdown().unwrap();
assert!(md.contains("hello world"));
}
#[test]
fn page_extract_json_produces_valid_json() {
let page = Page {
html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
inner_text: "content".into(),
..Page::default()
};
let json = page.extract_json().unwrap();
let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
}
#[test]
fn page_screenshot_png_none_by_default() {
let page = Page::default();
assert!(page.screenshot_png().is_none());
}
#[test]
fn fetch_rejects_invalid_url() {
let result = fetch(FetchOptions::new("not a url"));
assert!(result.is_err());
let err = result.unwrap_err();
assert!(matches!(err, Error::InvalidUrl { .. }));
}
#[test]
fn fetch_rejects_private_ip() {
let result = fetch(FetchOptions::new("http://127.0.0.1/"));
assert!(result.is_err());
}
#[test]
fn fetch_rejects_file_scheme() {
let result = fetch(FetchOptions::new("file:///etc/passwd"));
assert!(result.is_err());
}
}