1use std::time::Duration;
4
5use crate::error::Error;
6use crate::net::sanitize_user_agent;
7
8#[derive(Debug, Clone, Default, serde::Serialize)]
10#[non_exhaustive]
11pub struct Page {
12 pub html: String,
14 pub inner_text: String,
16 pub title: Option<String>,
18 #[serde(skip_serializing_if = "Option::is_none")]
20 pub layout_json: Option<String>,
21 #[serde(skip_serializing_if = "Option::is_none")]
23 pub js_result: Option<String>,
24 pub console_messages: Vec<ConsoleMessage>,
26 #[serde(skip_serializing_if = "Option::is_none")]
28 pub accessibility_tree: Option<String>,
29 #[serde(skip)]
30 screenshot_png: Option<Vec<u8>>,
31}
32
33impl Page {
34 pub fn markdown(&self) -> crate::error::Result<String> {
36 self.markdown_with_url("")
37 }
38
39 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
41 let input = crate::extract::ExtractInput::new(&self.html, url)
42 .with_layout_json(self.layout_json.as_deref())
43 .with_inner_text(Some(&self.inner_text));
44 Ok(crate::extract::extract_text(&input)?)
45 }
46
47 pub fn extract_json(&self) -> crate::error::Result<String> {
49 self.extract_json_with_url("")
50 }
51
52 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
54 let input = crate::extract::ExtractInput::new(&self.html, url)
55 .with_layout_json(self.layout_json.as_deref())
56 .with_inner_text(Some(&self.inner_text));
57 Ok(crate::extract::extract_json(&input)?)
58 }
59
60 pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
62 let input = crate::extract::ExtractInput::new(&self.html, url)
63 .with_layout_json(self.layout_json.as_deref())
64 .with_inner_text(Some(&self.inner_text))
65 .with_selector(Some(selector));
66 Ok(crate::extract::extract_text(&input)?)
67 }
68
69 pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
71 let input = crate::extract::ExtractInput::new(&self.html, url)
72 .with_layout_json(self.layout_json.as_deref())
73 .with_inner_text(Some(&self.inner_text))
74 .with_selector(Some(selector));
75 Ok(crate::extract::extract_json(&input)?)
76 }
77
78 #[must_use]
80 pub fn screenshot_png(&self) -> Option<&[u8]> {
81 self.screenshot_png.as_deref()
82 }
83
84 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
85 let title = {
86 let doc = dom_query::Document::from(page.html.as_str());
87 let t = doc.select("title").text().to_string();
88 if t.is_empty() { None } else { Some(t) }
89 };
90 let screenshot_png = page.screenshot.and_then(|img| {
91 let mut buf = std::io::Cursor::new(Vec::new());
92 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
93 Some(buf.into_inner())
94 });
95 Self {
96 html: page.html,
97 inner_text: page.inner_text.unwrap_or_default(),
98 title,
99 layout_json: page.layout_json,
100 js_result: page.js_result,
101 console_messages: page
102 .console_messages
103 .into_iter()
104 .map(|m| ConsoleMessage {
105 level: match m.level {
106 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
107 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
108 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
109 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
110 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
111 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
112 },
113 message: m.message,
114 })
115 .collect(),
116 screenshot_png,
117 accessibility_tree: page.accessibility_tree,
118 }
119 }
120}
121
122#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
124#[non_exhaustive]
125pub struct ConsoleMessage {
126 pub level: ConsoleLevel,
128 pub message: String,
130}
131
132#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
134#[serde(rename_all = "lowercase")]
135#[non_exhaustive]
136pub enum ConsoleLevel {
137 Log,
139 Debug,
141 Info,
143 Warn,
145 Error,
147 Trace,
149}
150
151impl ConsoleLevel {
152 #[must_use]
154 pub fn as_str(&self) -> &'static str {
155 match self {
156 Self::Log => "log",
157 Self::Debug => "debug",
158 Self::Info => "info",
159 Self::Warn => "warn",
160 Self::Error => "error",
161 Self::Trace => "trace",
162 }
163 }
164}
165
166impl std::fmt::Display for ConsoleLevel {
167 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168 f.pad(self.as_str())
169 }
170}
171
172#[derive(Debug, Clone, Default)]
173pub(crate) enum FetchMode {
174 #[default]
175 Content,
176 Screenshot {
177 full_page: bool,
178 },
179 JavaScript(String),
180}
181
182#[must_use = "options do nothing until passed to fetch()"]
184#[derive(Debug, Clone)]
185pub struct FetchOptions {
186 pub(crate) url: String,
187 pub(crate) timeout: Duration,
188 pub(crate) settle: Duration,
189 pub(crate) mode: FetchMode,
190 pub(crate) user_agent: Option<String>,
191}
192
193impl FetchOptions {
194 pub fn new(url: &str) -> Self {
196 Self {
197 url: url.into(),
198 timeout: Duration::from_secs(30),
199 settle: Duration::ZERO,
200 mode: FetchMode::Content,
201 user_agent: None,
202 }
203 }
204
205 pub fn screenshot(url: &str, full_page: bool) -> Self {
207 Self {
208 mode: FetchMode::Screenshot { full_page },
209 ..Self::new(url)
210 }
211 }
212
213 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
215 Self {
216 mode: FetchMode::JavaScript(expression.into()),
217 ..Self::new(url)
218 }
219 }
220
221 pub fn timeout(mut self, timeout: Duration) -> Self {
223 self.timeout = timeout;
224 self
225 }
226
227 pub fn settle(mut self, settle: Duration) -> Self {
229 self.settle = settle;
230 self
231 }
232
233 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
235 self.user_agent = Some(sanitize_user_agent(ua.into()));
236 self
237 }
238}
239
240#[allow(clippy::needless_pass_by_value)]
242pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
243 crate::net::ensure_crypto_provider();
244
245 crate::net::validate_url(&opts.url)?;
246
247 if matches!(opts.mode, FetchMode::Content)
248 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
249 {
250 let text = crate::extract::extract_pdf(&bytes);
251 return Ok(Page {
252 html: String::new(),
253 inner_text: text,
254 ..Page::default()
255 });
256 }
257
258 let bridge_opts = crate::bridge::FetchOptions {
259 url: &opts.url,
260 timeout_secs: opts.timeout.as_secs().max(1),
261 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
262 user_agent: opts.user_agent.as_deref(),
263 mode: match opts.mode {
264 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
265 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
266 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
267 expression: expr.clone(),
268 },
269 },
270 };
271
272 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
273 let msg = format!("{e:#}");
274 if msg.contains("timed out") {
275 Error::Timeout {
276 url: opts.url.clone(),
277 timeout: opts.timeout,
278 }
279 } else {
280 Error::Engine(msg)
281 }
282 })?;
283
284 Ok(Page::from_servo(servo_page))
285}
286
287pub fn markdown(url: &str) -> crate::error::Result<String> {
289 fetch(FetchOptions::new(url))?.markdown_with_url(url)
290}
291
292pub fn extract_json(url: &str) -> crate::error::Result<String> {
294 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
295}
296
297pub fn text(url: &str) -> crate::error::Result<String> {
299 Ok(fetch(FetchOptions::new(url))?.inner_text)
300}
301
302#[cfg(test)]
303mod tests {
304 use super::*;
305
306 #[test]
307 fn fetch_options_defaults() {
308 let opts = FetchOptions::new("https://example.com");
309 assert_eq!(opts.url, "https://example.com");
310 assert_eq!(opts.timeout, Duration::from_secs(30));
311 assert_eq!(opts.settle, Duration::ZERO);
312 assert!(matches!(opts.mode, FetchMode::Content));
313 }
314
315 #[test]
316 fn fetch_options_screenshot() {
317 let opts = FetchOptions::screenshot("https://example.com", true);
318 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
319 }
320
321 #[test]
322 fn fetch_options_javascript() {
323 let opts = FetchOptions::javascript("https://example.com", "document.title");
324 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
325 }
326
327 #[test]
328 fn fetch_options_chaining() {
329 let opts = FetchOptions::new("https://example.com")
330 .timeout(Duration::from_secs(60))
331 .settle(Duration::from_millis(500));
332 assert_eq!(opts.timeout, Duration::from_secs(60));
333 assert_eq!(opts.settle, Duration::from_millis(500));
334 }
335
336 #[test]
337 fn fetch_user_agent_set() {
338 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
339 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
340 }
341
342 #[test]
343 fn fetch_user_agent_default_is_none() {
344 let opts = FetchOptions::new("https://example.com");
345 assert!(opts.user_agent.is_none());
346 }
347
348 #[test]
349 fn fetch_user_agent_sanitizes_crlf() {
350 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
351 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
352 }
353
354 #[test]
355 fn fetch_user_agent_sanitizes_null() {
356 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
357 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
358 }
359
360 #[test]
361 fn fetch_user_agent_empty_string() {
362 let opts = FetchOptions::new("https://example.com").user_agent("");
363 assert_eq!(opts.user_agent.as_deref(), Some(""));
364 }
365
366 #[test]
367 fn page_markdown_from_html() {
368 let page = Page {
369 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
370 inner_text: "hello world".into(),
371 ..Page::default()
372 };
373 let md = page.markdown().unwrap();
374 assert!(md.contains("hello world"));
375 }
376
377 #[test]
378 fn page_extract_json_produces_valid_json() {
379 let page = Page {
380 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
381 inner_text: "content".into(),
382 ..Page::default()
383 };
384 let json = page.extract_json().unwrap();
385 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
386 }
387
388 #[test]
389 fn page_screenshot_png_none_by_default() {
390 let page = Page::default();
391 assert!(page.screenshot_png().is_none());
392 }
393
394 #[test]
395 fn page_markdown_with_selector_scopes_to_subtree() {
396 let page = Page {
397 html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
398 ..Page::default()
399 };
400 let md = page.markdown_with_selector("https://example.com", "article").unwrap();
401 assert!(md.contains("keep"));
402 assert!(!md.contains("drop"));
403 }
404
405 #[test]
406 fn page_extract_json_with_selector_includes_url() {
407 let page = Page {
408 html: "<html><body><article>scoped</article></body></html>".into(),
409 ..Page::default()
410 };
411 let json = page
412 .extract_json_with_selector("https://example.com/page", "article")
413 .unwrap();
414 let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
415 assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
416 assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
417 }
418
419 #[test]
420 fn page_markdown_with_selector_no_match_returns_empty() {
421 let page = Page {
422 html: "<html><body><article>x</article></body></html>".into(),
423 ..Page::default()
424 };
425 let md = page.markdown_with_selector("", ".nonexistent").unwrap();
426 assert!(md.is_empty());
427 }
428
429 #[test]
430 fn page_markdown_with_invalid_selector_returns_error() {
431 let page = Page {
432 html: "<html><body><p>x</p></body></html>".into(),
433 ..Page::default()
434 };
435 let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
436 assert!(err.to_string().contains("invalid CSS selector"));
437 }
438
439 #[test]
440 fn page_markdown_with_empty_selector_returns_error() {
441 let page = Page {
442 html: "<html><body><p>x</p></body></html>".into(),
443 ..Page::default()
444 };
445 assert!(page.markdown_with_selector("", "").is_err());
446 }
447
448 #[test]
449 fn fetch_rejects_invalid_url() {
450 let result = fetch(FetchOptions::new("not a url"));
451 assert!(result.is_err());
452 let err = result.unwrap_err();
453 assert!(matches!(err, Error::InvalidUrl { .. }));
454 }
455
456 #[test]
457 fn fetch_rejects_private_ip() {
458 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
459 assert!(result.is_err());
460 }
461
462 #[test]
463 fn fetch_rejects_file_scheme() {
464 let result = fetch(FetchOptions::new("file:///etc/passwd"));
465 assert!(result.is_err());
466 }
467}