1use std::time::Duration;
4
5use crate::error::Error;
6use crate::net::sanitize_user_agent;
7
8#[derive(Debug, Clone, Default, serde::Serialize)]
10#[non_exhaustive]
11pub struct Page {
12 pub html: String,
14 pub inner_text: String,
16 pub title: Option<String>,
18 #[serde(skip_serializing_if = "Option::is_none")]
20 pub layout_json: Option<String>,
21 #[serde(skip_serializing_if = "Option::is_none")]
23 pub js_result: Option<String>,
24 pub console_messages: Vec<ConsoleMessage>,
26 #[serde(skip_serializing_if = "Option::is_none")]
28 pub accessibility_tree: Option<String>,
29 #[serde(skip_serializing_if = "Option::is_none")]
31 pub extracted: Option<serde_json::Value>,
32 #[serde(skip)]
33 screenshot_png: Option<Vec<u8>>,
34}
35
36impl Page {
37 pub fn markdown(&self) -> crate::error::Result<String> {
39 self.markdown_with_url("")
40 }
41
42 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
44 let input = crate::extract::ExtractInput::new(&self.html, url)
45 .with_layout_json(self.layout_json.as_deref())
46 .with_inner_text(Some(&self.inner_text));
47 Ok(crate::extract::extract_text(&input)?)
48 }
49
50 pub fn extract_json(&self) -> crate::error::Result<String> {
52 self.extract_json_with_url("")
53 }
54
55 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
57 let input = crate::extract::ExtractInput::new(&self.html, url)
58 .with_layout_json(self.layout_json.as_deref())
59 .with_inner_text(Some(&self.inner_text));
60 Ok(crate::extract::extract_json(&input)?)
61 }
62
63 pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
65 let input = crate::extract::ExtractInput::new(&self.html, url)
66 .with_layout_json(self.layout_json.as_deref())
67 .with_inner_text(Some(&self.inner_text))
68 .with_selector(Some(selector));
69 Ok(crate::extract::extract_text(&input)?)
70 }
71
72 pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
74 let input = crate::extract::ExtractInput::new(&self.html, url)
75 .with_layout_json(self.layout_json.as_deref())
76 .with_inner_text(Some(&self.inner_text))
77 .with_selector(Some(selector));
78 Ok(crate::extract::extract_json(&input)?)
79 }
80
81 #[must_use]
83 pub fn screenshot_png(&self) -> Option<&[u8]> {
84 self.screenshot_png.as_deref()
85 }
86
87 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
88 let title = {
89 let doc = dom_query::Document::from(page.html.as_str());
90 let t = doc.select("title").text().to_string();
91 if t.is_empty() { None } else { Some(t) }
92 };
93 let screenshot_png = page.screenshot.and_then(|img| {
94 let mut buf = std::io::Cursor::new(Vec::new());
95 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
96 Some(buf.into_inner())
97 });
98 Self {
99 html: page.html,
100 inner_text: page.inner_text.unwrap_or_default(),
101 title,
102 layout_json: page.layout_json,
103 js_result: page.js_result,
104 console_messages: page
105 .console_messages
106 .into_iter()
107 .map(|m| ConsoleMessage {
108 level: match m.level {
109 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
110 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
111 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
112 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
113 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
114 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
115 },
116 message: m.message,
117 })
118 .collect(),
119 screenshot_png,
120 accessibility_tree: page.accessibility_tree,
121 extracted: None,
122 }
123 }
124}
125
126#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
128#[non_exhaustive]
129pub struct ConsoleMessage {
130 pub level: ConsoleLevel,
132 pub message: String,
134}
135
136#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
138#[serde(rename_all = "lowercase")]
139#[non_exhaustive]
140pub enum ConsoleLevel {
141 Log,
143 Debug,
145 Info,
147 Warn,
149 Error,
151 Trace,
153}
154
155impl ConsoleLevel {
156 #[must_use]
158 pub fn as_str(&self) -> &'static str {
159 match self {
160 Self::Log => "log",
161 Self::Debug => "debug",
162 Self::Info => "info",
163 Self::Warn => "warn",
164 Self::Error => "error",
165 Self::Trace => "trace",
166 }
167 }
168}
169
170impl std::fmt::Display for ConsoleLevel {
171 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
172 f.pad(self.as_str())
173 }
174}
175
176#[derive(Debug, Clone, Default)]
177pub(crate) enum FetchMode {
178 #[default]
179 Content,
180 Screenshot {
181 full_page: bool,
182 },
183 JavaScript(String),
184}
185
186#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190 pub(crate) url: String,
191 pub(crate) timeout: Duration,
192 pub(crate) settle: Duration,
193 pub(crate) mode: FetchMode,
194 pub(crate) user_agent: Option<String>,
195 pub(crate) extract_schema: Option<crate::schema::ExtractSchema>,
196}
197
198impl FetchOptions {
199 pub fn new(url: &str) -> Self {
201 Self {
202 url: url.into(),
203 timeout: Duration::from_secs(30),
204 settle: Duration::ZERO,
205 mode: FetchMode::Content,
206 user_agent: None,
207 extract_schema: None,
208 }
209 }
210
211 pub fn screenshot(url: &str, full_page: bool) -> Self {
213 Self {
214 mode: FetchMode::Screenshot { full_page },
215 ..Self::new(url)
216 }
217 }
218
219 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
221 Self {
222 mode: FetchMode::JavaScript(expression.into()),
223 ..Self::new(url)
224 }
225 }
226
227 pub fn timeout(mut self, timeout: Duration) -> Self {
229 self.timeout = timeout;
230 self
231 }
232
233 pub fn settle(mut self, settle: Duration) -> Self {
235 self.settle = settle;
236 self
237 }
238
239 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
241 self.user_agent = Some(sanitize_user_agent(ua.into()));
242 self
243 }
244
245 pub fn schema(mut self, schema: crate::schema::ExtractSchema) -> Self {
247 self.extract_schema = Some(schema);
248 self
249 }
250}
251
252#[allow(clippy::needless_pass_by_value)]
254pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
255 crate::net::ensure_crypto_provider();
256
257 crate::net::validate_url(&opts.url)?;
258
259 if matches!(opts.mode, FetchMode::Content)
260 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
261 {
262 let text = crate::extract::extract_pdf(&bytes);
263 return Ok(Page {
264 html: String::new(),
265 inner_text: text,
266 ..Page::default()
267 });
268 }
269
270 let bridge_opts = crate::bridge::FetchOptions {
271 url: &opts.url,
272 timeout_secs: opts.timeout.as_secs().max(1),
273 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
274 user_agent: opts.user_agent.as_deref(),
275 mode: match opts.mode {
276 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
277 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
278 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
279 expression: expr.clone(),
280 },
281 },
282 };
283
284 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
285 let msg = format!("{e:#}");
286 if msg.contains("timed out") {
287 Error::Timeout {
288 url: opts.url.clone(),
289 timeout: opts.timeout,
290 }
291 } else {
292 Error::Engine(msg)
293 }
294 })?;
295
296 let mut page = Page::from_servo(servo_page);
297 if let Some(schema) = opts.extract_schema.as_ref() {
298 page.extracted = Some(schema.extract_from(&page.html));
299 }
300 Ok(page)
301}
302
303pub fn markdown(url: &str) -> crate::error::Result<String> {
305 fetch(FetchOptions::new(url))?.markdown_with_url(url)
306}
307
308pub fn extract_json(url: &str) -> crate::error::Result<String> {
310 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
311}
312
313pub fn text(url: &str) -> crate::error::Result<String> {
315 Ok(fetch(FetchOptions::new(url))?.inner_text)
316}
317
318#[cfg(test)]
319mod tests {
320 use super::*;
321
322 #[test]
323 fn fetch_options_defaults() {
324 let opts = FetchOptions::new("https://example.com");
325 assert_eq!(opts.url, "https://example.com");
326 assert_eq!(opts.timeout, Duration::from_secs(30));
327 assert_eq!(opts.settle, Duration::ZERO);
328 assert!(matches!(opts.mode, FetchMode::Content));
329 }
330
331 #[test]
332 fn fetch_options_screenshot() {
333 let opts = FetchOptions::screenshot("https://example.com", true);
334 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
335 }
336
337 #[test]
338 fn fetch_options_javascript() {
339 let opts = FetchOptions::javascript("https://example.com", "document.title");
340 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
341 }
342
343 #[test]
344 fn fetch_options_chaining() {
345 let opts = FetchOptions::new("https://example.com")
346 .timeout(Duration::from_secs(60))
347 .settle(Duration::from_millis(500));
348 assert_eq!(opts.timeout, Duration::from_secs(60));
349 assert_eq!(opts.settle, Duration::from_millis(500));
350 }
351
352 #[test]
353 fn fetch_user_agent_set() {
354 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
355 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
356 }
357
358 #[test]
359 fn fetch_user_agent_default_is_none() {
360 let opts = FetchOptions::new("https://example.com");
361 assert!(opts.user_agent.is_none());
362 }
363
364 #[test]
365 fn fetch_user_agent_sanitizes_crlf() {
366 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
367 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
368 }
369
370 #[test]
371 fn fetch_user_agent_sanitizes_null() {
372 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
373 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
374 }
375
376 #[test]
377 fn fetch_user_agent_empty_string() {
378 let opts = FetchOptions::new("https://example.com").user_agent("");
379 assert_eq!(opts.user_agent.as_deref(), Some(""));
380 }
381
382 #[test]
383 fn page_markdown_from_html() {
384 let page = Page {
385 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
386 inner_text: "hello world".into(),
387 ..Page::default()
388 };
389 let md = page.markdown().unwrap();
390 assert!(md.contains("hello world"));
391 }
392
393 #[test]
394 fn page_extract_json_produces_valid_json() {
395 let page = Page {
396 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
397 inner_text: "content".into(),
398 ..Page::default()
399 };
400 let json = page.extract_json().unwrap();
401 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
402 }
403
404 #[test]
405 fn page_screenshot_png_none_by_default() {
406 let page = Page::default();
407 assert!(page.screenshot_png().is_none());
408 }
409
410 #[test]
411 fn page_markdown_with_selector_scopes_to_subtree() {
412 let page = Page {
413 html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
414 ..Page::default()
415 };
416 let md = page.markdown_with_selector("https://example.com", "article").unwrap();
417 assert!(md.contains("keep"));
418 assert!(!md.contains("drop"));
419 }
420
421 #[test]
422 fn page_extract_json_with_selector_includes_url() {
423 let page = Page {
424 html: "<html><body><article>scoped</article></body></html>".into(),
425 ..Page::default()
426 };
427 let json = page
428 .extract_json_with_selector("https://example.com/page", "article")
429 .unwrap();
430 let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
431 assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
432 assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
433 }
434
435 #[test]
436 fn page_markdown_with_selector_no_match_returns_empty() {
437 let page = Page {
438 html: "<html><body><article>x</article></body></html>".into(),
439 ..Page::default()
440 };
441 let md = page.markdown_with_selector("", ".nonexistent").unwrap();
442 assert!(md.is_empty());
443 }
444
445 #[test]
446 fn page_markdown_with_invalid_selector_returns_error() {
447 let page = Page {
448 html: "<html><body><p>x</p></body></html>".into(),
449 ..Page::default()
450 };
451 let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
452 assert!(err.to_string().contains("invalid CSS selector"));
453 }
454
455 #[test]
456 fn page_markdown_with_empty_selector_returns_error() {
457 let page = Page {
458 html: "<html><body><p>x</p></body></html>".into(),
459 ..Page::default()
460 };
461 assert!(page.markdown_with_selector("", "").is_err());
462 }
463
464 #[test]
465 fn fetch_rejects_invalid_url() {
466 let result = fetch(FetchOptions::new("not a url"));
467 assert!(result.is_err());
468 let err = result.unwrap_err();
469 assert!(matches!(err, Error::InvalidUrl { .. }));
470 }
471
472 #[test]
473 fn fetch_rejects_private_ip() {
474 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
475 assert!(result.is_err());
476 }
477
478 #[test]
479 fn fetch_rejects_file_scheme() {
480 let result = fetch(FetchOptions::new("file:///etc/passwd"));
481 assert!(result.is_err());
482 }
483}