1use std::time::Duration;
4
5use crate::error::Error;
6
7#[derive(Debug, Clone, Default, serde::Serialize)]
9#[non_exhaustive]
10pub struct Page {
11 pub html: String,
13 pub inner_text: String,
15 pub title: Option<String>,
17 #[serde(skip_serializing_if = "Option::is_none")]
19 pub layout_json: Option<String>,
20 #[serde(skip_serializing_if = "Option::is_none")]
22 pub js_result: Option<String>,
23 pub console_messages: Vec<ConsoleMessage>,
25 #[serde(skip_serializing_if = "Option::is_none")]
27 pub accessibility_tree: Option<String>,
28 #[serde(skip)]
29 screenshot_png: Option<Vec<u8>>,
30}
31
32impl Page {
33 pub fn markdown(&self) -> crate::error::Result<String> {
35 self.markdown_with_url("")
36 }
37
38 pub fn markdown_with_url(&self, url: &str) -> crate::error::Result<String> {
40 let input = crate::extract::ExtractInput::new(&self.html, url)
41 .with_layout_json(self.layout_json.as_deref())
42 .with_inner_text(Some(&self.inner_text));
43 Ok(crate::extract::extract_text(&input)?)
44 }
45
46 pub fn extract_json(&self) -> crate::error::Result<String> {
48 self.extract_json_with_url("")
49 }
50
51 pub fn extract_json_with_url(&self, url: &str) -> crate::error::Result<String> {
53 let input = crate::extract::ExtractInput::new(&self.html, url)
54 .with_layout_json(self.layout_json.as_deref())
55 .with_inner_text(Some(&self.inner_text));
56 Ok(crate::extract::extract_json(&input)?)
57 }
58
59 pub fn markdown_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
61 let input = crate::extract::ExtractInput::new(&self.html, url)
62 .with_layout_json(self.layout_json.as_deref())
63 .with_inner_text(Some(&self.inner_text))
64 .with_selector(Some(selector));
65 Ok(crate::extract::extract_text(&input)?)
66 }
67
68 pub fn extract_json_with_selector(&self, url: &str, selector: &str) -> crate::error::Result<String> {
70 let input = crate::extract::ExtractInput::new(&self.html, url)
71 .with_layout_json(self.layout_json.as_deref())
72 .with_inner_text(Some(&self.inner_text))
73 .with_selector(Some(selector));
74 Ok(crate::extract::extract_json(&input)?)
75 }
76
77 #[must_use]
79 pub fn screenshot_png(&self) -> Option<&[u8]> {
80 self.screenshot_png.as_deref()
81 }
82
83 pub(crate) fn from_servo(page: crate::bridge::ServoPage) -> Self {
84 let title = {
85 let doc = dom_query::Document::from(page.html.as_str());
86 let t = doc.select("title").text().to_string();
87 if t.is_empty() { None } else { Some(t) }
88 };
89 let screenshot_png = page.screenshot.and_then(|img| {
90 let mut buf = std::io::Cursor::new(Vec::new());
91 img.write_to(&mut buf, image::ImageFormat::Png).ok()?;
92 Some(buf.into_inner())
93 });
94 Self {
95 html: page.html,
96 inner_text: page.inner_text.unwrap_or_default(),
97 title,
98 layout_json: page.layout_json,
99 js_result: page.js_result,
100 console_messages: page
101 .console_messages
102 .into_iter()
103 .map(|m| ConsoleMessage {
104 level: match m.level {
105 crate::bridge::ConsoleLevel::Log => ConsoleLevel::Log,
106 crate::bridge::ConsoleLevel::Debug => ConsoleLevel::Debug,
107 crate::bridge::ConsoleLevel::Info => ConsoleLevel::Info,
108 crate::bridge::ConsoleLevel::Warn => ConsoleLevel::Warn,
109 crate::bridge::ConsoleLevel::Error => ConsoleLevel::Error,
110 crate::bridge::ConsoleLevel::Trace => ConsoleLevel::Trace,
111 },
112 message: m.message,
113 })
114 .collect(),
115 screenshot_png,
116 accessibility_tree: page.accessibility_tree,
117 }
118 }
119}
120
121#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
123#[non_exhaustive]
124pub struct ConsoleMessage {
125 pub level: ConsoleLevel,
127 pub message: String,
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
133#[serde(rename_all = "lowercase")]
134#[non_exhaustive]
135pub enum ConsoleLevel {
136 Log,
138 Debug,
140 Info,
142 Warn,
144 Error,
146 Trace,
148}
149
150impl ConsoleLevel {
151 #[must_use]
153 pub fn as_str(&self) -> &'static str {
154 match self {
155 Self::Log => "log",
156 Self::Debug => "debug",
157 Self::Info => "info",
158 Self::Warn => "warn",
159 Self::Error => "error",
160 Self::Trace => "trace",
161 }
162 }
163}
164
165impl std::fmt::Display for ConsoleLevel {
166 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167 f.pad(self.as_str())
168 }
169}
170
171#[derive(Debug, Clone, Default)]
172pub(crate) enum FetchMode {
173 #[default]
174 Content,
175 Screenshot {
176 full_page: bool,
177 },
178 JavaScript(String),
179}
180
181#[must_use = "options do nothing until passed to fetch()"]
188#[derive(Debug, Clone)]
189pub struct FetchOptions {
190 pub(crate) url: String,
191 pub(crate) timeout: Duration,
192 pub(crate) settle: Duration,
193 pub(crate) mode: FetchMode,
194 pub(crate) user_agent: Option<String>,
195}
196
197impl FetchOptions {
198 pub fn new(url: &str) -> Self {
200 Self {
201 url: url.into(),
202 timeout: Duration::from_secs(30),
203 settle: Duration::ZERO,
204 mode: FetchMode::Content,
205 user_agent: None,
206 }
207 }
208
209 pub fn screenshot(url: &str, full_page: bool) -> Self {
211 Self {
212 mode: FetchMode::Screenshot { full_page },
213 ..Self::new(url)
214 }
215 }
216
217 pub fn javascript(url: &str, expression: impl Into<String>) -> Self {
219 Self {
220 mode: FetchMode::JavaScript(expression.into()),
221 ..Self::new(url)
222 }
223 }
224
225 pub fn timeout(mut self, timeout: Duration) -> Self {
227 self.timeout = timeout;
228 self
229 }
230
231 pub fn settle(mut self, settle: Duration) -> Self {
233 self.settle = settle;
234 self
235 }
236
237 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
239 self.user_agent = Some(sanitize_user_agent(ua.into()));
240 self
241 }
242}
243
244#[allow(clippy::needless_pass_by_value)]
249pub fn fetch(opts: FetchOptions) -> crate::error::Result<Page> {
250 ensure_crypto_provider();
251
252 crate::net::validate_url(&opts.url, crate::bridge::engine_policy()).map_err(|e| map_url_error(&opts.url, e))?;
253
254 if matches!(opts.mode, FetchMode::Content)
255 && let Some(bytes) = crate::pdf::probe(&opts.url, opts.timeout.as_secs().max(1))
256 {
257 let text = crate::extract::extract_pdf(&bytes);
258 return Ok(Page {
259 html: String::new(),
260 inner_text: text,
261 ..Page::default()
262 });
263 }
264
265 let bridge_opts = crate::bridge::FetchOptions {
266 url: &opts.url,
267 timeout_secs: opts.timeout.as_secs().max(1),
268 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
269 user_agent: opts.user_agent.as_deref(),
270 mode: match opts.mode {
271 FetchMode::Content => crate::bridge::FetchMode::Content { include_a11y: false },
272 FetchMode::Screenshot { full_page } => crate::bridge::FetchMode::Screenshot { full_page },
273 FetchMode::JavaScript(ref expr) => crate::bridge::FetchMode::ExecuteJs {
274 expression: expr.clone(),
275 },
276 },
277 };
278
279 let servo_page = crate::bridge::fetch_page(bridge_opts).map_err(|e| {
280 let msg = format!("{e:#}");
281 if msg.contains("timed out") {
282 Error::Timeout {
283 url: opts.url.clone(),
284 timeout: opts.timeout,
285 }
286 } else {
287 Error::Engine(msg)
288 }
289 })?;
290
291 Ok(Page::from_servo(servo_page))
292}
293
294#[must_use = "options do nothing until passed to crawl() or crawl_each()"]
296#[derive(Debug, Clone)]
297pub struct CrawlOptions {
298 pub(crate) url: String,
299 pub(crate) limit: usize,
300 pub(crate) max_depth: usize,
301 pub(crate) timeout: Duration,
302 pub(crate) settle: Duration,
303 pub(crate) include: Vec<String>,
304 pub(crate) exclude: Vec<String>,
305 pub(crate) selector: Option<String>,
306 pub(crate) json: bool,
307 pub(crate) user_agent: Option<String>,
308}
309
310impl CrawlOptions {
311 pub fn new(url: &str) -> Self {
313 Self {
314 url: url.into(),
315 limit: 50,
316 max_depth: 3,
317 timeout: Duration::from_secs(30),
318 settle: Duration::ZERO,
319 include: Vec::new(),
320 exclude: Vec::new(),
321 selector: None,
322 json: false,
323 user_agent: None,
324 }
325 }
326
327 pub fn limit(mut self, n: usize) -> Self {
329 self.limit = n;
330 self
331 }
332
333 pub fn max_depth(mut self, n: usize) -> Self {
335 self.max_depth = n;
336 self
337 }
338
339 pub fn timeout(mut self, timeout: Duration) -> Self {
341 self.timeout = timeout;
342 self
343 }
344
345 pub fn settle(mut self, settle: Duration) -> Self {
347 self.settle = settle;
348 self
349 }
350
351 pub fn include(mut self, patterns: &[&str]) -> Self {
353 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
354 self
355 }
356
357 pub fn exclude(mut self, patterns: &[&str]) -> Self {
359 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
360 self
361 }
362
363 pub fn json(mut self, json: bool) -> Self {
365 self.json = json;
366 self
367 }
368
369 pub fn selector(mut self, selector: impl Into<String>) -> Self {
371 self.selector = Some(selector.into());
372 self
373 }
374
375 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
377 self.user_agent = Some(sanitize_user_agent(ua.into()));
378 self
379 }
380}
381
382#[derive(Debug, Clone)]
384#[non_exhaustive]
385pub struct CrawlResult {
386 pub url: String,
388 pub depth: usize,
390 pub outcome: Result<CrawlPage, CrawlError>,
392}
393
394#[derive(Debug, Clone)]
396pub struct CrawlPage {
397 pub title: Option<String>,
399 pub content: String,
401 pub links_found: usize,
403}
404
405#[derive(Debug, Clone)]
407pub struct CrawlError {
408 pub message: String,
410}
411
412impl std::fmt::Display for CrawlError {
413 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
414 f.write_str(&self.message)
415 }
416}
417
418impl std::error::Error for CrawlError {}
419
420impl serde::Serialize for CrawlResult {
421 fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
422 use serde::ser::SerializeMap;
423 match &self.outcome {
424 Ok(page) => {
425 let mut map = serializer.serialize_map(None)?;
426 map.serialize_entry("url", &self.url)?;
427 map.serialize_entry("depth", &self.depth)?;
428 map.serialize_entry("status", "ok")?;
429 if let Some(t) = &page.title {
430 map.serialize_entry("title", t)?;
431 }
432 map.serialize_entry("content", &page.content)?;
433 map.serialize_entry("links_found", &page.links_found)?;
434 map.end()
435 }
436 Err(e) => {
437 let mut map = serializer.serialize_map(None)?;
438 map.serialize_entry("url", &self.url)?;
439 map.serialize_entry("depth", &self.depth)?;
440 map.serialize_entry("status", "error")?;
441 map.serialize_entry("error", &e.message)?;
442 map.end()
443 }
444 }
445 }
446}
447
448impl CrawlResult {
449 fn from_internal(r: &crate::crawl::CrawlPageResult) -> Self {
450 let outcome = match r.status {
451 crate::crawl::CrawlStatus::Ok => Ok(CrawlPage {
452 title: r.title.clone(),
453 content: r.content.clone().unwrap_or_default(),
454 links_found: r.links_found,
455 }),
456 crate::crawl::CrawlStatus::Error => Err(CrawlError {
457 message: r.error.clone().unwrap_or_default(),
458 }),
459 };
460 Self {
461 url: r.url.clone(),
462 depth: r.depth,
463 outcome,
464 }
465 }
466}
467
468#[allow(clippy::needless_pass_by_value)]
470pub fn crawl_each(opts: CrawlOptions, mut on_page: impl FnMut(&CrawlResult)) -> crate::error::Result<()> {
471 ensure_crypto_provider();
472 let internal_opts = build_crawl_options(&opts)?;
473 crate::runtime::block_on(async {
474 let robots = tokio::task::spawn_blocking({
475 let seed = internal_opts.seed.clone();
476 let user_agent = internal_opts.user_agent.clone();
477 let timeout = Duration::from_secs(internal_opts.timeout_secs);
478 move || crate::robots::RobotsRules::fetch(&seed, user_agent.as_deref(), timeout)
479 })
480 .await
481 .unwrap_or(crate::robots::RobotsPolicy::Unreachable);
482 crate::crawl::run(internal_opts, robots, &crate::bridge::ServoFetcher, |r| {
483 on_page(&CrawlResult::from_internal(r));
484 })
485 .await
486 })
487 .map_err(|e| Error::Engine(e.to_string()))?;
488 Ok(())
489}
490
491#[allow(clippy::needless_pass_by_value)]
493pub fn crawl(opts: CrawlOptions) -> crate::error::Result<Vec<CrawlResult>> {
494 let mut results = Vec::new();
495 crawl_each(opts, |r| results.push(r.clone()))?;
496 Ok(results)
497}
498
499#[must_use = "options do nothing until passed to map()"]
501#[derive(Debug, Clone)]
502pub struct MapOptions {
503 url: String,
504 limit: usize,
505 include: Vec<String>,
506 exclude: Vec<String>,
507 user_agent: Option<String>,
508 timeout: u64,
509 no_fallback: bool,
510}
511
512impl MapOptions {
513 pub fn new(url: impl Into<String>) -> Self {
515 Self {
516 url: url.into(),
517 limit: 5000,
518 include: Vec::new(),
519 exclude: Vec::new(),
520 user_agent: None,
521 timeout: 30,
522 no_fallback: false,
523 }
524 }
525
526 pub fn limit(mut self, n: usize) -> Self {
528 self.limit = n;
529 self
530 }
531
532 pub fn include(mut self, patterns: &[&str]) -> Self {
534 self.include = patterns.iter().map(|s| (*s).to_string()).collect();
535 self
536 }
537
538 pub fn exclude(mut self, patterns: &[&str]) -> Self {
540 self.exclude = patterns.iter().map(|s| (*s).to_string()).collect();
541 self
542 }
543
544 pub fn user_agent(mut self, ua: impl Into<String>) -> Self {
546 self.user_agent = Some(ua.into());
547 self
548 }
549
550 pub fn timeout(mut self, secs: u64) -> Self {
552 self.timeout = secs;
553 self
554 }
555
556 pub fn no_fallback(mut self, yes: bool) -> Self {
558 self.no_fallback = yes;
559 self
560 }
561}
562
563#[derive(Debug, Clone, serde::Serialize)]
565pub struct MappedUrl {
566 pub url: String,
568 #[serde(skip_serializing_if = "Option::is_none")]
570 pub lastmod: Option<String>,
571}
572
573#[allow(clippy::needless_pass_by_value)]
575pub fn map(opts: MapOptions) -> crate::error::Result<Vec<MappedUrl>> {
576 ensure_crypto_provider();
577 let seed = url::Url::parse(&opts.url).map_err(|e| Error::InvalidUrl {
578 url: opts.url.clone(),
579 reason: e.to_string(),
580 })?;
581 crate::net::validate_url(seed.as_str(), crate::bridge::engine_policy()).map_err(|e| map_url_error(&opts.url, e))?;
582
583 let include = if opts.include.is_empty() {
584 None
585 } else {
586 Some(crate::scope::build_globset(&opts.include)?)
587 };
588 let exclude = if opts.exclude.is_empty() {
589 None
590 } else {
591 Some(crate::scope::build_globset(&opts.exclude)?)
592 };
593
594 let internal = crate::map::MapConfig {
595 seed,
596 limit: opts.limit,
597 include,
598 exclude,
599 user_agent: opts.user_agent,
600 timeout: Duration::from_secs(opts.timeout),
601 no_fallback: opts.no_fallback,
602 };
603
604 let mut results = Vec::new();
605 crate::runtime::block_on(crate::map::run(&internal, |entry| {
606 results.push(MappedUrl {
607 url: entry.url.clone(),
608 lastmod: entry.lastmod.clone(),
609 });
610 }))
611 .map_err(|e| Error::Engine(e.to_string()))?;
612 Ok(results)
613}
614
615pub fn markdown(url: &str) -> crate::error::Result<String> {
617 fetch(FetchOptions::new(url))?.markdown_with_url(url)
618}
619
620pub fn extract_json(url: &str) -> crate::error::Result<String> {
622 fetch(FetchOptions::new(url))?.extract_json_with_url(url)
623}
624
625pub fn text(url: &str) -> crate::error::Result<String> {
627 Ok(fetch(FetchOptions::new(url))?.inner_text)
628}
629
630pub fn init(policy: crate::net::NetworkPolicy) {
632 crate::bridge::set_engine_policy(policy);
633}
634
635pub fn validate_url(url: &str) -> crate::error::Result<url::Url> {
638 crate::net::validate_url(url, crate::bridge::engine_policy()).map_err(|e| map_url_error(url, e))
639}
640
641fn ensure_crypto_provider() {
642 let _ = rustls::crypto::aws_lc_rs::default_provider().install_default();
643}
644
645pub(crate) fn sanitize_user_agent(ua: String) -> String {
647 if ua.bytes().any(|b| b == b'\r' || b == b'\n' || b == 0) {
648 ua.replace(['\r', '\n', '\0'], " ")
649 } else {
650 ua
651 }
652}
653
654fn map_url_error(url: &str, e: crate::net::UrlError) -> Error {
655 match e {
656 crate::net::UrlError::PrivateAddress(host) => Error::AddressNotAllowed(host),
657 crate::net::UrlError::Invalid(reason) => Error::InvalidUrl {
658 url: url.into(),
659 reason,
660 },
661 }
662}
663
664fn build_crawl_options(opts: &CrawlOptions) -> crate::error::Result<crate::crawl::CrawlOptions> {
665 let seed =
666 crate::net::validate_url(&opts.url, crate::bridge::engine_policy()).map_err(|e| map_url_error(&opts.url, e))?;
667 let include = if opts.include.is_empty() {
668 None
669 } else {
670 Some(crate::scope::build_globset(&opts.include)?)
671 };
672 let exclude = if opts.exclude.is_empty() {
673 None
674 } else {
675 Some(crate::scope::build_globset(&opts.exclude)?)
676 };
677 Ok(crate::crawl::CrawlOptions {
678 seed,
679 limit: opts.limit,
680 max_depth: opts.max_depth,
681 timeout_secs: opts.timeout.as_secs().max(1),
682 settle_ms: u64::try_from(opts.settle.as_millis()).unwrap_or(u64::MAX),
683 include,
684 exclude,
685 selector: opts.selector.clone(),
686 json: opts.json,
687 user_agent: opts.user_agent.clone(),
688 })
689}
690
691#[cfg(test)]
692mod tests {
693 use super::*;
694
695 #[test]
696 fn fetch_options_defaults() {
697 let opts = FetchOptions::new("https://example.com");
698 assert_eq!(opts.url, "https://example.com");
699 assert_eq!(opts.timeout, Duration::from_secs(30));
700 assert_eq!(opts.settle, Duration::ZERO);
701 assert!(matches!(opts.mode, FetchMode::Content));
702 }
703
704 #[test]
705 fn fetch_options_screenshot() {
706 let opts = FetchOptions::screenshot("https://example.com", true);
707 assert!(matches!(opts.mode, FetchMode::Screenshot { full_page: true }));
708 }
709
710 #[test]
711 fn fetch_options_javascript() {
712 let opts = FetchOptions::javascript("https://example.com", "document.title");
713 assert!(matches!(opts.mode, FetchMode::JavaScript(ref e) if e == "document.title"));
714 }
715
716 #[test]
717 fn fetch_options_chaining() {
718 let opts = FetchOptions::new("https://example.com")
719 .timeout(Duration::from_secs(60))
720 .settle(Duration::from_millis(500));
721 assert_eq!(opts.timeout, Duration::from_secs(60));
722 assert_eq!(opts.settle, Duration::from_millis(500));
723 }
724
725 #[test]
726 fn crawl_options_defaults() {
727 let opts = CrawlOptions::new("https://example.com");
728 assert_eq!(opts.url, "https://example.com");
729 assert_eq!(opts.limit, 50);
730 assert_eq!(opts.max_depth, 3);
731 assert_eq!(opts.timeout, Duration::from_secs(30));
732 assert!(opts.include.is_empty());
733 assert!(opts.exclude.is_empty());
734 }
735
736 #[test]
737 fn crawl_options_chaining() {
738 let opts = CrawlOptions::new("https://example.com")
739 .limit(100)
740 .max_depth(5)
741 .timeout(Duration::from_secs(60))
742 .include(&["/docs/**"])
743 .exclude(&["/docs/archive/**"]);
744 assert_eq!(opts.limit, 100);
745 assert_eq!(opts.max_depth, 5);
746 assert_eq!(opts.include, vec!["/docs/**"]);
747 assert_eq!(opts.exclude, vec!["/docs/archive/**"]);
748 }
749
750 #[test]
751 fn fetch_user_agent_set() {
752 let opts = FetchOptions::new("https://example.com").user_agent("MyBot/1.0");
753 assert_eq!(opts.user_agent.as_deref(), Some("MyBot/1.0"));
754 }
755
756 #[test]
757 fn fetch_user_agent_default_is_none() {
758 let opts = FetchOptions::new("https://example.com");
759 assert!(opts.user_agent.is_none());
760 }
761
762 #[test]
763 fn fetch_user_agent_sanitizes_crlf() {
764 let opts = FetchOptions::new("https://example.com").user_agent("Bot\r\nX-Evil: yes");
765 assert_eq!(opts.user_agent.as_deref(), Some("Bot X-Evil: yes"));
766 }
767
768 #[test]
769 fn fetch_user_agent_sanitizes_null() {
770 let opts = FetchOptions::new("https://example.com").user_agent("Bot\0/1.0");
771 assert_eq!(opts.user_agent.as_deref(), Some("Bot /1.0"));
772 }
773
774 #[test]
775 fn fetch_user_agent_empty_string() {
776 let opts = FetchOptions::new("https://example.com").user_agent("");
777 assert_eq!(opts.user_agent.as_deref(), Some(""));
778 }
779
780 #[test]
781 fn crawl_user_agent_sanitizes_crlf() {
782 let opts = CrawlOptions::new("https://example.com").user_agent("Crawler\r\n/2.0");
783 assert_eq!(opts.user_agent.as_deref(), Some("Crawler /2.0"));
784 }
785
786 #[test]
787 fn page_markdown_from_html() {
788 let page = Page {
789 html: "<html><head><title>Test</title></head><body><p>hello world</p></body></html>".into(),
790 inner_text: "hello world".into(),
791 ..Page::default()
792 };
793 let md = page.markdown().unwrap();
794 assert!(md.contains("hello world"));
795 }
796
797 #[test]
798 fn page_extract_json_produces_valid_json() {
799 let page = Page {
800 html: "<html><head><title>Test</title></head><body><p>content</p></body></html>".into(),
801 inner_text: "content".into(),
802 ..Page::default()
803 };
804 let json = page.extract_json().unwrap();
805 let _: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
806 }
807
808 #[test]
809 fn page_screenshot_png_none_by_default() {
810 let page = Page::default();
811 assert!(page.screenshot_png().is_none());
812 }
813
814 #[test]
815 fn page_markdown_with_selector_scopes_to_subtree() {
816 let page = Page {
817 html: "<html><body><article>keep</article><aside>drop</aside></body></html>".into(),
818 ..Page::default()
819 };
820 let md = page.markdown_with_selector("https://example.com", "article").unwrap();
821 assert!(md.contains("keep"));
822 assert!(!md.contains("drop"));
823 }
824
825 #[test]
826 fn page_extract_json_with_selector_includes_url() {
827 let page = Page {
828 html: "<html><body><article>scoped</article></body></html>".into(),
829 ..Page::default()
830 };
831 let json = page
832 .extract_json_with_selector("https://example.com/page", "article")
833 .unwrap();
834 let parsed: serde_json::Value = serde_json::from_str(&json).expect("valid JSON");
835 assert_eq!(parsed["url"].as_str(), Some("https://example.com/page"));
836 assert!(parsed["text_content"].as_str().unwrap().contains("scoped"));
837 }
838
839 #[test]
840 fn page_markdown_with_selector_no_match_returns_empty() {
841 let page = Page {
842 html: "<html><body><article>x</article></body></html>".into(),
843 ..Page::default()
844 };
845 let md = page.markdown_with_selector("", ".nonexistent").unwrap();
846 assert!(md.is_empty());
847 }
848
849 #[test]
850 fn page_markdown_with_invalid_selector_returns_error() {
851 let page = Page {
852 html: "<html><body><p>x</p></body></html>".into(),
853 ..Page::default()
854 };
855 let err = page.markdown_with_selector("", "###invalid[[[").unwrap_err();
856 assert!(err.to_string().contains("invalid CSS selector"));
857 }
858
859 #[test]
860 fn page_markdown_with_empty_selector_returns_error() {
861 let page = Page {
862 html: "<html><body><p>x</p></body></html>".into(),
863 ..Page::default()
864 };
865 assert!(page.markdown_with_selector("", "").is_err());
866 }
867
868 #[test]
869 fn fetch_rejects_invalid_url() {
870 let result = fetch(FetchOptions::new("not a url"));
871 assert!(result.is_err());
872 let err = result.unwrap_err();
873 assert!(matches!(err, Error::InvalidUrl { .. }));
874 }
875
876 #[test]
877 fn fetch_rejects_private_ip() {
878 let result = fetch(FetchOptions::new("http://127.0.0.1/"));
879 assert!(result.is_err());
880 }
881
882 #[test]
883 fn fetch_rejects_file_scheme() {
884 let result = fetch(FetchOptions::new("file:///etc/passwd"));
885 assert!(result.is_err());
886 }
887}