1use crate::{Request, SpiderError};
14use psl::{List, Psl};
15use scraper::Selector;
16use std::fs;
17use std::path::Path;
18use url::Url;
19
20mod bloom_filter;
21pub use bloom_filter::BloomFilter;
22
23pub fn normalize_origin(request: &Request) -> String {
25 let url = &request.url;
26 let scheme = url.scheme();
27 let host = url.host_str().unwrap_or("");
28 let port = url.port_or_known_default().unwrap_or(0);
29
30 format!("{scheme}://{host}:{port}")
31}
32pub fn is_same_site(a: &Url, b: &Url) -> bool {
34 a.host_str().and_then(|h| List.domain(h.as_bytes()))
35 == b.host_str().and_then(|h| List.domain(h.as_bytes()))
36}
37
38pub fn validate_output_dir(file_path: impl AsRef<Path>) -> Result<(), SpiderError> {
40 let Some(parent_dir) = file_path.as_ref().parent() else {
41 return Ok(());
42 };
43
44 if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() {
45 fs::create_dir_all(parent_dir)?;
46 }
47
48 Ok(())
49}
50
51pub fn create_dir(dir_path: impl AsRef<Path>) -> Result<(), SpiderError> {
53 fs::create_dir_all(dir_path)?;
54 Ok(())
55}
56
57pub trait ToSelector {
58 fn to_selector(&self) -> Result<Selector, SpiderError>;
60}
61
62impl ToSelector for &str {
63 fn to_selector(&self) -> Result<Selector, SpiderError> {
64 Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
65 }
66}
67
68impl ToSelector for String {
69 fn to_selector(&self) -> Result<Selector, SpiderError> {
70 Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
71 }
72}