Skip to main content

spider_lib/
utils.rs

1use crate::{Request, SpiderError};
2use psl::{List, Psl};
3use scraper::Selector;
4use std::fs;
5use std::path::Path;
6use url::Url;
7
8/// Normalizes the origin of a request's URL.
9pub fn normalize_origin(request: &Request) -> String {
10    let url = &request.url;
11    let scheme = url.scheme();
12    let host = url.host_str().unwrap_or("");
13    let port = url.port_or_known_default().unwrap_or(0);
14
15    format!("{scheme}://{host}:{port}")
16}
17/// Checks if two URLs belong to the same site.
18pub fn is_same_site(a: &Url, b: &Url) -> bool {
19    a.host_str().and_then(|h| List.domain(h.as_bytes()))
20        == b.host_str().and_then(|h| List.domain(h.as_bytes()))
21}
22
23/// Validates that the parent directory of a given file path exists, creating it if necessary.
24pub fn validate_output_dir(file_path: impl AsRef<Path>) -> Result<(), SpiderError> {
25    let Some(parent_dir) = file_path.as_ref().parent() else {
26        return Ok(());
27    };
28
29    if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() {
30        fs::create_dir_all(parent_dir)?;
31    }
32
33    Ok(())
34}
35
36/// Creates a directory and all of its parent components if they are missing.
37pub fn create_dir(dir_path: impl AsRef<Path>) -> Result<(), SpiderError> {
38    fs::create_dir_all(dir_path)?;
39    Ok(())
40}
41
42pub trait ToSelector {
43    /// Parses a string slice into a `scraper::Selector`, returning a `SpiderError` on failure.
44    fn to_selector(&self) -> Result<Selector, SpiderError>;
45}
46
47impl ToSelector for &str {
48    fn to_selector(&self) -> Result<Selector, SpiderError> {
49        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
50    }
51}
52
53impl ToSelector for String {
54    fn to_selector(&self) -> Result<Selector, SpiderError> {
55        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
56    }
57}