Skip to main content

spider_util/
util.rs

1//! Small utility helpers shared across the workspace.
2
3use psl::{List, Psl};
4use scraper::Selector;
5use std::fs;
6use std::path::Path;
7use url::Url;
8
9use crate::error::SpiderError;
10use crate::request::Request;
11
12/// Checks if two URLs belong to the same site.
13pub fn is_same_site(a: &Url, b: &Url) -> bool {
14    a.host_str().and_then(|h| List.domain(h.as_bytes()))
15        == b.host_str().and_then(|h| List.domain(h.as_bytes()))
16}
17
18/// Normalizes the origin of a request's URL.
19pub fn normalize_origin(request: &Request) -> String {
20    let url = &request.url;
21    let scheme = url.scheme();
22    let host = url.host_str().unwrap_or("");
23    let port = url.port_or_known_default().unwrap_or(0);
24
25    format!("{scheme}://{host}:{port}")
26}
27
28/// Validates that the parent directory of a given file path exists, creating it if necessary.
29///
30/// # Errors
31///
32/// Returns an error if the parent directory cannot be created.
33pub fn validate_output_dir(file_path: impl AsRef<Path>) -> Result<(), SpiderError> {
34    let Some(parent_dir) = file_path.as_ref().parent() else {
35        return Ok(());
36    };
37
38    if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() {
39        fs::create_dir_all(parent_dir)?;
40    }
41
42    Ok(())
43}
44
45/// Creates a directory and all of its parent components if they are missing.
46///
47/// # Errors
48///
49/// Returns an error if the directory cannot be created.
50pub fn create_dir(dir_path: impl AsRef<Path>) -> Result<(), SpiderError> {
51    fs::create_dir_all(dir_path)?;
52    Ok(())
53}
54
55/// Converts a string selector expression into a parsed [`Selector`].
56pub trait ToSelector {
57    /// Parses a string slice into a `scraper::Selector`, returning a `SpiderError` on failure.
58    ///
59    /// # Errors
60    ///
61    /// Returns [`SpiderError::HtmlParseError`] when selector parsing fails.
62    fn to_selector(&self) -> Result<Selector, SpiderError>;
63}
64
65impl ToSelector for &str {
66    fn to_selector(&self) -> Result<Selector, SpiderError> {
67        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
68    }
69}
70
71impl ToSelector for String {
72    fn to_selector(&self) -> Result<Selector, SpiderError> {
73        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
74    }
75}