Skip to main content

spider_util/
util.rs

1//! Utility functions for the `spider-lib` framework.
2//!
3//! This module provides utility functions that are used across different
4//! components of the framework.
5
6use psl::{List, Psl};
7use scraper::Selector;
8use std::fs;
9use std::path::Path;
10use url::Url;
11
12use crate::error::SpiderError;
13use crate::request::Request;
14
15/// Checks if two URLs belong to the same site.
16pub fn is_same_site(a: &Url, b: &Url) -> bool {
17    a.host_str().and_then(|h| List.domain(h.as_bytes()))
18        == b.host_str().and_then(|h| List.domain(h.as_bytes()))
19}
20
21/// Normalizes the origin of a request's URL.
22pub fn normalize_origin(request: &Request) -> String {
23    let url = &request.url;
24    let scheme = url.scheme();
25    let host = url.host_str().unwrap_or("");
26    let port = url.port_or_known_default().unwrap_or(0);
27
28    format!("{scheme}://{host}:{port}")
29}
30
31/// Validates that the parent directory of a given file path exists, creating it if necessary.
32///
33/// # Errors
34///
35/// Returns an error if the parent directory cannot be created.
36pub fn validate_output_dir(file_path: impl AsRef<Path>) -> Result<(), SpiderError> {
37    let Some(parent_dir) = file_path.as_ref().parent() else {
38        return Ok(());
39    };
40
41    if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() {
42        fs::create_dir_all(parent_dir)?;
43    }
44
45    Ok(())
46}
47
48/// Creates a directory and all of its parent components if they are missing.
49///
50/// # Errors
51///
52/// Returns an error if the directory cannot be created.
53pub fn create_dir(dir_path: impl AsRef<Path>) -> Result<(), SpiderError> {
54    fs::create_dir_all(dir_path)?;
55    Ok(())
56}
57
58/// Converts a string selector expression into a parsed [`Selector`].
59pub trait ToSelector {
60    /// Parses a string slice into a `scraper::Selector`, returning a `SpiderError` on failure.
61    ///
62    /// # Errors
63    ///
64    /// Returns [`SpiderError::HtmlParseError`] when selector parsing fails.
65    fn to_selector(&self) -> Result<Selector, SpiderError>;
66}
67
68impl ToSelector for &str {
69    fn to_selector(&self) -> Result<Selector, SpiderError> {
70        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
71    }
72}
73
74impl ToSelector for String {
75    fn to_selector(&self) -> Result<Selector, SpiderError> {
76        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
77    }
78}