Skip to main content

spider_lib/
utils.rs

1//! General utility functions and helper traits for the `spider-lib` framework.
2//!
3//! This module provides a collection of miscellaneous functions and extensions
4//! that are used across different components of the `spider-lib`. These utilities
5//! aim to simplify common tasks such as URL manipulation, file system operations,
6//! and HTML selector parsing.
7//!
8//! Key functionalities include:
9//! - Normalizing URL origins and checking same-site policies.
10//! - Ensuring the existence of directories for output files.
11//! - Conveniently converting strings into `scraper::Selector` instances.
12
13use crate::{Request, SpiderError};
14use psl::{List, Psl};
15use scraper::Selector;
16use std::fs;
17use std::path::Path;
18use url::Url;
19
20/// Normalizes the origin of a request's URL.
21pub fn normalize_origin(request: &Request) -> String {
22    let url = &request.url;
23    let scheme = url.scheme();
24    let host = url.host_str().unwrap_or("");
25    let port = url.port_or_known_default().unwrap_or(0);
26
27    format!("{scheme}://{host}:{port}")
28}
29/// Checks if two URLs belong to the same site.
30pub fn is_same_site(a: &Url, b: &Url) -> bool {
31    a.host_str().and_then(|h| List.domain(h.as_bytes()))
32        == b.host_str().and_then(|h| List.domain(h.as_bytes()))
33}
34
35/// Validates that the parent directory of a given file path exists, creating it if necessary.
36pub fn validate_output_dir(file_path: impl AsRef<Path>) -> Result<(), SpiderError> {
37    let Some(parent_dir) = file_path.as_ref().parent() else {
38        return Ok(());
39    };
40
41    if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() {
42        fs::create_dir_all(parent_dir)?;
43    }
44
45    Ok(())
46}
47
48/// Creates a directory and all of its parent components if they are missing.
49pub fn create_dir(dir_path: impl AsRef<Path>) -> Result<(), SpiderError> {
50    fs::create_dir_all(dir_path)?;
51    Ok(())
52}
53
54pub trait ToSelector {
55    /// Parses a string slice into a `scraper::Selector`, returning a `SpiderError` on failure.
56    fn to_selector(&self) -> Result<Selector, SpiderError>;
57}
58
59impl ToSelector for &str {
60    fn to_selector(&self) -> Result<Selector, SpiderError> {
61        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
62    }
63}
64
65impl ToSelector for String {
66    fn to_selector(&self) -> Result<Selector, SpiderError> {
67        Selector::parse(self).map_err(|e| SpiderError::HtmlParseError(e.to_string()))
68    }
69}