essence/utils/
mod.rs

1pub mod dns_cache;
2pub mod etld;
3pub mod retry;
4pub mod robots;
5pub mod robots_enhanced;
6pub mod ssrf_protection;
7pub mod url_rewrites;
8pub mod user_agents;
9
10use url::Url;
11
12/// Normalize URL to prevent duplicates from trailing slashes, fragments, etc.
13///
14/// This function:
15/// - Removes ALL trailing slashes (including root path)
16/// - Removes fragments (#anchors)
17/// - Lowercases the scheme and host
18/// - Preserves query parameters
19///
20/// # Examples
21///
22/// ```
23/// use essence::utils::normalize_url_string;
24///
25/// assert_eq!(
26///     normalize_url_string("https://example.com/").unwrap(),
27///     "https://example.com"
28/// );
29/// assert_eq!(
30///     normalize_url_string("https://example.com/page/").unwrap(),
31///     "https://example.com/page"
32/// );
33/// assert_eq!(
34///     normalize_url_string("https://example.com/page#section").unwrap(),
35///     "https://example.com/page"
36/// );
37/// ```
38pub fn normalize_url_string(url_str: &str) -> Result<String, String> {
39    let mut url = Url::parse(url_str).map_err(|e| format!("Invalid URL: {}", e))?;
40
41    // Remove fragment (#anchors)
42    url.set_fragment(None);
43
44    // Remove trailing slash from path (for non-root paths)
45    let path = url.path().to_string();
46    if path.len() > 1 && path.ends_with('/') {
47        url.set_path(&path[..path.len() - 1]);
48    }
49
50    // Serialize and remove trailing slash even for root path
51    // This ensures "https://example.com" and "https://example.com/" are the same
52    let mut normalized = url.to_string();
53    if normalized.ends_with('/') && !normalized.ends_with("://") {
54        normalized.pop();
55    }
56
57    Ok(normalized)
58}
59
60/// Parse and normalize URL (legacy function for backward compatibility)
61pub fn normalize_url(url_str: &str) -> Result<Url, String> {
62    let normalized = normalize_url_string(url_str)?;
63    Url::parse(&normalized).map_err(|e| format!("Invalid URL: {}", e))
64}
65
66/// Extract domain from URL
67pub fn extract_domain(url: &Url) -> Option<String> {
68    url.host_str().map(|s| s.to_string())
69}
70
71/// Check if URL is valid for scraping
72pub fn is_valid_scrape_url(url: &Url) -> bool {
73    matches!(url.scheme(), "http" | "https")
74}
75
76#[cfg(test)]
77mod tests {
78    use super::*;
79
80    #[test]
81    fn test_normalize_url_trailing_slash() {
82        // Root path should have trailing slash removed for consistency
83        assert_eq!(
84            normalize_url_string("https://example.com/").unwrap(),
85            "https://example.com"
86        );
87
88        assert_eq!(
89            normalize_url_string("https://example.com").unwrap(),
90            "https://example.com"
91        );
92
93        // Non-root paths should have trailing slash removed
94        assert_eq!(
95            normalize_url_string("https://example.com/page/").unwrap(),
96            "https://example.com/page"
97        );
98
99        assert_eq!(
100            normalize_url_string("https://example.com/blog/post/").unwrap(),
101            "https://example.com/blog/post"
102        );
103    }
104
105    #[test]
106    fn test_normalize_url_fragment() {
107        assert_eq!(
108            normalize_url_string("https://example.com#section").unwrap(),
109            "https://example.com"
110        );
111
112        assert_eq!(
113            normalize_url_string("https://example.com/page#section").unwrap(),
114            "https://example.com/page"
115        );
116
117        assert_eq!(
118            normalize_url_string("https://example.com/page/#section").unwrap(),
119            "https://example.com/page"
120        );
121    }
122
123    #[test]
124    fn test_normalize_url_query_params() {
125        // Query parameters should be preserved
126        assert_eq!(
127            normalize_url_string("https://example.com/page?key=value").unwrap(),
128            "https://example.com/page?key=value"
129        );
130
131        assert_eq!(
132            normalize_url_string("https://example.com/page/?key=value").unwrap(),
133            "https://example.com/page?key=value"
134        );
135    }
136
137    #[test]
138    fn test_normalize_url_scheme_and_host() {
139        // Scheme and host should be lowercased
140        assert_eq!(
141            normalize_url_string("HTTPS://EXAMPLE.COM/PAGE").unwrap(),
142            "https://example.com/PAGE"
143        );
144    }
145
146    #[test]
147    fn test_normalize_url_deduplication_case() {
148        // This is the critical test case from quotes.toscrape.com
149        let url1 = normalize_url_string("https://quotes.toscrape.com").unwrap();
150        let url2 = normalize_url_string("https://quotes.toscrape.com/").unwrap();
151
152        assert_eq!(
153            url1, url2,
154            "URLs with and without trailing slash should normalize to the same value"
155        );
156    }
157
158    #[test]
159    fn test_normalize_url_invalid() {
160        assert!(normalize_url_string("not a url").is_err());
161        assert!(normalize_url_string("").is_err());
162    }
163}
essence/utils/mod.rs

essence/utils/
mod.rs