1pub mod dns_cache;
2pub mod etld;
3pub mod retry;
4pub mod robots;
5pub mod robots_enhanced;
6pub mod ssrf_protection;
7pub mod url_rewrites;
8pub mod user_agents;
9
10use url::Url;
11
12pub fn normalize_url_string(url_str: &str) -> Result<String, String> {
39 let mut url = Url::parse(url_str).map_err(|e| format!("Invalid URL: {}", e))?;
40
41 url.set_fragment(None);
43
44 let path = url.path().to_string();
46 if path.len() > 1 && path.ends_with('/') {
47 url.set_path(&path[..path.len() - 1]);
48 }
49
50 let mut normalized = url.to_string();
53 if normalized.ends_with('/') && !normalized.ends_with("://") {
54 normalized.pop();
55 }
56
57 Ok(normalized)
58}
59
60pub fn normalize_url(url_str: &str) -> Result<Url, String> {
62 let normalized = normalize_url_string(url_str)?;
63 Url::parse(&normalized).map_err(|e| format!("Invalid URL: {}", e))
64}
65
66pub fn extract_domain(url: &Url) -> Option<String> {
68 url.host_str().map(|s| s.to_string())
69}
70
71pub fn is_valid_scrape_url(url: &Url) -> bool {
73 matches!(url.scheme(), "http" | "https")
74}
75
76#[cfg(test)]
77mod tests {
78 use super::*;
79
80 #[test]
81 fn test_normalize_url_trailing_slash() {
82 assert_eq!(
84 normalize_url_string("https://example.com/").unwrap(),
85 "https://example.com"
86 );
87
88 assert_eq!(
89 normalize_url_string("https://example.com").unwrap(),
90 "https://example.com"
91 );
92
93 assert_eq!(
95 normalize_url_string("https://example.com/page/").unwrap(),
96 "https://example.com/page"
97 );
98
99 assert_eq!(
100 normalize_url_string("https://example.com/blog/post/").unwrap(),
101 "https://example.com/blog/post"
102 );
103 }
104
105 #[test]
106 fn test_normalize_url_fragment() {
107 assert_eq!(
108 normalize_url_string("https://example.com#section").unwrap(),
109 "https://example.com"
110 );
111
112 assert_eq!(
113 normalize_url_string("https://example.com/page#section").unwrap(),
114 "https://example.com/page"
115 );
116
117 assert_eq!(
118 normalize_url_string("https://example.com/page/#section").unwrap(),
119 "https://example.com/page"
120 );
121 }
122
123 #[test]
124 fn test_normalize_url_query_params() {
125 assert_eq!(
127 normalize_url_string("https://example.com/page?key=value").unwrap(),
128 "https://example.com/page?key=value"
129 );
130
131 assert_eq!(
132 normalize_url_string("https://example.com/page/?key=value").unwrap(),
133 "https://example.com/page?key=value"
134 );
135 }
136
137 #[test]
138 fn test_normalize_url_scheme_and_host() {
139 assert_eq!(
141 normalize_url_string("HTTPS://EXAMPLE.COM/PAGE").unwrap(),
142 "https://example.com/PAGE"
143 );
144 }
145
146 #[test]
147 fn test_normalize_url_deduplication_case() {
148 let url1 = normalize_url_string("https://quotes.toscrape.com").unwrap();
150 let url2 = normalize_url_string("https://quotes.toscrape.com/").unwrap();
151
152 assert_eq!(
153 url1, url2,
154 "URLs with and without trailing slash should normalize to the same value"
155 );
156 }
157
158 #[test]
159 fn test_normalize_url_invalid() {
160 assert!(normalize_url_string("not a url").is_err());
161 assert!(normalize_url_string("").is_err());
162 }
163}