1use std::collections::HashSet;
9use url::Url;
10
11pub fn generate_url_permutations(url: &str) -> Vec<String> {
36 let mut perms = HashSet::new();
37
38 let Ok(parsed) = Url::parse(url) else {
39 return vec![url.to_string()];
40 };
41
42 for scheme in ["http", "https"] {
44 for www in [true, false] {
45 for trailing_slash in [true, false] {
46 for index_file in [None, Some("index.html"), Some("index.php")] {
47 let mut perm_url = parsed.clone();
48
49 if perm_url.set_scheme(scheme).is_err() {
51 continue;
52 }
53
54 if let Some(host) = perm_url.host_str() {
56 let new_host = if www && !host.starts_with("www.") {
57 format!("www.{}", host)
58 } else if !www && host.starts_with("www.") {
59 host.strip_prefix("www.").unwrap_or(host).to_string()
60 } else {
61 host.to_string()
62 };
63
64 if perm_url.set_host(Some(&new_host)).is_err() {
65 continue;
66 }
67 }
68
69 let mut path = perm_url.path().to_string();
71
72 if let Some(index) = index_file {
74 if !path.ends_with(index) {
75 if path.ends_with('/') {
76 path = format!("{}{}", path, index);
77 } else {
78 path = format!("{}/{}", path, index);
79 }
80 }
81 } else {
82 if path.ends_with("/index.html") {
84 path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
85 } else if path.ends_with("/index.php") {
86 path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
87 }
88 }
89
90 if trailing_slash {
92 if !path.ends_with('/') && !path.is_empty() {
93 path = format!("{}/", path);
94 }
95 } else if path.ends_with('/') && path != "/" {
96 path = path.strip_suffix('/').unwrap_or(&path).to_string();
97 }
98
99 if path.is_empty() {
101 path = "/".to_string();
102 }
103
104 perm_url.set_path(&path);
105 perms.insert(perm_url.to_string());
106 }
107 }
108 }
109 }
110
111 perms.into_iter().collect()
112}
113
114pub fn normalize_url(url: &str) -> String {
151 let Ok(mut parsed) = Url::parse(url) else {
152 return url.to_string();
153 };
154
155 if parsed.set_scheme("https").is_err() {
157 return url.to_string();
158 }
159
160 let host_str = parsed.host_str().map(|s| s.to_string());
162 if let Some(host) = host_str {
163 if host.starts_with("www.") {
164 if let Some(without_www) = host.strip_prefix("www.") {
165 if parsed.set_host(Some(without_www)).is_err() {
166 return url.to_string();
167 }
168 }
169 }
170 }
171
172 let mut path = parsed.path().to_string();
174
175 while path.len() > 1 && path.ends_with('/') {
177 path = path.strip_suffix('/').unwrap_or(&path).to_string();
178 }
179
180 if path.ends_with("/index.html") {
182 path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
183 } else if path.ends_with("/index.php") {
184 path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
185 } else if path == "index.html" || path == "index.php" {
186 path = "/".to_string();
188 }
189
190 if path.is_empty() {
192 path = "/".to_string();
193 }
194
195 parsed.set_path(&path);
196
197 let query_pairs: Vec<(String, String)> = parsed.query_pairs()
199 .map(|(k, v)| (k.to_string(), v.to_string()))
200 .collect();
201 if !query_pairs.is_empty() {
202 let mut sorted_pairs = query_pairs;
203 sorted_pairs.sort_by(|a, b| a.0.cmp(&b.0));
204
205 parsed.query_pairs_mut().clear();
206 for (key, value) in sorted_pairs {
207 parsed.query_pairs_mut().append_pair(&key, &value);
208 }
209 }
210
211 parsed.set_fragment(None);
213
214 parsed.to_string()
215}
216
217#[cfg(test)]
218mod tests {
219 use super::*;
220
221 #[test]
222 fn test_normalize_url_removes_www() {
223 assert_eq!(
224 normalize_url("https://www.example.com/page"),
225 "https://example.com/page"
226 );
227
228 assert_eq!(
229 normalize_url("https://www.subdomain.example.com/page"),
230 "https://subdomain.example.com/page"
231 );
232 }
233
234 #[test]
235 fn test_normalize_url_prefers_https() {
236 assert_eq!(
237 normalize_url("http://example.com/page"),
238 "https://example.com/page"
239 );
240
241 assert_eq!(
242 normalize_url("http://www.example.com/page"),
243 "https://example.com/page"
244 );
245 }
246
247 #[test]
248 fn test_normalize_url_removes_trailing_slash() {
249 assert_eq!(
250 normalize_url("https://example.com/page/"),
251 "https://example.com/page"
252 );
253
254 assert_eq!(
256 normalize_url("https://example.com/"),
257 "https://example.com/"
258 );
259
260 assert_eq!(
261 normalize_url("https://example.com"),
262 "https://example.com/"
263 );
264 }
265
266 #[test]
267 fn test_normalize_url_removes_index_files() {
268 assert_eq!(
269 normalize_url("https://example.com/page/index.html"),
270 "https://example.com/page"
271 );
272
273 assert_eq!(
274 normalize_url("https://example.com/page/index.php"),
275 "https://example.com/page"
276 );
277
278 assert_eq!(
279 normalize_url("https://example.com/index.html"),
280 "https://example.com/"
281 );
282 }
283
284 #[test]
285 fn test_normalize_url_sorts_query_params() {
286 assert_eq!(
287 normalize_url("https://example.com/page?z=1&a=2"),
288 "https://example.com/page?a=2&z=1"
289 );
290
291 assert_eq!(
292 normalize_url("https://example.com/page?c=3&b=2&a=1"),
293 "https://example.com/page?a=1&b=2&c=3"
294 );
295 }
296
297 #[test]
298 fn test_normalize_url_removes_fragment() {
299 assert_eq!(
300 normalize_url("https://example.com/page#section"),
301 "https://example.com/page"
302 );
303
304 assert_eq!(
305 normalize_url("https://example.com/page?key=value#section"),
306 "https://example.com/page?key=value"
307 );
308 }
309
310 #[test]
311 fn test_generate_permutations_count() {
312 let perms = generate_url_permutations("https://example.com/page");
313 assert!(perms.len() >= 8 && perms.len() <= 32, "Expected 8-32 permutations, got {}", perms.len());
317 }
318
319 #[test]
320 fn test_generate_permutations_includes_variants() {
321 let perms = generate_url_permutations("https://example.com/page");
322
323 assert!(perms.contains(&"http://example.com/page".to_string()),
325 "Should include http variant");
326 assert!(perms.contains(&"https://www.example.com/page".to_string()),
327 "Should include www variant");
328 assert!(perms.contains(&"https://example.com/page/".to_string()),
329 "Should include trailing slash variant");
330 }
331
332 #[test]
333 fn test_normalization_idempotent() {
334 let url = "https://example.com/page";
335 assert_eq!(
336 normalize_url(&normalize_url(url)),
337 normalize_url(url),
338 "Normalization should be idempotent"
339 );
340
341 let complex_url = "http://www.example.com/page/?z=1&a=2#section";
342 assert_eq!(
343 normalize_url(&normalize_url(complex_url)),
344 normalize_url(complex_url),
345 "Complex URL normalization should be idempotent"
346 );
347 }
348
349 #[test]
350 fn test_all_permutations_normalize_to_same() {
351 let perms = generate_url_permutations("https://example.com/page");
352 let normalized: Vec<_> = perms.iter().map(|p| normalize_url(p)).collect();
353
354 let unique: HashSet<_> = normalized.iter().collect();
356 if unique.len() > 1 {
357 eprintln!("Unique normalized URLs: {:?}", unique);
358 for perm in &perms {
359 eprintln!(" {} -> {}", perm, normalize_url(perm));
360 }
361 }
362
363 let first = &normalized[0];
365 assert!(
366 normalized.iter().all(|n| n == first),
367 "All permutations should normalize to the same URL. Got: {:?}",
368 unique
369 );
370 }
371
372 #[test]
373 fn test_normalize_url_with_port() {
374 assert_eq!(
375 normalize_url("http://example.com:8080/page"),
376 "https://example.com:8080/page"
377 );
378
379 assert_eq!(
380 normalize_url("http://www.example.com:8080/page/"),
381 "https://example.com:8080/page"
382 );
383 }
384
385 #[test]
386 fn test_normalize_url_with_userinfo() {
387 let url_with_user = "http://user:pass@example.com/page";
389 let normalized = normalize_url(url_with_user);
390
391 assert!(normalized.contains("user:pass@"));
393 assert!(normalized.starts_with("https://"));
394 }
395
396 #[test]
397 fn test_normalize_invalid_url() {
398 let invalid = "not a valid url";
399 assert_eq!(normalize_url(invalid), invalid, "Invalid URLs should be returned as-is");
400 }
401
402 #[test]
403 fn test_generate_permutations_invalid_url() {
404 let invalid = "not a valid url";
405 let perms = generate_url_permutations(invalid);
406 assert_eq!(perms.len(), 1, "Invalid URLs should return single element");
407 assert_eq!(perms[0], invalid, "Invalid URLs should be returned as-is");
408 }
409
410 #[test]
411 fn test_normalize_url_mixed_case() {
412 assert_eq!(
413 normalize_url("HTTP://WWW.EXAMPLE.COM/Page"),
414 "https://example.com/Page"
415 );
416
417 let normalized = normalize_url("HTTPS://EXAMPLE.COM/MyPage");
419 assert!(normalized.starts_with("https://example.com/"));
420 assert!(normalized.contains("/MyPage"));
421 }
422
423 #[test]
424 fn test_normalize_url_non_ascii() {
425 let url = "https://example.com/café";
427 let normalized = normalize_url(url);
428 assert!(normalized.contains("caf"), "Should handle non-ASCII characters");
429 }
430
431 #[test]
432 fn test_normalize_url_empty_path() {
433 assert_eq!(
434 normalize_url("https://example.com"),
435 "https://example.com/"
436 );
437 }
438
439 #[test]
440 fn test_normalize_complex_query_params() {
441 let url = "https://example.com/page?name=John&age=30&city=Boston";
443 let normalized = normalize_url(url);
444
445 assert!(normalized.contains("age=30"));
447 assert!(normalized.contains("city="));
448 assert!(normalized.contains("name="));
449
450 let age_pos = normalized.find("age=").unwrap();
452 let city_pos = normalized.find("city=").unwrap();
453 let name_pos = normalized.find("name=").unwrap();
454 assert!(age_pos < city_pos, "age should come before city");
455 assert!(city_pos < name_pos, "city should come before name");
456 }
457
458 #[test]
459 fn test_normalize_url_preserves_subdomain() {
460 assert_eq!(
461 normalize_url("https://blog.example.com/page"),
462 "https://blog.example.com/page"
463 );
464
465 assert_eq!(
466 normalize_url("https://www.blog.example.com/page"),
467 "https://blog.example.com/page"
468 );
469 }
470
471 #[test]
472 fn test_normalize_multiple_trailing_slashes() {
473 assert_eq!(
475 normalize_url("https://example.com/page///"),
476 "https://example.com/page"
477 );
478 }
479
480 #[test]
481 fn test_permutations_with_query_params() {
482 let url = "https://example.com/page?key=value";
483 let perms = generate_url_permutations(url);
484
485 assert!(perms.iter().any(|p| p.contains("key=value")));
487 assert!(perms.len() >= 8);
488 }
489
490 #[test]
491 fn test_normalize_performance() {
492 use std::time::Instant;
494
495 let test_urls = vec![
496 "http://www.example.com/page/",
497 "https://example.com/page?z=1&a=2",
498 "http://www.example.com/page/index.html#section",
499 "https://subdomain.example.com/path/to/page/",
500 ];
501
502 let iterations = 1000;
503 let start = Instant::now();
504
505 for _ in 0..iterations {
506 for url in &test_urls {
507 let _ = normalize_url(url);
508 }
509 }
510
511 let elapsed = start.elapsed();
512 let avg_per_url = elapsed / (iterations * test_urls.len() as u32);
513
514 assert!(
516 avg_per_url.as_micros() < 50,
517 "Normalization took {}μs, expected <50μs",
518 avg_per_url.as_micros()
519 );
520 }
521
522 #[test]
523 fn test_normalize_url_special_paths() {
524 assert_eq!(
526 normalize_url("https://example.com/path/with-dashes"),
527 "https://example.com/path/with-dashes"
528 );
529
530 assert_eq!(
531 normalize_url("https://example.com/path_with_underscores"),
532 "https://example.com/path_with_underscores"
533 );
534
535 assert_eq!(
536 normalize_url("https://example.com/path.with.dots"),
537 "https://example.com/path.with.dots"
538 );
539 }
540
541 #[test]
542 fn test_normalize_url_removes_default_ports() {
543 let url = "https://example.com:443/page";
545 let normalized = normalize_url(url);
546 assert!(!normalized.contains(":443") || normalized == "https://example.com:443/page");
548 }
549}
550#[cfg(test)]
551mod demo {
552 use crate::crawler::url_normalization::{normalize_url, generate_url_permutations};
553
554 #[test]
555 fn demo_normalization() {
556 println!("\n=== URL Normalization Demo ===\n");
557
558 let test_cases = vec![
559 "http://www.example.com/page/",
560 "https://example.com/page?z=1&a=2",
561 "http://www.example.com/index.html#section",
562 "https://example.com/page/index.php/",
563 ];
564
565 for url in test_cases {
566 let normalized = normalize_url(url);
567 println!(" {} \n → {}\n", url, normalized);
568 }
569 }
570
571 #[test]
572 fn demo_permutations() {
573 println!("\n=== URL Permutations Demo ===\n");
574
575 let url = "https://example.com/page";
576 let perms = generate_url_permutations(url);
577
578 println!("Base URL: {}", url);
579 println!("Generated {} permutations:\n", perms.len());
580
581 for (i, perm) in perms.iter().enumerate().take(10) {
582 println!(" {}. {}", i + 1, perm);
583 }
584
585 if perms.len() > 10 {
586 println!(" ... and {} more", perms.len() - 10);
587 }
588
589 let normalized: std::collections::HashSet<_> = perms.iter()
591 .map(|p| normalize_url(p))
592 .collect();
593
594 println!("\nAll {} permutations normalize to {} unique URL(s):",
595 perms.len(), normalized.len());
596 for norm in normalized {
597 println!(" → {}", norm);
598 }
599 }
600}