essence/crawler/
url_normalization.rs

1//! URL Normalization and Permutation Generation for Crawl Deduplication
2//!
3//! This module provides comprehensive URL normalization to prevent duplicate scraping
4//! of the same URL with different permutations (www/non-www, http/https, trailing slash, etc.).
5//!
6//! Expected impact: 5-10% crawl efficiency improvement by reducing duplicate requests.
7
8use std::collections::HashSet;
9use url::Url;
10
11/// Generate all URL permutations for deduplication (returns ~16 variations)
12///
13/// This function generates common URL variations that should be treated as duplicates:
14/// - http vs https
15/// - www vs non-www
16/// - trailing slash vs no trailing slash
17/// - index.html, index.php removal
18///
19/// # Arguments
20/// * `url` - The base URL to generate permutations for
21///
22/// # Returns
23/// A vector of all permutation strings. Invalid URLs return a single-element vector.
24///
25/// # Examples
26///
27/// ```
28/// use essence::crawler::url_normalization::generate_url_permutations;
29///
30/// let perms = generate_url_permutations("https://example.com/page");
31/// assert!(perms.len() >= 8);
32/// assert!(perms.contains(&"http://example.com/page".to_string()));
33/// assert!(perms.contains(&"https://www.example.com/page".to_string()));
34/// ```
35pub fn generate_url_permutations(url: &str) -> Vec<String> {
36    let mut perms = HashSet::new();
37
38    let Ok(parsed) = Url::parse(url) else {
39        return vec![url.to_string()];
40    };
41
42    // Base variations
43    for scheme in ["http", "https"] {
44        for www in [true, false] {
45            for trailing_slash in [true, false] {
46                for index_file in [None, Some("index.html"), Some("index.php")] {
47                    let mut perm_url = parsed.clone();
48                    
49                    // Set scheme
50                    if perm_url.set_scheme(scheme).is_err() {
51                        continue;
52                    }
53
54                    // Add/remove www
55                    if let Some(host) = perm_url.host_str() {
56                        let new_host = if www && !host.starts_with("www.") {
57                            format!("www.{}", host)
58                        } else if !www && host.starts_with("www.") {
59                            host.strip_prefix("www.").unwrap_or(host).to_string()
60                        } else {
61                            host.to_string()
62                        };
63                        
64                        if perm_url.set_host(Some(&new_host)).is_err() {
65                            continue;
66                        }
67                    }
68
69                    // Get the current path
70                    let mut path = perm_url.path().to_string();
71                    
72                    // Add/remove index files
73                    if let Some(index) = index_file {
74                        if !path.ends_with(index) {
75                            if path.ends_with('/') {
76                                path = format!("{}{}", path, index);
77                            } else {
78                                path = format!("{}/{}", path, index);
79                            }
80                        }
81                    } else {
82                        // Remove index files if present
83                        if path.ends_with("/index.html") {
84                            path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
85                        } else if path.ends_with("/index.php") {
86                            path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
87                        }
88                    }
89                    
90                    // Add/remove trailing slash
91                    if trailing_slash {
92                        if !path.ends_with('/') && !path.is_empty() {
93                            path = format!("{}/", path);
94                        }
95                    } else if path.ends_with('/') && path != "/" {
96                        path = path.strip_suffix('/').unwrap_or(&path).to_string();
97                    }
98                    
99                    // Ensure path is not empty
100                    if path.is_empty() {
101                        path = "/".to_string();
102                    }
103
104                    perm_url.set_path(&path);
105                    perms.insert(perm_url.to_string());
106                }
107            }
108        }
109    }
110
111    perms.into_iter().collect()
112}
113
114/// Normalize URL to canonical form for deduplication
115///
116/// Canonical form rules:
117/// 1. Always HTTPS (prefer secure)
118/// 2. Remove www. prefix
119/// 3. Remove trailing slash (except for root /)
120/// 4. Remove index.html/index.php
121/// 5. Sort query parameters alphabetically
122/// 6. Remove fragment (#)
123///
124/// # Arguments
125/// * `url` - The URL to normalize
126///
127/// # Returns
128/// The normalized canonical URL string. Returns original string if parsing fails.
129///
130/// # Examples
131///
132/// ```
133/// use essence::crawler::url_normalization::normalize_url;
134///
135/// assert_eq!(
136///     normalize_url("http://www.example.com/page/"),
137///     "https://example.com/page"
138/// );
139///
140/// assert_eq!(
141///     normalize_url("https://example.com/page/index.html"),
142///     "https://example.com/page"
143/// );
144///
145/// assert_eq!(
146///     normalize_url("https://example.com/page?z=1&a=2"),
147///     "https://example.com/page?a=2&z=1"
148/// );
149/// ```
150pub fn normalize_url(url: &str) -> String {
151    let Ok(mut parsed) = Url::parse(url) else {
152        return url.to_string();
153    };
154
155    // 1. Always HTTPS (prefer secure)
156    if parsed.set_scheme("https").is_err() {
157        return url.to_string();
158    }
159
160    // 2. Remove www. prefix
161    let host_str = parsed.host_str().map(|s| s.to_string());
162    if let Some(host) = host_str {
163        if host.starts_with("www.") {
164            if let Some(without_www) = host.strip_prefix("www.") {
165                if parsed.set_host(Some(without_www)).is_err() {
166                    return url.to_string();
167                }
168            }
169        }
170    }
171
172    // 3. Get path and normalize
173    let mut path = parsed.path().to_string();
174
175    // 4. First remove all trailing slashes (except for root)
176    while path.len() > 1 && path.ends_with('/') {
177        path = path.strip_suffix('/').unwrap_or(&path).to_string();
178    }
179
180    // 5. Then remove index.html/index.php (after trailing slashes are gone)
181    if path.ends_with("/index.html") {
182        path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
183    } else if path.ends_with("/index.php") {
184        path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
185    } else if path == "index.html" || path == "index.php" {
186        // Special case: root index files
187        path = "/".to_string();
188    }
189
190    // Ensure path is not empty
191    if path.is_empty() {
192        path = "/".to_string();
193    }
194
195    parsed.set_path(&path);
196
197    // 6. Sort query parameters alphabetically
198    let query_pairs: Vec<(String, String)> = parsed.query_pairs()
199        .map(|(k, v)| (k.to_string(), v.to_string()))
200        .collect();
201    if !query_pairs.is_empty() {
202        let mut sorted_pairs = query_pairs;
203        sorted_pairs.sort_by(|a, b| a.0.cmp(&b.0));
204
205        parsed.query_pairs_mut().clear();
206        for (key, value) in sorted_pairs {
207            parsed.query_pairs_mut().append_pair(&key, &value);
208        }
209    }
210
211    // 7. Remove fragment
212    parsed.set_fragment(None);
213
214    parsed.to_string()
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn test_normalize_url_removes_www() {
223        assert_eq!(
224            normalize_url("https://www.example.com/page"),
225            "https://example.com/page"
226        );
227        
228        assert_eq!(
229            normalize_url("https://www.subdomain.example.com/page"),
230            "https://subdomain.example.com/page"
231        );
232    }
233
234    #[test]
235    fn test_normalize_url_prefers_https() {
236        assert_eq!(
237            normalize_url("http://example.com/page"),
238            "https://example.com/page"
239        );
240        
241        assert_eq!(
242            normalize_url("http://www.example.com/page"),
243            "https://example.com/page"
244        );
245    }
246
247    #[test]
248    fn test_normalize_url_removes_trailing_slash() {
249        assert_eq!(
250            normalize_url("https://example.com/page/"),
251            "https://example.com/page"
252        );
253        
254        // But keep for root
255        assert_eq!(
256            normalize_url("https://example.com/"),
257            "https://example.com/"
258        );
259        
260        assert_eq!(
261            normalize_url("https://example.com"),
262            "https://example.com/"
263        );
264    }
265
266    #[test]
267    fn test_normalize_url_removes_index_files() {
268        assert_eq!(
269            normalize_url("https://example.com/page/index.html"),
270            "https://example.com/page"
271        );
272        
273        assert_eq!(
274            normalize_url("https://example.com/page/index.php"),
275            "https://example.com/page"
276        );
277        
278        assert_eq!(
279            normalize_url("https://example.com/index.html"),
280            "https://example.com/"
281        );
282    }
283
284    #[test]
285    fn test_normalize_url_sorts_query_params() {
286        assert_eq!(
287            normalize_url("https://example.com/page?z=1&a=2"),
288            "https://example.com/page?a=2&z=1"
289        );
290        
291        assert_eq!(
292            normalize_url("https://example.com/page?c=3&b=2&a=1"),
293            "https://example.com/page?a=1&b=2&c=3"
294        );
295    }
296
297    #[test]
298    fn test_normalize_url_removes_fragment() {
299        assert_eq!(
300            normalize_url("https://example.com/page#section"),
301            "https://example.com/page"
302        );
303        
304        assert_eq!(
305            normalize_url("https://example.com/page?key=value#section"),
306            "https://example.com/page?key=value"
307        );
308    }
309
310    #[test]
311    fn test_generate_permutations_count() {
312        let perms = generate_url_permutations("https://example.com/page");
313        // Should generate permutations (at least 8)
314        // 2 schemes × 2 www × 2 trailing slash × 3 index files = 24 potential combinations
315        // Some may be deduplicated
316        assert!(perms.len() >= 8 && perms.len() <= 32, "Expected 8-32 permutations, got {}", perms.len());
317    }
318
319    #[test]
320    fn test_generate_permutations_includes_variants() {
321        let perms = generate_url_permutations("https://example.com/page");
322        
323        // Should include various combinations
324        assert!(perms.contains(&"http://example.com/page".to_string()), 
325                "Should include http variant");
326        assert!(perms.contains(&"https://www.example.com/page".to_string()),
327                "Should include www variant");
328        assert!(perms.contains(&"https://example.com/page/".to_string()),
329                "Should include trailing slash variant");
330    }
331
332    #[test]
333    fn test_normalization_idempotent() {
334        let url = "https://example.com/page";
335        assert_eq!(
336            normalize_url(&normalize_url(url)), 
337            normalize_url(url),
338            "Normalization should be idempotent"
339        );
340        
341        let complex_url = "http://www.example.com/page/?z=1&a=2#section";
342        assert_eq!(
343            normalize_url(&normalize_url(complex_url)),
344            normalize_url(complex_url),
345            "Complex URL normalization should be idempotent"
346        );
347    }
348
349    #[test]
350    fn test_all_permutations_normalize_to_same() {
351        let perms = generate_url_permutations("https://example.com/page");
352        let normalized: Vec<_> = perms.iter().map(|p| normalize_url(p)).collect();
353
354        // Debug: print unique normalized URLs
355        let unique: HashSet<_> = normalized.iter().collect();
356        if unique.len() > 1 {
357            eprintln!("Unique normalized URLs: {:?}", unique);
358            for perm in &perms {
359                eprintln!("  {} -> {}", perm, normalize_url(perm));
360            }
361        }
362
363        // All permutations should normalize to the same canonical form
364        let first = &normalized[0];
365        assert!(
366            normalized.iter().all(|n| n == first),
367            "All permutations should normalize to the same URL. Got: {:?}",
368            unique
369        );
370    }
371
372    #[test]
373    fn test_normalize_url_with_port() {
374        assert_eq!(
375            normalize_url("http://example.com:8080/page"),
376            "https://example.com:8080/page"
377        );
378        
379        assert_eq!(
380            normalize_url("http://www.example.com:8080/page/"),
381            "https://example.com:8080/page"
382        );
383    }
384
385    #[test]
386    fn test_normalize_url_with_userinfo() {
387        // URL with userinfo (rare but valid)
388        let url_with_user = "http://user:pass@example.com/page";
389        let normalized = normalize_url(url_with_user);
390        
391        // Should preserve userinfo but normalize other parts
392        assert!(normalized.contains("user:pass@"));
393        assert!(normalized.starts_with("https://"));
394    }
395
396    #[test]
397    fn test_normalize_invalid_url() {
398        let invalid = "not a valid url";
399        assert_eq!(normalize_url(invalid), invalid, "Invalid URLs should be returned as-is");
400    }
401
402    #[test]
403    fn test_generate_permutations_invalid_url() {
404        let invalid = "not a valid url";
405        let perms = generate_url_permutations(invalid);
406        assert_eq!(perms.len(), 1, "Invalid URLs should return single element");
407        assert_eq!(perms[0], invalid, "Invalid URLs should be returned as-is");
408    }
409
410    #[test]
411    fn test_normalize_url_mixed_case() {
412        assert_eq!(
413            normalize_url("HTTP://WWW.EXAMPLE.COM/Page"),
414            "https://example.com/Page"
415        );
416        
417        // Host should be lowercase, path should preserve case
418        let normalized = normalize_url("HTTPS://EXAMPLE.COM/MyPage");
419        assert!(normalized.starts_with("https://example.com/"));
420        assert!(normalized.contains("/MyPage"));
421    }
422
423    #[test]
424    fn test_normalize_url_non_ascii() {
425        // Test with international domain names
426        let url = "https://example.com/café";
427        let normalized = normalize_url(url);
428        assert!(normalized.contains("caf"), "Should handle non-ASCII characters");
429    }
430
431    #[test]
432    fn test_normalize_url_empty_path() {
433        assert_eq!(
434            normalize_url("https://example.com"),
435            "https://example.com/"
436        );
437    }
438
439    #[test]
440    fn test_normalize_complex_query_params() {
441        // Test with URL-encoded query parameters
442        let url = "https://example.com/page?name=John&age=30&city=Boston";
443        let normalized = normalize_url(url);
444
445        // Should preserve parameters and sort
446        assert!(normalized.contains("age=30"));
447        assert!(normalized.contains("city="));
448        assert!(normalized.contains("name="));
449
450        // Verify sorted order (age < city < name alphabetically)
451        let age_pos = normalized.find("age=").unwrap();
452        let city_pos = normalized.find("city=").unwrap();
453        let name_pos = normalized.find("name=").unwrap();
454        assert!(age_pos < city_pos, "age should come before city");
455        assert!(city_pos < name_pos, "city should come before name");
456    }
457
458    #[test]
459    fn test_normalize_url_preserves_subdomain() {
460        assert_eq!(
461            normalize_url("https://blog.example.com/page"),
462            "https://blog.example.com/page"
463        );
464        
465        assert_eq!(
466            normalize_url("https://www.blog.example.com/page"),
467            "https://blog.example.com/page"
468        );
469    }
470
471    #[test]
472    fn test_normalize_multiple_trailing_slashes() {
473        // Edge case: multiple trailing slashes
474        assert_eq!(
475            normalize_url("https://example.com/page///"),
476            "https://example.com/page"
477        );
478    }
479
480    #[test]
481    fn test_permutations_with_query_params() {
482        let url = "https://example.com/page?key=value";
483        let perms = generate_url_permutations(url);
484
485        // Should generate permutations while preserving query params
486        assert!(perms.iter().any(|p| p.contains("key=value")));
487        assert!(perms.len() >= 8);
488    }
489
490    #[test]
491    fn test_normalize_performance() {
492        // Verify normalization is fast (<10μs per URL)
493        use std::time::Instant;
494
495        let test_urls = vec![
496            "http://www.example.com/page/",
497            "https://example.com/page?z=1&a=2",
498            "http://www.example.com/page/index.html#section",
499            "https://subdomain.example.com/path/to/page/",
500        ];
501
502        let iterations = 1000;
503        let start = Instant::now();
504
505        for _ in 0..iterations {
506            for url in &test_urls {
507                let _ = normalize_url(url);
508            }
509        }
510
511        let elapsed = start.elapsed();
512        let avg_per_url = elapsed / (iterations * test_urls.len() as u32);
513
514        // Should be well under 10μs per normalization
515        assert!(
516            avg_per_url.as_micros() < 50,
517            "Normalization took {}μs, expected <50μs",
518            avg_per_url.as_micros()
519        );
520    }
521
522    #[test]
523    fn test_normalize_url_special_paths() {
524        // Test with special path characters
525        assert_eq!(
526            normalize_url("https://example.com/path/with-dashes"),
527            "https://example.com/path/with-dashes"
528        );
529        
530        assert_eq!(
531            normalize_url("https://example.com/path_with_underscores"),
532            "https://example.com/path_with_underscores"
533        );
534        
535        assert_eq!(
536            normalize_url("https://example.com/path.with.dots"),
537            "https://example.com/path.with.dots"
538        );
539    }
540
541    #[test]
542    fn test_normalize_url_removes_default_ports() {
543        // The url crate automatically removes default ports
544        let url = "https://example.com:443/page";
545        let normalized = normalize_url(url);
546        // Port 443 is default for HTTPS, should be removed by url crate
547        assert!(!normalized.contains(":443") || normalized == "https://example.com:443/page");
548    }
549}
550#[cfg(test)]
551mod demo {
552    use crate::crawler::url_normalization::{normalize_url, generate_url_permutations};
553    
554    #[test]
555    fn demo_normalization() {
556        println!("\n=== URL Normalization Demo ===\n");
557        
558        let test_cases = vec![
559            "http://www.example.com/page/",
560            "https://example.com/page?z=1&a=2",
561            "http://www.example.com/index.html#section",
562            "https://example.com/page/index.php/",
563        ];
564        
565        for url in test_cases {
566            let normalized = normalize_url(url);
567            println!("  {} \n    → {}\n", url, normalized);
568        }
569    }
570    
571    #[test]
572    fn demo_permutations() {
573        println!("\n=== URL Permutations Demo ===\n");
574        
575        let url = "https://example.com/page";
576        let perms = generate_url_permutations(url);
577        
578        println!("Base URL: {}", url);
579        println!("Generated {} permutations:\n", perms.len());
580        
581        for (i, perm) in perms.iter().enumerate().take(10) {
582            println!("  {}. {}", i + 1, perm);
583        }
584        
585        if perms.len() > 10 {
586            println!("  ... and {} more", perms.len() - 10);
587        }
588        
589        // Show they all normalize to same
590        let normalized: std::collections::HashSet<_> = perms.iter()
591            .map(|p| normalize_url(p))
592            .collect();
593        
594        println!("\nAll {} permutations normalize to {} unique URL(s):", 
595                 perms.len(), normalized.len());
596        for norm in normalized {
597            println!("  → {}", norm);
598        }
599    }
600}
essence/crawler/url_normalization.rs

essence/crawler/
url_normalization.rs