essence-engine 0.2.0

//! URL Normalization and Permutation Generation for Crawl Deduplication
//!
//! This module provides comprehensive URL normalization to prevent duplicate scraping
//! of the same URL with different permutations (www/non-www, http/https, trailing slash, etc.).
//!
//! Expected impact: 5-10% crawl efficiency improvement by reducing duplicate requests.

use std::collections::HashSet;
use url::Url;

/// Generate all URL permutations for deduplication (returns ~16 variations)
///
/// This function generates common URL variations that should be treated as duplicates:
/// - http vs https
/// - www vs non-www
/// - trailing slash vs no trailing slash
/// - index.html, index.php removal
///
/// # Arguments
/// * `url` - The base URL to generate permutations for
///
/// # Returns
/// A vector of all permutation strings. Invalid URLs return a single-element vector.
///
/// # Examples
///
/// ```
/// use essence::crawler::url_normalization::generate_url_permutations;
///
/// let perms = generate_url_permutations("https://example.com/page");
/// assert!(perms.len() >= 8);
/// assert!(perms.contains(&"http://example.com/page".to_string()));
/// assert!(perms.contains(&"https://www.example.com/page".to_string()));
/// ```
pub fn generate_url_permutations(url: &str) -> Vec<String> {
    let mut perms = HashSet::new();

    let Ok(parsed) = Url::parse(url) else {
        return vec![url.to_string()];
    };

    // Base variations
    for scheme in ["http", "https"] {
        for www in [true, false] {
            for trailing_slash in [true, false] {
                for index_file in [None, Some("index.html"), Some("index.php")] {
                    let mut perm_url = parsed.clone();
                    
                    // Set scheme
                    if perm_url.set_scheme(scheme).is_err() {
                        continue;
                    }

                    // Add/remove www
                    if let Some(host) = perm_url.host_str() {
                        let new_host = if www && !host.starts_with("www.") {
                            format!("www.{}", host)
                        } else if !www && host.starts_with("www.") {
                            host.strip_prefix("www.").unwrap_or(host).to_string()
                        } else {
                            host.to_string()
                        };
                        
                        if perm_url.set_host(Some(&new_host)).is_err() {
                            continue;
                        }
                    }

                    // Get the current path
                    let mut path = perm_url.path().to_string();
                    
                    // Add/remove index files
                    if let Some(index) = index_file {
                        if !path.ends_with(index) {
                            if path.ends_with('/') {
                                path = format!("{}{}", path, index);
                            } else {
                                path = format!("{}/{}", path, index);
                            }
                        }
                    } else {
                        // Remove index files if present
                        if path.ends_with("/index.html") {
                            path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
                        } else if path.ends_with("/index.php") {
                            path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
                        }
                    }
                    
                    // Add/remove trailing slash
                    if trailing_slash {
                        if !path.ends_with('/') && !path.is_empty() {
                            path = format!("{}/", path);
                        }
                    } else if path.ends_with('/') && path != "/" {
                        path = path.strip_suffix('/').unwrap_or(&path).to_string();
                    }
                    
                    // Ensure path is not empty
                    if path.is_empty() {
                        path = "/".to_string();
                    }

                    perm_url.set_path(&path);
                    perms.insert(perm_url.to_string());
                }
            }
        }
    }

    perms.into_iter().collect()
}

/// Normalize URL to canonical form for deduplication
///
/// Canonical form rules:
/// 1. Always HTTPS (prefer secure)
/// 2. Remove www. prefix
/// 3. Remove trailing slash (except for root /)
/// 4. Remove index.html/index.php
/// 5. Sort query parameters alphabetically
/// 6. Remove fragment (#)
///
/// # Arguments
/// * `url` - The URL to normalize
///
/// # Returns
/// The normalized canonical URL string. Returns original string if parsing fails.
///
/// # Examples
///
/// ```
/// use essence::crawler::url_normalization::normalize_url;
///
/// assert_eq!(
///     normalize_url("http://www.example.com/page/"),
///     "https://example.com/page"
/// );
///
/// assert_eq!(
///     normalize_url("https://example.com/page/index.html"),
///     "https://example.com/page"
/// );
///
/// assert_eq!(
///     normalize_url("https://example.com/page?z=1&a=2"),
///     "https://example.com/page?a=2&z=1"
/// );
/// ```
pub fn normalize_url(url: &str) -> String {
    let Ok(mut parsed) = Url::parse(url) else {
        return url.to_string();
    };

    // 1. Always HTTPS (prefer secure)
    if parsed.set_scheme("https").is_err() {
        return url.to_string();
    }

    // 2. Remove www. prefix
    let host_str = parsed.host_str().map(|s| s.to_string());
    if let Some(host) = host_str {
        if host.starts_with("www.") {
            if let Some(without_www) = host.strip_prefix("www.") {
                if parsed.set_host(Some(without_www)).is_err() {
                    return url.to_string();
                }
            }
        }
    }

    // 3. Get path and normalize
    let mut path = parsed.path().to_string();

    // 4. First remove all trailing slashes (except for root)
    while path.len() > 1 && path.ends_with('/') {
        path = path.strip_suffix('/').unwrap_or(&path).to_string();
    }

    // 5. Then remove index.html/index.php (after trailing slashes are gone)
    if path.ends_with("/index.html") {
        path = path.strip_suffix("/index.html").unwrap_or(&path).to_string();
    } else if path.ends_with("/index.php") {
        path = path.strip_suffix("/index.php").unwrap_or(&path).to_string();
    } else if path == "index.html" || path == "index.php" {
        // Special case: root index files
        path = "/".to_string();
    }

    // Ensure path is not empty
    if path.is_empty() {
        path = "/".to_string();
    }

    parsed.set_path(&path);

    // 6. Sort query parameters alphabetically
    let query_pairs: Vec<(String, String)> = parsed.query_pairs()
        .map(|(k, v)| (k.to_string(), v.to_string()))
        .collect();
    if !query_pairs.is_empty() {
        let mut sorted_pairs = query_pairs;
        sorted_pairs.sort_by(|a, b| a.0.cmp(&b.0));

        parsed.query_pairs_mut().clear();
        for (key, value) in sorted_pairs {
            parsed.query_pairs_mut().append_pair(&key, &value);
        }
    }

    // 7. Remove fragment
    parsed.set_fragment(None);

    parsed.to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_url_removes_www() {
        assert_eq!(
            normalize_url("https://www.example.com/page"),
            "https://example.com/page"
        );
        
        assert_eq!(
            normalize_url("https://www.subdomain.example.com/page"),
            "https://subdomain.example.com/page"
        );
    }

    #[test]
    fn test_normalize_url_prefers_https() {
        assert_eq!(
            normalize_url("http://example.com/page"),
            "https://example.com/page"
        );
        
        assert_eq!(
            normalize_url("http://www.example.com/page"),
            "https://example.com/page"
        );
    }

    #[test]
    fn test_normalize_url_removes_trailing_slash() {
        assert_eq!(
            normalize_url("https://example.com/page/"),
            "https://example.com/page"
        );
        
        // But keep for root
        assert_eq!(
            normalize_url("https://example.com/"),
            "https://example.com/"
        );
        
        assert_eq!(
            normalize_url("https://example.com"),
            "https://example.com/"
        );
    }

    #[test]
    fn test_normalize_url_removes_index_files() {
        assert_eq!(
            normalize_url("https://example.com/page/index.html"),
            "https://example.com/page"
        );
        
        assert_eq!(
            normalize_url("https://example.com/page/index.php"),
            "https://example.com/page"
        );
        
        assert_eq!(
            normalize_url("https://example.com/index.html"),
            "https://example.com/"
        );
    }

    #[test]
    fn test_normalize_url_sorts_query_params() {
        assert_eq!(
            normalize_url("https://example.com/page?z=1&a=2"),
            "https://example.com/page?a=2&z=1"
        );
        
        assert_eq!(
            normalize_url("https://example.com/page?c=3&b=2&a=1"),
            "https://example.com/page?a=1&b=2&c=3"
        );
    }

    #[test]
    fn test_normalize_url_removes_fragment() {
        assert_eq!(
            normalize_url("https://example.com/page#section"),
            "https://example.com/page"
        );
        
        assert_eq!(
            normalize_url("https://example.com/page?key=value#section"),
            "https://example.com/page?key=value"
        );
    }

    #[test]
    fn test_generate_permutations_count() {
        let perms = generate_url_permutations("https://example.com/page");
        // Should generate permutations (at least 8)
        // 2 schemes × 2 www × 2 trailing slash × 3 index files = 24 potential combinations
        // Some may be deduplicated
        assert!(perms.len() >= 8 && perms.len() <= 32, "Expected 8-32 permutations, got {}", perms.len());
    }

    #[test]
    fn test_generate_permutations_includes_variants() {
        let perms = generate_url_permutations("https://example.com/page");
        
        // Should include various combinations
        assert!(perms.contains(&"http://example.com/page".to_string()), 
                "Should include http variant");
        assert!(perms.contains(&"https://www.example.com/page".to_string()),
                "Should include www variant");
        assert!(perms.contains(&"https://example.com/page/".to_string()),
                "Should include trailing slash variant");
    }

    #[test]
    fn test_normalization_idempotent() {
        let url = "https://example.com/page";
        assert_eq!(
            normalize_url(&normalize_url(url)), 
            normalize_url(url),
            "Normalization should be idempotent"
        );
        
        let complex_url = "http://www.example.com/page/?z=1&a=2#section";
        assert_eq!(
            normalize_url(&normalize_url(complex_url)),
            normalize_url(complex_url),
            "Complex URL normalization should be idempotent"
        );
    }

    #[test]
    fn test_all_permutations_normalize_to_same() {
        let perms = generate_url_permutations("https://example.com/page");
        let normalized: Vec<_> = perms.iter().map(|p| normalize_url(p)).collect();

        // Debug: print unique normalized URLs
        let unique: HashSet<_> = normalized.iter().collect();
        if unique.len() > 1 {
            eprintln!("Unique normalized URLs: {:?}", unique);
            for perm in &perms {
                eprintln!("  {} -> {}", perm, normalize_url(perm));
            }
        }

        // All permutations should normalize to the same canonical form
        let first = &normalized[0];
        assert!(
            normalized.iter().all(|n| n == first),
            "All permutations should normalize to the same URL. Got: {:?}",
            unique
        );
    }

    #[test]
    fn test_normalize_url_with_port() {
        assert_eq!(
            normalize_url("http://example.com:8080/page"),
            "https://example.com:8080/page"
        );
        
        assert_eq!(
            normalize_url("http://www.example.com:8080/page/"),
            "https://example.com:8080/page"
        );
    }

    #[test]
    fn test_normalize_url_with_userinfo() {
        // URL with userinfo (rare but valid)
        let url_with_user = "http://user:pass@example.com/page";
        let normalized = normalize_url(url_with_user);
        
        // Should preserve userinfo but normalize other parts
        assert!(normalized.contains("user:pass@"));
        assert!(normalized.starts_with("https://"));
    }

    #[test]
    fn test_normalize_invalid_url() {
        let invalid = "not a valid url";
        assert_eq!(normalize_url(invalid), invalid, "Invalid URLs should be returned as-is");
    }

    #[test]
    fn test_generate_permutations_invalid_url() {
        let invalid = "not a valid url";
        let perms = generate_url_permutations(invalid);
        assert_eq!(perms.len(), 1, "Invalid URLs should return single element");
        assert_eq!(perms[0], invalid, "Invalid URLs should be returned as-is");
    }

    #[test]
    fn test_normalize_url_mixed_case() {
        assert_eq!(
            normalize_url("HTTP://WWW.EXAMPLE.COM/Page"),
            "https://example.com/Page"
        );
        
        // Host should be lowercase, path should preserve case
        let normalized = normalize_url("HTTPS://EXAMPLE.COM/MyPage");
        assert!(normalized.starts_with("https://example.com/"));
        assert!(normalized.contains("/MyPage"));
    }

    #[test]
    fn test_normalize_url_non_ascii() {
        // Test with international domain names
        let url = "https://example.com/café";
        let normalized = normalize_url(url);
        assert!(normalized.contains("caf"), "Should handle non-ASCII characters");
    }

    #[test]
    fn test_normalize_url_empty_path() {
        assert_eq!(
            normalize_url("https://example.com"),
            "https://example.com/"
        );
    }

    #[test]
    fn test_normalize_complex_query_params() {
        // Test with URL-encoded query parameters
        let url = "https://example.com/page?name=John&age=30&city=Boston";
        let normalized = normalize_url(url);

        // Should preserve parameters and sort
        assert!(normalized.contains("age=30"));
        assert!(normalized.contains("city="));
        assert!(normalized.contains("name="));

        // Verify sorted order (age < city < name alphabetically)
        let age_pos = normalized.find("age=").unwrap();
        let city_pos = normalized.find("city=").unwrap();
        let name_pos = normalized.find("name=").unwrap();
        assert!(age_pos < city_pos, "age should come before city");
        assert!(city_pos < name_pos, "city should come before name");
    }

    #[test]
    fn test_normalize_url_preserves_subdomain() {
        assert_eq!(
            normalize_url("https://blog.example.com/page"),
            "https://blog.example.com/page"
        );
        
        assert_eq!(
            normalize_url("https://www.blog.example.com/page"),
            "https://blog.example.com/page"
        );
    }

    #[test]
    fn test_normalize_multiple_trailing_slashes() {
        // Edge case: multiple trailing slashes
        assert_eq!(
            normalize_url("https://example.com/page///"),
            "https://example.com/page"
        );
    }

    #[test]
    fn test_permutations_with_query_params() {
        let url = "https://example.com/page?key=value";
        let perms = generate_url_permutations(url);

        // Should generate permutations while preserving query params
        assert!(perms.iter().any(|p| p.contains("key=value")));
        assert!(perms.len() >= 8);
    }

    #[test]
    fn test_normalize_performance() {
        // Verify normalization is fast (<10μs per URL)
        use std::time::Instant;

        let test_urls = vec![
            "http://www.example.com/page/",
            "https://example.com/page?z=1&a=2",
            "http://www.example.com/page/index.html#section",
            "https://subdomain.example.com/path/to/page/",
        ];

        let iterations = 1000;
        let start = Instant::now();

        for _ in 0..iterations {
            for url in &test_urls {
                let _ = normalize_url(url);
            }
        }

        let elapsed = start.elapsed();
        let avg_per_url = elapsed / (iterations * test_urls.len() as u32);

        // Should be well under 10μs per normalization
        assert!(
            avg_per_url.as_micros() < 50,
            "Normalization took {}μs, expected <50μs",
            avg_per_url.as_micros()
        );
    }

    #[test]
    fn test_normalize_url_special_paths() {
        // Test with special path characters
        assert_eq!(
            normalize_url("https://example.com/path/with-dashes"),
            "https://example.com/path/with-dashes"
        );
        
        assert_eq!(
            normalize_url("https://example.com/path_with_underscores"),
            "https://example.com/path_with_underscores"
        );
        
        assert_eq!(
            normalize_url("https://example.com/path.with.dots"),
            "https://example.com/path.with.dots"
        );
    }

    #[test]
    fn test_normalize_url_removes_default_ports() {
        // The url crate automatically removes default ports
        let url = "https://example.com:443/page";
        let normalized = normalize_url(url);
        // Port 443 is default for HTTPS, should be removed by url crate
        assert!(!normalized.contains(":443") || normalized == "https://example.com:443/page");
    }
}
#[cfg(test)]
mod demo {
    use crate::crawler::url_normalization::{normalize_url, generate_url_permutations};
    
    #[test]
    fn demo_normalization() {
        println!("\n=== URL Normalization Demo ===\n");
        
        let test_cases = vec![
            "http://www.example.com/page/",
            "https://example.com/page?z=1&a=2",
            "http://www.example.com/index.html#section",
            "https://example.com/page/index.php/",
        ];
        
        for url in test_cases {
            let normalized = normalize_url(url);
            println!("  {} \n    → {}\n", url, normalized);
        }
    }
    
    #[test]
    fn demo_permutations() {
        println!("\n=== URL Permutations Demo ===\n");
        
        let url = "https://example.com/page";
        let perms = generate_url_permutations(url);
        
        println!("Base URL: {}", url);
        println!("Generated {} permutations:\n", perms.len());
        
        for (i, perm) in perms.iter().enumerate().take(10) {
            println!("  {}. {}", i + 1, perm);
        }
        
        if perms.len() > 10 {
            println!("  ... and {} more", perms.len() - 10);
        }
        
        // Show they all normalize to same
        let normalized: std::collections::HashSet<_> = perms.iter()
            .map(|p| normalize_url(p))
            .collect();
        
        println!("\nAll {} permutations normalize to {} unique URL(s):", 
                 perms.len(), normalized.len());
        for norm in normalized {
            println!("  → {}", norm);
        }
    }
}