eggsearch 0.1.2

use url::Url;

const TRACKING_PARAMS: &[&str] = &[
    "utm_source",
    "utm_medium",
    "utm_campaign",
    "utm_term",
    "utm_content",
    "fbclid",
    "gclid",
    "msclkid",
    "yclid",
    "ref",
    "source",
];

const INDEX_FILES: &[&str] = &["index.html", "index.htm", "index.php"];

pub fn normalize(raw: &str) -> Option<String> {
    let mut url = Url::parse(raw).ok()?;

    url.set_fragment(None);

    let clean: Vec<(String, String)> = url
        .query_pairs()
        .filter(|(k, _)| !TRACKING_PARAMS.contains(&k.as_ref()))
        .map(|(k, v)| (k.into_owned(), v.into_owned()))
        .collect();

    if clean.is_empty() {
        url.set_query(None);
    } else {
        let mut sorted = clean;
        sorted.sort_by(|a, b| a.0.cmp(&b.0));
        let qs = sorted
            .iter()
            .map(|(k, v)| format!("{k}={v}"))
            .collect::<Vec<_>>()
            .join("&");
        url.set_query(Some(&qs));
    }

    let path = url.path().to_string();
    let path = strip_locale_prefix(&path);
    let path = strip_index_file(path);

    let path = if path.len() > 1 && path.ends_with('/') {
        path.trim_end_matches('/').to_string()
    } else {
        path.to_string()
    };

    url.set_path(&path);

    Some(url.to_string().to_lowercase())
}

fn strip_locale_prefix(path: &str) -> &str {
    let rest = match path.strip_prefix('/') {
        Some(r) => r,
        None => return path,
    };

    let (segment, remainder) = match rest.find('/') {
        Some(i) => (&rest[..i], &rest[i..]),
        None => return path,
    };

    if is_locale_segment(segment) {
        remainder
    } else {
        path
    }
}

fn is_locale_segment(s: &str) -> bool {
    let b = s.as_bytes();
    match b.len() {
        2 => b[0].is_ascii_alphabetic() && b[1].is_ascii_alphabetic(),
        5 => {
            b[0].is_ascii_alphabetic()
                && b[1].is_ascii_alphabetic()
                && (b[2] == b'-' || b[2] == b'_')
                && b[3].is_ascii_alphabetic()
                && b[4].is_ascii_alphabetic()
        }
        _ => false,
    }
}

fn strip_index_file(path: &str) -> &str {
    for index in INDEX_FILES {
        if let Some(dir) = path.strip_suffix(index) {
            return dir;
        }
    }
    path
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_removes_tracking_params() {
        let n = normalize("https://example.com/page?utm_source=google&q=rust").unwrap();
        assert!(!n.contains("utm_source"));
        assert!(n.contains("q=rust"));
    }

    #[test]
    fn test_removes_fragment() {
        let a = normalize("https://example.com/page#section").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_removes_trailing_slash() {
        let a = normalize("https://example.com/page/").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_root_slash_preserved() {
        let n = normalize("https://example.com/").unwrap();
        assert!(n.ends_with('/') || n == "https://example.com");
    }

    #[test]
    fn test_sorts_query_params() {
        let a = normalize("https://example.com/?z=1&a=2").unwrap();
        let b = normalize("https://example.com/?a=2&z=1").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_lowercases_scheme_and_host() {
        let a = normalize("HTTPS://Example.COM/page").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_returns_none_for_invalid_url() {
        assert!(normalize("not a url").is_none());
    }

    #[test]
    fn test_strips_locale_language_only() {
        let a = normalize("https://example.com/en/docs").unwrap();
        let b = normalize("https://example.com/docs").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_strips_locale_language_region_hyphen() {
        let a = normalize("https://rust-lang.org/en-US/").unwrap();
        let b = normalize("https://rust-lang.org/").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_strips_locale_language_region_underscore() {
        let a = normalize("https://example.com/en_US/page").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_does_not_strip_bare_short_segment() {
        let n = normalize("https://example.com/go").unwrap();
        assert!(n.contains("/go"));
    }

    #[test]
    fn test_strips_index_html() {
        let a = normalize("https://example.com/page/index.html").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_strips_index_htm() {
        let a = normalize("https://example.com/page/index.htm").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_strips_index_php() {
        let a = normalize("https://example.com/page/index.php").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }

    #[test]
    fn test_combined_locale_and_index() {
        let a = normalize("https://example.com/en-US/page/index.html").unwrap();
        let b = normalize("https://example.com/page").unwrap();
        assert_eq!(a, b);
    }
}