perf-sentinel-core 0.8.13

Core library for perf-sentinel: polyglot performance anti-pattern detector
Documentation
//! HTTP URL normalizer.
//!
//! Replaces numeric path segments with `{id}`, UUID segments with `{uuid}`,
//! strips query parameters, and prepends the HTTP method.

/// Check if a string is a UUID (8-4-4-4-12 hex with dashes).
/// Hand-coded for performance, avoids regex engine overhead on the hot path.
fn is_uuid(s: &str) -> bool {
    if s.len() != 36 {
        return false;
    }
    let b = s.as_bytes();
    b[8] == b'-'
        && b[13] == b'-'
        && b[18] == b'-'
        && b[23] == b'-'
        && b.iter()
            .enumerate()
            .all(|(i, &c)| matches!(i, 8 | 13 | 18 | 23) || c.is_ascii_hexdigit())
}

/// Result of HTTP URL normalization.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HttpNormalized {
    pub template: String,
    pub params: Vec<String>,
}

/// Check if a segment is purely numeric (ASCII digits, non-empty).
fn is_numeric(seg: &str) -> bool {
    !seg.is_empty() && seg.bytes().all(|b| b.is_ascii_digit())
}

/// Count occurrences of `target` in `s`.
fn bytecount(s: &str, target: u8) -> usize {
    s.bytes().filter(|&b| b == target).count()
}

/// Normalize an HTTP target URL.
///
/// Strips scheme+authority, replaces numeric segments with `{id}`,
/// UUID segments with `{uuid}`, strips query params, and prepends the method.
#[must_use]
pub fn normalize_http(method: &str, target: &str) -> HttpNormalized {
    // Strip scheme + authority if present
    let path_and_query = strip_origin(target);

    // Strip query params
    let (path, query_params) = match path_and_query.split_once('?') {
        Some((p, q)) => (p, Some(q)),
        None => (path_and_query, None),
    };

    // Collect query params as extracted values (capped to prevent unbounded allocation).
    // Each pair is heap-allocated via to_string(). A Cow<str> backed by the source
    // would avoid this, but NormalizedEvent.params is Vec<String> throughout the
    // pipeline, so the allocation is unavoidable without a larger refactor. Pre-size
    // the Vec from the ampersand count to avoid the doubling growth on the hot path.
    let mut params = match query_params {
        Some(q) => {
            let cap = (bytecount(q, b'&') + 1).min(100);
            let mut out = Vec::with_capacity(cap);
            for pair in q.split('&').take(100) {
                out.push(pair.to_string());
            }
            out
        }
        None => Vec::new(),
    };

    let normalized_path = normalize_path_segments(path, &mut params);

    let template = format!("{method} {normalized_path}");
    HttpNormalized { template, params }
}

/// Normalize path segments: replace numeric with `{id}`, UUIDs with `{uuid}`.
fn normalize_path_segments(path: &str, params: &mut Vec<String>) -> String {
    if path.is_empty() || path == "/" {
        return "/".to_string();
    }
    let mut result = String::with_capacity(path.len() + 8);
    for (idx, seg) in path.split('/').enumerate() {
        if idx > 0 {
            result.push('/');
        }
        if seg.is_empty() {
            // leading or trailing slash
        } else if is_uuid(seg) {
            params.push(seg.to_string());
            result.push_str("{uuid}");
        } else if is_numeric(seg) {
            params.push(seg.to_string());
            result.push_str("{id}");
        } else {
            result.push_str(seg);
        }
    }
    result
}

/// Strip scheme and authority from a URL, returning just the path (+ query).
fn strip_origin(target: &str) -> &str {
    target
        .strip_prefix("http://")
        .or_else(|| target.strip_prefix("https://"))
        .map_or(target, |rest| {
            rest.find('/').map_or("/", |idx| &rest[idx..])
        })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn simple_path_with_numeric_id() {
        let r = normalize_http("GET", "/api/orders/42/submit");
        assert_eq!(r.template, "GET /api/orders/{id}/submit");
        assert_eq!(r.params, vec!["42"]);
    }

    #[test]
    fn uuid_segment() {
        let r = normalize_http("GET", "/api/users/a1b2c3d4-e5f6-7890-abcd-ef1234567890");
        assert_eq!(r.template, "GET /api/users/{uuid}");
        assert_eq!(r.params, vec!["a1b2c3d4-e5f6-7890-abcd-ef1234567890"]);
    }

    #[test]
    fn full_url_strips_origin() {
        let r = normalize_http("GET", "http://user-svc:5000/api/users/user-123");
        assert_eq!(r.template, "GET /api/users/user-123");
    }

    #[test]
    fn query_params_stripped() {
        let r = normalize_http("GET", "/api/users?page=2&size=10");
        assert_eq!(r.template, "GET /api/users");
        assert_eq!(r.params, vec!["page=2", "size=10"]);
    }

    #[test]
    fn full_url_with_query() {
        let r = normalize_http("POST", "https://svc.internal/api/items/99?expand=true");
        assert_eq!(r.template, "POST /api/items/{id}");
        assert_eq!(r.params, vec!["expand=true", "99"]);
    }

    #[test]
    fn multiple_numeric_segments() {
        let r = normalize_http("DELETE", "/api/orders/42/items/7");
        assert_eq!(r.template, "DELETE /api/orders/{id}/items/{id}");
        assert_eq!(r.params, vec!["42", "7"]);
    }

    #[test]
    fn root_path() {
        let r = normalize_http("GET", "/");
        assert_eq!(r.template, "GET /");
        assert!(r.params.is_empty());
    }

    #[test]
    fn no_numeric_or_uuid_segments() {
        let r = normalize_http("GET", "/api/health");
        assert_eq!(r.template, "GET /api/health");
        assert!(r.params.is_empty());
    }

    #[test]
    fn port_in_url_not_treated_as_id() {
        let r = normalize_http("GET", "http://localhost:8080/api/items");
        assert_eq!(r.template, "GET /api/items");
    }

    #[test]
    fn url_without_path_returns_root() {
        let r = normalize_http("GET", "http://example.com");
        assert_eq!(r.template, "GET /");
        assert!(r.params.is_empty());
    }

    #[test]
    fn https_url_without_path() {
        let r = normalize_http("GET", "https://example.com");
        assert_eq!(r.template, "GET /");
    }

    #[test]
    fn non_uuid_36_char_segment_not_replaced() {
        // 36 chars but not a valid UUID format
        let r = normalize_http("GET", "/api/users/abcdefghijklmnopqrstuvwxyz1234567890");
        assert_eq!(
            r.template,
            "GET /api/users/abcdefghijklmnopqrstuvwxyz1234567890"
        );
        assert!(r.params.is_empty());
    }

    #[test]
    fn empty_path() {
        let r = normalize_http("GET", "");
        assert_eq!(r.template, "GET /");
    }

    #[test]
    fn trailing_slash() {
        let r = normalize_http("GET", "/api/users/");
        assert_eq!(r.template, "GET /api/users/");
        assert!(r.params.is_empty());
    }

    #[test]
    fn single_numeric_segment() {
        let r = normalize_http("GET", "/42");
        assert_eq!(r.template, "GET /{id}");
        assert_eq!(r.params, vec!["42"]);
    }

    #[test]
    fn mixed_uuid_and_numeric() {
        let r = normalize_http(
            "PUT",
            "/api/org/a1b2c3d4-e5f6-7890-abcd-ef1234567890/user/99",
        );
        assert_eq!(r.template, "PUT /api/org/{uuid}/user/{id}");
        assert_eq!(r.params, vec!["a1b2c3d4-e5f6-7890-abcd-ef1234567890", "99"]);
    }

    #[test]
    fn is_uuid_valid() {
        assert!(is_uuid("a1b2c3d4-e5f6-7890-abcd-ef1234567890"));
        assert!(is_uuid("00000000-0000-0000-0000-000000000000"));
        assert!(is_uuid("AAAABBBB-CCCC-DDDD-EEEE-FFFFFFFFFFFF"));
    }

    #[test]
    fn is_uuid_invalid() {
        assert!(!is_uuid("not-a-uuid-at-all"));
        assert!(!is_uuid("")); // too short
        assert!(!is_uuid("a1b2c3d4-e5f6-7890-abcd-ef123456789")); // 35 chars
        assert!(!is_uuid("a1b2c3d4-e5f6-7890-abcd-ef12345678901")); // 37 chars
        assert!(!is_uuid("a1b2c3d4xe5f6-7890-abcd-ef1234567890")); // wrong dash pos
        assert!(!is_uuid("g1b2c3d4-e5f6-7890-abcd-ef1234567890")); // 'g' not hex
    }

    #[test]
    fn uppercase_uuid_detected() {
        let r = normalize_http("GET", "/api/item/A1B2C3D4-E5F6-7890-ABCD-EF1234567890");
        assert_eq!(r.template, "GET /api/item/{uuid}");
    }

    // -- Fragment handling --

    #[test]
    fn fragment_not_stripped_from_path() {
        // Fragments are rare in server-side URLs; the segment "42#section" is not
        // purely numeric so it passes through as-is (fragment is not separated)
        let r = normalize_http("GET", "/api/users/42#section");
        assert_eq!(r.template, "GET /api/users/42#section");
    }

    // -- Malformed/edge-case query params --

    #[test]
    fn trailing_question_mark_only() {
        let r = normalize_http("GET", "/api/users?");
        assert_eq!(r.template, "GET /api/users");
        assert_eq!(r.params, vec![""]);
    }

    #[test]
    fn empty_query_param_values() {
        let r = normalize_http("GET", "/api/users?id=&name=");
        assert_eq!(r.template, "GET /api/users");
        assert_eq!(r.params, vec!["id=", "name="]);
    }

    #[test]
    fn double_ampersand_in_query() {
        let r = normalize_http("GET", "/api/users?a=1&&b=2");
        assert_eq!(r.template, "GET /api/users");
        assert_eq!(r.params, vec!["a=1", "", "b=2"]);
    }

    // -- Double slashes --

    #[test]
    fn double_slash_in_path_preserved() {
        let r = normalize_http("GET", "/api//users/42");
        assert_eq!(r.template, "GET /api//users/{id}");
    }

    // -- URL-encoded segments (pass through as-is) --

    #[test]
    fn url_encoded_numeric_not_detected() {
        // %34%32 = "42" but URL-encoded, not decoded before detection
        let r = normalize_http("GET", "/api/users/%34%32");
        assert_eq!(r.template, "GET /api/users/%34%32");
        assert!(r.params.is_empty());
    }

    // -- Query params capped at 100 --

    #[test]
    fn query_params_capped_at_100() {
        let params: Vec<String> = (0..200).map(|i| format!("p{i}={i}")).collect();
        let url = format!("/api/test?{}", params.join("&"));
        let r = normalize_http("GET", &url);
        assert_eq!(r.params.len(), 100);
    }
}