parlov-analysis 0.7.0

//! Input-reflection detection for body differentials.
//!
//! When the only difference between baseline and probe response bodies is the URL
//! identifier echoed back (e.g. error pages including the requested path), the apparent
//! `BodyDiff` is illusory — it does not reveal anything about the underlying resource state.
//! [`detect_input_reflection`] returns `Some(evidence)` when this is the case.
//!
//! Detection is path-anchored: we search for `<anchor><id>` where `anchor` is the last
//! few characters of the URL preceding the identifier (e.g. `i/Users/`). This avoids
//! false positives when bare short IDs (`1`, `2`) collide with arbitrary body content
//! like version numbers or counts.

/// Sentinel string used to normalize URL-ID echoes in body text before equality check.
/// NUL bytes ensure no collision with normal URL/path/JSON content.
const ID_PLACEHOLDER: &str = "\u{0}ID\u{0}";

/// Maximum number of bytes of URL prefix to use as the anchor preceding the ID.
/// Long enough to disambiguate short numeric IDs from coincidental body content,
/// short enough not to over-fit if the URL prefix differs in inconsequential ways.
const ANCHOR_MAX_BYTES: usize = 16;

/// Returns evidence for an `InputReflection` signal when the body diff is fully explained
/// by the request URL identifier echoed back, or `None` otherwise.
///
/// Uses a path-anchored needle (`<anchor><id>`) so short numeric IDs that collide with
/// unrelated body content (version numbers, counts) do not false-positive. Detection
/// requires:
/// 1. Both sides have the same nonzero number of `<anchor><id>` occurrences in the body.
/// 2. The raw byte-length delta equals `count * (len(probe_id) - len(baseline_id))`.
/// 3. After replacing every `<anchor><id>` with a sentinel, both bodies are byte-identical.
pub(super) fn detect_input_reflection(
    baseline_url: &str,
    probe_url: &str,
    baseline_body: &[u8],
    probe_body: &[u8],
) -> Option<String> {
    let extracted = extract_anchored_ids(baseline_url, probe_url)?;
    let baseline_str = std::str::from_utf8(baseline_body).ok()?;
    let probe_str = std::str::from_utf8(probe_body).ok()?;

    let baseline_count = baseline_str.matches(&extracted.baseline_needle).count();
    let probe_count = probe_str.matches(&extracted.probe_needle).count();
    if baseline_count == 0 || baseline_count != probe_count {
        return None;
    }

    let id_delta = signed_len_delta(extracted.baseline_id.len(), extracted.probe_id.len());
    let body_delta = signed_len_delta(baseline_body.len(), probe_body.len());
    let expected_delta = id_delta.checked_mul(i64::try_from(baseline_count).ok()?)?;
    if expected_delta != body_delta {
        return None;
    }

    let normalized_baseline = baseline_str.replace(&extracted.baseline_needle, ID_PLACEHOLDER);
    let normalized_probe = probe_str.replace(&extracted.probe_needle, ID_PLACEHOLDER);
    if normalized_baseline != normalized_probe {
        return None;
    }

    let bytes_diff = baseline_body.len().abs_diff(probe_body.len());
    Some(format!(
        "differential is request URL ID echo \
         (baseline_id={}, probe_id={}, \
         total {bytes_diff}b explained by ID substitution)",
        extracted.baseline_id, extracted.probe_id,
    ))
}

/// Anchored needles plus the bare IDs (kept for evidence formatting and length math).
struct ExtractedIds<'a> {
    baseline_id: &'a str,
    probe_id: &'a str,
    baseline_needle: String,
    probe_needle: String,
}

/// Extracts URL identifiers along with the path-anchored needles to search for.
fn extract_anchored_ids<'a>(baseline_url: &'a str, probe_url: &'a str) -> Option<ExtractedIds<'a>> {
    let prefix_len = common_prefix_len(baseline_url, probe_url);
    let b_rest = &baseline_url[prefix_len..];
    let p_rest = &probe_url[prefix_len..];
    let suffix_len = common_suffix_len(b_rest, p_rest);

    let baseline_id = &b_rest[..b_rest.len() - suffix_len];
    let probe_id = &p_rest[..p_rest.len() - suffix_len];
    if baseline_id.is_empty() || probe_id.is_empty() {
        return None;
    }

    let prefix = &baseline_url[..prefix_len];
    let anchor = anchor_suffix(prefix);
    Some(ExtractedIds {
        baseline_id,
        probe_id,
        baseline_needle: format!("{anchor}{baseline_id}"),
        probe_needle: format!("{anchor}{probe_id}"),
    })
}

/// Returns the trailing context from `prefix` to use as anchor before the ID.
///
/// Strategy: find the start of the second-to-last path segment in `prefix` (i.e. walk back
/// past the trailing `/` to the previous `/`), bounded above by `ANCHOR_MAX_BYTES`. For a
/// URL prefix `http://host/api/Users/` this yields `Users/` — discriminating enough to
/// avoid bare-digit collisions, but free of the scheme/host noise that won't appear in
/// rendered error messages.
fn anchor_suffix(prefix: &str) -> &str {
    let bytes = prefix.as_bytes();
    let mut end = bytes.len();
    if end == 0 {
        return prefix;
    }
    // Skip a trailing `/` if present so we look for the previous segment delimiter.
    if bytes[end - 1] == b'/' {
        end -= 1;
    }
    let search_start = end.saturating_sub(ANCHOR_MAX_BYTES);
    let mut anchor_start = search_start;
    if let Some(pos) = bytes[search_start..end].iter().rposition(|&b| b == b'/') {
        anchor_start = search_start + pos + 1;
    }
    while anchor_start > 0 && !prefix.is_char_boundary(anchor_start) {
        anchor_start -= 1;
    }
    &prefix[anchor_start..]
}

/// `len(probe) - len(baseline)` as `i64`, saturating to `i64::MAX` on impossibly-large inputs.
fn signed_len_delta(baseline_len: usize, probe_len: usize) -> i64 {
    let b = i64::try_from(baseline_len).unwrap_or(i64::MAX);
    let p = i64::try_from(probe_len).unwrap_or(i64::MAX);
    p - b
}

/// Length in bytes of the longest common prefix of two strings, snapped to a UTF-8 boundary.
fn common_prefix_len(a: &str, b: &str) -> usize {
    let mut len = a
        .as_bytes()
        .iter()
        .zip(b.as_bytes())
        .take_while(|(x, y)| x == y)
        .count();
    while len > 0 && !a.is_char_boundary(len) {
        len -= 1;
    }
    len
}

/// Length in bytes of the longest common suffix of two strings, snapped to a UTF-8 boundary.
fn common_suffix_len(a: &str, b: &str) -> usize {
    let mut len = a
        .as_bytes()
        .iter()
        .rev()
        .zip(b.as_bytes().iter().rev())
        .take_while(|(x, y)| x == y)
        .count();
    while len > 0 && !a.is_char_boundary(a.len() - len) {
        len -= 1;
    }
    len
}