parlov-analysis 0.7.0

Analysis engine trait and signal detection for parlov.
Documentation
//! Tests for the body signal extractor.
//!
//! Includes regression tests for the small-body preview path and coverage of
//! the truncation/non-UTF-8 fallbacks introduced to keep evidence consistent
//! across body sizes.

use super::*;
use crate::signals::tests::{fake_exchange_with_body, single_diff_set_with_bodies};
use parlov_core::{DifferentialSet, SignalKind};

#[test]
fn identical_bodies_produce_no_signal() {
    let ds = single_diff_set_with_bodies(403, 403, b"denied", b"denied");
    let signals = extract(&ds);
    assert!(signals.is_empty());
}

#[test]
fn different_bodies_produce_body_diff_signal() {
    let ds =
        single_diff_set_with_bodies(403, 403, b"access denied", b"not found for this resource");
    let signals = extract(&ds);
    assert_eq!(signals.len(), 1);
    assert_eq!(signals[0].kind, SignalKind::BodyDiff);
    assert!(signals[0].evidence.contains("body length:"));
    assert!(signals[0].evidence.contains("13 (baseline)"));
    assert!(signals[0].evidence.contains("27 (probe)"));
}

#[test]
fn same_length_different_content_noted() {
    let ds = single_diff_set_with_bodies(403, 403, b"abc", b"xyz");
    let signals = extract(&ds);
    assert_eq!(signals.len(), 1);
    assert!(signals[0].evidence.contains("same length: 3 bytes"));
}

#[test]
fn utf8_bodies_include_content_preview() {
    let ds = single_diff_set_with_bodies(
        403,
        404,
        br#"{"error":"access denied"}"#,
        br#"{"error":"not found"}"#,
    );
    let signals = extract(&ds);
    assert_eq!(signals.len(), 1);
    assert!(signals[0].evidence.contains("baseline:"));
    assert!(signals[0].evidence.contains("probe:"));
    assert!(signals[0].evidence.contains("access denied"));
    assert!(signals[0].evidence.contains("not found"));
}

#[test]
fn different_content_type_produces_signal() {
    let mut baseline = fake_exchange_with_body(403, b"error");
    baseline.response.headers.insert(
        http::header::CONTENT_TYPE,
        http::HeaderValue::from_static("application/json"),
    );
    let mut probe = fake_exchange_with_body(403, b"error");
    probe.response.headers.insert(
        http::header::CONTENT_TYPE,
        http::HeaderValue::from_static("text/html"),
    );
    let ds = DifferentialSet {
        baseline: vec![baseline],
        probe: vec![probe],
        canonical: None,
        technique: crate::signals::tests::status_code_diff_technique(),
    };
    let signals = extract(&ds);
    assert_eq!(signals.len(), 1);
    assert_eq!(signals[0].kind, SignalKind::BodyDiff);
    assert!(signals[0].evidence.contains("content-type"));
    assert!(signals[0].evidence.contains("application/json"));
    assert!(signals[0].evidence.contains("text/html"));
}

#[test]
fn empty_exchanges_produce_no_signals() {
    let ds = crate::signals::tests::diff_set_with_statuses(&[], &[]);
    assert!(extract(&ds).is_empty());
}

#[test]
fn one_empty_one_nonempty_produces_signal() {
    let ds = single_diff_set_with_bodies(403, 403, b"", b"error body");
    let signals = extract(&ds);
    assert_eq!(signals.len(), 1);
    assert_eq!(signals[0].kind, SignalKind::BodyDiff);
}

// --- Gap 1 regression: large bodies must still emit a (truncated) preview ---

#[test]
fn large_bodies_emit_truncated_preview() {
    let baseline = vec![b'a'; 3000];
    let probe = vec![b'b'; 3000];
    let ds = single_diff_set_with_bodies(403, 403, &baseline, &probe);
    let signals = extract(&ds);
    assert_eq!(signals.len(), 1);
    let ev = &signals[0].evidence;
    assert!(ev.contains("baseline:"), "missing baseline preview: {ev}");
    assert!(ev.contains("probe:"), "missing probe preview: {ev}");
    assert!(ev.contains('a'), "baseline content not in preview: {ev}");
    assert!(ev.contains('b'), "probe content not in preview: {ev}");
}

#[test]
fn large_bodies_truncation_marker_shows_total_length() {
    let baseline = vec![b'a'; 3000];
    let probe = vec![b'b'; 3000];
    let ds = single_diff_set_with_bodies(403, 403, &baseline, &probe);
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(
        ev.contains("(truncated, total 3000b)"),
        "expected truncation marker with total length: {ev}",
    );
    let occurrences = ev.matches("(truncated, total 3000b)").count();
    assert_eq!(occurrences, 2, "expected marker on both sides: {ev}");
}

#[test]
fn truncation_respects_utf8_boundary() {
    // "abc😀" is 7 bytes (3 ASCII + 4-byte UTF-8). Repeat to exceed 500 bytes.
    let body: String = "abc😀".repeat(200);
    assert!(body.len() > 500);
    let baseline = body.as_bytes().to_vec();
    let mut probe = baseline.clone();
    probe[0] = b'X'; // force inequality without breaking UTF-8

    let ds = single_diff_set_with_bodies(403, 403, &baseline, &probe);
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(ev.contains("baseline:"));
    assert!(ev.contains("probe:"));
    // Evidence string itself must be valid UTF-8 (String guarantees this; we
    // also assert no replacement char snuck in mid-codepoint).
    assert!(
        !ev.contains('\u{FFFD}'),
        "evidence contains replacement char, truncation broke a codepoint: {ev}",
    );
}

#[test]
fn non_utf8_baseline_emits_byte_count() {
    let baseline = [0xFF_u8, 0xFE, 0xFD];
    let probe = b"hello";
    let ds = single_diff_set_with_bodies(403, 403, &baseline, probe);
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(
        ev.contains("<3 bytes, non-text>"),
        "expected non-text marker for baseline: {ev}",
    );
    assert!(ev.contains("probe: hello"), "expected probe preview: {ev}");
}

#[test]
fn non_utf8_probe_emits_byte_count() {
    let baseline = b"hello";
    let probe = [0xFF_u8, 0xFE, 0xFD, 0xFC];
    let ds = single_diff_set_with_bodies(403, 403, baseline, &probe);
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(
        ev.contains("baseline: hello"),
        "expected baseline preview: {ev}",
    );
    assert!(
        ev.contains("<4 bytes, non-text>"),
        "expected non-text marker for probe: {ev}",
    );
}

#[test]
fn non_utf8_both_sides_emit_byte_counts() {
    let baseline = [0xFF_u8, 0xFE];
    let probe = [0xFC_u8, 0xFB, 0xFA];
    let ds = single_diff_set_with_bodies(403, 403, &baseline, &probe);
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(ev.contains("<2 bytes, non-text>"));
    assert!(ev.contains("<3 bytes, non-text>"));
}

#[test]
fn small_body_unchanged_behavior() {
    // Regression: small ASCII bodies should still produce raw content previews
    // without any truncation marker.
    let ds = single_diff_set_with_bodies(403, 403, b"foo", b"bar");
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(ev.contains("baseline: foo"), "small baseline raw: {ev}");
    assert!(ev.contains("probe: bar"), "small probe raw: {ev}");
    assert!(
        !ev.contains("truncated"),
        "no marker for small bodies: {ev}"
    );
}

#[test]
fn equal_length_different_content_includes_preview() {
    let ds = single_diff_set_with_bodies(403, 403, b"abc", b"xyz");
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(ev.contains("baseline: abc"));
    assert!(ev.contains("probe: xyz"));
}

#[test]
fn truncate_at_exact_max_preview_len_no_marker() {
    // A body of exactly 500 bytes is at — not over — the cap.
    let baseline = vec![b'a'; 500];
    let probe = vec![b'b'; 500];
    let ds = single_diff_set_with_bodies(403, 403, &baseline, &probe);
    let signals = extract(&ds);
    let ev = &signals[0].evidence;
    assert!(
        !ev.contains("truncated"),
        "no truncation marker when body fits exactly: {ev}",
    );
    assert!(ev.contains("baseline:"));
    assert!(ev.contains("probe:"));
}