Skip to main content

mockforge_registry_server/ai/
contract_diff.rs

1//! AI-assisted contract drift scoring (#348).
2//!
3//! The structural `ContractExecutor` in `mockforge-test-runner` already
4//! computes which declared spec endpoints have traffic vs. which are
5//! orphaned, and which traffic endpoints aren't declared at all. That's
6//! cheap and free.
7//!
8//! What this module adds is a *second pass* that takes a small sample
9//! of recent captured exchanges per declared endpoint and asks an LLM
10//! to score how the actual request/response shape compares to what the
11//! spec promises. Findings come back tagged
12//! `breaking | non_breaking | cosmetic` plus a confidence score.
13//!
14//! ## Why this is opt-in
15//!
16//! - Costs real LLM tokens (BYOK or platform credits, depending on the
17//!   org's setup).
18//! - Sampling captures means full request/response bodies leave the
19//!   registry's network, so users with strict data-handling rules
20//!   should be able to keep it disabled.
21//!
22//! Both are surfaced as a `ai_drift_enabled: bool` flag on the
23//! `test_suite.config` blob; the runner only fires the scoring callback
24//! when set true.
25//!
26//! ## What lives here vs. what lives in the handler
27//!
28//! Everything in this module is pure (no DB, no HTTP) so the prompt
29//! template, sample-shaping, and LLM-response parsing are unit-testable
30//! without fixtures. The HTTP handler at
31//! `handlers::internal_contract_diff` owns the orchestration: pulling
32//! sample captures, calling `run_completion_for_org`, and recording
33//! usage.
34
35use serde::{Deserialize, Serialize};
36
37/// Severity buckets returned by the AI scorer. Matches the existing
38/// `severity` field on `diff_finding` events so the UI's
39/// severity-grouped renderer renders both structural and AI findings
40/// without code changes.
41#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
42#[serde(rename_all = "snake_case")]
43pub enum DriftSeverity {
44    /// Behaviour observed in traffic that the spec explicitly forbids
45    /// (missing required field, wrong type, status code outside the
46    /// declared set, removed-but-still-served endpoint).
47    Breaking,
48    /// Behaviour the spec doesn't strictly forbid but that real
49    /// consumers would notice (added optional field in a response,
50    /// looser validation than declared).
51    NonBreaking,
52    /// Stylistic drift only (case differences, whitespace, ordering of
53    /// optional fields). Not actionable, kept so the model can flag
54    /// without distorting severity counts.
55    Cosmetic,
56}
57
58impl DriftSeverity {
59    /// Wire string used in `diff_finding` event payloads.
60    pub fn as_str(&self) -> &'static str {
61        match self {
62            DriftSeverity::Breaking => "breaking",
63            DriftSeverity::NonBreaking => "non_breaking",
64            DriftSeverity::Cosmetic => "cosmetic",
65        }
66    }
67}
68
69/// One AI-scored drift finding for a single endpoint.
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct AiFinding {
72    /// `breaking | non_breaking | cosmetic`.
73    pub severity: DriftSeverity,
74    /// `"POST /api/checkout"` — same shape the structural pass uses.
75    pub endpoint: String,
76    /// Short, human-readable description of the drift.
77    pub description: String,
78    /// Model's self-reported confidence in `[0.0, 1.0]`. Out-of-range
79    /// values are clamped on parse.
80    pub confidence: f64,
81    /// Free-text rationale the UI can show on click-through. Empty
82    /// string when the model didn't supply one.
83    #[serde(default)]
84    pub rationale: String,
85}
86
87/// One sampled exchange that the scorer can reason about. Bodies are
88/// the verbatim request/response strings from `runtime_captures` —
89/// already filtered to JSON-shaped payloads upstream.
90#[derive(Debug, Clone, Serialize)]
91pub struct SampledExchange {
92    pub method: String,
93    pub path: String,
94    pub status_code: Option<i32>,
95    /// Truncated to a safe length before being included in the prompt
96    /// (see [`MAX_BODY_CHARS`]). The truncation is stable so the model
97    /// gets a deterministic view across re-runs.
98    pub request_body: Option<String>,
99    pub response_body: Option<String>,
100}
101
102/// Cap per-body characters embedded in the prompt. Picked to stay well
103/// inside even Haiku-class context windows once the system prompt and
104/// spec excerpt are added: 6KB × ~10 exchanges ≈ 60KB of body data,
105/// plus ~10KB of spec excerpt and prompt ≈ 70KB total. Comfortable
106/// margin under a 128k context.
107pub const MAX_BODY_CHARS: usize = 6_000;
108
109/// Marker appended to truncated bodies so the model can tell it didn't
110/// see the whole thing. Length deliberately stable so
111/// [`truncate_body`] can reserve space for it.
112const TRUNCATION_MARKER: &str = "\n… (truncated)";
113
114/// Truncate a body string at a UTF-8 boundary, appending an
115/// `… (truncated)` marker so the model knows it didn't see the whole
116/// thing. Returns the original string when it's already small enough.
117/// Reserves space for the marker so the result is always shorter than
118/// the input — even when the original was barely over `max_chars`.
119pub fn truncate_body(body: &str, max_chars: usize) -> String {
120    if body.chars().count() <= max_chars {
121        return body.to_string();
122    }
123    // Reserve room for the marker so the final length stays under the
124    // input's. Saturating-sub guards against pathologically tiny
125    // `max_chars` values (anything < marker length collapses to a
126    // bare marker, which is fine — the model still gets the signal).
127    let marker_len = TRUNCATION_MARKER.chars().count();
128    let take = max_chars.saturating_sub(marker_len);
129    let mut out: String = body.chars().take(take).collect();
130    out.push_str(TRUNCATION_MARKER);
131    out
132}
133
134/// Build the system + user prompts the LLM sees. Pure function so the
135/// shape can be locked down in tests.
136///
137/// The system prompt teaches the model:
138/// 1. The severity vocabulary it must emit (breaking / non_breaking /
139///    cosmetic).
140/// 2. The strict JSON-array output format.
141/// 3. How to weigh sample size when reporting confidence.
142///
143/// The user prompt carries the spec excerpt followed by the
144/// per-endpoint sample exchanges, formatted as a fenced JSON block so
145/// the model can scan them mechanically.
146pub fn build_prompt(spec_excerpt: &str, exchanges: &[SampledExchange]) -> (String, String) {
147    let system = "You are a contract-drift reviewer for OpenAPI APIs. Compare actual \
148HTTP exchanges against the declared spec and report only meaningful drift.\n\n\
149You MUST emit a single JSON array (no prose, no markdown fences) where each \
150element has these exact keys: severity (\"breaking\" | \"non_breaking\" | \
151\"cosmetic\"), endpoint (\"METHOD /path\"), description (one short sentence), \
152confidence (0.0 to 1.0), rationale (one or two sentences explaining what \
153you observed).\n\n\
154Severity rules:\n\
155- breaking: required field missing, wrong type, undeclared status code, \
156endpoint serving traffic that the spec forbids.\n\
157- non_breaking: extra optional field present, loosened validation, \
158behaviour the spec doesn't forbid.\n\
159- cosmetic: case-only differences, ordering, whitespace, trailing slashes.\n\n\
160Confidence rules: If you saw fewer than 2 exchanges for an endpoint, cap \
161confidence at 0.6. If exchanges all show the same drift, use 0.85+. If \
162exchanges disagree with each other, drop to 0.5.\n\n\
163If you find no drift at all, emit an empty array `[]`."
164        .to_string();
165
166    let mut user = String::new();
167    user.push_str("# Declared OpenAPI spec (excerpt)\n\n```yaml\n");
168    user.push_str(spec_excerpt);
169    user.push_str("\n```\n\n# Sampled exchanges\n\n");
170
171    if exchanges.is_empty() {
172        user.push_str("(no exchanges sampled — declared endpoints had no recent traffic)\n");
173    } else {
174        for ex in exchanges {
175            user.push_str(&format!(
176                "## {} {}\nstatus: {}\n",
177                ex.method,
178                ex.path,
179                ex.status_code.map(|s| s.to_string()).unwrap_or_else(|| "?".to_string()),
180            ));
181            if let Some(body) = &ex.request_body {
182                user.push_str("\n### request body\n```\n");
183                user.push_str(&truncate_body(body, MAX_BODY_CHARS));
184                user.push_str("\n```\n");
185            }
186            if let Some(body) = &ex.response_body {
187                user.push_str("\n### response body\n```\n");
188                user.push_str(&truncate_body(body, MAX_BODY_CHARS));
189                user.push_str("\n```\n");
190            }
191            user.push('\n');
192        }
193    }
194
195    user.push_str(
196        "\n# Task\n\nEmit the JSON array of findings. No other output. \
197If nothing drifted, emit `[]`.",
198    );
199
200    (system, user)
201}
202
203/// Parse the model's response into structured findings. Tolerates:
204/// - Plain JSON arrays.
205/// - JSON arrays inside ```json``` fences (handled by
206///   [`crate::handlers::ai_studio::extract_json_payload`] upstream).
207/// - Single-object responses where the model forgot the array wrapper
208///   (we wrap it for them).
209/// - Out-of-range confidence values (clamped to `[0.0, 1.0]`).
210/// - Unknown severity strings (entry skipped with a warning).
211///
212/// Returns an empty vec when nothing usable was found — the executor
213/// emits a degraded-but-fine "AI scoring returned no findings" log
214/// rather than failing the whole run.
215pub fn parse_findings(json: &serde_json::Value) -> Vec<AiFinding> {
216    let raw_array: Vec<serde_json::Value> = match json {
217        serde_json::Value::Array(items) => items.clone(),
218        // Single-object response — wrap it.
219        obj @ serde_json::Value::Object(_) => vec![obj.clone()],
220        _ => return Vec::new(),
221    };
222
223    let mut out = Vec::with_capacity(raw_array.len());
224    for item in raw_array {
225        let severity_str = item.get("severity").and_then(|v| v.as_str()).unwrap_or("");
226        let severity = match severity_str {
227            "breaking" => DriftSeverity::Breaking,
228            "non_breaking" => DriftSeverity::NonBreaking,
229            "cosmetic" => DriftSeverity::Cosmetic,
230            _ => continue, // unknown severity — drop the entry rather than guess
231        };
232        let endpoint = item
233            .get("endpoint")
234            .and_then(|v| v.as_str())
235            .map(str::trim)
236            .filter(|s| !s.is_empty())
237            .map(String::from);
238        let Some(endpoint) = endpoint else { continue };
239        let description = item
240            .get("description")
241            .and_then(|v| v.as_str())
242            .map(str::trim)
243            .unwrap_or("")
244            .to_string();
245        let raw_conf = item.get("confidence").and_then(|v| v.as_f64()).unwrap_or(0.5);
246        let confidence = raw_conf.clamp(0.0, 1.0);
247        let rationale = item
248            .get("rationale")
249            .and_then(|v| v.as_str())
250            .map(str::trim)
251            .unwrap_or("")
252            .to_string();
253
254        out.push(AiFinding {
255            severity,
256            endpoint,
257            description,
258            confidence,
259            rationale,
260        });
261    }
262
263    out
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    fn ex(method: &str, path: &str, body: &str) -> SampledExchange {
271        SampledExchange {
272            method: method.to_string(),
273            path: path.to_string(),
274            status_code: Some(200),
275            request_body: None,
276            response_body: Some(body.to_string()),
277        }
278    }
279
280    #[test]
281    fn truncate_short_body_unchanged() {
282        let s = "abc";
283        assert_eq!(truncate_body(s, 100), "abc");
284    }
285
286    #[test]
287    fn truncate_appends_marker() {
288        let s = "x".repeat(MAX_BODY_CHARS + 10);
289        let out = truncate_body(&s, MAX_BODY_CHARS);
290        assert!(out.ends_with("… (truncated)"));
291        // Original body chars + marker
292        assert!(out.chars().count() < s.chars().count());
293    }
294
295    #[test]
296    fn truncate_handles_multibyte_at_boundary() {
297        // Marker chars: emoji plus surrounding text. Must not split codepoints.
298        let s = "héllo wörld 🎉".repeat(2_000);
299        let out = truncate_body(&s, 100);
300        assert!(out.is_char_boundary(out.len()));
301    }
302
303    #[test]
304    fn build_prompt_with_no_exchanges() {
305        let (system, user) = build_prompt("paths: {}", &[]);
306        assert!(system.contains("contract-drift"));
307        assert!(user.contains("(no exchanges sampled"));
308        assert!(user.contains("paths: {}"));
309    }
310
311    #[test]
312    fn build_prompt_includes_method_path_status() {
313        let (_system, user) =
314            build_prompt("openapi: 3.0.0", &[ex("POST", "/api/checkout", r#"{"item":"x"}"#)]);
315        assert!(user.contains("POST /api/checkout"));
316        assert!(user.contains("status: 200"));
317        assert!(user.contains(r#"{"item":"x"}"#));
318    }
319
320    #[test]
321    fn parse_findings_well_formed_array() {
322        let json = serde_json::json!([
323            {
324                "severity": "breaking",
325                "endpoint": "POST /api/checkout",
326                "description": "Required field missing",
327                "confidence": 0.9,
328                "rationale": "All 3 sampled responses omitted `created_at`."
329            },
330            {
331                "severity": "cosmetic",
332                "endpoint": "GET /api/users",
333                "description": "Trailing slash in path",
334                "confidence": 0.4,
335                "rationale": ""
336            }
337        ]);
338        let findings = parse_findings(&json);
339        assert_eq!(findings.len(), 2);
340        assert_eq!(findings[0].severity, DriftSeverity::Breaking);
341        assert_eq!(findings[0].endpoint, "POST /api/checkout");
342        assert!((findings[0].confidence - 0.9).abs() < 0.001);
343        assert_eq!(findings[1].severity, DriftSeverity::Cosmetic);
344        assert!(findings[1].rationale.is_empty());
345    }
346
347    #[test]
348    fn parse_findings_clamps_confidence_out_of_range() {
349        let json = serde_json::json!([
350            { "severity": "breaking", "endpoint": "GET /a", "description": "x", "confidence": 1.7 },
351            { "severity": "breaking", "endpoint": "GET /b", "description": "y", "confidence": -0.5 }
352        ]);
353        let findings = parse_findings(&json);
354        assert_eq!(findings.len(), 2);
355        assert!((findings[0].confidence - 1.0).abs() < 0.001);
356        assert!(findings[1].confidence.abs() < 0.001);
357    }
358
359    #[test]
360    fn parse_findings_drops_unknown_severity() {
361        let json = serde_json::json!([
362            { "severity": "blocker", "endpoint": "GET /a", "description": "x", "confidence": 0.5 },
363            { "severity": "breaking", "endpoint": "GET /b", "description": "y", "confidence": 0.5 }
364        ]);
365        let findings = parse_findings(&json);
366        assert_eq!(findings.len(), 1);
367        assert_eq!(findings[0].endpoint, "GET /b");
368    }
369
370    #[test]
371    fn parse_findings_drops_missing_endpoint() {
372        let json = serde_json::json!([
373            { "severity": "breaking", "description": "x", "confidence": 0.5 }
374        ]);
375        assert!(parse_findings(&json).is_empty());
376    }
377
378    #[test]
379    fn parse_findings_wraps_single_object() {
380        let json = serde_json::json!({
381            "severity": "non_breaking",
382            "endpoint": "GET /a",
383            "description": "Extra optional field",
384            "confidence": 0.7
385        });
386        let findings = parse_findings(&json);
387        assert_eq!(findings.len(), 1);
388        assert_eq!(findings[0].severity, DriftSeverity::NonBreaking);
389    }
390
391    #[test]
392    fn parse_findings_empty_array() {
393        let json = serde_json::json!([]);
394        assert!(parse_findings(&json).is_empty());
395    }
396
397    #[test]
398    fn parse_findings_non_array_non_object_returns_empty() {
399        assert!(parse_findings(&serde_json::json!("nope")).is_empty());
400        assert!(parse_findings(&serde_json::json!(42)).is_empty());
401        assert!(parse_findings(&serde_json::json!(null)).is_empty());
402    }
403
404    #[test]
405    fn drift_severity_wire_strings() {
406        assert_eq!(DriftSeverity::Breaking.as_str(), "breaking");
407        assert_eq!(DriftSeverity::NonBreaking.as_str(), "non_breaking");
408        assert_eq!(DriftSeverity::Cosmetic.as_str(), "cosmetic");
409    }
410}