Skip to main content

agi4_adapters/
swe_bench.rs

1//! SWE-bench Verified pass@5 adapter for autonomous agency conjunct.
2//!
3//! Ingests pass@5 success rate from the SWE-bench Verified benchmark.
4//! Rejects pass@1-only data. Returns a single Fraction value representing
5//! pass@5 performance.
6
7use crate::{ModelId, Source};
8use agi4_core::evidence::{
9    BoundedFraction, Evidence, MeasurementId, Provenance, SourceId, SourceValue,
10};
11use agi4_core::sources::autonomous_agency;
12use serde::{Deserialize, Serialize};
13use std::error::Error;
14use std::fmt;
15use url::Url;
16
17/// SWE-bench Verified benchmark data: pass@k success rate.
18#[derive(Debug, Clone, Deserialize, Serialize)]
19#[serde(deny_unknown_fields)]
20pub struct SweBenchRaw {
21    /// The value of k in pass@k. Must be >= 5.
22    pub pass_at_k: u32,
23    /// Success rate as a fraction (0.0 to 1.0).
24    pub success_rate: f64,
25}
26
27/// Error type for SWE-bench adapter operations.
28#[derive(Debug, Clone)]
29pub enum SweBenchError {
30    /// JSON parsing failed.
31    ParseError(String),
32    /// Value validation failed (e.g., pass@1-only data, out-of-bounds rate).
33    ValidationError(String),
34}
35
36impl fmt::Display for SweBenchError {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        match self {
39            Self::ParseError(msg) => write!(f, "SWE-bench parse error: {}", msg),
40            Self::ValidationError(msg) => write!(f, "SWE-bench validation error: {}", msg),
41        }
42    }
43}
44
45impl Error for SweBenchError {}
46
47/// SWE-bench Verified pass@5 adapter for the success rate measurement.
48pub struct SweBenchAdapter {
49    /// Canonical SWE-bench endpoint.
50    endpoint: Url,
51}
52
53impl SweBenchAdapter {
54    /// Create a new SWE-bench adapter with the canonical endpoint.
55    pub fn new() -> Result<Self, SweBenchError> {
56        let endpoint = Url::parse("https://swe-bench.github.io/api/results")
57            .map_err(|e| SweBenchError::ParseError(format!("invalid endpoint URL: {}", e)))?;
58        Ok(Self { endpoint })
59    }
60
61    /// Create a SWE-bench adapter with a custom endpoint (for testing).
62    pub fn with_endpoint(endpoint: Url) -> Self {
63        Self { endpoint }
64    }
65}
66
67impl Default for SweBenchAdapter {
68    fn default() -> Self {
69        Self::new().expect("default SWE-bench endpoint should be valid")
70    }
71}
72
73impl Source for SweBenchAdapter {
74    type Raw = SweBenchRaw;
75    type Error = SweBenchError;
76
77    fn id(&self) -> SourceId {
78        SourceId::new(autonomous_agency::SWE_BENCH_VERIFIED)
79    }
80
81    fn endpoint(&self) -> &Url {
82        &self.endpoint
83    }
84
85    fn parse(&self, raw: &str) -> Result<Self::Raw, Self::Error> {
86        serde_json::from_str::<SweBenchRaw>(raw)
87            .map_err(|e| SweBenchError::ParseError(format!("failed to deserialize JSON: {}", e)))
88    }
89
90    fn to_evidence(&self, raw: Self::Raw, _model: &ModelId) -> Result<Vec<Evidence>, Self::Error> {
91        // Reject pass@1-only data. Per SPEC §2.4, we require pass@k where k >= 5.
92        if raw.pass_at_k < 5 {
93            return Err(SweBenchError::ValidationError(format!(
94                "pass@{} not acceptable; SPEC requires pass@k where k >= 5",
95                raw.pass_at_k
96            )));
97        }
98
99        // Validate and construct BoundedFraction
100        let success_rate = BoundedFraction::new(raw.success_rate).map_err(|e| {
101            SweBenchError::ValidationError(format!("invalid success rate value: {}", e))
102        })?;
103
104        let evidence = Evidence {
105            source: self.id(),
106            measurement: MeasurementId::new(format!("pass@{}-rate", raw.pass_at_k)),
107            value: SourceValue::Fraction(success_rate),
108            reliability_percentile: 80, // Per SPEC §2.4
109            provenance: Provenance {
110                source_url: self.endpoint.clone(),
111                fetch_timestamp: chrono::Utc::now(),
112                source_version: Some("swe-bench-verified-v1".to_string()),
113                raw_value: raw.success_rate.to_string(),
114            },
115        };
116
117        Ok(vec![evidence])
118    }
119}
120
121#[cfg(test)]
122mod tests {
123    use super::*;
124
125    #[test]
126    fn swe_bench_adapter_new() {
127        let adapter = SweBenchAdapter::new().expect("should create adapter");
128        assert_eq!(adapter.id().as_str(), "swe-bench-verified");
129        assert!(adapter.endpoint().as_str().contains("swe-bench"));
130    }
131
132    #[test]
133    fn swe_bench_adapter_default() {
134        let adapter = SweBenchAdapter::default();
135        assert_eq!(adapter.id().as_str(), "swe-bench-verified");
136    }
137
138    #[test]
139    fn swe_bench_adapter_with_custom_endpoint() {
140        let custom_url = Url::parse("http://localhost:8080/swe-bench").unwrap();
141        let adapter = SweBenchAdapter::with_endpoint(custom_url.clone());
142        assert_eq!(adapter.endpoint(), &custom_url);
143    }
144
145    #[test]
146    fn swe_bench_parse_valid_json() {
147        let adapter = SweBenchAdapter::default();
148        let raw_json = r#"{"pass_at_k": 5, "success_rate": 0.91}"#;
149        let result = adapter.parse(raw_json);
150        assert!(result.is_ok());
151        let swe_bench_raw = result.unwrap();
152        assert_eq!(swe_bench_raw.pass_at_k, 5);
153        assert_eq!(swe_bench_raw.success_rate, 0.91);
154    }
155
156    #[test]
157    fn swe_bench_parse_invalid_json() {
158        let adapter = SweBenchAdapter::default();
159        let invalid_json = r#"{"invalid": "schema"}"#;
160        let result = adapter.parse(invalid_json);
161        assert!(result.is_err());
162        match result {
163            Err(SweBenchError::ParseError(_)) => {}
164            _ => panic!("expected ParseError"),
165        }
166    }
167
168    #[test]
169    fn swe_bench_parse_malformed_json() {
170        let adapter = SweBenchAdapter::default();
171        let malformed = "not valid json";
172        let result = adapter.parse(malformed);
173        assert!(result.is_err());
174    }
175
176    #[test]
177    fn swe_bench_to_evidence_valid() {
178        let adapter = SweBenchAdapter::default();
179        let raw = SweBenchRaw {
180            pass_at_k: 5,
181            success_rate: 0.91,
182        };
183        let model = ModelId::new("test-model");
184        let result = adapter.to_evidence(raw, &model);
185
186        assert!(result.is_ok());
187        let evidence_vec = result.unwrap();
188        assert_eq!(
189            evidence_vec.len(),
190            1,
191            "SWE-bench produces one evidence entry"
192        );
193
194        let evidence = &evidence_vec[0];
195
196        // Verify metadata
197        assert_eq!(evidence.source.as_str(), "swe-bench-verified");
198        assert_eq!(evidence.measurement.as_str(), "pass@5-rate");
199        assert_eq!(evidence.reliability_percentile, 80);
200
201        // Verify value type and bounds
202        match &evidence.value {
203            SourceValue::Fraction(frac) => {
204                assert_eq!(frac.value(), 0.91);
205            }
206            _ => panic!("expected Fraction value"),
207        }
208    }
209
210    #[test]
211    fn swe_bench_to_evidence_rejects_pass_at_1() {
212        let adapter = SweBenchAdapter::default();
213        let raw = SweBenchRaw {
214            pass_at_k: 1,
215            success_rate: 0.85,
216        };
217        let model = ModelId::new("test-model");
218        let result = adapter.to_evidence(raw, &model);
219
220        assert!(result.is_err());
221        match result {
222            Err(SweBenchError::ValidationError(msg)) => {
223                assert!(msg.contains("pass@1") || msg.contains("k >= 5"));
224            }
225            _ => panic!("expected ValidationError for pass@1"),
226        }
227    }
228
229    #[test]
230    fn swe_bench_to_evidence_rejects_pass_at_3() {
231        let adapter = SweBenchAdapter::default();
232        let raw = SweBenchRaw {
233            pass_at_k: 3,
234            success_rate: 0.88,
235        };
236        let model = ModelId::new("test-model");
237        let result = adapter.to_evidence(raw, &model);
238
239        assert!(result.is_err());
240        match result {
241            Err(SweBenchError::ValidationError(_)) => {}
242            _ => panic!("expected ValidationError for pass@3"),
243        }
244    }
245
246    #[test]
247    fn swe_bench_to_evidence_accepts_pass_at_10() {
248        let adapter = SweBenchAdapter::default();
249        let raw = SweBenchRaw {
250            pass_at_k: 10,
251            success_rate: 0.94,
252        };
253        let model = ModelId::new("test-model");
254        let result = adapter.to_evidence(raw, &model);
255
256        assert!(result.is_ok());
257        let evidence_vec = result.unwrap();
258        assert_eq!(evidence_vec.len(), 1);
259        let evidence = &evidence_vec[0];
260        assert_eq!(evidence.measurement.as_str(), "pass@10-rate");
261    }
262
263    #[test]
264    fn swe_bench_to_evidence_zero_rate() {
265        let adapter = SweBenchAdapter::default();
266        let raw = SweBenchRaw {
267            pass_at_k: 5,
268            success_rate: 0.0,
269        };
270        let model = ModelId::new("test-model");
271        let result = adapter.to_evidence(raw, &model);
272
273        assert!(result.is_ok());
274        let evidence_vec = result.unwrap();
275        assert_eq!(evidence_vec.len(), 1);
276    }
277
278    #[test]
279    fn swe_bench_to_evidence_maximum_rate() {
280        let adapter = SweBenchAdapter::default();
281        let raw = SweBenchRaw {
282            pass_at_k: 5,
283            success_rate: 1.0,
284        };
285        let model = ModelId::new("test-model");
286        let result = adapter.to_evidence(raw, &model);
287
288        assert!(result.is_ok());
289        let evidence_vec = result.unwrap();
290        assert_eq!(evidence_vec.len(), 1);
291    }
292
293    #[test]
294    fn swe_bench_to_evidence_out_of_bounds_high() {
295        let adapter = SweBenchAdapter::default();
296        let raw = SweBenchRaw {
297            pass_at_k: 5,
298            success_rate: 1.5,
299        };
300        let model = ModelId::new("test-model");
301        let result = adapter.to_evidence(raw, &model);
302
303        assert!(result.is_err());
304        match result {
305            Err(SweBenchError::ValidationError(_)) => {}
306            _ => panic!("expected ValidationError"),
307        }
308    }
309
310    #[test]
311    fn swe_bench_to_evidence_out_of_bounds_low() {
312        let adapter = SweBenchAdapter::default();
313        let raw = SweBenchRaw {
314            pass_at_k: 5,
315            success_rate: -0.1,
316        };
317        let model = ModelId::new("test-model");
318        let result = adapter.to_evidence(raw, &model);
319
320        assert!(result.is_err());
321        match result {
322            Err(SweBenchError::ValidationError(_)) => {}
323            _ => panic!("expected ValidationError"),
324        }
325    }
326
327    #[test]
328    fn swe_bench_to_evidence_provenance() {
329        let adapter = SweBenchAdapter::default();
330        let raw = SweBenchRaw {
331            pass_at_k: 5,
332            success_rate: 0.91,
333        };
334        let model = ModelId::new("test-model");
335        let evidence_vec = adapter.to_evidence(raw, &model).unwrap();
336        let evidence = &evidence_vec[0];
337
338        assert!(
339            evidence
340                .provenance
341                .source_url
342                .as_str()
343                .contains("swe-bench")
344        );
345        assert!(evidence.provenance.source_version.is_some());
346        assert_eq!(
347            evidence.provenance.source_version.as_ref().unwrap(),
348            "swe-bench-verified-v1"
349        );
350        assert_eq!(evidence.provenance.raw_value, "0.91");
351    }
352
353    #[test]
354    fn swe_bench_round_trip() {
355        let adapter = SweBenchAdapter::default();
356        let raw_json = r#"{"pass_at_k": 5, "success_rate": 0.91}"#;
357        let model = ModelId::new("test-model");
358
359        // Parse JSON
360        let swe_bench_raw = adapter.parse(raw_json).expect("should parse");
361
362        // Convert to evidence
363        let evidence_vec = adapter
364            .to_evidence(swe_bench_raw, &model)
365            .expect("should convert");
366
367        // Verify
368        assert_eq!(evidence_vec.len(), 1);
369        let evidence = &evidence_vec[0];
370        assert_eq!(evidence.source.as_str(), "swe-bench-verified");
371        assert_eq!(evidence.reliability_percentile, 80);
372
373        match &evidence.value {
374            SourceValue::Fraction(frac) => assert_eq!(frac.value(), 0.91),
375            _ => panic!("expected Fraction"),
376        }
377    }
378
379    #[test]
380    fn swe_bench_error_display() {
381        let err1 = SweBenchError::ParseError("test error".to_string());
382        assert!(err1.to_string().contains("parse error"));
383
384        let err2 = SweBenchError::ValidationError("invalid value".to_string());
385        assert!(err2.to_string().contains("validation error"));
386    }
387}