1use crate::{ModelId, Source};
8use agi4_core::evidence::{
9 BoundedFraction, Evidence, MeasurementId, Provenance, SourceId, SourceValue,
10};
11use agi4_core::sources::autonomous_agency;
12use serde::{Deserialize, Serialize};
13use std::error::Error;
14use std::fmt;
15use url::Url;
16
17#[derive(Debug, Clone, Deserialize, Serialize)]
19#[serde(deny_unknown_fields)]
20pub struct SweBenchRaw {
21 pub pass_at_k: u32,
23 pub success_rate: f64,
25}
26
27#[derive(Debug, Clone)]
29pub enum SweBenchError {
30 ParseError(String),
32 ValidationError(String),
34}
35
36impl fmt::Display for SweBenchError {
37 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38 match self {
39 Self::ParseError(msg) => write!(f, "SWE-bench parse error: {}", msg),
40 Self::ValidationError(msg) => write!(f, "SWE-bench validation error: {}", msg),
41 }
42 }
43}
44
45impl Error for SweBenchError {}
46
47pub struct SweBenchAdapter {
49 endpoint: Url,
51}
52
53impl SweBenchAdapter {
54 pub fn new() -> Result<Self, SweBenchError> {
56 let endpoint = Url::parse("https://swe-bench.github.io/api/results")
57 .map_err(|e| SweBenchError::ParseError(format!("invalid endpoint URL: {}", e)))?;
58 Ok(Self { endpoint })
59 }
60
61 pub fn with_endpoint(endpoint: Url) -> Self {
63 Self { endpoint }
64 }
65}
66
67impl Default for SweBenchAdapter {
68 fn default() -> Self {
69 Self::new().expect("default SWE-bench endpoint should be valid")
70 }
71}
72
73impl Source for SweBenchAdapter {
74 type Raw = SweBenchRaw;
75 type Error = SweBenchError;
76
77 fn id(&self) -> SourceId {
78 SourceId::new(autonomous_agency::SWE_BENCH_VERIFIED)
79 }
80
81 fn endpoint(&self) -> &Url {
82 &self.endpoint
83 }
84
85 fn parse(&self, raw: &str) -> Result<Self::Raw, Self::Error> {
86 serde_json::from_str::<SweBenchRaw>(raw)
87 .map_err(|e| SweBenchError::ParseError(format!("failed to deserialize JSON: {}", e)))
88 }
89
90 fn to_evidence(&self, raw: Self::Raw, _model: &ModelId) -> Result<Vec<Evidence>, Self::Error> {
91 if raw.pass_at_k < 5 {
93 return Err(SweBenchError::ValidationError(format!(
94 "pass@{} not acceptable; SPEC requires pass@k where k >= 5",
95 raw.pass_at_k
96 )));
97 }
98
99 let success_rate = BoundedFraction::new(raw.success_rate).map_err(|e| {
101 SweBenchError::ValidationError(format!("invalid success rate value: {}", e))
102 })?;
103
104 let evidence = Evidence {
105 source: self.id(),
106 measurement: MeasurementId::new(format!("pass@{}-rate", raw.pass_at_k)),
107 value: SourceValue::Fraction(success_rate),
108 reliability_percentile: 80, provenance: Provenance {
110 source_url: self.endpoint.clone(),
111 fetch_timestamp: chrono::Utc::now(),
112 source_version: Some("swe-bench-verified-v1".to_string()),
113 raw_value: raw.success_rate.to_string(),
114 },
115 };
116
117 Ok(vec![evidence])
118 }
119}
120
121#[cfg(test)]
122mod tests {
123 use super::*;
124
125 #[test]
126 fn swe_bench_adapter_new() {
127 let adapter = SweBenchAdapter::new().expect("should create adapter");
128 assert_eq!(adapter.id().as_str(), "swe-bench-verified");
129 assert!(adapter.endpoint().as_str().contains("swe-bench"));
130 }
131
132 #[test]
133 fn swe_bench_adapter_default() {
134 let adapter = SweBenchAdapter::default();
135 assert_eq!(adapter.id().as_str(), "swe-bench-verified");
136 }
137
138 #[test]
139 fn swe_bench_adapter_with_custom_endpoint() {
140 let custom_url = Url::parse("http://localhost:8080/swe-bench").unwrap();
141 let adapter = SweBenchAdapter::with_endpoint(custom_url.clone());
142 assert_eq!(adapter.endpoint(), &custom_url);
143 }
144
145 #[test]
146 fn swe_bench_parse_valid_json() {
147 let adapter = SweBenchAdapter::default();
148 let raw_json = r#"{"pass_at_k": 5, "success_rate": 0.91}"#;
149 let result = adapter.parse(raw_json);
150 assert!(result.is_ok());
151 let swe_bench_raw = result.unwrap();
152 assert_eq!(swe_bench_raw.pass_at_k, 5);
153 assert_eq!(swe_bench_raw.success_rate, 0.91);
154 }
155
156 #[test]
157 fn swe_bench_parse_invalid_json() {
158 let adapter = SweBenchAdapter::default();
159 let invalid_json = r#"{"invalid": "schema"}"#;
160 let result = adapter.parse(invalid_json);
161 assert!(result.is_err());
162 match result {
163 Err(SweBenchError::ParseError(_)) => {}
164 _ => panic!("expected ParseError"),
165 }
166 }
167
168 #[test]
169 fn swe_bench_parse_malformed_json() {
170 let adapter = SweBenchAdapter::default();
171 let malformed = "not valid json";
172 let result = adapter.parse(malformed);
173 assert!(result.is_err());
174 }
175
176 #[test]
177 fn swe_bench_to_evidence_valid() {
178 let adapter = SweBenchAdapter::default();
179 let raw = SweBenchRaw {
180 pass_at_k: 5,
181 success_rate: 0.91,
182 };
183 let model = ModelId::new("test-model");
184 let result = adapter.to_evidence(raw, &model);
185
186 assert!(result.is_ok());
187 let evidence_vec = result.unwrap();
188 assert_eq!(
189 evidence_vec.len(),
190 1,
191 "SWE-bench produces one evidence entry"
192 );
193
194 let evidence = &evidence_vec[0];
195
196 assert_eq!(evidence.source.as_str(), "swe-bench-verified");
198 assert_eq!(evidence.measurement.as_str(), "pass@5-rate");
199 assert_eq!(evidence.reliability_percentile, 80);
200
201 match &evidence.value {
203 SourceValue::Fraction(frac) => {
204 assert_eq!(frac.value(), 0.91);
205 }
206 _ => panic!("expected Fraction value"),
207 }
208 }
209
210 #[test]
211 fn swe_bench_to_evidence_rejects_pass_at_1() {
212 let adapter = SweBenchAdapter::default();
213 let raw = SweBenchRaw {
214 pass_at_k: 1,
215 success_rate: 0.85,
216 };
217 let model = ModelId::new("test-model");
218 let result = adapter.to_evidence(raw, &model);
219
220 assert!(result.is_err());
221 match result {
222 Err(SweBenchError::ValidationError(msg)) => {
223 assert!(msg.contains("pass@1") || msg.contains("k >= 5"));
224 }
225 _ => panic!("expected ValidationError for pass@1"),
226 }
227 }
228
229 #[test]
230 fn swe_bench_to_evidence_rejects_pass_at_3() {
231 let adapter = SweBenchAdapter::default();
232 let raw = SweBenchRaw {
233 pass_at_k: 3,
234 success_rate: 0.88,
235 };
236 let model = ModelId::new("test-model");
237 let result = adapter.to_evidence(raw, &model);
238
239 assert!(result.is_err());
240 match result {
241 Err(SweBenchError::ValidationError(_)) => {}
242 _ => panic!("expected ValidationError for pass@3"),
243 }
244 }
245
246 #[test]
247 fn swe_bench_to_evidence_accepts_pass_at_10() {
248 let adapter = SweBenchAdapter::default();
249 let raw = SweBenchRaw {
250 pass_at_k: 10,
251 success_rate: 0.94,
252 };
253 let model = ModelId::new("test-model");
254 let result = adapter.to_evidence(raw, &model);
255
256 assert!(result.is_ok());
257 let evidence_vec = result.unwrap();
258 assert_eq!(evidence_vec.len(), 1);
259 let evidence = &evidence_vec[0];
260 assert_eq!(evidence.measurement.as_str(), "pass@10-rate");
261 }
262
263 #[test]
264 fn swe_bench_to_evidence_zero_rate() {
265 let adapter = SweBenchAdapter::default();
266 let raw = SweBenchRaw {
267 pass_at_k: 5,
268 success_rate: 0.0,
269 };
270 let model = ModelId::new("test-model");
271 let result = adapter.to_evidence(raw, &model);
272
273 assert!(result.is_ok());
274 let evidence_vec = result.unwrap();
275 assert_eq!(evidence_vec.len(), 1);
276 }
277
278 #[test]
279 fn swe_bench_to_evidence_maximum_rate() {
280 let adapter = SweBenchAdapter::default();
281 let raw = SweBenchRaw {
282 pass_at_k: 5,
283 success_rate: 1.0,
284 };
285 let model = ModelId::new("test-model");
286 let result = adapter.to_evidence(raw, &model);
287
288 assert!(result.is_ok());
289 let evidence_vec = result.unwrap();
290 assert_eq!(evidence_vec.len(), 1);
291 }
292
293 #[test]
294 fn swe_bench_to_evidence_out_of_bounds_high() {
295 let adapter = SweBenchAdapter::default();
296 let raw = SweBenchRaw {
297 pass_at_k: 5,
298 success_rate: 1.5,
299 };
300 let model = ModelId::new("test-model");
301 let result = adapter.to_evidence(raw, &model);
302
303 assert!(result.is_err());
304 match result {
305 Err(SweBenchError::ValidationError(_)) => {}
306 _ => panic!("expected ValidationError"),
307 }
308 }
309
310 #[test]
311 fn swe_bench_to_evidence_out_of_bounds_low() {
312 let adapter = SweBenchAdapter::default();
313 let raw = SweBenchRaw {
314 pass_at_k: 5,
315 success_rate: -0.1,
316 };
317 let model = ModelId::new("test-model");
318 let result = adapter.to_evidence(raw, &model);
319
320 assert!(result.is_err());
321 match result {
322 Err(SweBenchError::ValidationError(_)) => {}
323 _ => panic!("expected ValidationError"),
324 }
325 }
326
327 #[test]
328 fn swe_bench_to_evidence_provenance() {
329 let adapter = SweBenchAdapter::default();
330 let raw = SweBenchRaw {
331 pass_at_k: 5,
332 success_rate: 0.91,
333 };
334 let model = ModelId::new("test-model");
335 let evidence_vec = adapter.to_evidence(raw, &model).unwrap();
336 let evidence = &evidence_vec[0];
337
338 assert!(
339 evidence
340 .provenance
341 .source_url
342 .as_str()
343 .contains("swe-bench")
344 );
345 assert!(evidence.provenance.source_version.is_some());
346 assert_eq!(
347 evidence.provenance.source_version.as_ref().unwrap(),
348 "swe-bench-verified-v1"
349 );
350 assert_eq!(evidence.provenance.raw_value, "0.91");
351 }
352
353 #[test]
354 fn swe_bench_round_trip() {
355 let adapter = SweBenchAdapter::default();
356 let raw_json = r#"{"pass_at_k": 5, "success_rate": 0.91}"#;
357 let model = ModelId::new("test-model");
358
359 let swe_bench_raw = adapter.parse(raw_json).expect("should parse");
361
362 let evidence_vec = adapter
364 .to_evidence(swe_bench_raw, &model)
365 .expect("should convert");
366
367 assert_eq!(evidence_vec.len(), 1);
369 let evidence = &evidence_vec[0];
370 assert_eq!(evidence.source.as_str(), "swe-bench-verified");
371 assert_eq!(evidence.reliability_percentile, 80);
372
373 match &evidence.value {
374 SourceValue::Fraction(frac) => assert_eq!(frac.value(), 0.91),
375 _ => panic!("expected Fraction"),
376 }
377 }
378
379 #[test]
380 fn swe_bench_error_display() {
381 let err1 = SweBenchError::ParseError("test error".to_string());
382 assert!(err1.to_string().contains("parse error"));
383
384 let err2 = SweBenchError::ValidationError("invalid value".to_string());
385 assert!(err2.to_string().contains("validation error"));
386 }
387}