1use crate::conjunct::ConjunctStatus;
7use crate::evidence::{Evidence, SourceValue};
8use crate::sources;
9use crate::threshold;
10use serde::{Deserialize, Serialize};
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ConsistencyResult {
15 pub passed: bool,
16 pub failed_rules: Vec<String>,
17 pub detail: Option<String>,
18}
19
20impl ConsistencyResult {
21 pub fn pass() -> Self {
23 Self {
24 passed: true,
25 failed_rules: vec![],
26 detail: None,
27 }
28 }
29
30 pub fn fail(rules: Vec<&str>, detail: String) -> Self {
32 Self {
33 passed: false,
34 failed_rules: rules.iter().map(|s| s.to_string()).collect(),
35 detail: Some(detail),
36 }
37 }
38}
39
40fn check_no_insufficient_data_masking(
43 conjunct_statuses: &[ConjunctStatus; 4],
44) -> Result<(), String> {
45 let pass_count = conjunct_statuses
46 .iter()
47 .filter(|s| **s == ConjunctStatus::Pass)
48 .count();
49 let insufficient_count = conjunct_statuses
50 .iter()
51 .filter(|s| **s == ConjunctStatus::InsufficientData)
52 .count();
53
54 if pass_count == 3 && insufficient_count == 1 {
56 return Err(
57 "One conjunct is insufficient_data while all others pass (masking pattern)".to_string(),
58 );
59 }
60 Ok(())
61}
62
63fn get_source_threshold(source_id: &str) -> Option<Vec<(usize, f64, Option<f64>)>> {
66 match source_id {
67 sources::generality::ARC_AGI_2 => {
68 Some(vec![(0, threshold::generality::ARC_AGI_2_PASS, None)])
69 }
70 sources::generality::ARC_AGI_3 => Some(vec![
71 (
72 0,
73 threshold::generality::ARC_AGI_3_PASS,
74 Some(threshold::generality::ARC_AGI_3_FLOOR),
75 ),
76 (
77 2,
78 threshold::environmental_transfer::ARC_AGI_3_PASS,
79 Some(threshold::environmental_transfer::ARC_AGI_3_FLOOR),
80 ),
81 ]),
82 sources::generality::HLE => Some(vec![(0, threshold::generality::HLE_PASS, None)]),
83 sources::generality::GPQA_DIAMOND => {
84 Some(vec![(0, threshold::generality::GPQA_DIAMOND_PASS, None)])
85 }
86 sources::economic_substitutability::GDPVAL
87 | sources::economic_substitutability::GDPVAL_AA => Some(vec![(
88 1,
89 threshold::economic_substitutability::GDPVAL_PASS,
90 None,
91 )]),
92 sources::economic_substitutability::RLI => Some(vec![(
93 1,
94 threshold::economic_substitutability::RLI_PASS,
95 Some(threshold::economic_substitutability::RLI_FLOOR),
96 )]),
97 sources::economic_substitutability::APEX_AGENTS => Some(vec![(
98 1,
99 threshold::economic_substitutability::APEX_AGENTS_PASS,
100 None,
101 )]),
102 sources::environmental_transfer::OSWORLD => Some(vec![(
103 2,
104 threshold::environmental_transfer::OSWORLD_PASS,
105 None,
106 )]),
107 sources::environmental_transfer::NES => {
108 None
110 }
111 sources::autonomous_agency::METR_80PCT_TIME_HORIZON => Some(vec![(
112 3,
113 threshold::autonomous_agency::METR_80PCT_PASS_HOURS,
114 Some(threshold::autonomous_agency::METR_80PCT_FLOOR_HOURS),
115 )]),
116 sources::autonomous_agency::RE_BENCH => {
117 Some(vec![(3, threshold::autonomous_agency::REBENCH_PASS, None)])
118 }
119 sources::autonomous_agency::SWE_BENCH_VERIFIED => Some(vec![(
120 3,
121 threshold::autonomous_agency::SWEBENCH_VERIFIED_PASS_AT_5,
122 None,
123 )]),
124 _ => None,
125 }
126}
127
128fn check_variance_bound(
135 evidence: &[Evidence],
136 conjunct_statuses: &[ConjunctStatus; 4],
137) -> Result<(), String> {
138 let all_pass = conjunct_statuses.iter().all(|s| *s == ConjunctStatus::Pass);
139 if !all_pass {
140 return Ok(());
142 }
143
144 let mut all_margins = Vec::new();
145
146 for e in evidence {
147 if let Some(thresholds) = get_source_threshold(e.source.as_str()) {
148 for (_, pass_threshold, _) in thresholds {
149 match e.value {
150 SourceValue::Fraction(f) => {
151 let margin = f.value() / pass_threshold;
152 all_margins.push(margin);
153 }
154 SourceValue::Hours(h) => {
155 let margin = h.value() / pass_threshold;
156 all_margins.push(margin);
157 }
158 }
159 }
160 }
161 }
162
163 if all_margins.is_empty() {
164 return Ok(());
166 }
167
168 let min_margin = all_margins.iter().cloned().fold(f64::INFINITY, f64::min);
170 let max_margin = all_margins
171 .iter()
172 .cloned()
173 .fold(f64::NEG_INFINITY, f64::max);
174 if min_margin < threshold::consistency::MARGIN_VARIANCE_RATIO * max_margin {
175 return Err(format!(
176 "Variance bound violated: min_margin ({:.3}) < 0.5 * max_margin ({:.3})",
177 min_margin, max_margin
178 ));
179 }
180
181 Ok(())
182}
183
184fn check_provenance_metadata(evidence: &[Evidence]) -> Result<(), String> {
187 let mut missing_sources = Vec::new();
188
189 for e in evidence {
190 let source_id = e.source.as_str();
191 let mut issues = Vec::new();
192
193 if e.provenance.source_url.as_str().is_empty() {
195 issues.push("source_url");
196 }
197
198 if !issues.is_empty() {
206 missing_sources.push(format!("{} (missing: {})", source_id, issues.join(", ")));
207 }
208 }
209
210 if !missing_sources.is_empty() {
211 return Err(format!(
212 "Provenance metadata incomplete for: {}",
213 missing_sources.join("; ")
214 ));
215 }
216
217 Ok(())
218}
219
220pub fn consistency_check(
225 evidence: &[Evidence],
226 conjunct_statuses: &[ConjunctStatus; 4],
227) -> ConsistencyResult {
228 let mut failed_rules = Vec::new();
229
230 if check_no_insufficient_data_masking(conjunct_statuses).is_err() {
232 failed_rules.push("rule_1_insufficient_data_masking");
233 }
234
235 if check_variance_bound(evidence, conjunct_statuses).is_err() {
237 failed_rules.push("rule_2_variance_bound");
238 }
239
240 if check_provenance_metadata(evidence).is_err() {
242 failed_rules.push("rule_3_provenance_metadata");
243 }
244
245 if failed_rules.is_empty() {
246 ConsistencyResult::pass()
247 } else {
248 let detail = format!("Consistency check failed on: {}", failed_rules.join(", "));
249 ConsistencyResult::fail(failed_rules.to_vec(), detail)
250 }
251}
252
253#[cfg(test)]
254mod tests {
255 use super::*;
256 use crate::evidence::{
257 BoundedFraction, MeasurementId, NonNegativeHours, Provenance, SourceId, SourceValue,
258 };
259 use chrono::Utc;
260 use url::Url;
261
262 fn make_evidence(source: &str, value: f64, is_fraction: bool) -> Evidence {
263 Evidence {
264 source: SourceId::new(source),
265 measurement: MeasurementId::new("test-measurement"),
266 value: if is_fraction {
267 SourceValue::Fraction(BoundedFraction::new(value).unwrap())
268 } else {
269 SourceValue::Hours(NonNegativeHours::new(value).unwrap())
270 },
271 reliability_percentile: 95,
272 provenance: Provenance {
273 source_url: Url::parse("https://example.com").unwrap(),
274 fetch_timestamp: Utc::now(),
275 source_version: Some("1.0".to_string()),
276 raw_value: format!("{}", value),
277 },
278 }
279 }
280
281 #[test]
282 fn rule1_all_pass_with_no_insufficient_data() {
283 let statuses = [
284 ConjunctStatus::Pass,
285 ConjunctStatus::Pass,
286 ConjunctStatus::Pass,
287 ConjunctStatus::Pass,
288 ];
289 assert!(check_no_insufficient_data_masking(&statuses).is_ok());
290 }
291
292 #[test]
293 fn rule1_all_pass_with_insufficient_data_fails() {
294 let statuses = [
295 ConjunctStatus::Pass,
296 ConjunctStatus::Pass,
297 ConjunctStatus::Pass,
298 ConjunctStatus::InsufficientData,
299 ];
300 assert!(check_no_insufficient_data_masking(&statuses).is_err());
301 }
302
303 #[test]
304 fn rule1_not_all_pass_with_insufficient_data_ok() {
305 let statuses = [
306 ConjunctStatus::Pass,
307 ConjunctStatus::Partial,
308 ConjunctStatus::Pass,
309 ConjunctStatus::InsufficientData,
310 ];
311 assert!(check_no_insufficient_data_masking(&statuses).is_ok());
312 }
313
314 #[test]
315 fn rule1_not_all_pass_with_fail_and_insufficient_data_ok() {
316 let statuses = [
317 ConjunctStatus::Fail,
318 ConjunctStatus::Pass,
319 ConjunctStatus::Pass,
320 ConjunctStatus::InsufficientData,
321 ];
322 assert!(check_no_insufficient_data_masking(&statuses).is_ok());
323 }
324
325 #[test]
326 fn rule2_variance_bound_passes_when_not_all_pass() {
327 let statuses = [
328 ConjunctStatus::Pass,
329 ConjunctStatus::Partial,
330 ConjunctStatus::Pass,
331 ConjunctStatus::Pass,
332 ];
333 let evidence = vec![
334 make_evidence("arc-agi-2", 0.95, true),
335 make_evidence("arc-agi-3", 0.60, true),
336 ];
337 assert!(check_variance_bound(&evidence, &statuses).is_ok());
338 }
339
340 #[test]
341 fn rule2_variance_bound_passes_with_reasonable_margins() {
342 let statuses = [
343 ConjunctStatus::Pass,
344 ConjunctStatus::Pass,
345 ConjunctStatus::Pass,
346 ConjunctStatus::Pass,
347 ];
348 let evidence = vec![
350 make_evidence("arc-agi-2", 0.95, true), make_evidence("gdpval", 0.92, true), make_evidence("osworld", 0.93, true), make_evidence("re-bench", 0.80, true), ];
355 assert!(check_variance_bound(&evidence, &statuses).is_ok());
357 }
358
359 #[test]
360 fn rule2_variance_bound_fails_with_imbalance_within_type() {
361 let statuses = [
362 ConjunctStatus::Pass,
363 ConjunctStatus::Pass,
364 ConjunctStatus::Pass,
365 ConjunctStatus::Pass,
366 ];
367
368 let evidence = vec![
374 make_evidence("arc-agi-2", 0.40, true), make_evidence("gdpval", 0.85, true), make_evidence("osworld", 0.90, true), make_evidence("re-bench", 0.60, true), ];
379 assert!(check_variance_bound(&evidence, &statuses).is_err());
380 }
381
382 #[test]
383 fn rule2_empty_evidence_passes() {
384 let statuses = [
385 ConjunctStatus::Pass,
386 ConjunctStatus::Pass,
387 ConjunctStatus::Pass,
388 ConjunctStatus::Pass,
389 ];
390 let evidence = vec![];
391 assert!(check_variance_bound(&evidence, &statuses).is_ok());
392 }
393
394 #[test]
395 fn rule2_unknown_sources_passes() {
396 let statuses = [
397 ConjunctStatus::Pass,
398 ConjunctStatus::Pass,
399 ConjunctStatus::Pass,
400 ConjunctStatus::Pass,
401 ];
402 let evidence = vec![make_evidence("unknown-source", 0.95, true)];
403 assert!(check_variance_bound(&evidence, &statuses).is_ok());
405 }
406
407 #[test]
408 fn rule3_complete_provenance_passes() {
409 let evidence = vec![
410 make_evidence("arc-agi-2", 0.95, true),
411 make_evidence("gdpval", 0.90, true),
412 ];
413 assert!(check_provenance_metadata(&evidence).is_ok());
414 }
415
416 #[test]
417 fn rule3_empty_evidence_passes() {
418 let evidence = vec![];
419 assert!(check_provenance_metadata(&evidence).is_ok());
420 }
421
422 #[test]
423 fn consistency_check_all_pass_all_rules() {
424 let statuses = [
425 ConjunctStatus::Pass,
426 ConjunctStatus::Pass,
427 ConjunctStatus::Pass,
428 ConjunctStatus::Pass,
429 ];
430 let evidence = vec![
431 make_evidence("arc-agi-2", 0.90, true), make_evidence("gdpval", 0.88, true), make_evidence("osworld", 0.90, true), make_evidence("re-bench", 0.75, true), ];
436 let result = consistency_check(&evidence, &statuses);
437 assert!(result.passed, "Expected pass but got: {:?}", result);
438 assert!(result.failed_rules.is_empty());
439 }
440
441 #[test]
442 fn consistency_check_rule1_fails() {
443 let statuses = [
444 ConjunctStatus::Pass,
445 ConjunctStatus::Pass,
446 ConjunctStatus::Pass,
447 ConjunctStatus::InsufficientData,
448 ];
449 let evidence = vec![make_evidence("arc-agi-2", 0.95, true)];
450 let result = consistency_check(&evidence, &statuses);
451 assert!(!result.passed);
452 assert!(
453 result
454 .failed_rules
455 .contains(&"rule_1_insufficient_data_masking".to_string())
456 );
457 }
458
459 #[test]
460 fn consistency_check_rule2_passes_with_balanced_sources() {
461 let statuses = [
465 ConjunctStatus::Pass,
466 ConjunctStatus::Pass,
467 ConjunctStatus::Pass,
468 ConjunctStatus::Pass,
469 ];
470 let evidence = vec![
471 make_evidence("arc-agi-2", 0.95, true), make_evidence("gdpval", 0.90, true), make_evidence("osworld", 0.88, true), make_evidence("metr-80pct-time-horizon", 200.0, false), ];
476 let result = consistency_check(&evidence, &statuses);
480 assert!(
481 result.passed,
482 "Well-balanced sources should pass variance bound"
483 );
484 }
485
486 #[test]
487 fn consistency_check_rule2_fails_when_fractions_imbalanced() {
488 let statuses = [
491 ConjunctStatus::Pass,
492 ConjunctStatus::Pass,
493 ConjunctStatus::Pass,
494 ConjunctStatus::Pass,
495 ];
496 let evidence = vec![
497 make_evidence("arc-agi-2", 0.40, true), make_evidence("gdpval", 0.851, true), make_evidence("osworld", 0.90, true), make_evidence("metr-80pct-time-horizon", 200.0, false), ];
502 let result = consistency_check(&evidence, &statuses);
507 assert!(!result.passed);
508 assert!(
509 result
510 .failed_rules
511 .contains(&"rule_2_variance_bound".to_string())
512 );
513 }
514
515 #[test]
516 fn consistency_check_partial_or_fail_status_allows_insufficient_data() {
517 let statuses = [
518 ConjunctStatus::Pass,
519 ConjunctStatus::Partial,
520 ConjunctStatus::Pass,
521 ConjunctStatus::InsufficientData,
522 ];
523 let evidence = vec![
524 make_evidence("arc-agi-2", 0.95, true),
525 make_evidence("gdpval", 0.90, true),
526 make_evidence("osworld", 0.95, true),
527 make_evidence("metr-80pct-time-horizon", 500.0, false),
528 ];
529 let result = consistency_check(&evidence, &statuses);
530 assert!(result.passed);
533 }
534
535 #[test]
536 fn variance_bound_fails_with_strong_outlier() {
537 let statuses = [
542 ConjunctStatus::Pass,
543 ConjunctStatus::Pass,
544 ConjunctStatus::Pass,
545 ConjunctStatus::Pass,
546 ];
547 let evidence = vec![
548 make_evidence("arc-agi-2", 0.88, true), make_evidence("gdpval", 0.87, true), make_evidence("osworld", 0.86, true), make_evidence("metr-80pct-time-horizon", 400.0, false), ];
553 let result = consistency_check(&evidence, &statuses);
557 assert!(
558 !result.passed,
559 "Evidence with strong outlier should fail variance bound per SPEC §4 rule 2"
560 );
561 assert!(
562 result
563 .failed_rules
564 .contains(&"rule_2_variance_bound".to_string())
565 );
566 }
567}