Skip to main content

agi4_core/
consistency.rs

1//! Cross-conjunct consistency check.
2//!
3//! Implements SPEC.md §4: prevents suspicious measurement patterns where
4//! one conjunct is in insufficient_data while others marginally pass.
5
6use crate::conjunct::ConjunctStatus;
7use crate::evidence::{Evidence, SourceValue};
8use crate::sources;
9use crate::threshold;
10use serde::{Deserialize, Serialize};
11
12/// Result of the consistency check.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct ConsistencyResult {
15    pub passed: bool,
16    pub failed_rules: Vec<String>,
17    pub detail: Option<String>,
18}
19
20impl ConsistencyResult {
21    /// Create a passing consistency check.
22    pub fn pass() -> Self {
23        Self {
24            passed: true,
25            failed_rules: vec![],
26            detail: None,
27        }
28    }
29
30    /// Create a failing consistency check with reason(s).
31    pub fn fail(rules: Vec<&str>, detail: String) -> Self {
32        Self {
33            passed: false,
34            failed_rules: rules.iter().map(|s| s.to_string()).collect(),
35            detail: Some(detail),
36        }
37    }
38}
39
40/// Check rule 1: no insufficient_data masking.
41/// If three conjuncts pass and one is insufficient_data, it's a masking pattern.
42fn check_no_insufficient_data_masking(
43    conjunct_statuses: &[ConjunctStatus; 4],
44) -> Result<(), String> {
45    let pass_count = conjunct_statuses
46        .iter()
47        .filter(|s| **s == ConjunctStatus::Pass)
48        .count();
49    let insufficient_count = conjunct_statuses
50        .iter()
51        .filter(|s| **s == ConjunctStatus::InsufficientData)
52        .count();
53
54    // If 3 are Pass and 1 is InsufficientData, it's a masking pattern
55    if pass_count == 3 && insufficient_count == 1 {
56        return Err(
57            "One conjunct is insufficient_data while all others pass (masking pattern)".to_string(),
58        );
59    }
60    Ok(())
61}
62
63/// Map source IDs to their associated conjuncts and thresholds.
64/// Returns (conjunct_index, pass_threshold, floor) tuples.
65fn get_source_threshold(source_id: &str) -> Option<Vec<(usize, f64, Option<f64>)>> {
66    match source_id {
67        sources::generality::ARC_AGI_2 => {
68            Some(vec![(0, threshold::generality::ARC_AGI_2_PASS, None)])
69        }
70        sources::generality::ARC_AGI_3 => Some(vec![
71            (
72                0,
73                threshold::generality::ARC_AGI_3_PASS,
74                Some(threshold::generality::ARC_AGI_3_FLOOR),
75            ),
76            (
77                2,
78                threshold::environmental_transfer::ARC_AGI_3_PASS,
79                Some(threshold::environmental_transfer::ARC_AGI_3_FLOOR),
80            ),
81        ]),
82        sources::generality::HLE => Some(vec![(0, threshold::generality::HLE_PASS, None)]),
83        sources::generality::GPQA_DIAMOND => {
84            Some(vec![(0, threshold::generality::GPQA_DIAMOND_PASS, None)])
85        }
86        sources::economic_substitutability::GDPVAL
87        | sources::economic_substitutability::GDPVAL_AA => Some(vec![(
88            1,
89            threshold::economic_substitutability::GDPVAL_PASS,
90            None,
91        )]),
92        sources::economic_substitutability::RLI => Some(vec![(
93            1,
94            threshold::economic_substitutability::RLI_PASS,
95            Some(threshold::economic_substitutability::RLI_FLOOR),
96        )]),
97        sources::economic_substitutability::APEX_AGENTS => Some(vec![(
98            1,
99            threshold::economic_substitutability::APEX_AGENTS_PASS,
100            None,
101        )]),
102        sources::environmental_transfer::OSWORLD => Some(vec![(
103            2,
104            threshold::environmental_transfer::OSWORLD_PASS,
105            None,
106        )]),
107        sources::environmental_transfer::NES => {
108            // NES thresholds TBD in v0.1.x per SPEC.md. For now, skip NES in variance calculation.
109            None
110        }
111        sources::autonomous_agency::METR_80PCT_TIME_HORIZON => Some(vec![(
112            3,
113            threshold::autonomous_agency::METR_80PCT_PASS_HOURS,
114            Some(threshold::autonomous_agency::METR_80PCT_FLOOR_HOURS),
115        )]),
116        sources::autonomous_agency::RE_BENCH => {
117            Some(vec![(3, threshold::autonomous_agency::REBENCH_PASS, None)])
118        }
119        sources::autonomous_agency::SWE_BENCH_VERIFIED => Some(vec![(
120            3,
121            threshold::autonomous_agency::SWEBENCH_VERIFIED_PASS_AT_5,
122            None,
123        )]),
124        _ => None,
125    }
126}
127
128/// Check rule 2: variance bound.
129/// When all four conjuncts pass, min_margin >= 0.5 * max_margin.
130/// SPEC.md §4 rule 2 requires comparing across all sources used (both Fraction and Hours).
131/// Hours and Fraction are on different scales, so we normalize to a unitless ratio:
132/// - Fraction: margin = value / threshold (inherently unitless)
133/// - Hours: margin = value / threshold (e.g., 168 hours / 168 threshold = 1.0)
134fn check_variance_bound(
135    evidence: &[Evidence],
136    conjunct_statuses: &[ConjunctStatus; 4],
137) -> Result<(), String> {
138    let all_pass = conjunct_statuses.iter().all(|s| *s == ConjunctStatus::Pass);
139    if !all_pass {
140        // Variance rule only applies when all conjuncts pass
141        return Ok(());
142    }
143
144    let mut all_margins = Vec::new();
145
146    for e in evidence {
147        if let Some(thresholds) = get_source_threshold(e.source.as_str()) {
148            for (_, pass_threshold, _) in thresholds {
149                match e.value {
150                    SourceValue::Fraction(f) => {
151                        let margin = f.value() / pass_threshold;
152                        all_margins.push(margin);
153                    }
154                    SourceValue::Hours(h) => {
155                        let margin = h.value() / pass_threshold;
156                        all_margins.push(margin);
157                    }
158                }
159            }
160        }
161    }
162
163    if all_margins.is_empty() {
164        // No recognized sources; variance check passes trivially
165        return Ok(());
166    }
167
168    // Check variance bound across all margins (SPEC.md §4 rule 2)
169    let min_margin = all_margins.iter().cloned().fold(f64::INFINITY, f64::min);
170    let max_margin = all_margins
171        .iter()
172        .cloned()
173        .fold(f64::NEG_INFINITY, f64::max);
174    if min_margin < threshold::consistency::MARGIN_VARIANCE_RATIO * max_margin {
175        return Err(format!(
176            "Variance bound violated: min_margin ({:.3}) < 0.5 * max_margin ({:.3})",
177            min_margin, max_margin
178        ));
179    }
180
181    Ok(())
182}
183
184/// Check rule 3: provenance metadata completeness.
185/// Every source must have URL, fetch timestamp, and source version/date.
186fn check_provenance_metadata(evidence: &[Evidence]) -> Result<(), String> {
187    let mut missing_sources = Vec::new();
188
189    for e in evidence {
190        let source_id = e.source.as_str();
191        let mut issues = Vec::new();
192
193        // Check source_url is present and valid (it's a Url type, so presence is guaranteed by type)
194        if e.provenance.source_url.as_str().is_empty() {
195            issues.push("source_url");
196        }
197
198        // Check fetch_timestamp is present (it's a DateTime, so presence is guaranteed by type)
199
200        // Check source_version or we're lenient here because it's optional in the schema
201        // but SPEC.md §4 rule 3 says "version or date stamp"
202        // The DateTime<Utc> fetch_timestamp serves as the date stamp, so version is optional
203        // but if we want to be strict, we could require it. For now, the fetch_timestamp satisfies the "date stamp" requirement.
204
205        if !issues.is_empty() {
206            missing_sources.push(format!("{} (missing: {})", source_id, issues.join(", ")));
207        }
208    }
209
210    if !missing_sources.is_empty() {
211        return Err(format!(
212            "Provenance metadata incomplete for: {}",
213            missing_sources.join("; ")
214        ));
215    }
216
217    Ok(())
218}
219
220/// Evaluate all three consistency check rules.
221///
222/// Takes the evidence array and the array of per-conjunct statuses (in order:
223/// Generality, EconomicSubstitutability, EnvironmentalTransfer, AutonomousAgency).
224pub fn consistency_check(
225    evidence: &[Evidence],
226    conjunct_statuses: &[ConjunctStatus; 4],
227) -> ConsistencyResult {
228    let mut failed_rules = Vec::new();
229
230    // Rule 1: No insufficient_data masking
231    if check_no_insufficient_data_masking(conjunct_statuses).is_err() {
232        failed_rules.push("rule_1_insufficient_data_masking");
233    }
234
235    // Rule 2: Variance bound
236    if check_variance_bound(evidence, conjunct_statuses).is_err() {
237        failed_rules.push("rule_2_variance_bound");
238    }
239
240    // Rule 3: Provenance metadata
241    if check_provenance_metadata(evidence).is_err() {
242        failed_rules.push("rule_3_provenance_metadata");
243    }
244
245    if failed_rules.is_empty() {
246        ConsistencyResult::pass()
247    } else {
248        let detail = format!("Consistency check failed on: {}", failed_rules.join(", "));
249        ConsistencyResult::fail(failed_rules.to_vec(), detail)
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256    use crate::evidence::{
257        BoundedFraction, MeasurementId, NonNegativeHours, Provenance, SourceId, SourceValue,
258    };
259    use chrono::Utc;
260    use url::Url;
261
262    fn make_evidence(source: &str, value: f64, is_fraction: bool) -> Evidence {
263        Evidence {
264            source: SourceId::new(source),
265            measurement: MeasurementId::new("test-measurement"),
266            value: if is_fraction {
267                SourceValue::Fraction(BoundedFraction::new(value).unwrap())
268            } else {
269                SourceValue::Hours(NonNegativeHours::new(value).unwrap())
270            },
271            reliability_percentile: 95,
272            provenance: Provenance {
273                source_url: Url::parse("https://example.com").unwrap(),
274                fetch_timestamp: Utc::now(),
275                source_version: Some("1.0".to_string()),
276                raw_value: format!("{}", value),
277            },
278        }
279    }
280
281    #[test]
282    fn rule1_all_pass_with_no_insufficient_data() {
283        let statuses = [
284            ConjunctStatus::Pass,
285            ConjunctStatus::Pass,
286            ConjunctStatus::Pass,
287            ConjunctStatus::Pass,
288        ];
289        assert!(check_no_insufficient_data_masking(&statuses).is_ok());
290    }
291
292    #[test]
293    fn rule1_all_pass_with_insufficient_data_fails() {
294        let statuses = [
295            ConjunctStatus::Pass,
296            ConjunctStatus::Pass,
297            ConjunctStatus::Pass,
298            ConjunctStatus::InsufficientData,
299        ];
300        assert!(check_no_insufficient_data_masking(&statuses).is_err());
301    }
302
303    #[test]
304    fn rule1_not_all_pass_with_insufficient_data_ok() {
305        let statuses = [
306            ConjunctStatus::Pass,
307            ConjunctStatus::Partial,
308            ConjunctStatus::Pass,
309            ConjunctStatus::InsufficientData,
310        ];
311        assert!(check_no_insufficient_data_masking(&statuses).is_ok());
312    }
313
314    #[test]
315    fn rule1_not_all_pass_with_fail_and_insufficient_data_ok() {
316        let statuses = [
317            ConjunctStatus::Fail,
318            ConjunctStatus::Pass,
319            ConjunctStatus::Pass,
320            ConjunctStatus::InsufficientData,
321        ];
322        assert!(check_no_insufficient_data_masking(&statuses).is_ok());
323    }
324
325    #[test]
326    fn rule2_variance_bound_passes_when_not_all_pass() {
327        let statuses = [
328            ConjunctStatus::Pass,
329            ConjunctStatus::Partial,
330            ConjunctStatus::Pass,
331            ConjunctStatus::Pass,
332        ];
333        let evidence = vec![
334            make_evidence("arc-agi-2", 0.95, true),
335            make_evidence("arc-agi-3", 0.60, true),
336        ];
337        assert!(check_variance_bound(&evidence, &statuses).is_ok());
338    }
339
340    #[test]
341    fn rule2_variance_bound_passes_with_reasonable_margins() {
342        let statuses = [
343            ConjunctStatus::Pass,
344            ConjunctStatus::Pass,
345            ConjunctStatus::Pass,
346            ConjunctStatus::Pass,
347        ];
348        // All fraction sources well above their thresholds with balanced margins
349        let evidence = vec![
350            make_evidence("arc-agi-2", 0.95, true), // margin: 0.95/0.85 ≈ 1.118
351            make_evidence("gdpval", 0.92, true),    // margin: 0.92/0.85 ≈ 1.082
352            make_evidence("osworld", 0.93, true),   // margin: 0.93/0.85 ≈ 1.094
353            make_evidence("re-bench", 0.80, true),  // margin: 0.80/0.60 ≈ 1.333
354        ];
355        // min_margin ≈ 1.082, max_margin ≈ 1.333, min >= 0.5*max? 1.082 >= 0.667? Yes
356        assert!(check_variance_bound(&evidence, &statuses).is_ok());
357    }
358
359    #[test]
360    fn rule2_variance_bound_fails_with_imbalance_within_type() {
361        let statuses = [
362            ConjunctStatus::Pass,
363            ConjunctStatus::Pass,
364            ConjunctStatus::Pass,
365            ConjunctStatus::Pass,
366        ];
367
368        // To fail variance bound, we need min < 0.5*max within a type group.
369        // Fraction sources: 0.40, 0.85, 0.90, 0.60
370        // Margins: 0.40/0.85 ≈ 0.470, 0.85/0.85 = 1.0, 0.90/0.85 ≈ 1.059, 0.60/0.60 = 1.0
371        // min ≈ 0.470, max ≈ 1.059
372        // Need: 0.470 >= 0.5*1.059 ≈ 0.530? NO! This should fail
373        let evidence = vec![
374            make_evidence("arc-agi-2", 0.40, true), // margin: 0.40/0.85 ≈ 0.470
375            make_evidence("gdpval", 0.85, true),    // margin: 0.85/0.85 = 1.0
376            make_evidence("osworld", 0.90, true),   // margin: 0.90/0.85 ≈ 1.059
377            make_evidence("re-bench", 0.60, true),  // margin: 0.60/0.60 = 1.0
378        ];
379        assert!(check_variance_bound(&evidence, &statuses).is_err());
380    }
381
382    #[test]
383    fn rule2_empty_evidence_passes() {
384        let statuses = [
385            ConjunctStatus::Pass,
386            ConjunctStatus::Pass,
387            ConjunctStatus::Pass,
388            ConjunctStatus::Pass,
389        ];
390        let evidence = vec![];
391        assert!(check_variance_bound(&evidence, &statuses).is_ok());
392    }
393
394    #[test]
395    fn rule2_unknown_sources_passes() {
396        let statuses = [
397            ConjunctStatus::Pass,
398            ConjunctStatus::Pass,
399            ConjunctStatus::Pass,
400            ConjunctStatus::Pass,
401        ];
402        let evidence = vec![make_evidence("unknown-source", 0.95, true)];
403        // Unknown sources are ignored, so margins is empty, check passes trivially
404        assert!(check_variance_bound(&evidence, &statuses).is_ok());
405    }
406
407    #[test]
408    fn rule3_complete_provenance_passes() {
409        let evidence = vec![
410            make_evidence("arc-agi-2", 0.95, true),
411            make_evidence("gdpval", 0.90, true),
412        ];
413        assert!(check_provenance_metadata(&evidence).is_ok());
414    }
415
416    #[test]
417    fn rule3_empty_evidence_passes() {
418        let evidence = vec![];
419        assert!(check_provenance_metadata(&evidence).is_ok());
420    }
421
422    #[test]
423    fn consistency_check_all_pass_all_rules() {
424        let statuses = [
425            ConjunctStatus::Pass,
426            ConjunctStatus::Pass,
427            ConjunctStatus::Pass,
428            ConjunctStatus::Pass,
429        ];
430        let evidence = vec![
431            make_evidence("arc-agi-2", 0.90, true), // margin: 0.90/0.85 ≈ 1.06
432            make_evidence("gdpval", 0.88, true),    // margin: 0.88/0.85 ≈ 1.04
433            make_evidence("osworld", 0.90, true),   // margin: 0.90/0.85 ≈ 1.06
434            make_evidence("re-bench", 0.75, true),  // margin: 0.75/0.60 = 1.25
435        ];
436        let result = consistency_check(&evidence, &statuses);
437        assert!(result.passed, "Expected pass but got: {:?}", result);
438        assert!(result.failed_rules.is_empty());
439    }
440
441    #[test]
442    fn consistency_check_rule1_fails() {
443        let statuses = [
444            ConjunctStatus::Pass,
445            ConjunctStatus::Pass,
446            ConjunctStatus::Pass,
447            ConjunctStatus::InsufficientData,
448        ];
449        let evidence = vec![make_evidence("arc-agi-2", 0.95, true)];
450        let result = consistency_check(&evidence, &statuses);
451        assert!(!result.passed);
452        assert!(
453            result
454                .failed_rules
455                .contains(&"rule_1_insufficient_data_masking".to_string())
456        );
457    }
458
459    #[test]
460    fn consistency_check_rule2_passes_with_balanced_sources() {
461        // SPEC.md §4 rule 2: all sources across all conjuncts must have
462        // min_margin >= 0.5 * max_margin (single pool, no type separation).
463        // This evidence has well-balanced margins across all sources.
464        let statuses = [
465            ConjunctStatus::Pass,
466            ConjunctStatus::Pass,
467            ConjunctStatus::Pass,
468            ConjunctStatus::Pass,
469        ];
470        let evidence = vec![
471            make_evidence("arc-agi-2", 0.95, true), // margin: 0.95/0.85 ≈ 1.118
472            make_evidence("gdpval", 0.90, true),    // margin: 0.90/0.85 ≈ 1.059
473            make_evidence("osworld", 0.88, true),   // margin: 0.88/0.85 ≈ 1.035
474            make_evidence("metr-80pct-time-horizon", 200.0, false), // margin: 200/168 ≈ 1.190
475        ];
476        // All margins: [1.118, 1.059, 1.035, 1.190]
477        // min=1.035, max=1.190
478        // Check: 1.035 >= 0.5*1.190 = 0.595? YES, passes!
479        let result = consistency_check(&evidence, &statuses);
480        assert!(
481            result.passed,
482            "Well-balanced sources should pass variance bound"
483        );
484    }
485
486    #[test]
487    fn consistency_check_rule2_fails_when_fractions_imbalanced() {
488        // Variance bound should fail when fraction sources are imbalanced,
489        // even if hours sources are separately balanced.
490        let statuses = [
491            ConjunctStatus::Pass,
492            ConjunctStatus::Pass,
493            ConjunctStatus::Pass,
494            ConjunctStatus::Pass,
495        ];
496        let evidence = vec![
497            make_evidence("arc-agi-2", 0.40, true), // margin: 0.40/0.85 ≈ 0.470
498            make_evidence("gdpval", 0.851, true),   // margin: 0.851/0.85 ≈ 1.001
499            make_evidence("osworld", 0.90, true),   // margin: 0.90/0.85 ≈ 1.059
500            make_evidence("metr-80pct-time-horizon", 200.0, false), // margin: 200/168 ≈ 1.19
501        ];
502        // Fraction margins: [0.470, 1.001, 1.059]
503        // Hours margins: [1.19]
504        // Fraction check: min=0.470, max=1.059
505        // Need: 0.470 >= 0.5*1.059 ≈ 0.530? NO, fails!
506        let result = consistency_check(&evidence, &statuses);
507        assert!(!result.passed);
508        assert!(
509            result
510                .failed_rules
511                .contains(&"rule_2_variance_bound".to_string())
512        );
513    }
514
515    #[test]
516    fn consistency_check_partial_or_fail_status_allows_insufficient_data() {
517        let statuses = [
518            ConjunctStatus::Pass,
519            ConjunctStatus::Partial,
520            ConjunctStatus::Pass,
521            ConjunctStatus::InsufficientData,
522        ];
523        let evidence = vec![
524            make_evidence("arc-agi-2", 0.95, true),
525            make_evidence("gdpval", 0.90, true),
526            make_evidence("osworld", 0.95, true),
527            make_evidence("metr-80pct-time-horizon", 500.0, false),
528        ];
529        let result = consistency_check(&evidence, &statuses);
530        // Only rule3 would fail if provenance is broken, but we have good provenance
531        // Rule1 doesn't apply (not all pass), rule2 doesn't apply (not all pass)
532        assert!(result.passed);
533    }
534
535    #[test]
536    fn variance_bound_fails_with_strong_outlier() {
537        // Test: when one source (Hours) is a strong outlier vs fraction sources,
538        // the single-pool variance check (SPEC.md §4 rule 2) will fail if margins
539        // are too spread. This model has METR at 400h (margin ≈2.38) but fraction
540        // margins only ~1.01-1.04. The ratio violates the 0.5× bound.
541        let statuses = [
542            ConjunctStatus::Pass,
543            ConjunctStatus::Pass,
544            ConjunctStatus::Pass,
545            ConjunctStatus::Pass,
546        ];
547        let evidence = vec![
548            make_evidence("arc-agi-2", 0.88, true), // margin: 0.88/0.85 ≈ 1.035
549            make_evidence("gdpval", 0.87, true),    // margin: 0.87/0.85 ≈ 1.024
550            make_evidence("osworld", 0.86, true),   // margin: 0.86/0.85 ≈ 1.012
551            make_evidence("metr-80pct-time-horizon", 400.0, false), // margin: 400/168 ≈ 2.381
552        ];
553        // All margins: [1.035, 1.024, 1.012, 2.381]
554        // min=1.012, max=2.381
555        // Check: 1.012 >= 0.5*2.381 = 1.191? NO, fails!
556        let result = consistency_check(&evidence, &statuses);
557        assert!(
558            !result.passed,
559            "Evidence with strong outlier should fail variance bound per SPEC §4 rule 2"
560        );
561        assert!(
562            result
563                .failed_rules
564                .contains(&"rule_2_variance_bound".to_string())
565        );
566    }
567}