use crate::conjunct::ConjunctStatus;
use crate::evidence::{Evidence, SourceValue};
use crate::sources;
use crate::threshold;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConsistencyResult {
pub passed: bool,
pub failed_rules: Vec<String>,
pub detail: Option<String>,
}
impl ConsistencyResult {
pub fn pass() -> Self {
Self {
passed: true,
failed_rules: vec![],
detail: None,
}
}
pub fn fail(rules: Vec<&str>, detail: String) -> Self {
Self {
passed: false,
failed_rules: rules.iter().map(|s| s.to_string()).collect(),
detail: Some(detail),
}
}
}
fn check_no_insufficient_data_masking(
conjunct_statuses: &[ConjunctStatus; 4],
) -> Result<(), String> {
let pass_count = conjunct_statuses
.iter()
.filter(|s| **s == ConjunctStatus::Pass)
.count();
let insufficient_count = conjunct_statuses
.iter()
.filter(|s| **s == ConjunctStatus::InsufficientData)
.count();
if pass_count == 3 && insufficient_count == 1 {
return Err(
"One conjunct is insufficient_data while all others pass (masking pattern)".to_string(),
);
}
Ok(())
}
fn get_source_threshold(source_id: &str) -> Option<Vec<(usize, f64, Option<f64>)>> {
match source_id {
sources::generality::ARC_AGI_2 => {
Some(vec![(0, threshold::generality::ARC_AGI_2_PASS, None)])
}
sources::generality::ARC_AGI_3 => Some(vec![
(
0,
threshold::generality::ARC_AGI_3_PASS,
Some(threshold::generality::ARC_AGI_3_FLOOR),
),
(
2,
threshold::environmental_transfer::ARC_AGI_3_PASS,
Some(threshold::environmental_transfer::ARC_AGI_3_FLOOR),
),
]),
sources::generality::HLE => Some(vec![(0, threshold::generality::HLE_PASS, None)]),
sources::generality::GPQA_DIAMOND => {
Some(vec![(0, threshold::generality::GPQA_DIAMOND_PASS, None)])
}
sources::economic_substitutability::GDPVAL
| sources::economic_substitutability::GDPVAL_AA => Some(vec![(
1,
threshold::economic_substitutability::GDPVAL_PASS,
None,
)]),
sources::economic_substitutability::RLI => Some(vec![(
1,
threshold::economic_substitutability::RLI_PASS,
Some(threshold::economic_substitutability::RLI_FLOOR),
)]),
sources::economic_substitutability::APEX_AGENTS => Some(vec![(
1,
threshold::economic_substitutability::APEX_AGENTS_PASS,
None,
)]),
sources::environmental_transfer::OSWORLD => Some(vec![(
2,
threshold::environmental_transfer::OSWORLD_PASS,
None,
)]),
sources::environmental_transfer::NES => {
None
}
sources::autonomous_agency::METR_80PCT_TIME_HORIZON => Some(vec![(
3,
threshold::autonomous_agency::METR_80PCT_PASS_HOURS,
Some(threshold::autonomous_agency::METR_80PCT_FLOOR_HOURS),
)]),
sources::autonomous_agency::RE_BENCH => {
Some(vec![(3, threshold::autonomous_agency::REBENCH_PASS, None)])
}
sources::autonomous_agency::SWE_BENCH_VERIFIED => Some(vec![(
3,
threshold::autonomous_agency::SWEBENCH_VERIFIED_PASS_AT_5,
None,
)]),
_ => None,
}
}
fn check_variance_bound(
evidence: &[Evidence],
conjunct_statuses: &[ConjunctStatus; 4],
) -> Result<(), String> {
let all_pass = conjunct_statuses.iter().all(|s| *s == ConjunctStatus::Pass);
if !all_pass {
return Ok(());
}
let mut all_margins = Vec::new();
for e in evidence {
if let Some(thresholds) = get_source_threshold(e.source.as_str()) {
for (_, pass_threshold, _) in thresholds {
match e.value {
SourceValue::Fraction(f) => {
let margin = f.value() / pass_threshold;
all_margins.push(margin);
}
SourceValue::Hours(h) => {
let margin = h.value() / pass_threshold;
all_margins.push(margin);
}
}
}
}
}
if all_margins.is_empty() {
return Ok(());
}
let min_margin = all_margins.iter().cloned().fold(f64::INFINITY, f64::min);
let max_margin = all_margins
.iter()
.cloned()
.fold(f64::NEG_INFINITY, f64::max);
if min_margin < threshold::consistency::MARGIN_VARIANCE_RATIO * max_margin {
return Err(format!(
"Variance bound violated: min_margin ({:.3}) < 0.5 * max_margin ({:.3})",
min_margin, max_margin
));
}
Ok(())
}
fn check_provenance_metadata(evidence: &[Evidence]) -> Result<(), String> {
let mut missing_sources = Vec::new();
for e in evidence {
let source_id = e.source.as_str();
let mut issues = Vec::new();
if e.provenance.source_url.as_str().is_empty() {
issues.push("source_url");
}
if !issues.is_empty() {
missing_sources.push(format!("{} (missing: {})", source_id, issues.join(", ")));
}
}
if !missing_sources.is_empty() {
return Err(format!(
"Provenance metadata incomplete for: {}",
missing_sources.join("; ")
));
}
Ok(())
}
pub fn consistency_check(
evidence: &[Evidence],
conjunct_statuses: &[ConjunctStatus; 4],
) -> ConsistencyResult {
let mut failed_rules = Vec::new();
if check_no_insufficient_data_masking(conjunct_statuses).is_err() {
failed_rules.push("rule_1_insufficient_data_masking");
}
if check_variance_bound(evidence, conjunct_statuses).is_err() {
failed_rules.push("rule_2_variance_bound");
}
if check_provenance_metadata(evidence).is_err() {
failed_rules.push("rule_3_provenance_metadata");
}
if failed_rules.is_empty() {
ConsistencyResult::pass()
} else {
let detail = format!("Consistency check failed on: {}", failed_rules.join(", "));
ConsistencyResult::fail(failed_rules.to_vec(), detail)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::evidence::{
BoundedFraction, MeasurementId, NonNegativeHours, Provenance, SourceId, SourceValue,
};
use chrono::Utc;
use url::Url;
fn make_evidence(source: &str, value: f64, is_fraction: bool) -> Evidence {
Evidence {
source: SourceId::new(source),
measurement: MeasurementId::new("test-measurement"),
value: if is_fraction {
SourceValue::Fraction(BoundedFraction::new(value).unwrap())
} else {
SourceValue::Hours(NonNegativeHours::new(value).unwrap())
},
reliability_percentile: 95,
provenance: Provenance {
source_url: Url::parse("https://example.com").unwrap(),
fetch_timestamp: Utc::now(),
source_version: Some("1.0".to_string()),
raw_value: format!("{}", value),
},
}
}
#[test]
fn rule1_all_pass_with_no_insufficient_data() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
assert!(check_no_insufficient_data_masking(&statuses).is_ok());
}
#[test]
fn rule1_all_pass_with_insufficient_data_fails() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
assert!(check_no_insufficient_data_masking(&statuses).is_err());
}
#[test]
fn rule1_not_all_pass_with_insufficient_data_ok() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Partial,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
assert!(check_no_insufficient_data_masking(&statuses).is_ok());
}
#[test]
fn rule1_not_all_pass_with_fail_and_insufficient_data_ok() {
let statuses = [
ConjunctStatus::Fail,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
assert!(check_no_insufficient_data_masking(&statuses).is_ok());
}
#[test]
fn rule2_variance_bound_passes_when_not_all_pass() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Partial,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true),
make_evidence("arc-agi-3", 0.60, true),
];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule2_variance_bound_passes_with_reasonable_margins() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true), make_evidence("gdpval", 0.92, true), make_evidence("osworld", 0.93, true), make_evidence("re-bench", 0.80, true), ];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule2_variance_bound_fails_with_imbalance_within_type() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.40, true), make_evidence("gdpval", 0.85, true), make_evidence("osworld", 0.90, true), make_evidence("re-bench", 0.60, true), ];
assert!(check_variance_bound(&evidence, &statuses).is_err());
}
#[test]
fn rule2_empty_evidence_passes() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule2_unknown_sources_passes() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![make_evidence("unknown-source", 0.95, true)];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule3_complete_provenance_passes() {
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true),
make_evidence("gdpval", 0.90, true),
];
assert!(check_provenance_metadata(&evidence).is_ok());
}
#[test]
fn rule3_empty_evidence_passes() {
let evidence = vec![];
assert!(check_provenance_metadata(&evidence).is_ok());
}
#[test]
fn consistency_check_all_pass_all_rules() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.90, true), make_evidence("gdpval", 0.88, true), make_evidence("osworld", 0.90, true), make_evidence("re-bench", 0.75, true), ];
let result = consistency_check(&evidence, &statuses);
assert!(result.passed, "Expected pass but got: {:?}", result);
assert!(result.failed_rules.is_empty());
}
#[test]
fn consistency_check_rule1_fails() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
let evidence = vec![make_evidence("arc-agi-2", 0.95, true)];
let result = consistency_check(&evidence, &statuses);
assert!(!result.passed);
assert!(
result
.failed_rules
.contains(&"rule_1_insufficient_data_masking".to_string())
);
}
#[test]
fn consistency_check_rule2_passes_with_balanced_sources() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true), make_evidence("gdpval", 0.90, true), make_evidence("osworld", 0.88, true), make_evidence("metr-80pct-time-horizon", 200.0, false), ];
let result = consistency_check(&evidence, &statuses);
assert!(
result.passed,
"Well-balanced sources should pass variance bound"
);
}
#[test]
fn consistency_check_rule2_fails_when_fractions_imbalanced() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.40, true), make_evidence("gdpval", 0.851, true), make_evidence("osworld", 0.90, true), make_evidence("metr-80pct-time-horizon", 200.0, false), ];
let result = consistency_check(&evidence, &statuses);
assert!(!result.passed);
assert!(
result
.failed_rules
.contains(&"rule_2_variance_bound".to_string())
);
}
#[test]
fn consistency_check_partial_or_fail_status_allows_insufficient_data() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Partial,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true),
make_evidence("gdpval", 0.90, true),
make_evidence("osworld", 0.95, true),
make_evidence("metr-80pct-time-horizon", 500.0, false),
];
let result = consistency_check(&evidence, &statuses);
assert!(result.passed);
}
#[test]
fn variance_bound_fails_with_strong_outlier() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.88, true), make_evidence("gdpval", 0.87, true), make_evidence("osworld", 0.86, true), make_evidence("metr-80pct-time-horizon", 400.0, false), ];
let result = consistency_check(&evidence, &statuses);
assert!(
!result.passed,
"Evidence with strong outlier should fail variance bound per SPEC §4 rule 2"
);
assert!(
result
.failed_rules
.contains(&"rule_2_variance_bound".to_string())
);
}
}