use crate::conjunct::ConjunctStatus;
use crate::evidence::{Evidence, SourceValue};
use crate::threshold;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConsistencyResult {
pub passed: bool,
pub failed_rules: Vec<String>,
pub detail: Option<String>,
}
impl ConsistencyResult {
pub fn pass() -> Self {
Self {
passed: true,
failed_rules: vec![],
detail: None,
}
}
pub fn fail(rules: Vec<&str>, detail: String) -> Self {
Self {
passed: false,
failed_rules: rules.iter().map(|s| s.to_string()).collect(),
detail: Some(detail),
}
}
}
fn check_no_insufficient_data_masking(
conjunct_statuses: &[ConjunctStatus; 4],
) -> Result<(), String> {
let pass_count = conjunct_statuses
.iter()
.filter(|s| **s == ConjunctStatus::Pass)
.count();
let insufficient_count = conjunct_statuses
.iter()
.filter(|s| **s == ConjunctStatus::InsufficientData)
.count();
if pass_count == 3 && insufficient_count == 1 {
return Err(
"One conjunct is insufficient_data while all others pass (masking pattern)".to_string(),
);
}
Ok(())
}
fn get_source_threshold(source_id: &str) -> Option<Vec<(usize, f64, Option<f64>)>> {
match source_id {
"arc-agi-2" => Some(vec![(0, threshold::generality::ARC_AGI_2_PASS, None)]),
"arc-agi-3" => Some(vec![
(
0,
threshold::generality::ARC_AGI_3_PASS,
Some(threshold::generality::ARC_AGI_3_FLOOR),
),
(
2,
threshold::environmental_transfer::ARC_AGI_3_PASS,
Some(threshold::environmental_transfer::ARC_AGI_3_FLOOR),
),
]),
"hle" => Some(vec![(0, threshold::generality::HLE_PASS, None)]),
"gpqa-diamond" => Some(vec![(0, threshold::generality::GPQA_DIAMOND_PASS, None)]),
"gdpval" | "gdpval-aa" => Some(vec![(
1,
threshold::economic_substitutability::GDPVAL_PASS,
None,
)]),
"rli" => Some(vec![(
1,
threshold::economic_substitutability::RLI_PASS,
Some(threshold::economic_substitutability::RLI_FLOOR),
)]),
"apex-agents" => Some(vec![(
1,
threshold::economic_substitutability::APEX_AGENTS_PASS,
None,
)]),
"osworld" => Some(vec![(
2,
threshold::environmental_transfer::OSWORLD_PASS,
None,
)]),
"nes" => {
None
}
"metr-80pct-time-horizon" | "metr-time-horizon-80pct" => Some(vec![(
3,
threshold::autonomous_agency::METR_80PCT_PASS_HOURS,
Some(threshold::autonomous_agency::METR_80PCT_FLOOR_HOURS),
)]),
"re-bench" => Some(vec![(3, threshold::autonomous_agency::REBENCH_PASS, None)]),
"swe-bench-verified" | "swe-bench-verified-pass5" => Some(vec![(
3,
threshold::autonomous_agency::SWEBENCH_VERIFIED_PASS_AT_5,
None,
)]),
_ => None,
}
}
fn check_variance_bound(
evidence: &[Evidence],
conjunct_statuses: &[ConjunctStatus; 4],
) -> Result<(), String> {
let all_pass = conjunct_statuses.iter().all(|s| *s == ConjunctStatus::Pass);
if !all_pass {
return Ok(());
}
let mut margins = Vec::new();
for e in evidence {
if let Some(thresholds) = get_source_threshold(e.source.as_str()) {
for (_, pass_threshold, _) in thresholds {
let raw_value = match e.value {
SourceValue::Fraction(f) => f.value(),
SourceValue::Hours(h) => h.value(),
};
let margin = raw_value / pass_threshold;
margins.push(margin);
}
}
}
if margins.is_empty() {
return Ok(());
}
let min_margin = margins.iter().cloned().fold(f64::INFINITY, f64::min);
let max_margin = margins.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
const VARIANCE_RATIO: f64 = 0.5;
if min_margin < VARIANCE_RATIO * max_margin {
return Err(format!(
"Variance bound violated: min_margin ({:.3}) < 0.5 * max_margin ({:.3})",
min_margin, max_margin
));
}
Ok(())
}
fn check_provenance_metadata(evidence: &[Evidence]) -> Result<(), String> {
let mut missing_sources = Vec::new();
for e in evidence {
let source_id = e.source.as_str();
let mut issues = Vec::new();
if e.provenance.source_url.as_str().is_empty() {
issues.push("source_url");
}
if !issues.is_empty() {
missing_sources.push(format!("{} (missing: {})", source_id, issues.join(", ")));
}
}
if !missing_sources.is_empty() {
return Err(format!(
"Provenance metadata incomplete for: {}",
missing_sources.join("; ")
));
}
Ok(())
}
pub fn consistency_check(
evidence: &[Evidence],
conjunct_statuses: &[ConjunctStatus; 4],
) -> ConsistencyResult {
let mut failed_rules = Vec::new();
if check_no_insufficient_data_masking(conjunct_statuses).is_err() {
failed_rules.push("rule_1_insufficient_data_masking");
}
if check_variance_bound(evidence, conjunct_statuses).is_err() {
failed_rules.push("rule_2_variance_bound");
}
if check_provenance_metadata(evidence).is_err() {
failed_rules.push("rule_3_provenance_metadata");
}
if failed_rules.is_empty() {
ConsistencyResult::pass()
} else {
let detail = format!("Consistency check failed on: {}", failed_rules.join(", "));
ConsistencyResult::fail(failed_rules.to_vec(), detail)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::evidence::{
BoundedFraction, MeasurementId, NonNegativeHours, Provenance, SourceId, SourceValue,
};
use chrono::Utc;
use url::Url;
fn make_evidence(source: &str, value: f64, is_fraction: bool) -> Evidence {
Evidence {
source: SourceId::new(source),
measurement: MeasurementId::new("test-measurement"),
value: if is_fraction {
SourceValue::Fraction(BoundedFraction::new(value).unwrap())
} else {
SourceValue::Hours(NonNegativeHours::new(value).unwrap())
},
reliability_percentile: 95,
provenance: Provenance {
source_url: Url::parse("https://example.com").unwrap(),
fetch_timestamp: Utc::now(),
source_version: Some("1.0".to_string()),
raw_value: format!("{}", value),
},
}
}
#[test]
fn rule1_all_pass_with_no_insufficient_data() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
assert!(check_no_insufficient_data_masking(&statuses).is_ok());
}
#[test]
fn rule1_all_pass_with_insufficient_data_fails() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
assert!(check_no_insufficient_data_masking(&statuses).is_err());
}
#[test]
fn rule1_not_all_pass_with_insufficient_data_ok() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Partial,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
assert!(check_no_insufficient_data_masking(&statuses).is_ok());
}
#[test]
fn rule1_not_all_pass_with_fail_and_insufficient_data_ok() {
let statuses = [
ConjunctStatus::Fail,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
assert!(check_no_insufficient_data_masking(&statuses).is_ok());
}
#[test]
fn rule2_variance_bound_passes_when_not_all_pass() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Partial,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true),
make_evidence("arc-agi-3", 0.60, true),
];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule2_variance_bound_passes_with_reasonable_margins() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true), make_evidence("gdpval", 0.92, true), make_evidence("osworld", 0.93, true), make_evidence("re-bench", 0.80, true), ];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule2_variance_bound_fails_with_extreme_imbalance() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.851, true), make_evidence("gdpval", 0.851, true), make_evidence("osworld", 0.851, true), make_evidence("metr-80pct-time-horizon", 8000.0, false), ];
assert!(check_variance_bound(&evidence, &statuses).is_err());
}
#[test]
fn rule2_empty_evidence_passes() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule2_unknown_sources_passes() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![make_evidence("unknown-source", 0.95, true)];
assert!(check_variance_bound(&evidence, &statuses).is_ok());
}
#[test]
fn rule3_complete_provenance_passes() {
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true),
make_evidence("gdpval", 0.90, true),
];
assert!(check_provenance_metadata(&evidence).is_ok());
}
#[test]
fn rule3_empty_evidence_passes() {
let evidence = vec![];
assert!(check_provenance_metadata(&evidence).is_ok());
}
#[test]
fn consistency_check_all_pass_all_rules() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.90, true), make_evidence("gdpval", 0.88, true), make_evidence("osworld", 0.90, true), make_evidence("re-bench", 0.75, true), ];
let result = consistency_check(&evidence, &statuses);
assert!(result.passed, "Expected pass but got: {:?}", result);
assert!(result.failed_rules.is_empty());
}
#[test]
fn consistency_check_rule1_fails() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
let evidence = vec![make_evidence("arc-agi-2", 0.95, true)];
let result = consistency_check(&evidence, &statuses);
assert!(!result.passed);
assert!(
result
.failed_rules
.contains(&"rule_1_insufficient_data_masking".to_string())
);
}
#[test]
fn consistency_check_rule2_fails() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.99, true),
make_evidence("gdpval", 0.851, true),
make_evidence("osworld", 0.90, true),
make_evidence("metr-80pct-time-horizon", 8000.0, false),
];
let result = consistency_check(&evidence, &statuses);
assert!(!result.passed);
assert!(
result
.failed_rules
.contains(&"rule_2_variance_bound".to_string())
);
}
#[test]
fn consistency_check_multiple_rules_fail() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
ConjunctStatus::Pass,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.99, true),
make_evidence("gdpval", 0.851, true),
make_evidence("osworld", 0.90, true),
make_evidence("metr-80pct-time-horizon", 8000.0, false),
];
let result = consistency_check(&evidence, &statuses);
assert!(!result.passed);
assert!(
result
.failed_rules
.contains(&"rule_2_variance_bound".to_string())
);
}
#[test]
fn consistency_check_partial_or_fail_status_allows_insufficient_data() {
let statuses = [
ConjunctStatus::Pass,
ConjunctStatus::Partial,
ConjunctStatus::Pass,
ConjunctStatus::InsufficientData,
];
let evidence = vec![
make_evidence("arc-agi-2", 0.95, true),
make_evidence("gdpval", 0.90, true),
make_evidence("osworld", 0.95, true),
make_evidence("metr-80pct-time-horizon", 500.0, false),
];
let result = consistency_check(&evidence, &statuses);
assert!(result.passed);
}
}