Skip to main content

agi4_core/
evaluators.rs

1//! Per-conjunct evaluation functions.
2//!
3//! Each function takes evidence for a conjunct and returns a ConjunctStatus
4//! based on the thresholds defined in SPEC.md §3.
5
6use crate::conjunct::ConjunctStatus;
7use crate::evidence::{Evidence, SourceValue};
8use crate::sources;
9use crate::threshold;
10
11/// Evaluate the Generality conjunct.
12///
13/// Requires: at least 3 of 4 sources (ARC-AGI-2, ARC-AGI-3, HLE, GPQA-Diamond)
14/// with ARC-AGI-3 mandatory.
15///
16/// Pass: all four sources meet thresholds
17/// Partial: at least one meets, at least one doesn't
18/// Fail: no source meets OR ARC-AGI-3 < 5%
19/// InsufficientData: minimum evidence requirement unmet
20pub fn evaluate_generality(evidence: &[Evidence]) -> ConjunctStatus {
21    let mut arc_agi_2 = None;
22    let mut arc_agi_3 = None;
23    let mut hle = None;
24    let mut gpqa_diamond = None;
25
26    for e in evidence {
27        match e.source.as_str() {
28            sources::generality::ARC_AGI_2 => {
29                if let SourceValue::Fraction(f) = e.value {
30                    arc_agi_2 = Some(f);
31                }
32            }
33            sources::generality::ARC_AGI_3 => {
34                if let SourceValue::Fraction(f) = e.value {
35                    arc_agi_3 = Some(f);
36                }
37            }
38            sources::generality::HLE => {
39                if let SourceValue::Fraction(f) = e.value {
40                    hle = Some(f);
41                }
42            }
43            sources::generality::GPQA_DIAMOND => {
44                if let SourceValue::Fraction(f) = e.value {
45                    gpqa_diamond = Some(f);
46                }
47            }
48            _ => {}
49        }
50    }
51
52    // ARC-AGI-3 is mandatory
53    let arc_agi_3 = match arc_agi_3 {
54        Some(f) => f,
55        None => return ConjunctStatus::InsufficientData,
56    };
57
58    // Check ARC-AGI-3 floor
59    if arc_agi_3.value() < threshold::generality::ARC_AGI_3_FLOOR {
60        return ConjunctStatus::Fail;
61    }
62
63    // Count how many sources are available
64    let available_sources = [arc_agi_2.is_some(), hle.is_some(), gpqa_diamond.is_some()]
65        .iter()
66        .filter(|&&x| x)
67        .count()
68        + 1; // +1 for ARC-AGI-3
69
70    // Minimum evidence: at least 3 of 4 sources
71    if available_sources < 3 {
72        return ConjunctStatus::InsufficientData;
73    }
74
75    // Check thresholds
76    let arc_agi_2_pass = arc_agi_2
77        .map(|f| f.value() >= threshold::generality::ARC_AGI_2_PASS)
78        .unwrap_or(false);
79    let arc_agi_3_pass = arc_agi_3.value() >= threshold::generality::ARC_AGI_3_PASS;
80    let hle_pass = hle
81        .map(|f| f.value() >= threshold::generality::HLE_PASS)
82        .unwrap_or(false);
83    let gpqa_pass = gpqa_diamond
84        .map(|f| f.value() >= threshold::generality::GPQA_DIAMOND_PASS)
85        .unwrap_or(false);
86
87    let sources = [
88        (arc_agi_2_pass, arc_agi_2.is_some()),
89        (arc_agi_3_pass, true),
90        (hle_pass, hle.is_some()),
91        (gpqa_pass, gpqa_diamond.is_some()),
92    ];
93
94    let passing = sources
95        .iter()
96        .filter(|(pass, present)| *present && *pass)
97        .count();
98    let present = sources.iter().filter(|(_, present)| *present).count();
99
100    if passing == present && present >= 3 {
101        ConjunctStatus::Pass
102    } else if passing > 0 && passing < present {
103        ConjunctStatus::Partial
104    } else {
105        ConjunctStatus::Fail
106    }
107}
108
109/// Evaluate the Economic Substitutability conjunct.
110///
111/// Requires: both GDPval and RLI (APEX-Agents is supplementary)
112///
113/// Pass: GDPval ≥85% AND RLI ≥60%
114/// Partial: one meets, one doesn't
115/// Fail: neither meets threshold
116/// InsufficientData: missing required sources
117pub fn evaluate_economic_substitutability(evidence: &[Evidence]) -> ConjunctStatus {
118    let mut gdpval = None;
119    let mut rli = None;
120
121    for e in evidence {
122        match e.source.as_str() {
123            sources::economic_substitutability::GDPVAL
124            | sources::economic_substitutability::GDPVAL_AA => {
125                if let SourceValue::Fraction(f) = e.value {
126                    gdpval = Some(f);
127                }
128            }
129            sources::economic_substitutability::RLI => {
130                if let SourceValue::Fraction(f) = e.value {
131                    rli = Some(f);
132                }
133            }
134            sources::economic_substitutability::APEX_AGENTS => {
135                // APEX-Agents is supplementary, not used in logic yet
136            }
137            _ => {}
138        }
139    }
140
141    // Both GDPval and RLI are required
142    let gdpval = match gdpval {
143        Some(f) => f,
144        None => return ConjunctStatus::InsufficientData,
145    };
146    let rli = match rli {
147        Some(f) => f,
148        None => return ConjunctStatus::InsufficientData,
149    };
150
151    // Check for floor on RLI
152    if rli.value() < threshold::economic_substitutability::RLI_FLOOR {
153        return ConjunctStatus::Fail;
154    }
155
156    let gdpval_pass = gdpval.value() >= threshold::economic_substitutability::GDPVAL_PASS;
157    let rli_pass = rli.value() >= threshold::economic_substitutability::RLI_PASS;
158
159    if gdpval_pass && rli_pass {
160        ConjunctStatus::Pass
161    } else if gdpval_pass || rli_pass {
162        ConjunctStatus::Partial
163    } else {
164        ConjunctStatus::Fail
165    }
166}
167
168/// Evaluate the Environmental Transfer conjunct.
169///
170/// Requires: ARC-AGI-3 (mandatory) + at least one of OSWorld or NES
171///
172/// Pass: ARC-AGI-3 ≥50% AND (OSWorld ≥85% OR NES ≥threshold)
173/// Partial: ARC-AGI-3 above floor but below threshold OR ARC-AGI-3 passes but no secondary source
174/// Fail: ARC-AGI-3 < 5%
175/// InsufficientData: ARC-AGI-3 missing or no secondary source
176pub fn evaluate_environmental_transfer(evidence: &[Evidence]) -> ConjunctStatus {
177    let mut arc_agi_3 = None;
178    let mut osworld = None;
179    let mut _nes = None;
180
181    for e in evidence {
182        match e.source.as_str() {
183            sources::environmental_transfer::ARC_AGI_3 => {
184                if let SourceValue::Fraction(f) = e.value {
185                    arc_agi_3 = Some(f);
186                }
187            }
188            sources::environmental_transfer::OSWORLD => {
189                if let SourceValue::Fraction(f) = e.value {
190                    osworld = Some(f);
191                }
192            }
193            sources::environmental_transfer::NES => {
194                if let SourceValue::Fraction(f) = e.value {
195                    _nes = Some(f);
196                }
197            }
198            _ => {}
199        }
200    }
201
202    // ARC-AGI-3 is required
203    let arc_agi_3 = match arc_agi_3 {
204        Some(f) => f,
205        None => return ConjunctStatus::InsufficientData,
206    };
207
208    // Check ARC-AGI-3 floor
209    if arc_agi_3.value() < threshold::environmental_transfer::ARC_AGI_3_FLOOR {
210        return ConjunctStatus::Fail;
211    }
212
213    // In v0.1.x, NES is underspecified and cannot be used for evaluation.
214    // OSWorld is the only valid secondary source. Evidence of NES is accepted
215    // but not evaluated (per SPEC.md §2.3).
216    if osworld.is_none() {
217        return ConjunctStatus::InsufficientData;
218    }
219
220    let arc_agi_3_pass = arc_agi_3.value() >= threshold::environmental_transfer::ARC_AGI_3_PASS;
221    let osworld_pass = osworld
222        .map(|f| f.value() >= threshold::environmental_transfer::OSWORLD_PASS)
223        .unwrap_or(false);
224
225    if arc_agi_3_pass && osworld_pass {
226        ConjunctStatus::Pass
227    } else {
228        // Partial: either source passes individually, or ARC-AGI-3 clears floor.
229        // (Floor is guaranteed by line 209 check, so the condition below always evaluates to true
230        // when we reach this point, ensuring we never reach a Fail state after the floor check.)
231        ConjunctStatus::Partial
232    }
233}
234
235/// Evaluate the Autonomous Agency conjunct.
236///
237/// Requires: METR 80%-time horizon (mandatory) + at least one of RE-Bench or SWE-bench Verified
238///
239/// Pass: METR ≥168h AND (RE-Bench ≥60% OR SWE-bench ≥85%)
240/// Partial: METR ≥168h but no supporting source OR supporting source passes but METR < 168h >= 8h
241/// Fail: METR < 8h
242/// InsufficientData: METR missing or no supporting source
243pub fn evaluate_autonomous_agency(evidence: &[Evidence]) -> ConjunctStatus {
244    let mut metr = None;
245    let mut rebench = None;
246    let mut swebench = None;
247
248    for e in evidence {
249        match e.source.as_str() {
250            sources::autonomous_agency::METR_80PCT_TIME_HORIZON => {
251                if let SourceValue::Hours(h) = e.value {
252                    metr = Some(h);
253                }
254            }
255            sources::autonomous_agency::RE_BENCH => {
256                if let SourceValue::Fraction(f) = e.value {
257                    rebench = Some(f);
258                }
259            }
260            sources::autonomous_agency::SWE_BENCH_VERIFIED => {
261                if let SourceValue::Fraction(f) = e.value {
262                    swebench = Some(f);
263                }
264            }
265            _ => {}
266        }
267    }
268
269    // METR is required
270    let metr = match metr {
271        Some(h) => h,
272        None => return ConjunctStatus::InsufficientData,
273    };
274
275    // Check METR floor
276    if metr.value() < threshold::autonomous_agency::METR_80PCT_FLOOR_HOURS {
277        return ConjunctStatus::Fail;
278    }
279
280    // Need at least one supporting source
281    if rebench.is_none() && swebench.is_none() {
282        return ConjunctStatus::InsufficientData;
283    }
284
285    let metr_pass = metr.value() >= threshold::autonomous_agency::METR_80PCT_PASS_HOURS;
286    let rebench_pass = rebench
287        .map(|f| f.value() >= threshold::autonomous_agency::REBENCH_PASS)
288        .unwrap_or(false);
289    let swebench_pass = swebench
290        .map(|f| f.value() >= threshold::autonomous_agency::SWEBENCH_VERIFIED_PASS_AT_5)
291        .unwrap_or(false);
292
293    if metr_pass && (rebench_pass || swebench_pass) {
294        ConjunctStatus::Pass
295    } else if metr_pass || rebench_pass || swebench_pass {
296        ConjunctStatus::Partial
297    } else {
298        ConjunctStatus::Fail
299    }
300}
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305    use crate::evidence::{BoundedFraction, MeasurementId, NonNegativeHours, Provenance, SourceId};
306    use chrono::Utc;
307    use url::Url;
308
309    fn make_evidence(source: &str, measurement: &str, value: SourceValue) -> Evidence {
310        Evidence {
311            source: SourceId::new(source),
312            measurement: MeasurementId::new(measurement),
313            value,
314            reliability_percentile: 95,
315            provenance: Provenance {
316                source_url: Url::parse("https://example.com").unwrap(),
317                fetch_timestamp: Utc::now(),
318                source_version: Some("1.0".to_string()),
319                raw_value: "test".to_string(),
320            },
321        }
322    }
323
324    #[test]
325    fn generality_pass_all_sources() {
326        let evidence = vec![
327            make_evidence(
328                "arc-agi-2",
329                "pass-rate",
330                SourceValue::Fraction(BoundedFraction::new(0.85).unwrap()),
331            ),
332            make_evidence(
333                "arc-agi-3",
334                "pass-rate",
335                SourceValue::Fraction(BoundedFraction::new(0.50).unwrap()),
336            ),
337            make_evidence(
338                "hle",
339                "accuracy",
340                SourceValue::Fraction(BoundedFraction::new(0.80).unwrap()),
341            ),
342            make_evidence(
343                "gpqa-diamond",
344                "accuracy",
345                SourceValue::Fraction(BoundedFraction::new(0.90).unwrap()),
346            ),
347        ];
348
349        assert_eq!(evaluate_generality(&evidence), ConjunctStatus::Pass);
350    }
351
352    #[test]
353    fn generality_insufficient_data() {
354        let evidence = vec![make_evidence(
355            "arc-agi-3",
356            "pass-rate",
357            SourceValue::Fraction(BoundedFraction::new(0.50).unwrap()),
358        )];
359
360        assert_eq!(
361            evaluate_generality(&evidence),
362            ConjunctStatus::InsufficientData
363        );
364    }
365
366    #[test]
367    fn generality_fail_below_floor() {
368        let evidence = vec![
369            make_evidence(
370                "arc-agi-3",
371                "pass-rate",
372                SourceValue::Fraction(BoundedFraction::new(0.03).unwrap()),
373            ),
374            make_evidence(
375                "hle",
376                "accuracy",
377                SourceValue::Fraction(BoundedFraction::new(0.80).unwrap()),
378            ),
379            make_evidence(
380                "gpqa-diamond",
381                "accuracy",
382                SourceValue::Fraction(BoundedFraction::new(0.90).unwrap()),
383            ),
384        ];
385
386        assert_eq!(evaluate_generality(&evidence), ConjunctStatus::Fail);
387    }
388
389    #[test]
390    fn economic_substitutability_pass() {
391        let evidence = vec![
392            make_evidence(
393                "gdpval",
394                "win-rate",
395                SourceValue::Fraction(BoundedFraction::new(0.85).unwrap()),
396            ),
397            make_evidence(
398                "rli",
399                "completion-rate",
400                SourceValue::Fraction(BoundedFraction::new(0.60).unwrap()),
401            ),
402        ];
403
404        assert_eq!(
405            evaluate_economic_substitutability(&evidence),
406            ConjunctStatus::Pass
407        );
408    }
409
410    #[test]
411    fn economic_substitutability_insufficient_data() {
412        let evidence = vec![make_evidence(
413            "gdpval",
414            "win-rate",
415            SourceValue::Fraction(BoundedFraction::new(0.85).unwrap()),
416        )];
417
418        assert_eq!(
419            evaluate_economic_substitutability(&evidence),
420            ConjunctStatus::InsufficientData
421        );
422    }
423
424    #[test]
425    fn environmental_transfer_pass() {
426        let evidence = vec![
427            make_evidence(
428                "arc-agi-3",
429                "pass-rate",
430                SourceValue::Fraction(BoundedFraction::new(0.50).unwrap()),
431            ),
432            make_evidence(
433                "osworld",
434                "completion-rate",
435                SourceValue::Fraction(BoundedFraction::new(0.85).unwrap()),
436            ),
437        ];
438
439        assert_eq!(
440            evaluate_environmental_transfer(&evidence),
441            ConjunctStatus::Pass
442        );
443    }
444
445    #[test]
446    fn environmental_transfer_insufficient_without_secondary() {
447        let evidence = vec![make_evidence(
448            sources::environmental_transfer::ARC_AGI_3,
449            "pass-rate",
450            SourceValue::Fraction(BoundedFraction::new(0.50).unwrap()),
451        )];
452
453        assert_eq!(
454            evaluate_environmental_transfer(&evidence),
455            ConjunctStatus::InsufficientData
456        );
457    }
458
459    #[test]
460    fn environmental_transfer_insufficient_with_nes_only() {
461        let evidence = vec![
462            make_evidence(
463                sources::environmental_transfer::ARC_AGI_3,
464                "pass-rate",
465                SourceValue::Fraction(BoundedFraction::new(0.50).unwrap()),
466            ),
467            make_evidence(
468                sources::environmental_transfer::NES,
469                "completion-rate",
470                SourceValue::Fraction(BoundedFraction::new(0.90).unwrap()),
471            ),
472        ];
473
474        assert_eq!(
475            evaluate_environmental_transfer(&evidence),
476            ConjunctStatus::InsufficientData,
477            "NES alone cannot satisfy secondary source requirement in v0.1.x"
478        );
479    }
480
481    #[test]
482    fn autonomous_agency_pass() {
483        let evidence = vec![
484            make_evidence(
485                sources::autonomous_agency::METR_80PCT_TIME_HORIZON,
486                "hours",
487                SourceValue::Hours(NonNegativeHours::new(168.0).unwrap()),
488            ),
489            make_evidence(
490                sources::autonomous_agency::RE_BENCH,
491                "success-rate",
492                SourceValue::Fraction(BoundedFraction::new(0.60).unwrap()),
493            ),
494        ];
495
496        assert_eq!(evaluate_autonomous_agency(&evidence), ConjunctStatus::Pass);
497    }
498
499    #[test]
500    fn autonomous_agency_insufficient_without_supporting() {
501        let evidence = vec![make_evidence(
502            sources::autonomous_agency::METR_80PCT_TIME_HORIZON,
503            "hours",
504            SourceValue::Hours(NonNegativeHours::new(168.0).unwrap()),
505        )];
506
507        assert_eq!(
508            evaluate_autonomous_agency(&evidence),
509            ConjunctStatus::InsufficientData
510        );
511    }
512
513    #[test]
514    fn autonomous_agency_fail_below_floor() {
515        let evidence = vec![
516            make_evidence(
517                sources::autonomous_agency::METR_80PCT_TIME_HORIZON,
518                "hours",
519                SourceValue::Hours(NonNegativeHours::new(4.0).unwrap()),
520            ),
521            make_evidence(
522                sources::autonomous_agency::RE_BENCH,
523                "success-rate",
524                SourceValue::Fraction(BoundedFraction::new(0.60).unwrap()),
525            ),
526        ];
527
528        assert_eq!(evaluate_autonomous_agency(&evidence), ConjunctStatus::Fail);
529    }
530}