1use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20use stack_ids::{AttemptId, ClaimId, EnvelopeId, TraceCtx, TrialId};
21
22use crate::estimator::EstimatorMeta;
23
24#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
26pub struct EvidenceBundleId(pub String);
27
28impl EvidenceBundleId {
29 pub fn generate() -> Self {
31 Self(uuid::Uuid::new_v4().to_string())
32 }
33
34 pub fn new(id: impl Into<String>) -> Self {
36 Self(id.into())
37 }
38
39 pub fn as_str(&self) -> &str {
41 &self.0
42 }
43}
44
45impl std::fmt::Display for EvidenceBundleId {
46 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47 write!(f, "{}", self.0)
48 }
49}
50
51#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
53pub struct CausalQuestion {
54 pub description: String,
56 pub unit_definition: String,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
62pub struct TreatmentSpec {
63 pub description: String,
65 pub baseline_description: String,
67 pub paired_trials: bool,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
73pub struct OutcomeSpec {
74 pub description: String,
76 pub measurement_method: String,
78 pub outcome_type: String,
80}
81
82#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
84pub struct RefutationAttempt {
85 pub method: String,
87 pub result: RefutationResult,
89 pub estimator_kind: Option<String>,
91 pub parameters: Option<serde_json::Value>,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
97#[serde(rename_all = "snake_case")]
98pub enum RefutationResult {
99 Passed {
101 estimate_change: Option<f64>,
103 },
104 Failed {
106 reason: String,
108 estimate_change: Option<f64>,
110 },
111 Inconclusive { reason: String },
113 Skipped { reason: String },
115}
116
117#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
119#[serde(rename_all = "snake_case")]
120pub enum VerificationTrialSide {
121 Baseline,
122 Patched,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
127pub struct VerificationTrialRecord {
128 pub trial_id: TrialId,
130 pub attempt_id: AttemptId,
132 pub side: VerificationTrialSide,
134 pub completed: bool,
136 #[serde(default)]
138 pub receipt_handles: Vec<String>,
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
143pub struct ComparabilitySnapshot {
144 pub workload_id: String,
146 pub backend_family: String,
148 pub selected_checks: Vec<String>,
150 pub timeout_class: String,
152 pub config_flags: Vec<String>,
154 pub comparable: Option<bool>,
156 #[serde(default)]
158 pub violations: Vec<String>,
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
163pub struct RefutationArtifactRecord {
164 pub artifact_id: String,
166 pub artifact_type: String,
168 #[serde(default, skip_serializing_if = "Option::is_none")]
170 pub trial_id: Option<TrialId>,
171 #[serde(default, skip_serializing_if = "Option::is_none")]
173 pub attempt_id: Option<AttemptId>,
174 pub result: RefutationResult,
176 #[serde(default, skip_serializing_if = "Option::is_none")]
178 pub estimate_delta: Option<f64>,
179 #[serde(default, skip_serializing_if = "Option::is_none")]
181 pub details: Option<String>,
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
186#[serde(rename_all = "snake_case")]
187pub enum VerificationLifecycleState {
188 Unverified,
189 Verified,
190 Contradicted,
191 Superseded,
192}
193
194#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
196#[serde(tag = "state", rename_all = "snake_case")]
197pub enum PromotionState {
198 NotPromoted,
199 Eligible,
200 Blocked {
201 reason: String,
202 },
203 Promoted {
204 #[serde(default, skip_serializing_if = "Option::is_none")]
205 version_id: Option<String>,
206 #[serde(default, skip_serializing_if = "Option::is_none")]
207 promoted_at: Option<String>,
208 },
209}
210
211#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
213pub struct VerificationSummary {
214 pub lifecycle_state: VerificationLifecycleState,
216 pub promotion_state: PromotionState,
218 pub completed_trial_count: u32,
220 pub passed_refutation_count: u32,
222 pub failed_refutation_count: u32,
224 #[serde(default, skip_serializing_if = "Option::is_none")]
226 pub comparability_snapshot_version: Option<String>,
227 #[serde(default)]
229 pub notes: Vec<String>,
230}
231
232#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
238pub struct EvidenceBundle {
239 pub id: EvidenceBundleId,
241
242 pub question: CausalQuestion,
245 pub treatment: TreatmentSpec,
247 pub outcome: OutcomeSpec,
249 pub covariates: Vec<String>,
251 pub identification_rationale: String,
253
254 pub estimator_kind: String,
257 pub estimator_version: String,
259 #[serde(default, skip_serializing_if = "Option::is_none")]
261 pub estimator_meta: Option<EstimatorMeta>,
262 pub estimate: f64,
264 pub estimate_uncertainty: Option<f64>,
266 pub confidence: f32,
268 pub trial_count: u32,
270 pub variance_aware: bool,
272 #[serde(default)]
274 pub verification_trials: Vec<VerificationTrialRecord>,
275 #[serde(default, skip_serializing_if = "Option::is_none")]
277 pub comparability_snapshot: Option<ComparabilitySnapshot>,
278
279 pub refutations: Vec<RefutationAttempt>,
282 #[serde(default)]
284 pub refutation_artifacts: Vec<RefutationArtifactRecord>,
285 #[serde(default, skip_serializing_if = "Option::is_none")]
287 pub verification_summary: Option<VerificationSummary>,
288
289 pub raw_receipt_handle: Option<String>,
292 pub trace_ctx: Option<TraceCtx>,
294 pub attempt_id: Option<AttemptId>,
296 pub trial_id: Option<TrialId>,
298 pub replay_handle: Option<String>,
300 pub source_envelope_id: Option<EnvelopeId>,
302 pub claim_ids: Vec<ClaimId>,
304
305 pub created_at: String,
308 pub comparability_snapshot_version: Option<String>,
310 pub metadata: Option<serde_json::Value>,
312}
313
314impl EvidenceBundle {
315 pub fn new(
317 question: CausalQuestion,
318 treatment: TreatmentSpec,
319 outcome: OutcomeSpec,
320 estimator_kind: impl Into<String>,
321 estimator_version: impl Into<String>,
322 estimate: f64,
323 ) -> Self {
324 Self {
325 id: EvidenceBundleId::generate(),
326 question,
327 treatment,
328 outcome,
329 covariates: Vec::new(),
330 identification_rationale: String::new(),
331 estimator_kind: estimator_kind.into(),
332 estimator_version: estimator_version.into(),
333 estimator_meta: None,
334 estimate,
335 estimate_uncertainty: None,
336 confidence: 0.0,
337 trial_count: 0,
338 variance_aware: false,
339 verification_trials: Vec::new(),
340 comparability_snapshot: None,
341 refutations: Vec::new(),
342 refutation_artifacts: Vec::new(),
343 verification_summary: None,
344 raw_receipt_handle: None,
345 trace_ctx: None,
346 attempt_id: None,
347 trial_id: None,
348 replay_handle: None,
349 source_envelope_id: None,
350 claim_ids: Vec::new(),
351 created_at: chrono::Utc::now().to_rfc3339(),
352 comparability_snapshot_version: None,
353 metadata: None,
354 }
355 }
356
357 pub fn all_refutations_passed(&self) -> bool {
359 self.refutations.iter().all(|r| {
360 matches!(
361 r.result,
362 RefutationResult::Passed { .. } | RefutationResult::Skipped { .. }
363 )
364 })
365 }
366
367 pub fn has_failed_refutation(&self) -> bool {
369 self.refutations
370 .iter()
371 .any(|r| matches!(r.result, RefutationResult::Failed { .. }))
372 }
373}
374
375#[cfg(test)]
376mod tests {
377 use super::*;
378
379 #[test]
380 fn evidence_bundle_creation() {
381 let bundle = EvidenceBundle::new(
382 CausalQuestion {
383 description: "Does patch X fix bug Y?".into(),
384 unit_definition: "code patch".into(),
385 },
386 TreatmentSpec {
387 description: "Apply patch X".into(),
388 baseline_description: "Original code".into(),
389 paired_trials: true,
390 },
391 OutcomeSpec {
392 description: "Test suite passes".into(),
393 measurement_method: "test runner".into(),
394 outcome_type: "binary".into(),
395 },
396 "diff_in_diff",
397 "1.0.0",
398 0.85,
399 );
400
401 assert!(!bundle.id.as_str().is_empty());
402 assert_eq!(bundle.estimate, 0.85);
403 assert!(bundle.refutations.is_empty());
404 assert!(bundle.verification_trials.is_empty());
405 assert!(bundle.refutation_artifacts.is_empty());
406 assert!(bundle.estimator_meta.is_none());
407 assert!(bundle.verification_summary.is_none());
408 assert!(bundle.all_refutations_passed());
409 assert!(!bundle.has_failed_refutation());
410 }
411
412 #[test]
413 fn refutation_tracking() {
414 let mut bundle = EvidenceBundle::new(
415 CausalQuestion {
416 description: "test".into(),
417 unit_definition: "unit".into(),
418 },
419 TreatmentSpec {
420 description: "t".into(),
421 baseline_description: "b".into(),
422 paired_trials: false,
423 },
424 OutcomeSpec {
425 description: "o".into(),
426 measurement_method: "m".into(),
427 outcome_type: "binary".into(),
428 },
429 "ols",
430 "1.0.0",
431 0.5,
432 );
433
434 bundle.refutations.push(RefutationAttempt {
435 method: "placebo_treatment".into(),
436 result: RefutationResult::Passed {
437 estimate_change: Some(0.01),
438 },
439 estimator_kind: None,
440 parameters: None,
441 });
442 assert!(bundle.all_refutations_passed());
443 assert!(!bundle.has_failed_refutation());
444
445 bundle.refutations.push(RefutationAttempt {
446 method: "random_cause".into(),
447 result: RefutationResult::Failed {
448 reason: "estimate changed significantly".into(),
449 estimate_change: Some(0.45),
450 },
451 estimator_kind: None,
452 parameters: None,
453 });
454 assert!(!bundle.all_refutations_passed());
455 assert!(bundle.has_failed_refutation());
456 }
457
458 #[test]
459 fn serde_roundtrip() {
460 let bundle = EvidenceBundle::new(
461 CausalQuestion {
462 description: "q".into(),
463 unit_definition: "u".into(),
464 },
465 TreatmentSpec {
466 description: "t".into(),
467 baseline_description: "b".into(),
468 paired_trials: true,
469 },
470 OutcomeSpec {
471 description: "o".into(),
472 measurement_method: "m".into(),
473 outcome_type: "continuous".into(),
474 },
475 "iv",
476 "2.0.0",
477 1.23,
478 );
479
480 let json = serde_json::to_string(&bundle).unwrap();
481 let back: EvidenceBundle = serde_json::from_str(&json).unwrap();
482 assert_eq!(back.id.as_str(), bundle.id.as_str());
483 assert_eq!(back.estimate, bundle.estimate);
484 assert_eq!(back.estimator_kind, "iv");
485 }
486
487 #[test]
488 fn verification_fields_roundtrip_as_first_class_artifacts() {
489 let mut bundle = EvidenceBundle::new(
490 CausalQuestion {
491 description: "Does the patch improve benchmark latency?".into(),
492 unit_definition: "paired benchmark run".into(),
493 },
494 TreatmentSpec {
495 description: "apply patch-123".into(),
496 baseline_description: "baseline checkout".into(),
497 paired_trials: true,
498 },
499 OutcomeSpec {
500 description: "benchmark latency drops".into(),
501 measurement_method: "criterion benchmark".into(),
502 outcome_type: "continuous".into(),
503 },
504 "before_after",
505 "3.1.4",
506 0.27,
507 );
508 bundle.covariates = vec![
509 "workload:bench-a".into(),
510 "backend:cargo".into(),
511 "known_threat:cache_warmup".into(),
512 ];
513 bundle.identification_rationale = "same workload, flags, and timeout class".into();
514 bundle.estimator_meta = Some(crate::estimator::EstimatorMeta {
515 kind: crate::estimator::EstimatorKind::Custom(
516 "living_memory_phase5_scorevector".into(),
517 ),
518 version: "3.1.4".into(),
519 parameters: serde_json::json!({
520 "weighted_total": 0.27,
521 }),
522 random_seed: Some(7),
523 environment: Some(crate::estimator::EnvironmentFingerprint {
524 python_version: None,
525 package_versions: serde_json::json!({
526 "checks": ["cargo test"],
527 }),
528 platform: Some("linux".into()),
529 env_hash: Some("env-123".into()),
530 }),
531 timeout_secs: Some(60),
532 failure_mode: None,
533 request_schema_version: Some("living_memory.phase5.bundle.v1".into()),
534 response_schema_version: Some("semantic_memory_forge.evidence_bundle.v3".into()),
535 });
536 bundle.estimate_uncertainty = Some(0.04);
537 bundle.confidence = 0.91;
538 bundle.trial_count = 2;
539 bundle.variance_aware = true;
540 bundle.verification_trials = vec![
541 VerificationTrialRecord {
542 trial_id: TrialId::new("trial-baseline-1"),
543 attempt_id: AttemptId::new("attempt-family-1"),
544 side: VerificationTrialSide::Baseline,
545 completed: true,
546 receipt_handles: vec!["receipt:baseline".into()],
547 },
548 VerificationTrialRecord {
549 trial_id: TrialId::new("trial-patched-1"),
550 attempt_id: AttemptId::new("attempt-family-1"),
551 side: VerificationTrialSide::Patched,
552 completed: true,
553 receipt_handles: vec!["receipt:patched".into()],
554 },
555 ];
556 bundle.comparability_snapshot = Some(ComparabilitySnapshot {
557 workload_id: "bench-a".into(),
558 backend_family: "cargo".into(),
559 selected_checks: vec!["cargo test".into()],
560 timeout_class: "short".into(),
561 config_flags: vec!["--all-features".into()],
562 comparable: Some(true),
563 violations: vec![],
564 });
565 bundle.refutations = vec![RefutationAttempt {
566 method: "placebo".into(),
567 result: RefutationResult::Passed {
568 estimate_change: Some(0.01),
569 },
570 estimator_kind: Some("living_memory_phase5_scorevector".into()),
571 parameters: Some(serde_json::json!({
572 "artifact_id": "placebo-1",
573 })),
574 }];
575 bundle.refutation_artifacts = vec![RefutationArtifactRecord {
576 artifact_id: "placebo-1".into(),
577 artifact_type: "placebo".into(),
578 trial_id: Some(TrialId::new("trial-baseline-1")),
579 attempt_id: Some(AttemptId::new("attempt-family-1")),
580 result: RefutationResult::Passed {
581 estimate_change: Some(0.01),
582 },
583 estimate_delta: Some(0.01),
584 details: Some("placebo preserved the null effect".into()),
585 }];
586 bundle.verification_summary = Some(VerificationSummary {
587 lifecycle_state: VerificationLifecycleState::Verified,
588 promotion_state: PromotionState::Eligible,
589 completed_trial_count: 2,
590 passed_refutation_count: 1,
591 failed_refutation_count: 0,
592 comparability_snapshot_version: Some("bench-a:cargo:short".into()),
593 notes: vec!["paired verification clean".into()],
594 });
595 bundle.raw_receipt_handle = Some("receipt:store:receipts:123".into());
596 bundle.trace_ctx = Some(TraceCtx::from_trace_id("trace-ver-001"));
597 bundle.attempt_id = Some(AttemptId::new("attempt-family-1"));
598 bundle.trial_id = Some(TrialId::new("trial-patched-1"));
599 bundle.replay_handle = Some("replay://attempt-family-1".into());
600 bundle.claim_ids = vec![ClaimId::new("claim-ver-001")];
601 bundle.comparability_snapshot_version = Some("bench-a:cargo:short".into());
602
603 let json = serde_json::to_value(&bundle).unwrap();
604 assert_eq!(
605 json["question"]["unit_definition"],
606 serde_json::json!("paired benchmark run")
607 );
608 assert_eq!(json["treatment"]["paired_trials"], serde_json::json!(true));
609 assert_eq!(
610 json["verification_trials"]
611 .as_array()
612 .expect("verification trials should serialize")
613 .len(),
614 2
615 );
616 assert_eq!(
617 json["refutation_artifacts"]
618 .as_array()
619 .expect("refutation artifacts should serialize")
620 .len(),
621 1
622 );
623 assert_eq!(
624 json["verification_summary"]["promotion_state"]["state"],
625 serde_json::json!("eligible")
626 );
627 assert_eq!(
628 json["comparability_snapshot"]["workload_id"],
629 serde_json::json!("bench-a")
630 );
631
632 let back: EvidenceBundle = serde_json::from_value(json).unwrap();
633 assert_eq!(back.question.description, bundle.question.description);
634 assert_eq!(back.treatment.description, bundle.treatment.description);
635 assert_eq!(back.outcome.description, bundle.outcome.description);
636 assert_eq!(back.verification_trials.len(), 2);
637 assert_eq!(back.refutation_artifacts.len(), 1);
638 assert_eq!(
639 back.verification_summary
640 .expect("verification summary should roundtrip")
641 .lifecycle_state,
642 VerificationLifecycleState::Verified
643 );
644 assert_eq!(
645 back.claim_ids[0].as_str(),
646 "claim-ver-001",
647 "claim bundles should remain first-class serialized artifacts"
648 );
649 }
650}