Skip to main content

semantic_memory_forge/
bundle.rs

1//! Evidence bundle schema.
2//!
3//! Each causal/effect verification produces an `EvidenceBundle` that captures
4//! the full methodological chain from question through estimation and refutation.
5//!
6//! ## Required Fields (per canonical spec)
7//!
8//! - causal question
9//! - unit definition
10//! - treatment specification
11//! - outcome specification
12//! - covariates/confounders recorded
13//! - identification rationale
14//! - estimator and estimate
15//! - refutations attempted + results
16//! - raw receipt / trace / replay handles
17
18use schemars::JsonSchema;
19use serde::{Deserialize, Serialize};
20use stack_ids::{AttemptId, ClaimId, EnvelopeId, TraceCtx, TrialId};
21
22use crate::estimator::EstimatorMeta;
23
24/// Opaque identifier for an evidence bundle.
25#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)]
26pub struct EvidenceBundleId(pub String);
27
28impl EvidenceBundleId {
29    /// Generates a fresh opaque evidence-bundle identifier.
30    pub fn generate() -> Self {
31        Self(uuid::Uuid::new_v4().to_string())
32    }
33
34    /// Wraps an existing identifier string as an evidence-bundle identifier.
35    pub fn new(id: impl Into<String>) -> Self {
36        Self(id.into())
37    }
38
39    /// Returns the underlying identifier string.
40    pub fn as_str(&self) -> &str {
41        &self.0
42    }
43}
44
45impl std::fmt::Display for EvidenceBundleId {
46    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47        write!(f, "{}", self.0)
48    }
49}
50
51/// The causal question being investigated.
52#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
53pub struct CausalQuestion {
54    /// Natural language description of the causal question.
55    pub description: String,
56    /// The unit of analysis (e.g., "code patch", "configuration change").
57    pub unit_definition: String,
58}
59
60/// Treatment specification.
61#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
62pub struct TreatmentSpec {
63    /// What intervention is being tested.
64    pub description: String,
65    /// Baseline condition description.
66    pub baseline_description: String,
67    /// Whether paired baseline-vs-patched trials were used.
68    pub paired_trials: bool,
69}
70
71/// Outcome specification.
72#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
73pub struct OutcomeSpec {
74    /// What outcome is being measured.
75    pub description: String,
76    /// Measurement method.
77    pub measurement_method: String,
78    /// Whether outcome is binary, continuous, ordinal, etc.
79    pub outcome_type: String,
80}
81
82/// A refutation attempt and its result.
83#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
84pub struct RefutationAttempt {
85    /// Refutation method (e.g., "placebo_treatment", "random_cause", "subset_data").
86    pub method: String,
87    /// Result of the refutation attempt.
88    pub result: RefutationResult,
89    /// Estimator used for the refutation.
90    pub estimator_kind: Option<String>,
91    /// Parameters for the refutation.
92    pub parameters: Option<serde_json::Value>,
93}
94
95/// Result of a refutation attempt.
96#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
97#[serde(rename_all = "snake_case")]
98pub enum RefutationResult {
99    /// Refutation did not invalidate the original estimate.
100    Passed {
101        /// How much the estimate changed under the refutation test.
102        estimate_change: Option<f64>,
103    },
104    /// Refutation invalidated the original estimate.
105    Failed {
106        /// Description of how/why the refutation succeeded.
107        reason: String,
108        /// How much the estimate changed.
109        estimate_change: Option<f64>,
110    },
111    /// Refutation could not be completed.
112    Inconclusive { reason: String },
113    /// Refutation was not attempted (e.g., not applicable).
114    Skipped { reason: String },
115}
116
117/// Baseline-vs-patched side for a verification trial.
118#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
119#[serde(rename_all = "snake_case")]
120pub enum VerificationTrialSide {
121    Baseline,
122    Patched,
123}
124
125/// First-class verification trial record preserved in canonical raw truth.
126#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
127pub struct VerificationTrialRecord {
128    /// Trial identifier.
129    pub trial_id: TrialId,
130    /// Retry lineage / attempt family.
131    pub attempt_id: AttemptId,
132    /// Which side of the paired family this trial belongs to.
133    pub side: VerificationTrialSide,
134    /// Whether the trial completed.
135    pub completed: bool,
136    /// Opaque receipt handles tied to this trial.
137    #[serde(default)]
138    pub receipt_handles: Vec<String>,
139}
140
141/// Immutable comparability snapshot for a paired experiment family.
142#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
143pub struct ComparabilitySnapshot {
144    /// Which workload was exercised.
145    pub workload_id: String,
146    /// Backend family used for execution.
147    pub backend_family: String,
148    /// Ordered list of checks run for the pair.
149    pub selected_checks: Vec<String>,
150    /// Effective timeout class.
151    pub timeout_class: String,
152    /// Sorted execution-affecting flags.
153    pub config_flags: Vec<String>,
154    /// Explicit verdict when known; `None` preserves absence rather than inference.
155    pub comparable: Option<bool>,
156    /// Violations recorded during comparability checking.
157    #[serde(default)]
158    pub violations: Vec<String>,
159}
160
161/// First-class refutation artifact preserved in canonical raw truth.
162#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
163pub struct RefutationArtifactRecord {
164    /// Artifact identifier.
165    pub artifact_id: String,
166    /// Artifact type / family name.
167    pub artifact_type: String,
168    /// Trial that emitted the artifact, when available.
169    #[serde(default, skip_serializing_if = "Option::is_none")]
170    pub trial_id: Option<TrialId>,
171    /// Attempt that emitted the artifact, when available.
172    #[serde(default, skip_serializing_if = "Option::is_none")]
173    pub attempt_id: Option<AttemptId>,
174    /// Artifact outcome.
175    pub result: RefutationResult,
176    /// Effect delta / estimate change when available.
177    #[serde(default, skip_serializing_if = "Option::is_none")]
178    pub estimate_delta: Option<f64>,
179    /// Structured debug/detail payload when available.
180    #[serde(default, skip_serializing_if = "Option::is_none")]
181    pub details: Option<String>,
182}
183
184/// Compact lifecycle state that can be projected into memory-visible truth.
185#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
186#[serde(rename_all = "snake_case")]
187pub enum VerificationLifecycleState {
188    Unverified,
189    Verified,
190    Contradicted,
191    Superseded,
192}
193
194/// Promotion state for the claim represented by this bundle.
195#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
196#[serde(tag = "state", rename_all = "snake_case")]
197pub enum PromotionState {
198    NotPromoted,
199    Eligible,
200    Blocked {
201        reason: String,
202    },
203    Promoted {
204        #[serde(default, skip_serializing_if = "Option::is_none")]
205        version_id: Option<String>,
206        #[serde(default, skip_serializing_if = "Option::is_none")]
207        promoted_at: Option<String>,
208    },
209}
210
211/// Compact verification summary designed for projection/import visibility.
212#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
213pub struct VerificationSummary {
214    /// Verification lifecycle.
215    pub lifecycle_state: VerificationLifecycleState,
216    /// Promotion state.
217    pub promotion_state: PromotionState,
218    /// Number of completed trials contributing to the summary.
219    pub completed_trial_count: u32,
220    /// Number of passed refutation artifacts.
221    pub passed_refutation_count: u32,
222    /// Number of failed refutation artifacts.
223    pub failed_refutation_count: u32,
224    /// Immutable comparability snapshot version when carried by the source.
225    #[serde(default, skip_serializing_if = "Option::is_none")]
226    pub comparability_snapshot_version: Option<String>,
227    /// Human-readable, source-authored notes.
228    #[serde(default)]
229    pub notes: Vec<String>,
230}
231
232/// An evidence bundle capturing the full methodological chain.
233///
234/// This is the minimal but real evidence-bundle substrate required by the
235/// canonical verification pipeline. Every field maps to a requirement in
236/// the master delta spec.
237#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
238pub struct EvidenceBundle {
239    /// Unique bundle identifier.
240    pub id: EvidenceBundleId,
241
242    // ── Causal design ──────────────────────────────────────
243    /// The causal question being investigated.
244    pub question: CausalQuestion,
245    /// Treatment specification.
246    pub treatment: TreatmentSpec,
247    /// Outcome specification.
248    pub outcome: OutcomeSpec,
249    /// Covariates and confounders recorded.
250    pub covariates: Vec<String>,
251    /// Identification rationale (why we believe the causal identification is valid).
252    pub identification_rationale: String,
253
254    // ── Estimation ─────────────────────────────────────────
255    /// Estimator kind (e.g., "diff_in_diff", "propensity_score", "iv").
256    pub estimator_kind: String,
257    /// Estimator version (semantic versioning or commit hash).
258    pub estimator_version: String,
259    /// Structured estimator metadata for replay/audit.
260    #[serde(default, skip_serializing_if = "Option::is_none")]
261    pub estimator_meta: Option<EstimatorMeta>,
262    /// The causal estimate (effect size).
263    pub estimate: f64,
264    /// Standard error or uncertainty measure.
265    pub estimate_uncertainty: Option<f64>,
266    /// Confidence in the estimate (0.0 - 1.0).
267    pub confidence: f32,
268    /// Number of trials / observations.
269    pub trial_count: u32,
270    /// Whether variance-aware repeated trials were used.
271    pub variance_aware: bool,
272    /// Explicit paired verification-trial family.
273    #[serde(default)]
274    pub verification_trials: Vec<VerificationTrialRecord>,
275    /// Immutable comparability snapshot for the paired family, when known.
276    #[serde(default, skip_serializing_if = "Option::is_none")]
277    pub comparability_snapshot: Option<ComparabilitySnapshot>,
278
279    // ── Refutation ─────────────────────────────────────────
280    /// Refutation attempts and their results.
281    pub refutations: Vec<RefutationAttempt>,
282    /// First-class refutation artifacts with stable identities and lineage.
283    #[serde(default)]
284    pub refutation_artifacts: Vec<RefutationArtifactRecord>,
285    /// Compact verification/promotion summary for projection consumers.
286    #[serde(default, skip_serializing_if = "Option::is_none")]
287    pub verification_summary: Option<VerificationSummary>,
288
289    // ── Provenance ─────────────────────────────────────────
290    /// Raw receipt handle (opaque reference to the underlying raw data).
291    pub raw_receipt_handle: Option<String>,
292    /// Trace context for correlation.
293    pub trace_ctx: Option<TraceCtx>,
294    /// Attempt ID for retry lineage.
295    pub attempt_id: Option<AttemptId>,
296    /// Trial ID for the specific execution.
297    pub trial_id: Option<TrialId>,
298    /// Replay handle (linkage to original execution for replay).
299    pub replay_handle: Option<String>,
300    /// Source envelope ID if this bundle originated from an export.
301    pub source_envelope_id: Option<EnvelopeId>,
302    /// Claim IDs this bundle provides evidence for.
303    pub claim_ids: Vec<ClaimId>,
304
305    // ── Metadata ───────────────────────────────────────────
306    /// When the bundle was created.
307    pub created_at: String,
308    /// Comparability snapshot version (immutable once set).
309    pub comparability_snapshot_version: Option<String>,
310    /// Additional metadata.
311    pub metadata: Option<serde_json::Value>,
312}
313
314impl EvidenceBundle {
315    /// Create a new evidence bundle with required fields.
316    pub fn new(
317        question: CausalQuestion,
318        treatment: TreatmentSpec,
319        outcome: OutcomeSpec,
320        estimator_kind: impl Into<String>,
321        estimator_version: impl Into<String>,
322        estimate: f64,
323    ) -> Self {
324        Self {
325            id: EvidenceBundleId::generate(),
326            question,
327            treatment,
328            outcome,
329            covariates: Vec::new(),
330            identification_rationale: String::new(),
331            estimator_kind: estimator_kind.into(),
332            estimator_version: estimator_version.into(),
333            estimator_meta: None,
334            estimate,
335            estimate_uncertainty: None,
336            confidence: 0.0,
337            trial_count: 0,
338            variance_aware: false,
339            verification_trials: Vec::new(),
340            comparability_snapshot: None,
341            refutations: Vec::new(),
342            refutation_artifacts: Vec::new(),
343            verification_summary: None,
344            raw_receipt_handle: None,
345            trace_ctx: None,
346            attempt_id: None,
347            trial_id: None,
348            replay_handle: None,
349            source_envelope_id: None,
350            claim_ids: Vec::new(),
351            created_at: chrono::Utc::now().to_rfc3339(),
352            comparability_snapshot_version: None,
353            metadata: None,
354        }
355    }
356
357    /// Whether all refutation attempts passed.
358    pub fn all_refutations_passed(&self) -> bool {
359        self.refutations.iter().all(|r| {
360            matches!(
361                r.result,
362                RefutationResult::Passed { .. } | RefutationResult::Skipped { .. }
363            )
364        })
365    }
366
367    /// Whether any refutation attempt failed.
368    pub fn has_failed_refutation(&self) -> bool {
369        self.refutations
370            .iter()
371            .any(|r| matches!(r.result, RefutationResult::Failed { .. }))
372    }
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378
379    #[test]
380    fn evidence_bundle_creation() {
381        let bundle = EvidenceBundle::new(
382            CausalQuestion {
383                description: "Does patch X fix bug Y?".into(),
384                unit_definition: "code patch".into(),
385            },
386            TreatmentSpec {
387                description: "Apply patch X".into(),
388                baseline_description: "Original code".into(),
389                paired_trials: true,
390            },
391            OutcomeSpec {
392                description: "Test suite passes".into(),
393                measurement_method: "test runner".into(),
394                outcome_type: "binary".into(),
395            },
396            "diff_in_diff",
397            "1.0.0",
398            0.85,
399        );
400
401        assert!(!bundle.id.as_str().is_empty());
402        assert_eq!(bundle.estimate, 0.85);
403        assert!(bundle.refutations.is_empty());
404        assert!(bundle.verification_trials.is_empty());
405        assert!(bundle.refutation_artifacts.is_empty());
406        assert!(bundle.estimator_meta.is_none());
407        assert!(bundle.verification_summary.is_none());
408        assert!(bundle.all_refutations_passed());
409        assert!(!bundle.has_failed_refutation());
410    }
411
412    #[test]
413    fn refutation_tracking() {
414        let mut bundle = EvidenceBundle::new(
415            CausalQuestion {
416                description: "test".into(),
417                unit_definition: "unit".into(),
418            },
419            TreatmentSpec {
420                description: "t".into(),
421                baseline_description: "b".into(),
422                paired_trials: false,
423            },
424            OutcomeSpec {
425                description: "o".into(),
426                measurement_method: "m".into(),
427                outcome_type: "binary".into(),
428            },
429            "ols",
430            "1.0.0",
431            0.5,
432        );
433
434        bundle.refutations.push(RefutationAttempt {
435            method: "placebo_treatment".into(),
436            result: RefutationResult::Passed {
437                estimate_change: Some(0.01),
438            },
439            estimator_kind: None,
440            parameters: None,
441        });
442        assert!(bundle.all_refutations_passed());
443        assert!(!bundle.has_failed_refutation());
444
445        bundle.refutations.push(RefutationAttempt {
446            method: "random_cause".into(),
447            result: RefutationResult::Failed {
448                reason: "estimate changed significantly".into(),
449                estimate_change: Some(0.45),
450            },
451            estimator_kind: None,
452            parameters: None,
453        });
454        assert!(!bundle.all_refutations_passed());
455        assert!(bundle.has_failed_refutation());
456    }
457
458    #[test]
459    fn serde_roundtrip() {
460        let bundle = EvidenceBundle::new(
461            CausalQuestion {
462                description: "q".into(),
463                unit_definition: "u".into(),
464            },
465            TreatmentSpec {
466                description: "t".into(),
467                baseline_description: "b".into(),
468                paired_trials: true,
469            },
470            OutcomeSpec {
471                description: "o".into(),
472                measurement_method: "m".into(),
473                outcome_type: "continuous".into(),
474            },
475            "iv",
476            "2.0.0",
477            1.23,
478        );
479
480        let json = serde_json::to_string(&bundle).unwrap();
481        let back: EvidenceBundle = serde_json::from_str(&json).unwrap();
482        assert_eq!(back.id.as_str(), bundle.id.as_str());
483        assert_eq!(back.estimate, bundle.estimate);
484        assert_eq!(back.estimator_kind, "iv");
485    }
486
487    #[test]
488    fn verification_fields_roundtrip_as_first_class_artifacts() {
489        let mut bundle = EvidenceBundle::new(
490            CausalQuestion {
491                description: "Does the patch improve benchmark latency?".into(),
492                unit_definition: "paired benchmark run".into(),
493            },
494            TreatmentSpec {
495                description: "apply patch-123".into(),
496                baseline_description: "baseline checkout".into(),
497                paired_trials: true,
498            },
499            OutcomeSpec {
500                description: "benchmark latency drops".into(),
501                measurement_method: "criterion benchmark".into(),
502                outcome_type: "continuous".into(),
503            },
504            "before_after",
505            "3.1.4",
506            0.27,
507        );
508        bundle.covariates = vec![
509            "workload:bench-a".into(),
510            "backend:cargo".into(),
511            "known_threat:cache_warmup".into(),
512        ];
513        bundle.identification_rationale = "same workload, flags, and timeout class".into();
514        bundle.estimator_meta = Some(crate::estimator::EstimatorMeta {
515            kind: crate::estimator::EstimatorKind::Custom(
516                "living_memory_phase5_scorevector".into(),
517            ),
518            version: "3.1.4".into(),
519            parameters: serde_json::json!({
520                "weighted_total": 0.27,
521            }),
522            random_seed: Some(7),
523            environment: Some(crate::estimator::EnvironmentFingerprint {
524                python_version: None,
525                package_versions: serde_json::json!({
526                    "checks": ["cargo test"],
527                }),
528                platform: Some("linux".into()),
529                env_hash: Some("env-123".into()),
530            }),
531            timeout_secs: Some(60),
532            failure_mode: None,
533            request_schema_version: Some("living_memory.phase5.bundle.v1".into()),
534            response_schema_version: Some("semantic_memory_forge.evidence_bundle.v3".into()),
535        });
536        bundle.estimate_uncertainty = Some(0.04);
537        bundle.confidence = 0.91;
538        bundle.trial_count = 2;
539        bundle.variance_aware = true;
540        bundle.verification_trials = vec![
541            VerificationTrialRecord {
542                trial_id: TrialId::new("trial-baseline-1"),
543                attempt_id: AttemptId::new("attempt-family-1"),
544                side: VerificationTrialSide::Baseline,
545                completed: true,
546                receipt_handles: vec!["receipt:baseline".into()],
547            },
548            VerificationTrialRecord {
549                trial_id: TrialId::new("trial-patched-1"),
550                attempt_id: AttemptId::new("attempt-family-1"),
551                side: VerificationTrialSide::Patched,
552                completed: true,
553                receipt_handles: vec!["receipt:patched".into()],
554            },
555        ];
556        bundle.comparability_snapshot = Some(ComparabilitySnapshot {
557            workload_id: "bench-a".into(),
558            backend_family: "cargo".into(),
559            selected_checks: vec!["cargo test".into()],
560            timeout_class: "short".into(),
561            config_flags: vec!["--all-features".into()],
562            comparable: Some(true),
563            violations: vec![],
564        });
565        bundle.refutations = vec![RefutationAttempt {
566            method: "placebo".into(),
567            result: RefutationResult::Passed {
568                estimate_change: Some(0.01),
569            },
570            estimator_kind: Some("living_memory_phase5_scorevector".into()),
571            parameters: Some(serde_json::json!({
572                "artifact_id": "placebo-1",
573            })),
574        }];
575        bundle.refutation_artifacts = vec![RefutationArtifactRecord {
576            artifact_id: "placebo-1".into(),
577            artifact_type: "placebo".into(),
578            trial_id: Some(TrialId::new("trial-baseline-1")),
579            attempt_id: Some(AttemptId::new("attempt-family-1")),
580            result: RefutationResult::Passed {
581                estimate_change: Some(0.01),
582            },
583            estimate_delta: Some(0.01),
584            details: Some("placebo preserved the null effect".into()),
585        }];
586        bundle.verification_summary = Some(VerificationSummary {
587            lifecycle_state: VerificationLifecycleState::Verified,
588            promotion_state: PromotionState::Eligible,
589            completed_trial_count: 2,
590            passed_refutation_count: 1,
591            failed_refutation_count: 0,
592            comparability_snapshot_version: Some("bench-a:cargo:short".into()),
593            notes: vec!["paired verification clean".into()],
594        });
595        bundle.raw_receipt_handle = Some("receipt:store:receipts:123".into());
596        bundle.trace_ctx = Some(TraceCtx::from_trace_id("trace-ver-001"));
597        bundle.attempt_id = Some(AttemptId::new("attempt-family-1"));
598        bundle.trial_id = Some(TrialId::new("trial-patched-1"));
599        bundle.replay_handle = Some("replay://attempt-family-1".into());
600        bundle.claim_ids = vec![ClaimId::new("claim-ver-001")];
601        bundle.comparability_snapshot_version = Some("bench-a:cargo:short".into());
602
603        let json = serde_json::to_value(&bundle).unwrap();
604        assert_eq!(
605            json["question"]["unit_definition"],
606            serde_json::json!("paired benchmark run")
607        );
608        assert_eq!(json["treatment"]["paired_trials"], serde_json::json!(true));
609        assert_eq!(
610            json["verification_trials"]
611                .as_array()
612                .expect("verification trials should serialize")
613                .len(),
614            2
615        );
616        assert_eq!(
617            json["refutation_artifacts"]
618                .as_array()
619                .expect("refutation artifacts should serialize")
620                .len(),
621            1
622        );
623        assert_eq!(
624            json["verification_summary"]["promotion_state"]["state"],
625            serde_json::json!("eligible")
626        );
627        assert_eq!(
628            json["comparability_snapshot"]["workload_id"],
629            serde_json::json!("bench-a")
630        );
631
632        let back: EvidenceBundle = serde_json::from_value(json).unwrap();
633        assert_eq!(back.question.description, bundle.question.description);
634        assert_eq!(back.treatment.description, bundle.treatment.description);
635        assert_eq!(back.outcome.description, bundle.outcome.description);
636        assert_eq!(back.verification_trials.len(), 2);
637        assert_eq!(back.refutation_artifacts.len(), 1);
638        assert_eq!(
639            back.verification_summary
640                .expect("verification summary should roundtrip")
641                .lifecycle_state,
642            VerificationLifecycleState::Verified
643        );
644        assert_eq!(
645            back.claim_ids[0].as_str(),
646            "claim-ver-001",
647            "claim bundles should remain first-class serialized artifacts"
648        );
649    }
650}