use std::collections::{BTreeMap, BTreeSet};
use serde::{Deserialize, Serialize};
#[derive(
Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize,
)]
pub struct FactId(pub u64);
impl FactId {
#[must_use]
pub const fn is_valid(self) -> bool {
self.0 != 0
}
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct AnalysisSourceSpan {
pub file_id: u32,
pub start_byte: u32,
pub end_byte: u32,
pub start_line: u32,
pub start_column: u32,
pub end_line: u32,
pub end_column: u32,
}
impl AnalysisSourceSpan {
#[must_use]
pub const fn byte_range(file_id: u32, start_byte: u32, end_byte: u32) -> Self {
Self {
file_id,
start_byte,
end_byte,
start_line: 0,
start_column: 0,
end_line: 0,
end_column: 0,
}
}
pub fn validate(&self, context: &str) -> Result<(), AnalysisFactError> {
if self.end_byte < self.start_byte {
return Err(AnalysisFactError::InvalidSpan {
context: context.to_string(),
start_byte: self.start_byte,
end_byte: self.end_byte,
});
}
Ok(())
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
pub enum FactKind {
Node,
Edge,
Symbol,
Call,
Dataflow,
Control,
Auth,
Sanitizer,
Sink,
Source,
Type,
Lifetime,
Concurrency,
Provenance,
}
impl FactKind {
#[must_use]
pub const fn tag(self) -> u16 {
match self {
Self::Node => 1,
Self::Edge => 2,
Self::Symbol => 3,
Self::Call => 4,
Self::Dataflow => 5,
Self::Control => 6,
Self::Auth => 7,
Self::Sanitizer => 8,
Self::Sink => 9,
Self::Source => 10,
Self::Type => 11,
Self::Lifetime => 12,
Self::Concurrency => 13,
Self::Provenance => 14,
}
}
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct AnalysisFact {
pub id: FactId,
pub kind: FactKind,
pub span: AnalysisSourceSpan,
pub subject: u64,
pub object: Option<u64>,
pub payload: BTreeMap<String, String>,
pub provenance: Vec<FactId>,
pub confidence_bps: u16,
pub reason: String,
}
impl AnalysisFact {
#[must_use]
pub fn exact(id: FactId, kind: FactKind, span: AnalysisSourceSpan, subject: u64) -> Self {
Self {
id,
kind,
span,
subject,
object: None,
payload: BTreeMap::new(),
provenance: Vec::new(),
confidence_bps: 10_000,
reason: "exact-parser-fact".to_string(),
}
}
pub fn validate(&self) -> Result<(), AnalysisFactError> {
if !self.id.is_valid() {
return Err(AnalysisFactError::InvalidFactId { id: self.id });
}
self.span.validate("fact")?;
if self.confidence_bps > 10_000 {
return Err(AnalysisFactError::InvalidConfidence {
id: self.id,
confidence_bps: self.confidence_bps,
});
}
if self.confidence_bps < 10_000 && self.reason.trim().is_empty() {
return Err(AnalysisFactError::MissingInferenceReason { id: self.id });
}
if self.provenance.iter().any(|parent| *parent == self.id) {
return Err(AnalysisFactError::SelfProvenance { id: self.id });
}
for key in self.payload.keys() {
if key.trim().is_empty() {
return Err(AnalysisFactError::InvalidPayloadKey { id: self.id });
}
}
Ok(())
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
pub struct AnalysisFactTable {
pub facts: Vec<AnalysisFact>,
}
impl AnalysisFactTable {
#[must_use]
pub fn new(facts: Vec<AnalysisFact>) -> Self {
Self { facts }
}
pub fn validate(&self) -> Result<(), AnalysisFactError> {
let mut ids = BTreeSet::new();
for fact in &self.facts {
fact.validate()?;
if !ids.insert(fact.id) {
return Err(AnalysisFactError::DuplicateFactId { id: fact.id });
}
}
for fact in &self.facts {
for parent in &fact.provenance {
if !ids.contains(parent) {
return Err(AnalysisFactError::MissingProvenanceParent {
id: fact.id,
parent: *parent,
});
}
}
}
Ok(())
}
pub fn to_columnar(&self) -> Result<AnalysisFactColumns, AnalysisFactError> {
self.validate()?;
let mut facts = self.facts.iter().collect::<Vec<_>>();
facts.sort_by_key(|fact| fact.id);
let mut columns = AnalysisFactColumns::default();
for fact in facts {
columns.ids.push(fact.id.0);
columns.kinds.push(fact.kind.tag());
columns.file_ids.push(fact.span.file_id);
columns.start_bytes.push(fact.span.start_byte);
columns.end_bytes.push(fact.span.end_byte);
columns.subjects.push(fact.subject);
columns.objects.push(fact.object.unwrap_or(0));
columns.confidence_bps.push(fact.confidence_bps);
columns
.payload_digests
.push(payload_digest(&fact.payload, &fact.reason));
columns.provenance_offsets.push(columns.provenance_ids.len() as u32);
columns
.provenance_ids
.extend(fact.provenance.iter().map(|parent| parent.0));
}
columns.provenance_offsets.push(columns.provenance_ids.len() as u32);
Ok(columns)
}
#[must_use]
pub fn contains(&self, id: FactId) -> bool {
self.facts.iter().any(|fact| fact.id == id)
}
#[must_use]
pub fn get(&self, id: FactId) -> Option<&AnalysisFact> {
self.facts.iter().find(|fact| fact.id == id)
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
pub struct AnalysisFactColumns {
pub ids: Vec<u64>,
pub kinds: Vec<u16>,
pub file_ids: Vec<u32>,
pub start_bytes: Vec<u32>,
pub end_bytes: Vec<u32>,
pub subjects: Vec<u64>,
pub objects: Vec<u64>,
pub confidence_bps: Vec<u16>,
pub payload_digests: Vec<[u8; 32]>,
pub provenance_offsets: Vec<u32>,
pub provenance_ids: Vec<u64>,
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct FindingProofStep {
pub fact_id: FactId,
pub span: AnalysisSourceSpan,
pub role: String,
}
impl FindingProofStep {
#[must_use]
pub fn new(fact_id: FactId, span: AnalysisSourceSpan, role: impl Into<String>) -> Self {
Self {
fact_id,
span,
role: role.into(),
}
}
fn validate(&self, table: &AnalysisFactTable) -> Result<(), AnalysisFactError> {
if !table.contains(self.fact_id) {
return Err(AnalysisFactError::FindingReferencesMissingFact {
finding_id: "<proof-step>".to_string(),
fact_id: self.fact_id,
});
}
if self.role.trim().is_empty() {
return Err(AnalysisFactError::InvalidProofRole {
fact_id: self.fact_id,
});
}
self.span.validate("finding proof step")
}
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct FindingProofBundle {
pub finding_id: String,
pub query_id: String,
pub backend_id: String,
pub evidence_digest: String,
pub fact_ids: Vec<FactId>,
pub proof_path: Vec<FindingProofStep>,
pub confidence_bps: u16,
pub reason: String,
}
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct SourceToSinkFindingRequest {
pub finding_id: String,
pub query_id: String,
pub backend_id: String,
pub evidence_digest: String,
pub source_fact_id: FactId,
pub sink_fact_id: FactId,
pub path_fact_ids: Vec<FactId>,
pub sanitizer_fact_ids: Vec<FactId>,
pub query_hit: u32,
pub confidence_bps: u16,
pub reason: String,
}
impl FindingProofBundle {
pub fn validate_against(&self, table: &AnalysisFactTable) -> Result<(), AnalysisFactError> {
table.validate()?;
if self.finding_id.trim().is_empty() {
return Err(AnalysisFactError::InvalidFindingIdentity {
field: "finding_id",
});
}
if self.query_id.trim().is_empty() {
return Err(AnalysisFactError::InvalidFindingIdentity { field: "query_id" });
}
if self.backend_id.trim().is_empty() {
return Err(AnalysisFactError::InvalidFindingIdentity {
field: "backend_id",
});
}
if self.evidence_digest.trim().is_empty() {
return Err(AnalysisFactError::InvalidFindingIdentity {
field: "evidence_digest",
});
}
if self.reason.trim().is_empty() {
return Err(AnalysisFactError::InvalidFindingIdentity { field: "reason" });
}
if self.confidence_bps > 10_000 {
return Err(AnalysisFactError::InvalidFindingConfidence {
finding_id: self.finding_id.clone(),
confidence_bps: self.confidence_bps,
});
}
if self.fact_ids.is_empty() {
return Err(AnalysisFactError::FindingHasNoFacts {
finding_id: self.finding_id.clone(),
});
}
if self.proof_path.is_empty() {
return Err(AnalysisFactError::FindingHasNoProofPath {
finding_id: self.finding_id.clone(),
});
}
for fact_id in &self.fact_ids {
if !table.contains(*fact_id) {
return Err(AnalysisFactError::FindingReferencesMissingFact {
finding_id: self.finding_id.clone(),
fact_id: *fact_id,
});
}
}
for step in &self.proof_path {
step.validate(table).map_err(|error| match error {
AnalysisFactError::FindingReferencesMissingFact { fact_id, .. } => {
AnalysisFactError::FindingReferencesMissingFact {
finding_id: self.finding_id.clone(),
fact_id,
}
}
other => other,
})?;
}
Ok(())
}
}
pub fn finding_from_sanitized_source_to_sink_query(
table: &AnalysisFactTable,
request: SourceToSinkFindingRequest,
) -> Result<Option<FindingProofBundle>, AnalysisFactError> {
table.validate()?;
let source = require_fact_kind(
table,
request.source_fact_id,
"source",
&[FactKind::Source],
)?;
let sink = require_fact_kind(table, request.sink_fact_id, "sink", &[FactKind::Sink])?;
for fact_id in &request.path_fact_ids {
let _ = require_fact_kind(
table,
*fact_id,
"path",
&[
FactKind::Dataflow,
FactKind::Edge,
FactKind::Call,
FactKind::Control,
],
)?;
}
for fact_id in &request.sanitizer_fact_ids {
let _ = require_fact_kind(table, *fact_id, "sanitizer", &[FactKind::Sanitizer])?;
}
if request.query_hit == 0 {
return Ok(None);
}
let mut fact_ids = Vec::new();
push_unique_fact(&mut fact_ids, source.id);
for fact_id in &request.path_fact_ids {
push_unique_fact(&mut fact_ids, *fact_id);
}
for fact_id in &request.sanitizer_fact_ids {
push_unique_fact(&mut fact_ids, *fact_id);
}
push_unique_fact(&mut fact_ids, sink.id);
let mut proof_path = Vec::new();
proof_path.push(FindingProofStep::new(source.id, source.span.clone(), "source"));
for fact_id in &request.path_fact_ids {
let fact = table
.get(*fact_id)
.expect("validated path fact id must exist");
proof_path.push(FindingProofStep::new(
fact.id,
fact.span.clone(),
"dataflow-path",
));
}
for fact_id in &request.sanitizer_fact_ids {
let fact = table
.get(*fact_id)
.expect("validated sanitizer fact id must exist");
proof_path.push(FindingProofStep::new(
fact.id,
fact.span.clone(),
"sanitizer-considered",
));
}
proof_path.push(FindingProofStep::new(sink.id, sink.span.clone(), "sink"));
let bundle = FindingProofBundle {
finding_id: request.finding_id,
query_id: request.query_id,
backend_id: request.backend_id,
evidence_digest: request.evidence_digest,
fact_ids,
proof_path,
confidence_bps: request.confidence_bps,
reason: request.reason,
};
bundle.validate_against(table)?;
Ok(Some(bundle))
}
fn require_fact_kind<'a>(
table: &'a AnalysisFactTable,
fact_id: FactId,
role: &'static str,
expected: &'static [FactKind],
) -> Result<&'a AnalysisFact, AnalysisFactError> {
let fact = table
.get(fact_id)
.ok_or_else(|| AnalysisFactError::FindingReferencesMissingFact {
finding_id: format!("<{role}>"),
fact_id,
})?;
if !expected.contains(&fact.kind) {
return Err(AnalysisFactError::UnexpectedFactKind {
id: fact_id,
role,
expected: expected
.iter()
.map(|kind| format!("{kind:?}"))
.collect::<Vec<_>>()
.join("|"),
actual: fact.kind,
});
}
Ok(fact)
}
fn push_unique_fact(facts: &mut Vec<FactId>, fact_id: FactId) {
if !facts.contains(&fact_id) {
facts.push(fact_id);
}
}
#[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)]
pub enum AnalysisFactError {
#[error("invalid fact id {id:?}. Fix: assign non-zero stable fact ids before analysis.")]
InvalidFactId {
id: FactId,
},
#[error("duplicate fact id {id:?}. Fix: deduplicate facts before GPU columnar packing.")]
DuplicateFactId {
id: FactId,
},
#[error(
"{context} span has start_byte {start_byte} after end_byte {end_byte}. Fix: normalize parser spans before analysis."
)]
InvalidSpan {
context: String,
start_byte: u32,
end_byte: u32,
},
#[error(
"fact {id:?} confidence {confidence_bps} exceeds 10000. Fix: store confidence in basis points."
)]
InvalidConfidence {
id: FactId,
confidence_bps: u16,
},
#[error("fact {id:?} is inferred but has no reason. Fix: record why the fact is trusted.")]
MissingInferenceReason {
id: FactId,
},
#[error("fact {id:?} lists itself as provenance. Fix: remove cyclic fact derivation.")]
SelfProvenance {
id: FactId,
},
#[error("fact {id:?} has a blank payload key. Fix: normalize payload keys before packing.")]
InvalidPayloadKey {
id: FactId,
},
#[error(
"fact {id:?} references missing provenance parent {parent:?}. Fix: emit parent facts before derived facts."
)]
MissingProvenanceParent {
id: FactId,
parent: FactId,
},
#[error("finding field `{field}` is blank. Fix: findings must be fact-backed and replayable.")]
InvalidFindingIdentity {
field: &'static str,
},
#[error(
"finding `{finding_id}` confidence {confidence_bps} exceeds 10000. Fix: store confidence in basis points."
)]
InvalidFindingConfidence {
finding_id: String,
confidence_bps: u16,
},
#[error("finding `{finding_id}` references no facts. Fix: do not emit LLM-only findings.")]
FindingHasNoFacts {
finding_id: String,
},
#[error("finding `{finding_id}` has no proof path. Fix: include source-to-sink/auth path steps.")]
FindingHasNoProofPath {
finding_id: String,
},
#[error(
"finding `{finding_id}` references missing fact {fact_id:?}. Fix: include all proof facts in the fact table."
)]
FindingReferencesMissingFact {
finding_id: String,
fact_id: FactId,
},
#[error("proof step for fact {fact_id:?} has a blank role. Fix: name each proof step role.")]
InvalidProofRole {
fact_id: FactId,
},
#[error(
"fact {id:?} has kind {actual:?} for role `{role}`, expected {expected}. Fix: normalize analysis facts before query proof emission."
)]
UnexpectedFactKind {
id: FactId,
role: &'static str,
expected: String,
actual: FactKind,
},
}
fn payload_digest(payload: &BTreeMap<String, String>, reason: &str) -> [u8; 32] {
let mut hasher = blake3::Hasher::new();
hash_field(&mut hasher, b"format", b"vyre-analysis-payload-v1");
for (key, value) in payload {
hash_field(&mut hasher, b"key", key.as_bytes());
hash_field(&mut hasher, b"value", value.as_bytes());
}
hash_field(&mut hasher, b"reason", reason.as_bytes());
*hasher.finalize().as_bytes()
}
fn hash_field(hasher: &mut blake3::Hasher, label: &[u8], value: &[u8]) {
hasher.update(&(label.len() as u64).to_le_bytes());
hasher.update(label);
hasher.update(&(value.len() as u64).to_le_bytes());
hasher.update(value);
}
#[cfg(test)]
mod tests {
use super::*;
fn span(offset: u32) -> AnalysisSourceSpan {
AnalysisSourceSpan::byte_range(7, offset, offset + 4)
}
fn fact(id: u64, kind: FactKind, subject: u64) -> AnalysisFact {
AnalysisFact::exact(FactId(id), kind, span(id as u32), subject)
}
fn table() -> AnalysisFactTable {
let mut source = fact(1, FactKind::Source, 10);
source.payload.insert("name".to_string(), "req.user".to_string());
let mut edge = fact(2, FactKind::Dataflow, 10);
edge.object = Some(20);
edge.provenance.push(FactId(1));
let mut sink = fact(3, FactKind::Sink, 20);
sink.payload
.insert("kind".to_string(), "sql.query".to_string());
AnalysisFactTable::new(vec![sink, edge, source])
}
#[test]
fn fact_table_to_columnar_sorts_by_fact_id_and_preserves_provenance_offsets() {
let columns = table()
.to_columnar()
.expect("Fix: canonical fact table should validate and pack");
assert_eq!(columns.ids, vec![1, 2, 3]);
assert_eq!(
columns.kinds,
vec![
FactKind::Source.tag(),
FactKind::Dataflow.tag(),
FactKind::Sink.tag()
]
);
assert_eq!(columns.file_ids, vec![7, 7, 7]);
assert_eq!(columns.subjects, vec![10, 10, 20]);
assert_eq!(columns.objects, vec![0, 20, 0]);
assert_eq!(columns.provenance_offsets, vec![0, 0, 1, 1]);
assert_eq!(columns.provenance_ids, vec![1]);
}
#[test]
fn fact_table_rejects_duplicate_ids() {
let error = AnalysisFactTable::new(vec![
fact(1, FactKind::Source, 1),
fact(1, FactKind::Sink, 2),
])
.validate()
.expect_err("Fix: duplicate fact ids must be rejected");
assert_eq!(error, AnalysisFactError::DuplicateFactId { id: FactId(1) });
}
#[test]
fn fact_table_rejects_missing_provenance_parent() {
let mut derived = fact(2, FactKind::Dataflow, 10);
derived.provenance.push(FactId(99));
let error = AnalysisFactTable::new(vec![fact(1, FactKind::Source, 10), derived])
.validate()
.expect_err("Fix: missing provenance parents must be rejected");
assert_eq!(
error,
AnalysisFactError::MissingProvenanceParent {
id: FactId(2),
parent: FactId(99),
}
);
}
#[test]
fn fact_table_rejects_inferred_fact_without_reason() {
let mut inferred = fact(4, FactKind::Auth, 40);
inferred.confidence_bps = 7500;
inferred.reason.clear();
let error = AnalysisFactTable::new(vec![inferred])
.validate()
.expect_err("Fix: inferred facts need a reason");
assert_eq!(
error,
AnalysisFactError::MissingInferenceReason { id: FactId(4) }
);
}
#[test]
fn finding_proof_bundle_validates_fact_backing_and_proof_path() {
let fact_table = table();
let bundle = FindingProofBundle {
finding_id: "finding.sql.source-to-sink.1".to_string(),
query_id: "vyre-libs::security::flows_to_with_sanitizer".to_string(),
backend_id: "cpu-ref".to_string(),
evidence_digest: "evidence:abc123".to_string(),
fact_ids: vec![FactId(1), FactId(2), FactId(3)],
proof_path: vec![
FindingProofStep::new(FactId(1), span(1), "source"),
FindingProofStep::new(FactId(2), span(2), "dataflow-edge"),
FindingProofStep::new(FactId(3), span(3), "sink"),
],
confidence_bps: 9800,
reason: "source reaches sql sink without sanitizer dominance".to_string(),
};
bundle
.validate_against(&fact_table)
.expect("Fix: fact-backed proof bundle should validate");
}
#[test]
fn finding_proof_bundle_rejects_llm_only_finding_without_facts() {
let fact_table = table();
let bundle = FindingProofBundle {
finding_id: "finding.llm-only".to_string(),
query_id: "manual".to_string(),
backend_id: "cpu-ref".to_string(),
evidence_digest: "evidence:abc123".to_string(),
fact_ids: Vec::new(),
proof_path: vec![FindingProofStep::new(FactId(1), span(1), "source")],
confidence_bps: 5000,
reason: "model guessed from code text".to_string(),
};
let error = bundle
.validate_against(&fact_table)
.expect_err("Fix: factless findings must be rejected");
assert_eq!(
error,
AnalysisFactError::FindingHasNoFacts {
finding_id: "finding.llm-only".to_string(),
}
);
}
#[test]
fn finding_proof_bundle_rejects_missing_fact_reference() {
let fact_table = table();
let bundle = FindingProofBundle {
finding_id: "finding.missing-fact".to_string(),
query_id: "vyre-libs::security::flows_to".to_string(),
backend_id: "cpu-ref".to_string(),
evidence_digest: "evidence:abc123".to_string(),
fact_ids: vec![FactId(1), FactId(42)],
proof_path: vec![FindingProofStep::new(FactId(1), span(1), "source")],
confidence_bps: 9000,
reason: "source reaches sink".to_string(),
};
let error = bundle
.validate_against(&fact_table)
.expect_err("Fix: findings must not reference absent facts");
assert_eq!(
error,
AnalysisFactError::FindingReferencesMissingFact {
finding_id: "finding.missing-fact".to_string(),
fact_id: FactId(42),
}
);
}
}