Skip to main content

plsql_ir/
fact.rs

1//! Normalized fact schema.
2//!
3//! Every analysis pass emits its results as a stream of
4//! [`Fact`] records sharing one canonical shape. A fact is:
5//!
6//! 1. A stable [`FactId`] — the SHA-256 of the canonical
7//!    serialisation of every other field, so re-emitting the
8//!    same fact under the same inputs produces the same id.
9//! 2. A [`FactKind`] discriminator naming the family it
10//!    belongs to (declaration, reference, edge, opacity, …).
11//! 3. A typed payload — the per-family struct carrying the
12//!    actual evidence.
13//! 4. A [`FactProvenance`] record naming the analysis pass that
14//!    emitted the fact (component name, version, run id).
15//!
16//! Downstream consumers (lineage, doc, SAST, bindings) walk a
17//! `FactStore` and filter by kind. This keeps the engine's
18//! internal wiring loose — passes don't need to know about each
19//! other, only that they emit compatible Facts.
20//!
21//! ## /oracle evidence
22//!
23//! * `DATABASE-REFERENCE.md` PL/SQL Language Reference — the
24//!   fact families (declarations, references, dependency
25//!   edges, dynamic-SQL evidence) trace 1:1 to the PL/SQL
26//!   declaration / reference / call grammar.
27//! * `LOW-LEVEL-CATALOGS.md` Data Dictionary View Families —
28//!   each fact family has a server-side mirror
29//!   (`ALL_OBJECTS` for declarations, `ALL_DEPENDENCIES` for
30//!   edges, `ALL_SOURCE.WRAPPED` for the wrapped-source
31//!   opacity fact, …).
32
33use serde::{Deserialize, Serialize};
34use sha2::{Digest, Sha256};
35
36use crate::DeclId;
37use crate::flow::{ConstantValue, StringShape, TaintCleanser, TaintKind, ValueSet};
38
39/// Stable identity for a fact — `fact:<hex>` form.
40#[derive(Clone, Debug, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
41pub struct FactId(pub String);
42
43/// What family a fact belongs to. Drives consumer dispatch
44/// without having to match the payload.
45#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum FactKind {
48    Declaration,
49    Reference,
50    DependencyEdge,
51    DynamicSqlEvidence,
52    DbLinkReference,
53    Opacity,
54    ResolutionReport,
55    Privilege,
56    ConstantValue,
57    ValueSet,
58    StringShape,
59    Taint,
60    Sanitizer,
61    ExceptionHandler,
62    CursorForLoop,
63    MissingInstrumentation,
64    HardcodedCredential,
65    InvokerRights,
66    RefCursorReturn,
67    DmlInFunction,
68    UnboundedBulkCollect,
69    DeprecatedFeature,
70    DeterministicMisuse,
71    MutatingTableTrigger,
72    LogWithoutReraise,
73    CrossSchemaWrite,
74    SensitivePublicSynonym,
75    IsNullOnIndexedColumn,
76}
77
78/// A fact record. Wraps a typed payload with stable id + family
79/// + provenance.
80#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
81pub struct Fact {
82    pub id: FactId,
83    pub kind: FactKind,
84    pub provenance: FactProvenance,
85    pub payload: FactPayload,
86}
87
88/// Provenance — which analysis pass produced the fact, when, at
89/// what engine version.
90#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
91pub struct FactProvenance {
92    pub component: String,
93    pub component_version: String,
94    /// Stable run id from the engine's session — empty when the
95    /// fact was minted by a one-shot CLI.
96    #[serde(default, skip_serializing_if = "String::is_empty")]
97    pub run_id: String,
98    /// Optional logical source of the fact. For source-derived facts this is
99    /// usually the unit/object logical id; for catalog-derived project facts it
100    /// names the object or catalog row the fact came from.
101    #[serde(default, skip_serializing_if = "Option::is_none")]
102    pub source_logical_id: Option<String>,
103    /// Optional project-relative source file or catalog artifact that produced
104    /// this fact. Consumers should prefer this over their current iteration
105    /// unit when presenting fact-derived findings.
106    #[serde(default, skip_serializing_if = "Option::is_none")]
107    pub source_file: Option<String>,
108}
109
110impl FactProvenance {
111    #[must_use]
112    pub fn with_source(
113        mut self,
114        source_logical_id: impl Into<String>,
115        source_file: impl Into<String>,
116    ) -> Self {
117        self.source_logical_id = Some(source_logical_id.into());
118        self.source_file = Some(source_file.into());
119        self
120    }
121}
122
123/// Discriminated payload — one variant per `FactKind`. The
124/// per-family types are intentionally lightweight; consumers
125/// that need richer detail re-fetch from the originating crate's
126/// model (e.g. lineage's `LineageResult`).
127#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
128#[serde(tag = "family", rename_all = "snake_case")]
129pub enum FactPayload {
130    Declaration {
131        decl: DeclId,
132        logical_id: String,
133    },
134    Reference {
135        from_decl: DeclId,
136        to_logical_id: String,
137    },
138    DependencyEdge {
139        from_logical_id: String,
140        to_logical_id: String,
141        edge_kind: String,
142    },
143    DynamicSqlEvidence {
144        site: String,
145    },
146    DbLinkReference {
147        object: String,
148        db_link: String,
149    },
150    Opacity {
151        target_logical_id: String,
152        reason: String,
153    },
154    ResolutionReport {
155        reference: String,
156        strategy: String,
157    },
158    Privilege {
159        grantee: String,
160        privilege: String,
161        on: String,
162    },
163    /// A variable/expression with a compile-time constant value in the
164    /// flow lattice. `name` is normalized to the analyzer's semantic
165    /// identifier for that value inside `unit_logical_id`.
166    ConstantValue {
167        unit_logical_id: String,
168        name: String,
169        value: ConstantValue,
170    },
171    /// A variable/expression whose possible values are bounded by
172    /// the flow lattice. `ValueSet::Top` is intentionally not
173    /// emitted: absence of this fact means "unbounded/unmeasured",
174    /// not safety.
175    ValueSet {
176        unit_logical_id: String,
177        name: String,
178        value_set: ValueSet,
179    },
180    /// The string-shape abstraction for a variable/expression.
181    /// SAST and lineage can consume this without re-walking the raw
182    /// expression tree.
183    StringShape {
184        unit_logical_id: String,
185        name: String,
186        shape: StringShape,
187    },
188    /// Live taint kinds flowing into a variable/expression.
189    Taint {
190        unit_logical_id: String,
191        name: String,
192        kinds: Vec<TaintKind>,
193    },
194    /// Sanitizers observed in a value's derivation. This is evidence
195    /// for reports; it is never proof of safety by itself.
196    Sanitizer {
197        unit_logical_id: String,
198        name: String,
199        cleansed_by: Vec<TaintCleanser>,
200    },
201    /// An `EXCEPTION WHEN ... THEN ...` handler. `scope` is the
202    /// caught condition (`others` or a named exception); `body_class`
203    /// classifies the handler body so syntactic rules can decide
204    /// without re-parsing: `noop` (only `NULL;` — QUAL001 swallowed
205    /// exception), `commit` / `rollback` (QUAL004 transaction control
206    /// in a handler), or `other`.
207    ExceptionHandler {
208        unit_logical_id: String,
209        scope: String,
210        body_class: String,
211    },
212    /// A cursor `FOR` loop (`FOR <var> IN (<query>|<cursor>) LOOP …
213    /// END LOOP;`). `has_body_dml` is `true` when the loop body
214    /// contains a row-level `INSERT`/`UPDATE`/`DELETE`/`MERGE` —
215    /// PERF001 flags any cursor-FOR-loop as a bulk-collect
216    /// candidate; PERF002 flags the `has_body_dml` subset as a
217    /// `FORALL` candidate. Conservative (R13): an ambiguous shape
218    /// yields no fact rather than a wrong one.
219    CursorForLoop {
220        unit_logical_id: String,
221        loop_var: String,
222        has_body_dml: bool,
223    },
224    /// A routine body in which no recognized instrumentation call
225    /// (logging / tracing / audit) was found. STYLE001 (opt-in,
226    /// per house policy) decides whether that is a finding; the
227    /// fact only reports the *absence*, never asserts a violation.
228    MissingInstrumentation {
229        unit_logical_id: String,
230    },
231    /// A string literal that is, by strong syntactic context, a
232    /// hardcoded secret (`IDENTIFIED BY '…'`, an assignment to a
233    /// password/secret/token-named target, or a `password => '…'`
234    /// named argument). `marker` records the matched context so
235    /// SEC003 can explain the finding. Conservative (R13): only
236    /// emitted when a literal directly follows a credential marker.
237    HardcodedCredential {
238        unit_logical_id: String,
239        marker: String,
240    },
241    /// The unit declares `AUTHID CURRENT_USER` (invoker's rights).
242    /// Resolution of privileges is deferred to call time, which
243    /// widens the trust surface — SEC004 flags it for review (it is
244    /// frequently intentional, so the rule is advisory, not a hard
245    /// defect).
246    InvokerRights {
247        unit_logical_id: String,
248    },
249    /// A function whose `RETURN` type is a REF CURSOR
250    /// (`SYS_REFCURSOR` / `REF CURSOR`). Hands an open cursor to the
251    /// caller — a resource-ownership and (when the cursor wrapped
252    /// dynamic SQL) injection-amplification concern. SEC007.
253    RefCursorReturn {
254        unit_logical_id: String,
255    },
256    /// A `FUNCTION` whose body performs row-level DML
257    /// (`INSERT`/`UPDATE`/`DELETE`/`MERGE`). Side-effecting
258    /// functions break purity, are unsafe in SQL/parallel/replication
259    /// contexts, and surprise callers. QUAL007.
260    DmlInFunction {
261        unit_logical_id: String,
262    },
263    /// A `BULK COLLECT INTO` with no `LIMIT` in the same statement —
264    /// the entire result set is materialized into PGA memory
265    /// unbounded. QUAL003.
266    UnboundedBulkCollect {
267        unit_logical_id: String,
268    },
269    /// A well-known deprecated / legacy construct (`feature` names
270    /// the match: `dbms_job`, legacy `(+)` outer join, `… work`
271    /// transaction-control keyword). QUAL005.
272    DeprecatedFeature {
273        unit_logical_id: String,
274        feature: String,
275    },
276    /// A function marked `DETERMINISTIC` whose body contains a
277    /// non-deterministic construct (DML, query, SYSDATE/
278    /// SYSTIMESTAMP, DBMS_RANDOM, sequence `.NEXTVAL`). QUAL008.
279    DeterministicMisuse {
280        unit_logical_id: String,
281        construct: String,
282    },
283    /// A row-level (`FOR EACH ROW`) trigger whose body references
284    /// its own base `table` in a query/DML — ORA-04091 mutating-
285    /// table hazard. QUAL006.
286    MutatingTableTrigger {
287        unit_logical_id: String,
288        table: String,
289    },
290    /// An exception handler that logs (or otherwise instruments)
291    /// but neither re-raises nor signals, silently continuing —
292    /// the failure is recorded yet swallowed. QUAL002.
293    LogWithoutReraise {
294        unit_logical_id: String,
295    },
296    /// A DML statement whose target object is schema-qualified to a
297    /// schema other than the unit's own — a cross-schema write
298    /// surface. `target` is `schema.object`. DEP001.
299    CrossSchemaWrite {
300        unit_logical_id: String,
301        target: String,
302    },
303    /// A `CREATE PUBLIC SYNONYM` whose synonym or target name
304    /// matches a sensitivity heuristic (credential/PII/finance).
305    /// Public synonyms are visible to every account, so exposing a
306    /// sensitive object through one widens its reach. SEC005.
307    SensitivePublicSynonym {
308        unit_logical_id: String,
309        synonym: String,
310        target: String,
311    },
312    /// A `<col> IS NULL` predicate on a `column` that the *same
313    /// analyzed source* declares an index on (`CREATE INDEX … (col
314    /// …)`). A B-tree index does not store all-NULL keys, so the
315    /// predicate cannot use that index — a silent full-scan. PERF003.
316    /// (Catalog-only indexes are out of this source-level scope.)
317    IsNullOnIndexedColumn {
318        unit_logical_id: String,
319        column: String,
320    },
321}
322
323impl FactPayload {
324    /// Discriminate the family without matching the full enum.
325    #[must_use]
326    pub fn kind(&self) -> FactKind {
327        match self {
328            FactPayload::Declaration { .. } => FactKind::Declaration,
329            FactPayload::Reference { .. } => FactKind::Reference,
330            FactPayload::DependencyEdge { .. } => FactKind::DependencyEdge,
331            FactPayload::DynamicSqlEvidence { .. } => FactKind::DynamicSqlEvidence,
332            FactPayload::DbLinkReference { .. } => FactKind::DbLinkReference,
333            FactPayload::Opacity { .. } => FactKind::Opacity,
334            FactPayload::ResolutionReport { .. } => FactKind::ResolutionReport,
335            FactPayload::Privilege { .. } => FactKind::Privilege,
336            FactPayload::ConstantValue { .. } => FactKind::ConstantValue,
337            FactPayload::ValueSet { .. } => FactKind::ValueSet,
338            FactPayload::StringShape { .. } => FactKind::StringShape,
339            FactPayload::Taint { .. } => FactKind::Taint,
340            FactPayload::Sanitizer { .. } => FactKind::Sanitizer,
341            FactPayload::ExceptionHandler { .. } => FactKind::ExceptionHandler,
342            FactPayload::CursorForLoop { .. } => FactKind::CursorForLoop,
343            FactPayload::MissingInstrumentation { .. } => FactKind::MissingInstrumentation,
344            FactPayload::HardcodedCredential { .. } => FactKind::HardcodedCredential,
345            FactPayload::InvokerRights { .. } => FactKind::InvokerRights,
346            FactPayload::RefCursorReturn { .. } => FactKind::RefCursorReturn,
347            FactPayload::DmlInFunction { .. } => FactKind::DmlInFunction,
348            FactPayload::UnboundedBulkCollect { .. } => FactKind::UnboundedBulkCollect,
349            FactPayload::DeprecatedFeature { .. } => FactKind::DeprecatedFeature,
350            FactPayload::DeterministicMisuse { .. } => FactKind::DeterministicMisuse,
351            FactPayload::MutatingTableTrigger { .. } => FactKind::MutatingTableTrigger,
352            FactPayload::LogWithoutReraise { .. } => FactKind::LogWithoutReraise,
353            FactPayload::CrossSchemaWrite { .. } => FactKind::CrossSchemaWrite,
354            FactPayload::SensitivePublicSynonym { .. } => FactKind::SensitivePublicSynonym,
355            FactPayload::IsNullOnIndexedColumn { .. } => FactKind::IsNullOnIndexedColumn,
356        }
357    }
358}
359
360/// Build a `Fact` with the canonical id derived from `(kind,
361/// provenance, payload)`. The id is `fact:<hex>` so it doesn't
362/// collide with the `sha256:` namespace other engine bytes use.
363#[must_use]
364pub fn mint_fact(provenance: FactProvenance, payload: FactPayload) -> Fact {
365    let kind = payload.kind();
366    let id = compute_fact_id(kind, &provenance, &payload);
367    Fact {
368        id,
369        kind,
370        provenance,
371        payload,
372    }
373}
374
375fn compute_fact_id(kind: FactKind, provenance: &FactProvenance, payload: &FactPayload) -> FactId {
376    // Canonical serialisation — JSON with sorted keys via
377    // serde_json::to_string (BTreeMap-like determinism is
378    // guaranteed by serde for tagged enums + struct-form
379    // variants; sufficient for fact dedup).
380    let kind_json = serde_json::to_string(&kind).unwrap_or_default();
381    let prov_json = serde_json::to_string(provenance).unwrap_or_default();
382    let payload_json = serde_json::to_string(payload).unwrap_or_default();
383    let mut hasher = Sha256::new();
384    hasher.update(kind_json.as_bytes());
385    hasher.update(b"|");
386    hasher.update(prov_json.as_bytes());
387    hasher.update(b"|");
388    hasher.update(payload_json.as_bytes());
389    let digest = hasher.finalize();
390    let mut hex = String::with_capacity(5 + digest.len() * 2);
391    hex.push_str("fact:");
392    for b in digest {
393        hex.push_str(&format!("{b:02x}"));
394    }
395    FactId(hex)
396}
397
398/// Append-only collector — analysis passes push facts in;
399/// consumers walk them out.
400#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
401pub struct FactStore {
402    pub facts: Vec<Fact>,
403}
404
405impl FactStore {
406    pub fn push(&mut self, fact: Fact) -> FactId {
407        let id = fact.id.clone();
408        if !self.facts.iter().any(|f| f.id.cmp(&id).is_eq()) {
409            self.facts.push(fact);
410        }
411        id
412    }
413
414    /// Filter by family.
415    pub fn by_kind(&self, kind: FactKind) -> impl Iterator<Item = &Fact> {
416        self.facts.iter().filter(move |f| f.kind.cmp(&kind).is_eq())
417    }
418
419    #[must_use]
420    pub fn len(&self) -> usize {
421        self.facts.len()
422    }
423
424    #[must_use]
425    pub fn is_empty(&self) -> bool {
426        self.facts.is_empty()
427    }
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    fn prov() -> FactProvenance {
435        FactProvenance {
436            component: "plsql-lineage".into(),
437            component_version: "0.1.0".into(),
438            run_id: String::new(),
439            source_logical_id: None,
440            source_file: None,
441        }
442    }
443
444    fn payload() -> FactPayload {
445        FactPayload::DependencyEdge {
446            from_logical_id: "hr.foo".into(),
447            to_logical_id: "hr.bar".into(),
448            edge_kind: "Calls".into(),
449        }
450    }
451
452    #[test]
453    fn mint_fact_produces_fact_prefixed_id() {
454        let f = mint_fact(prov(), payload());
455        assert!(f.id.0.starts_with("fact:"));
456    }
457
458    #[test]
459    fn mint_fact_is_deterministic_for_same_inputs() {
460        let a = mint_fact(prov(), payload());
461        let b = mint_fact(prov(), payload());
462        assert_eq!(a.id, b.id);
463    }
464
465    #[test]
466    fn mint_fact_changes_id_when_payload_changes() {
467        let a = mint_fact(prov(), payload());
468        let mut diff = payload();
469        if let FactPayload::DependencyEdge { edge_kind, .. } = &mut diff {
470            *edge_kind = "Reads".into();
471        }
472        let b = mint_fact(prov(), diff);
473        assert_ne!(a.id, b.id);
474    }
475
476    #[test]
477    fn mint_fact_changes_id_when_provenance_changes() {
478        let a = mint_fact(prov(), payload());
479        let mut other_prov = prov();
480        other_prov.component_version = "9.9.9".into();
481        let b = mint_fact(other_prov, payload());
482        assert_ne!(a.id, b.id);
483    }
484
485    #[test]
486    fn payload_kind_method_returns_matching_family() {
487        let f = mint_fact(prov(), payload());
488        assert_eq!(f.kind, FactKind::DependencyEdge);
489        assert_eq!(f.payload.kind(), FactKind::DependencyEdge);
490    }
491
492    #[test]
493    fn store_pushes_and_dedupes_by_id() {
494        let mut store = FactStore::default();
495        let f = mint_fact(prov(), payload());
496        store.push(f.clone());
497        store.push(f);
498        assert_eq!(store.len(), 1);
499    }
500
501    #[test]
502    fn store_filters_by_kind() {
503        let mut store = FactStore::default();
504        let decl = mint_fact(
505            prov(),
506            FactPayload::Declaration {
507                decl: DeclId::new(1),
508                logical_id: "hr.foo".into(),
509            },
510        );
511        let edge = mint_fact(prov(), payload());
512        store.push(decl);
513        store.push(edge);
514        assert_eq!(store.by_kind(FactKind::Declaration).count(), 1);
515        assert_eq!(store.by_kind(FactKind::DependencyEdge).count(), 1);
516        assert_eq!(store.by_kind(FactKind::Privilege).count(), 0);
517    }
518
519    #[test]
520    fn fact_serialises_with_family_tag() {
521        let f = mint_fact(prov(), payload());
522        let json = serde_json::to_string(&f).unwrap();
523        assert!(json.contains("\"kind\":\"dependency_edge\""));
524        assert!(json.contains("\"family\":\"dependency_edge\""));
525        assert!(json.contains("fact:"));
526    }
527
528    #[test]
529    fn fact_round_trips_through_serde() {
530        let f = mint_fact(prov(), payload());
531        let json = serde_json::to_string(&f).unwrap();
532        let back: Fact = serde_json::from_str(&json).unwrap();
533        assert_eq!(back, f);
534    }
535
536    #[test]
537    fn run_id_omitted_when_empty() {
538        let f = mint_fact(prov(), payload());
539        let json = serde_json::to_string(&f).unwrap();
540        assert!(!json.contains("\"run_id\""));
541    }
542
543    #[test]
544    fn source_attribution_omitted_when_absent_and_round_trips_when_present() {
545        let without_source = serde_json::to_string(&mint_fact(prov(), payload())).unwrap();
546        assert!(!without_source.contains("source_logical_id"));
547        assert!(!without_source.contains("source_file"));
548
549        let with_source = prov().with_source("hr.pkg", "src/hr/pkg.pks");
550        let fact = mint_fact(with_source, payload());
551        let json = serde_json::to_string(&fact).unwrap();
552        assert!(json.contains("\"source_logical_id\":\"hr.pkg\""));
553        assert!(json.contains("\"source_file\":\"src/hr/pkg.pks\""));
554        let back: Fact = serde_json::from_str(&json).unwrap();
555        assert_eq!(back.provenance.source_logical_id.as_deref(), Some("hr.pkg"));
556        assert_eq!(
557            back.provenance.source_file.as_deref(),
558            Some("src/hr/pkg.pks")
559        );
560    }
561
562    #[test]
563    fn exception_handler_fact_kind_and_serde() {
564        let f = mint_fact(
565            prov(),
566            FactPayload::ExceptionHandler {
567                unit_logical_id: "hr.pay_pkg.run".into(),
568                scope: "others".into(),
569                body_class: "noop".into(),
570            },
571        );
572        assert_eq!(f.kind, FactKind::ExceptionHandler);
573        assert_eq!(f.payload.kind(), FactKind::ExceptionHandler);
574
575        let json = serde_json::to_string(&f).unwrap();
576        assert!(json.contains("\"kind\":\"exception_handler\""));
577        assert!(json.contains("\"family\":\"exception_handler\""));
578
579        let back: Fact = serde_json::from_str(&json).unwrap();
580        assert_eq!(back, f);
581
582        let mut store = FactStore::default();
583        store.push(f);
584        assert_eq!(store.by_kind(FactKind::ExceptionHandler).count(), 1);
585        assert_eq!(store.by_kind(FactKind::Privilege).count(), 0);
586    }
587}