Skip to main content

uni_locy/
types.rs

1use std::collections::HashMap;
2
3use uni_cypher::ast::{Clause, Expr, Pattern, Query};
4use uni_cypher::locy_ast::{
5    AbduceQuery, AlongBinding, BestByClause, DeriveCommand, ExplainRule, FoldBinding, GoalQuery,
6    RuleCondition, RuleOutput,
7};
8
9/// A fully validated and stratified Locy program, ready for the orchestrator.
10#[derive(Debug, Clone)]
11pub struct CompiledProgram {
12    pub strata: Vec<Stratum>,
13    pub rule_catalog: HashMap<String, CompiledRule>,
14    /// Compiled neural-predicate declarations from `CREATE MODEL`
15    /// statements (Phase B preview). Empty unless
16    /// `LocyConfig::neural_predicates_preview` is set.
17    pub model_catalog: HashMap<String, CompiledModel>,
18    pub warnings: Vec<CompilerWarning>,
19    pub commands: Vec<CompiledCommand>,
20}
21
22/// A compiled `CREATE MODEL` declaration (Phase B preview).
23///
24/// Lowered from `uni_cypher::locy_ast::ModelDefinition`. The feature
25/// expressions are kept as Cypher AST; the runtime evaluates them per
26/// row in a future slice (`LocyModelInvoke`).
27#[derive(Debug, Clone)]
28pub struct CompiledModel {
29    pub name: String,
30    pub inputs: Vec<CompiledInputBinding>,
31    pub features: Vec<uni_cypher::ast::Expr>,
32    /// Phase D D3: optional path-context feature `FEATURES (subject, col) FROM rule_name`.
33    pub path_context: Option<uni_cypher::locy_ast::PathContextFeature>,
34    pub output_type: uni_cypher::locy_ast::OutputType,
35    pub output_name: String,
36    pub xervo_alias: String,
37    /// Phase D D2 follow-up: optional embedder alias surfaced via
38    /// `USING xervo('classify/X', embedder='alias')`. When `None`,
39    /// `semantic_match` query-text embedding falls back to `"default"`.
40    pub embedder_alias: Option<String>,
41    pub calibration: Option<uni_cypher::locy_ast::CalibrationMethod>,
42    pub version: Option<String>,
43    pub annotations: uni_cypher::locy_ast::ModelAnnotations,
44}
45
46#[derive(Debug, Clone)]
47pub struct CompiledInputBinding {
48    pub variable: String,
49    pub label: Option<String>,
50}
51
52/// A compiled command (non-rule statement) ready for execution.
53#[derive(Debug, Clone)]
54pub enum CompiledCommand {
55    GoalQuery(GoalQuery),
56    Assume(CompiledAssume),
57    Abduce(AbduceQuery),
58    ExplainRule(ExplainRule),
59    DeriveCommand(DeriveCommand),
60    Cypher(Query),
61    /// Phase C C2: `CALIBRATE` statement — collects `(pred, label)`
62    /// pairs from the MATCH pattern, invokes the registered classifier,
63    /// fits the chosen calibrator on a holdout split.
64    Calibrate(CompiledCalibrate),
65    /// Phase C C3: `VALIDATE` statement — joins a rule's PROB output
66    /// against ground truth and reports requested metrics.
67    Validate(CompiledValidate),
68}
69
70/// Compiled `CALIBRATE` command — Phase C C2.
71#[derive(Debug, Clone)]
72pub struct CompiledCalibrate {
73    pub model_name: String,
74    pub pattern: uni_cypher::ast::Pattern,
75    pub where_expr: Option<uni_cypher::ast::Expr>,
76    pub target_expr: uni_cypher::ast::Expr,
77    pub method: uni_cypher::locy_ast::CalibrationMethod,
78    /// Resolved holdout fraction (default 0.2 when the source omitted it).
79    pub holdout: f64,
80}
81
82/// Compiled `VALIDATE` command — Phase C C3.
83#[derive(Debug, Clone)]
84pub struct CompiledValidate {
85    pub rule_name: String,
86    pub pattern: uni_cypher::ast::Pattern,
87    pub where_expr: Option<uni_cypher::ast::Expr>,
88    pub target_expr: uni_cypher::ast::Expr,
89    pub metrics: Vec<uni_cypher::locy_ast::ValidationMetric>,
90    /// Name of the rule's PROB column (resolved from `rule_catalog`).
91    /// Used by the runtime to find the prediction value in derived facts.
92    pub prob_column: String,
93}
94
95/// A compiled ASSUME block with mutations and body program.
96#[derive(Debug, Clone)]
97pub struct CompiledAssume {
98    pub mutations: Vec<Clause>,
99    pub body_program: CompiledProgram,
100    pub body_commands: Vec<CompiledCommand>,
101}
102
103/// A group of rules that must be evaluated together (one SCC).
104#[derive(Debug, Clone)]
105pub struct Stratum {
106    pub id: usize,
107    pub rules: Vec<CompiledRule>,
108    pub is_recursive: bool,
109    pub depends_on: Vec<usize>,
110}
111
112/// A named rule with all its clauses merged and validated.
113#[derive(Debug, Clone)]
114pub struct CompiledRule {
115    pub name: String,
116    pub clauses: Vec<CompiledClause>,
117    pub yield_schema: Vec<YieldColumn>,
118    pub priority: Option<i64>,
119}
120
121/// A single clause (one CREATE RULE ... AS ... definition).
122#[derive(Debug, Clone)]
123pub struct CompiledClause {
124    pub match_pattern: Pattern,
125    pub where_conditions: Vec<RuleCondition>,
126    pub along: Vec<AlongBinding>,
127    pub fold: Vec<FoldBinding>,
128    /// Post-FOLD filter conditions (HAVING semantics).
129    pub having: Vec<Expr>,
130    pub best_by: Option<BestByClause>,
131    pub output: RuleOutput,
132    pub priority: Option<i64>,
133    /// Phase B Slice 3: neural-model invocations extracted from this
134    /// clause's YIELD items (or, in future slices, other body sites).
135    /// Each invocation produces a synthetic output column whose values
136    /// are filled at runtime by the registered [`crate::NeuralClassifier`];
137    /// the original `model(args)` expression in YIELD has been rewritten
138    /// to a [`uni_cypher::ast::Expr::Variable`] reference to that column.
139    pub model_invocations: Vec<ModelInvocation>,
140    /// Column names that the compiler appended to this clause's YIELD
141    /// as hidden materialization items (e.g. `"s.tier"` for
142    /// `scorer(s.tier)`). They flow through projection/fixpoint to feed
143    /// `apply_model_invocations`, then get stripped from the final
144    /// `LocyResult` rows by the runtime.
145    pub hidden_yield_cols: Vec<String>,
146}
147
148/// A single neural-model invocation site extracted from a clause body.
149///
150/// At runtime, after the clause body produces a batch of rows, each
151/// invocation evaluates its `feature_exprs` per row, packs them into
152/// [`crate::ClassifyInput`]s, calls the classifier in one batched
153/// `classify` call, then appends the result vector as a new column
154/// `output_column` to the batch.
155#[derive(Debug, Clone)]
156pub struct ModelInvocation {
157    /// Name of the model from `CREATE MODEL <name>`.
158    pub model_name: String,
159    /// Synthetic column name where the per-row probabilities are
160    /// written. Generated as `__model_<name>_<idx>` where `idx` is a
161    /// dedup index for repeated invocations of the same model.
162    pub output_column: String,
163    /// Argument expressions from the invocation — one per declared
164    /// `INPUT` binding. Evaluated in clause-body scope to produce the
165    /// per-row feature value passed under the binding's `variable` name.
166    pub feature_exprs: Vec<Expr>,
167    /// Names of the model's `INPUT` bindings in declaration order, used
168    /// as feature keys when building [`crate::ClassifyInput`].
169    pub feature_names: Vec<String>,
170    /// Property-access expressions referenced by `feature_exprs`,
171    /// recorded as `(variable, property)` pairs (e.g. for
172    /// `scorer(s.tier)` → `[("s", "tier")]`). The compiler appends a
173    /// matching hidden YIELD item for each so the planner's standard
174    /// property-materialization pipeline produces a column named
175    /// `"<variable>.<property>"` in the body batch; runtime then
176    /// reads from that column.
177    pub feature_property_refs: Vec<(String, String)>,
178    /// Phase C B1–B3: when the invocation appears in a YIELD item
179    /// (e.g. `scorer(s) AS risk`), this carries the user-visible
180    /// alias (`risk`) — distinct from the synthetic
181    /// `output_column` (`__model_scorer_0`). Allows EXPLAIN to look
182    /// up the model output by the column name that survives
183    /// `LocyProject`'s projection. `None` when the invocation lives
184    /// only inside an ALONG / FOLD expression and never surfaces
185    /// as a user-visible YIELD column.
186    pub yield_alias: Option<String>,
187    /// Phase C B1-B3 follow-up: the user-authored feature
188    /// expressions BEFORE the `InvocationLifter` rewrote them to
189    /// `Variable("__model_<n>_<idx>")` references. Preserved so
190    /// EXPLAIN can reconstruct `ClassifyInput` per fact at lookup
191    /// time (the rewritten `feature_exprs` carry synthetic-column
192    /// references that can't be evaluated against a post-projection
193    /// fact_row). Same length and ordering as `feature_exprs` and
194    /// `feature_names`.
195    pub original_feature_exprs: Vec<Expr>,
196    /// Phase D D3: snapshot of the model's `path_context` declaration
197    /// (if any) carried onto the invocation so the runtime can pull
198    /// the named column from the source rule's derived facts at
199    /// classify time without re-consulting the model catalog.
200    pub path_context: Option<uni_cypher::locy_ast::PathContextFeature>,
201    /// Phase D D2 follow-up: optional embedder alias from the model's
202    /// `USING xervo('classify/X', embedder='alias')` clause. When
203    /// `None`, the runtime falls back to alias `"default"` for
204    /// `semantic_match` query-text embedding.
205    pub embedder_alias: Option<String>,
206}
207
208/// A column in a rule's YIELD schema.
209#[derive(Debug, Clone, PartialEq)]
210pub struct YieldColumn {
211    pub name: String,
212    pub is_key: bool,
213    pub is_prob: bool,
214}
215
216/// A non-fatal compiler diagnostic.
217#[derive(Debug, Clone)]
218pub struct CompilerWarning {
219    pub code: WarningCode,
220    pub message: String,
221    pub rule_name: String,
222}
223
224#[derive(Debug, Clone, PartialEq)]
225pub enum WarningCode {
226    MsumNonNegativity,
227    ProbabilityDomainViolation,
228    /// Phase B F1: a clause has a recursive IS-ref and a FOLD aggregate
229    /// but no ALONG clause. Almost always a semantic mistake — FOLD groups
230    /// by KEY columns, not by path. (Stress Corpus B3.)
231    FoldInRecursivePath,
232    /// Phase C C4: `VALIDATE METRICS ece` was requested; the equal-width
233    /// binning ECE is biased in the small-sample regime (Kumar et al.
234    /// NeurIPS 2019). Use `DEBIASED_ECE` instead for an unbiased
235    /// estimator. The bare ECE value is still reported.
236    EceBinningBias,
237    /// Phase B G1-lite: a `CREATE MODEL` declares no CALIBRATION (or
238    /// `CALIBRATION None`) AND the `xervo_alias` heuristically looks like
239    /// an LLM provider (`generate/...`, `chat/...`, `llm/...`). Raw LLM
240    /// logprobs are not calibrated probabilities (rollout D-10). Treat
241    /// as a documentation hint until Xervo exposes `calibration_source`.
242    UncalibratedLLMLogprobs,
243    /// Phase C C4: a rule body invokes a `CREATE MODEL` whose output
244    /// is PROB AND which declares no CALIBRATION (or `CALIBRATION None`).
245    /// The fitted probability flows into the probabilistic stack
246    /// (MNOR / MPROD / complement) — without calibration, the
247    /// downstream aggregates compound the miscalibration. Run a
248    /// `CALIBRATE` statement to fit a transform, or explicitly mark
249    /// the choice with `CALIBRATION none` to acknowledge the risk
250    /// (the warning still fires for the explicit-`none` case to keep
251    /// the acknowledgement visible — same pattern as Phase A's
252    /// `FuzzyNotProbabilistic`, rollout D-9).
253    UncalibratedNeuralPredicate,
254    /// Phase C F2a: two or more neural-model invocations in the
255    /// same rule share an INPUT VARIABLE argument
256    /// (e.g. `model_a(s)` and `model_b(s)`). Under
257    /// independence-by-default composing the probabilities via
258    /// MNOR/MPROD is likely wrong since both share the random
259    /// variable `s`. Suppressed when ALL invocations involved
260    /// carry the `@independent` annotation on their `CREATE MODEL`
261    /// declaration. Rollout D-8.
262    SharedNeuralInputArgument,
263    /// Phase C F2b: two or more neural-model invocations in the
264    /// same rule share an equivalent FEATURE VALUE expression
265    /// (e.g. `model_a(s.tier)` and `model_b(s.tier)`). Different
266    /// from F2a — even when binding variables differ, the feature
267    /// input is structurally identical so the same correlation
268    /// concern applies. Suppression by `@independent` annotation.
269    SharedNeuralFeatureValue,
270    /// Phase D F3 case 3: a rule body has both a positive IS-ref
271    /// and an IS NOT (complement) to *different* rules on the
272    /// *same* subject variable. When the positive and negated
273    /// rules share base facts, the independence assumption that
274    /// underlies the probabilistic complement / aggregation is
275    /// violated. This is a structural over-detection (the MVP
276    /// fires whenever the pattern matches, even if no actual base
277    /// overlap exists at runtime); a future refinement will gate
278    /// on runtime support-set intersection.
279    PositiveComplementCorrelation,
280    /// Phase D F3 case 2: a rule body has two or more positive
281    /// IS-refs to *different* PROB-bearing rules on the *same*
282    /// subject variable. The implicit `p AND q` conjunction
283    /// assumes independence between `p` and `q`, which is wrong
284    /// when the two rules share base facts. Structural
285    /// over-detection (the MVP fires whenever the pattern
286    /// matches, even if no actual support overlap exists at
287    /// runtime); a future refinement will gate on runtime
288    /// support-set intersection.
289    CrossPredicateCorrelation,
290    /// Phase D F3 case 4 (F2c): two or more neural-model
291    /// invocations in the same rule receive retrieval-backed
292    /// features (`similar_to(prop, _)` / `semantic_match(prop,
293    /// _)`) over the *same* node property. The two models
294    /// condition on the same retrieval evidence, so the implicit
295    /// independence assumption that underlies composition via
296    /// MNOR/MPROD/etc. is suspect. Suppressed when all involved
297    /// models carry `@independent`. Structural over-detection;
298    /// a future refinement could gate on cosine similarity of
299    /// the pre-embedded query vectors (queries are constants per
300    /// `apply_model_invocations` call).
301    SharedRetrievalContext,
302}
303
304/// Probability semiring used to evaluate MNOR/MPROD aggregates, PROB
305/// complement, and cross-predicate combination.
306///
307/// `AddMultProb` is the Phase 1/2 default (noisy-OR and product under the
308/// independence assumption). `MaxMinProb` is the Viterbi/fuzzy semiring
309/// and triggers a non-suppressible `RuntimeWarningCode::FuzzyNotProbabilistic`
310/// whenever it evaluates a PROB-bearing rule (rollout decision D-9).
311/// `BddExact` is whole-group weighted model counting (Phase 7) and is
312/// dispatched outside the row-at-a-time `LocySemiring` trait.
313#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
314#[non_exhaustive]
315pub enum SemiringKind {
316    #[default]
317    AddMultProb,
318    MaxMinProb,
319    BddExact,
320    /// Phase C C0: top-K proof tracking with per-row dependency DNFs
321    /// (impl plan §1.6, decision D-7). Each row carries up to `k`
322    /// proofs whose `base_rvs` flag shared dependencies; the per-tag
323    /// probability is computed via inclusion-exclusion over the DNF.
324    ///
325    /// Stage 1 (this slice): library-layer math complete; runtime
326    /// `SemiringDispatch` falls back to `AddMultProb` row math with a
327    /// loud tracing warn. Stage 2 wires tag flow through
328    /// `MonotonicAggState` and `FoldExec`.
329    TopKProofs {
330        k: u32,
331    },
332}
333
334/// Classification of runtime warnings emitted during evaluation.
335#[derive(Debug, Clone, PartialEq)]
336pub enum RuntimeWarningCode {
337    /// Two or more proof paths aggregated by MNOR/MPROD share an
338    /// intermediate fact, violating the independence assumption.
339    SharedProbabilisticDependency,
340    /// A shared-proof group exceeded `max_bdd_variables`, so the BDD
341    /// computation fell back to the independence-mode result.
342    BddLimitExceeded,
343    /// Base facts are shared across different KEY groups within the same
344    /// rule. The BDD corrects per-group probabilities but cannot account
345    /// for cross-group correlations.
346    CrossGroupCorrelationNotExact,
347    /// The `MaxMinProb` (fuzzy / Viterbi) semiring evaluated a PROB-bearing
348    /// rule. Per rollout decision D-9 this warning is **unsuppressible**:
349    /// fuzzy truth values are not probabilities, and silent conflation is
350    /// the dominant pitfall in neuro-symbolic systems (LTN, NTP).
351    FuzzyNotProbabilistic,
352    /// Phase C C0: a `TopKProofs::plus` operation discarded a proof
353    /// whose `base_rvs` overlapped a retained proof — top-K is too
354    /// small for the program's dependency structure (impl plan §3.0,
355    /// rollout doc §6). Increase `k` or accept the
356    /// approximation. Emitted from library code; Stage 2 wires it
357    /// into the runtime tag flow.
358    TopKPruningCrossedDependency,
359}
360
361/// A non-fatal runtime diagnostic collected during evaluation.
362#[derive(Debug, Clone)]
363pub struct RuntimeWarning {
364    /// Warning classification.
365    pub code: RuntimeWarningCode,
366    /// Human-readable explanation.
367    pub message: String,
368    /// Rule that triggered the warning, when applicable.
369    pub rule_name: String,
370    /// BDD variable count for the affected group (BddLimitExceeded only).
371    pub variable_count: Option<usize>,
372    /// Human-readable KEY group description (BddLimitExceeded only).
373    pub key_group: Option<String>,
374}