uni_locy/types.rs
1use std::collections::HashMap;
2
3use uni_cypher::ast::{Clause, Expr, Pattern, Query};
4use uni_cypher::locy_ast::{
5 AbduceQuery, AlongBinding, BestByClause, DeriveCommand, ExplainRule, FoldBinding, GoalQuery,
6 RuleCondition, RuleOutput,
7};
8
9/// A fully validated and stratified Locy program, ready for the orchestrator.
10#[derive(Debug, Clone)]
11pub struct CompiledProgram {
12 pub strata: Vec<Stratum>,
13 pub rule_catalog: HashMap<String, CompiledRule>,
14 /// Compiled neural-predicate declarations from `CREATE MODEL`
15 /// statements (Phase B preview). Empty unless
16 /// `LocyConfig::neural_predicates_preview` is set.
17 pub model_catalog: HashMap<String, CompiledModel>,
18 pub warnings: Vec<CompilerWarning>,
19 pub commands: Vec<CompiledCommand>,
20}
21
22/// A compiled `CREATE MODEL` declaration (Phase B preview).
23///
24/// Lowered from `uni_cypher::locy_ast::ModelDefinition`. The feature
25/// expressions are kept as Cypher AST; the runtime evaluates them per
26/// row in a future slice (`LocyModelInvoke`).
27#[derive(Debug, Clone)]
28pub struct CompiledModel {
29 pub name: String,
30 pub inputs: Vec<CompiledInputBinding>,
31 pub features: Vec<uni_cypher::ast::Expr>,
32 /// Phase D D3: optional path-context feature `FEATURES (subject, col) FROM rule_name`.
33 pub path_context: Option<uni_cypher::locy_ast::PathContextFeature>,
34 pub output_type: uni_cypher::locy_ast::OutputType,
35 pub output_name: String,
36 pub xervo_alias: String,
37 /// Phase D D2 follow-up: optional embedder alias surfaced via
38 /// `USING xervo('classify/X', embedder='alias')`. When `None`,
39 /// `semantic_match` query-text embedding falls back to `"default"`.
40 pub embedder_alias: Option<String>,
41 pub calibration: Option<uni_cypher::locy_ast::CalibrationMethod>,
42 pub version: Option<String>,
43 pub annotations: uni_cypher::locy_ast::ModelAnnotations,
44}
45
46#[derive(Debug, Clone)]
47pub struct CompiledInputBinding {
48 pub variable: String,
49 pub label: Option<String>,
50}
51
52/// A compiled command (non-rule statement) ready for execution.
53#[derive(Debug, Clone)]
54pub enum CompiledCommand {
55 GoalQuery(GoalQuery),
56 Assume(CompiledAssume),
57 Abduce(AbduceQuery),
58 ExplainRule(ExplainRule),
59 DeriveCommand(DeriveCommand),
60 Cypher(Query),
61 /// Phase C C2: `CALIBRATE` statement — collects `(pred, label)`
62 /// pairs from the MATCH pattern, invokes the registered classifier,
63 /// fits the chosen calibrator on a holdout split.
64 Calibrate(CompiledCalibrate),
65 /// Phase C C3: `VALIDATE` statement — joins a rule's PROB output
66 /// against ground truth and reports requested metrics.
67 Validate(CompiledValidate),
68}
69
70/// Compiled `CALIBRATE` command — Phase C C2.
71#[derive(Debug, Clone)]
72pub struct CompiledCalibrate {
73 pub model_name: String,
74 pub pattern: uni_cypher::ast::Pattern,
75 pub where_expr: Option<uni_cypher::ast::Expr>,
76 pub target_expr: uni_cypher::ast::Expr,
77 pub method: uni_cypher::locy_ast::CalibrationMethod,
78 /// Resolved holdout fraction (default 0.2 when the source omitted it).
79 pub holdout: f64,
80}
81
82/// Compiled `VALIDATE` command — Phase C C3.
83#[derive(Debug, Clone)]
84pub struct CompiledValidate {
85 pub rule_name: String,
86 pub pattern: uni_cypher::ast::Pattern,
87 pub where_expr: Option<uni_cypher::ast::Expr>,
88 pub target_expr: uni_cypher::ast::Expr,
89 pub metrics: Vec<uni_cypher::locy_ast::ValidationMetric>,
90 /// Name of the rule's PROB column (resolved from `rule_catalog`).
91 /// Used by the runtime to find the prediction value in derived facts.
92 pub prob_column: String,
93}
94
95/// A compiled ASSUME block with mutations and body program.
96#[derive(Debug, Clone)]
97pub struct CompiledAssume {
98 pub mutations: Vec<Clause>,
99 pub body_program: CompiledProgram,
100 pub body_commands: Vec<CompiledCommand>,
101}
102
103/// A group of rules that must be evaluated together (one SCC).
104#[derive(Debug, Clone)]
105pub struct Stratum {
106 pub id: usize,
107 pub rules: Vec<CompiledRule>,
108 pub is_recursive: bool,
109 pub depends_on: Vec<usize>,
110}
111
112/// A named rule with all its clauses merged and validated.
113#[derive(Debug, Clone)]
114pub struct CompiledRule {
115 pub name: String,
116 pub clauses: Vec<CompiledClause>,
117 pub yield_schema: Vec<YieldColumn>,
118 pub priority: Option<i64>,
119}
120
121/// A single clause (one CREATE RULE ... AS ... definition).
122#[derive(Debug, Clone)]
123pub struct CompiledClause {
124 pub match_pattern: Pattern,
125 pub where_conditions: Vec<RuleCondition>,
126 pub along: Vec<AlongBinding>,
127 pub fold: Vec<FoldBinding>,
128 /// Post-FOLD filter conditions (HAVING semantics).
129 pub having: Vec<Expr>,
130 pub best_by: Option<BestByClause>,
131 pub output: RuleOutput,
132 pub priority: Option<i64>,
133 /// Phase B Slice 3: neural-model invocations extracted from this
134 /// clause's YIELD items (or, in future slices, other body sites).
135 /// Each invocation produces a synthetic output column whose values
136 /// are filled at runtime by the registered [`crate::NeuralClassifier`];
137 /// the original `model(args)` expression in YIELD has been rewritten
138 /// to a [`uni_cypher::ast::Expr::Variable`] reference to that column.
139 pub model_invocations: Vec<ModelInvocation>,
140 /// Column names that the compiler appended to this clause's YIELD
141 /// as hidden materialization items (e.g. `"s.tier"` for
142 /// `scorer(s.tier)`). They flow through projection/fixpoint to feed
143 /// `apply_model_invocations`, then get stripped from the final
144 /// `LocyResult` rows by the runtime.
145 pub hidden_yield_cols: Vec<String>,
146}
147
148/// A single neural-model invocation site extracted from a clause body.
149///
150/// At runtime, after the clause body produces a batch of rows, each
151/// invocation evaluates its `feature_exprs` per row, packs them into
152/// [`crate::ClassifyInput`]s, calls the classifier in one batched
153/// `classify` call, then appends the result vector as a new column
154/// `output_column` to the batch.
155#[derive(Debug, Clone)]
156pub struct ModelInvocation {
157 /// Name of the model from `CREATE MODEL <name>`.
158 pub model_name: String,
159 /// Synthetic column name where the per-row probabilities are
160 /// written. Generated as `__model_<name>_<idx>` where `idx` is a
161 /// dedup index for repeated invocations of the same model.
162 pub output_column: String,
163 /// Argument expressions from the invocation — one per declared
164 /// `INPUT` binding. Evaluated in clause-body scope to produce the
165 /// per-row feature value passed under the binding's `variable` name.
166 pub feature_exprs: Vec<Expr>,
167 /// Names of the model's `INPUT` bindings in declaration order, used
168 /// as feature keys when building [`crate::ClassifyInput`].
169 pub feature_names: Vec<String>,
170 /// Property-access expressions referenced by `feature_exprs`,
171 /// recorded as `(variable, property)` pairs (e.g. for
172 /// `scorer(s.tier)` → `[("s", "tier")]`). The compiler appends a
173 /// matching hidden YIELD item for each so the planner's standard
174 /// property-materialization pipeline produces a column named
175 /// `"<variable>.<property>"` in the body batch; runtime then
176 /// reads from that column.
177 pub feature_property_refs: Vec<(String, String)>,
178 /// Phase C B1–B3: when the invocation appears in a YIELD item
179 /// (e.g. `scorer(s) AS risk`), this carries the user-visible
180 /// alias (`risk`) — distinct from the synthetic
181 /// `output_column` (`__model_scorer_0`). Allows EXPLAIN to look
182 /// up the model output by the column name that survives
183 /// `LocyProject`'s projection. `None` when the invocation lives
184 /// only inside an ALONG / FOLD expression and never surfaces
185 /// as a user-visible YIELD column.
186 pub yield_alias: Option<String>,
187 /// Phase C B1-B3 follow-up: the user-authored feature
188 /// expressions BEFORE the `InvocationLifter` rewrote them to
189 /// `Variable("__model_<n>_<idx>")` references. Preserved so
190 /// EXPLAIN can reconstruct `ClassifyInput` per fact at lookup
191 /// time (the rewritten `feature_exprs` carry synthetic-column
192 /// references that can't be evaluated against a post-projection
193 /// fact_row). Same length and ordering as `feature_exprs` and
194 /// `feature_names`.
195 pub original_feature_exprs: Vec<Expr>,
196 /// Phase D D3: snapshot of the model's `path_context` declaration
197 /// (if any) carried onto the invocation so the runtime can pull
198 /// the named column from the source rule's derived facts at
199 /// classify time without re-consulting the model catalog.
200 pub path_context: Option<uni_cypher::locy_ast::PathContextFeature>,
201 /// Phase D D2 follow-up: optional embedder alias from the model's
202 /// `USING xervo('classify/X', embedder='alias')` clause. When
203 /// `None`, the runtime falls back to alias `"default"` for
204 /// `semantic_match` query-text embedding.
205 pub embedder_alias: Option<String>,
206}
207
208/// A column in a rule's YIELD schema.
209#[derive(Debug, Clone, PartialEq)]
210pub struct YieldColumn {
211 pub name: String,
212 pub is_key: bool,
213 pub is_prob: bool,
214}
215
216/// A non-fatal compiler diagnostic.
217#[derive(Debug, Clone)]
218pub struct CompilerWarning {
219 pub code: WarningCode,
220 pub message: String,
221 pub rule_name: String,
222}
223
224#[derive(Debug, Clone, PartialEq)]
225pub enum WarningCode {
226 MsumNonNegativity,
227 ProbabilityDomainViolation,
228 /// Phase B F1: a clause has a recursive IS-ref and a FOLD aggregate
229 /// but no ALONG clause. Almost always a semantic mistake — FOLD groups
230 /// by KEY columns, not by path. (Stress Corpus B3.)
231 FoldInRecursivePath,
232 /// Phase C C4: `VALIDATE METRICS ece` was requested; the equal-width
233 /// binning ECE is biased in the small-sample regime (Kumar et al.
234 /// NeurIPS 2019). Use `DEBIASED_ECE` instead for an unbiased
235 /// estimator. The bare ECE value is still reported.
236 EceBinningBias,
237 /// Phase B G1-lite: a `CREATE MODEL` declares no CALIBRATION (or
238 /// `CALIBRATION None`) AND the `xervo_alias` heuristically looks like
239 /// an LLM provider (`generate/...`, `chat/...`, `llm/...`). Raw LLM
240 /// logprobs are not calibrated probabilities (rollout D-10). Treat
241 /// as a documentation hint until Xervo exposes `calibration_source`.
242 UncalibratedLLMLogprobs,
243 /// Phase C C4: a rule body invokes a `CREATE MODEL` whose output
244 /// is PROB AND which declares no CALIBRATION (or `CALIBRATION None`).
245 /// The fitted probability flows into the probabilistic stack
246 /// (MNOR / MPROD / complement) — without calibration, the
247 /// downstream aggregates compound the miscalibration. Run a
248 /// `CALIBRATE` statement to fit a transform, or explicitly mark
249 /// the choice with `CALIBRATION none` to acknowledge the risk
250 /// (the warning still fires for the explicit-`none` case to keep
251 /// the acknowledgement visible — same pattern as Phase A's
252 /// `FuzzyNotProbabilistic`, rollout D-9).
253 UncalibratedNeuralPredicate,
254 /// Phase C F2a: two or more neural-model invocations in the
255 /// same rule share an INPUT VARIABLE argument
256 /// (e.g. `model_a(s)` and `model_b(s)`). Under
257 /// independence-by-default composing the probabilities via
258 /// MNOR/MPROD is likely wrong since both share the random
259 /// variable `s`. Suppressed when ALL invocations involved
260 /// carry the `@independent` annotation on their `CREATE MODEL`
261 /// declaration. Rollout D-8.
262 SharedNeuralInputArgument,
263 /// Phase C F2b: two or more neural-model invocations in the
264 /// same rule share an equivalent FEATURE VALUE expression
265 /// (e.g. `model_a(s.tier)` and `model_b(s.tier)`). Different
266 /// from F2a — even when binding variables differ, the feature
267 /// input is structurally identical so the same correlation
268 /// concern applies. Suppression by `@independent` annotation.
269 SharedNeuralFeatureValue,
270 /// Phase D F3 case 3: a rule body has both a positive IS-ref
271 /// and an IS NOT (complement) to *different* rules on the
272 /// *same* subject variable. When the positive and negated
273 /// rules share base facts, the independence assumption that
274 /// underlies the probabilistic complement / aggregation is
275 /// violated. This is a structural over-detection (the MVP
276 /// fires whenever the pattern matches, even if no actual base
277 /// overlap exists at runtime); a future refinement will gate
278 /// on runtime support-set intersection.
279 PositiveComplementCorrelation,
280 /// Phase D F3 case 2: a rule body has two or more positive
281 /// IS-refs to *different* PROB-bearing rules on the *same*
282 /// subject variable. The implicit `p AND q` conjunction
283 /// assumes independence between `p` and `q`, which is wrong
284 /// when the two rules share base facts. Structural
285 /// over-detection (the MVP fires whenever the pattern
286 /// matches, even if no actual support overlap exists at
287 /// runtime); a future refinement will gate on runtime
288 /// support-set intersection.
289 CrossPredicateCorrelation,
290 /// Phase D F3 case 4 (F2c): two or more neural-model
291 /// invocations in the same rule receive retrieval-backed
292 /// features (`similar_to(prop, _)` / `semantic_match(prop,
293 /// _)`) over the *same* node property. The two models
294 /// condition on the same retrieval evidence, so the implicit
295 /// independence assumption that underlies composition via
296 /// MNOR/MPROD/etc. is suspect. Suppressed when all involved
297 /// models carry `@independent`. Structural over-detection;
298 /// a future refinement could gate on cosine similarity of
299 /// the pre-embedded query vectors (queries are constants per
300 /// `apply_model_invocations` call).
301 SharedRetrievalContext,
302}
303
304/// Probability semiring used to evaluate MNOR/MPROD aggregates, PROB
305/// complement, and cross-predicate combination.
306///
307/// `AddMultProb` is the Phase 1/2 default (noisy-OR and product under the
308/// independence assumption). `MaxMinProb` is the Viterbi/fuzzy semiring
309/// and triggers a non-suppressible `RuntimeWarningCode::FuzzyNotProbabilistic`
310/// whenever it evaluates a PROB-bearing rule (rollout decision D-9).
311/// `BddExact` is whole-group weighted model counting (Phase 7) and is
312/// dispatched outside the row-at-a-time `LocySemiring` trait.
313#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
314#[non_exhaustive]
315pub enum SemiringKind {
316 #[default]
317 AddMultProb,
318 MaxMinProb,
319 BddExact,
320 /// Phase C C0: top-K proof tracking with per-row dependency DNFs
321 /// (impl plan §1.6, decision D-7). Each row carries up to `k`
322 /// proofs whose `base_rvs` flag shared dependencies; the per-tag
323 /// probability is computed via inclusion-exclusion over the DNF.
324 ///
325 /// Stage 1 (this slice): library-layer math complete; runtime
326 /// `SemiringDispatch` falls back to `AddMultProb` row math with a
327 /// loud tracing warn. Stage 2 wires tag flow through
328 /// `MonotonicAggState` and `FoldExec`.
329 TopKProofs {
330 k: u32,
331 },
332}
333
334/// Classification of runtime warnings emitted during evaluation.
335#[derive(Debug, Clone, PartialEq)]
336pub enum RuntimeWarningCode {
337 /// Two or more proof paths aggregated by MNOR/MPROD share an
338 /// intermediate fact, violating the independence assumption.
339 SharedProbabilisticDependency,
340 /// A shared-proof group exceeded `max_bdd_variables`, so the BDD
341 /// computation fell back to the independence-mode result.
342 BddLimitExceeded,
343 /// Base facts are shared across different KEY groups within the same
344 /// rule. The BDD corrects per-group probabilities but cannot account
345 /// for cross-group correlations.
346 CrossGroupCorrelationNotExact,
347 /// The `MaxMinProb` (fuzzy / Viterbi) semiring evaluated a PROB-bearing
348 /// rule. Per rollout decision D-9 this warning is **unsuppressible**:
349 /// fuzzy truth values are not probabilities, and silent conflation is
350 /// the dominant pitfall in neuro-symbolic systems (LTN, NTP).
351 FuzzyNotProbabilistic,
352 /// Phase C C0: a `TopKProofs::plus` operation discarded a proof
353 /// whose `base_rvs` overlapped a retained proof — top-K is too
354 /// small for the program's dependency structure (impl plan §3.0,
355 /// rollout doc §6). Increase `k` or accept the
356 /// approximation. Emitted from library code; Stage 2 wires it
357 /// into the runtime tag flow.
358 TopKPruningCrossedDependency,
359}
360
361/// A non-fatal runtime diagnostic collected during evaluation.
362#[derive(Debug, Clone)]
363pub struct RuntimeWarning {
364 /// Warning classification.
365 pub code: RuntimeWarningCode,
366 /// Human-readable explanation.
367 pub message: String,
368 /// Rule that triggered the warning, when applicable.
369 pub rule_name: String,
370 /// BDD variable count for the affected group (BddLimitExceeded only).
371 pub variable_count: Option<usize>,
372 /// Human-readable KEY group description (BddLimitExceeded only).
373 pub key_group: Option<String>,
374}