Skip to main content

panproto_schema/
schema.rs

1//! Core schema data structures.
2//!
3//! A [`Schema`] is a model of a protocol's schema theory GAT. It stores
4//! vertices, binary edges, hyper-edges, constraints, required-edge
5//! declarations, and NSID mappings. Precomputed adjacency indices
6//! (`outgoing`, `incoming`, `between`) enable fast traversal.
7
8use std::collections::HashMap;
9
10use panproto_gat::Name;
11use serde::{Deserialize, Serialize};
12use smallvec::SmallVec;
13
14/// A schema vertex.
15///
16/// Each vertex has a unique `id`, a `kind` drawn from the protocol's
17/// recognized vertex kinds, and an optional NSID (namespace identifier).
18#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
19pub struct Vertex {
20    /// Unique vertex identifier within the schema.
21    pub id: Name,
22    /// The vertex kind (e.g., `"record"`, `"object"`, `"string"`).
23    pub kind: Name,
24    /// Optional namespace identifier (e.g., `"app.bsky.feed.post"`).
25    pub nsid: Option<Name>,
26}
27
28/// A binary edge between two vertices.
29///
30/// Edges are directed: they go from `src` to `tgt`. The `kind` determines
31/// the structural role (e.g., `"prop"`, `"record-schema"`), and `name`
32/// provides an optional label (e.g., the property name).
33#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
34pub struct Edge {
35    /// Source vertex ID.
36    pub src: Name,
37    /// Target vertex ID.
38    pub tgt: Name,
39    /// Edge kind (e.g., `"prop"`, `"record-schema"`).
40    pub kind: Name,
41    /// Optional edge label (e.g., a property name like `"text"`).
42    pub name: Option<Name>,
43}
44
45/// A hyper-edge (present only when the schema theory includes `ThHypergraph`).
46///
47/// Hyper-edges connect multiple vertices via a labeled signature.
48#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
49pub struct HyperEdge {
50    /// Unique hyper-edge identifier.
51    pub id: Name,
52    /// Hyper-edge kind.
53    pub kind: Name,
54    /// Maps label names to vertex IDs.
55    pub signature: HashMap<Name, Name>,
56    /// The label that identifies the parent vertex.
57    pub parent_label: Name,
58}
59
60/// A constraint on a vertex.
61///
62/// Constraints restrict the values a vertex can hold (e.g., maximum
63/// string length, format pattern).
64#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
65pub struct Constraint {
66    /// The constraint sort (e.g., `"maxLength"`, `"format"`).
67    pub sort: Name,
68    /// The constraint value (e.g., `"3000"`, `"at-uri"`).
69    pub value: String,
70}
71
72/// A variant in a coproduct (sum type / union).
73///
74/// Each variant is injected into a parent vertex (the union/coproduct)
75/// with an optional discriminant tag.
76#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
77pub struct Variant {
78    /// Unique variant identifier.
79    pub id: Name,
80    /// The parent coproduct vertex this variant belongs to.
81    pub parent_vertex: Name,
82    /// Optional discriminant tag.
83    pub tag: Option<Name>,
84}
85
86/// An ordering annotation on an edge.
87///
88/// Records that the children reached via this edge are ordered,
89/// with a specific position index.
90#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
91pub struct Ordering {
92    /// The edge being ordered.
93    pub edge: Edge,
94    /// Position in the ordered collection.
95    pub position: u32,
96}
97
98/// A recursion point (fixpoint marker) in the schema.
99///
100/// Marks a vertex as a recursive reference to another vertex,
101/// satisfying the fold-unfold law: `unfold(fold(v)) = v`.
102#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
103pub struct RecursionPoint {
104    /// The fixpoint marker vertex ID.
105    pub mu_id: Name,
106    /// The target vertex this unfolds to.
107    pub target_vertex: Name,
108}
109
110/// A span connecting two vertices through a common source.
111///
112/// Spans model correspondences, diffs, and migrations:
113/// `left ← span → right`.
114#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
115pub struct Span {
116    /// Unique span identifier.
117    pub id: Name,
118    /// Left vertex of the span.
119    pub left: Name,
120    /// Right vertex of the span.
121    pub right: Name,
122}
123
124/// Use-counting mode for an edge.
125///
126/// Captures the substructural distinction between edges that can
127/// be used freely (structural), exactly once (linear), or at most
128/// once (affine).
129#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
130pub enum UsageMode {
131    /// Can be used any number of times (default).
132    #[default]
133    Structural,
134    /// Must be used exactly once (e.g., protobuf `oneof`).
135    Linear,
136    /// Can be used at most once.
137    Affine,
138}
139
140/// Specification of a coercion between two value kinds.
141///
142/// Contains the forward coercion expression, an optional inverse for
143/// round-tripping, and the coercion class classifying the round-trip behavior.
144#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
145pub struct CoercionSpec {
146    /// Forward coercion expression (source to target).
147    pub forward: panproto_expr::Expr,
148    /// Inverse coercion expression (target to source) for the `put` direction.
149    pub inverse: Option<panproto_expr::Expr>,
150    /// Round-trip classification.
151    pub class: panproto_gat::CoercionClass,
152}
153
154/// A schema: a model of the protocol's schema theory.
155///
156/// Contains both the raw data (vertices, edges, constraints, etc.) and
157/// precomputed adjacency indices for efficient graph traversal.
158#[derive(Clone, Debug, Serialize, Deserialize)]
159pub struct Schema {
160    /// The protocol this schema belongs to.
161    pub protocol: String,
162    /// Vertices keyed by their ID.
163    pub vertices: HashMap<Name, Vertex>,
164    /// Edges keyed by the edge itself, value is the edge kind.
165    #[serde(with = "crate::serde_helpers::map_as_vec")]
166    pub edges: HashMap<Edge, Name>,
167    /// Hyper-edges keyed by their ID.
168    pub hyper_edges: HashMap<Name, HyperEdge>,
169    /// Constraints per vertex ID.
170    pub constraints: HashMap<Name, Vec<Constraint>>,
171    /// Required edges per vertex ID.
172    pub required: HashMap<Name, Vec<Edge>>,
173    /// NSID mapping: vertex ID to NSID string.
174    pub nsids: HashMap<Name, Name>,
175    /// Declared entry vertices.
176    ///
177    /// Semantically, this is the finite family of basepoints that makes
178    /// the schema a *pointed* schema: `E → Ob(C_S)` selecting the sorts
179    /// at which the W-algebra of instances may be rooted. Parsers set
180    /// this explicitly per their protocol's notion of a top-level
181    /// definition (a record, a top-level type, a path root, etc.);
182    /// consumers that need to choose an instance root consult it via
183    /// [`primary_entry`].
184    ///
185    /// Empty means the parser declined to supply a pointing; consumers
186    /// should then either fall back to a deterministic (but non-
187    /// canonical) selection or report an error. Order is preserved for
188    /// reproducibility; the set carries no duplicates.
189    #[serde(default)]
190    pub entries: Vec<Name>,
191
192    /// Coproduct variants per union vertex ID.
193    #[serde(default)]
194    pub variants: HashMap<Name, Vec<Variant>>,
195    /// Edge ordering positions (edge → position index).
196    #[serde(default, with = "crate::serde_helpers::map_as_vec_default")]
197    pub orderings: HashMap<Edge, u32>,
198    /// Recursion points (fixpoint markers).
199    #[serde(default)]
200    pub recursion_points: HashMap<Name, RecursionPoint>,
201    /// Spans connecting pairs of vertices.
202    #[serde(default)]
203    pub spans: HashMap<Name, Span>,
204    /// Edge usage modes (default: `Structural` for all).
205    #[serde(default, with = "crate::serde_helpers::map_as_vec_default")]
206    pub usage_modes: HashMap<Edge, UsageMode>,
207    /// Whether each vertex uses nominal identity (`true`) or
208    /// structural identity (`false`). Absent = structural.
209    #[serde(default)]
210    pub nominal: HashMap<Name, bool>,
211
212    // -- enrichment fields --
213    /// Coercion specifications: `(source_kind, target_kind)` to coercion spec.
214    #[serde(default, with = "crate::serde_helpers::map_as_vec_default")]
215    pub coercions: HashMap<(Name, Name), CoercionSpec>,
216    /// Merger expressions: `vertex_id` to merger expression.
217    #[serde(default)]
218    pub mergers: HashMap<Name, panproto_expr::Expr>,
219    /// Default value expressions: `vertex_id` to default expression.
220    #[serde(default)]
221    pub defaults: HashMap<Name, panproto_expr::Expr>,
222    /// Conflict resolution policy expressions: `sort_name` to policy expression.
223    #[serde(default)]
224    pub policies: HashMap<Name, panproto_expr::Expr>,
225
226    // -- precomputed indices --
227    /// Outgoing edges per vertex ID.
228    pub outgoing: HashMap<Name, SmallVec<Edge, 4>>,
229    /// Incoming edges per vertex ID.
230    pub incoming: HashMap<Name, SmallVec<Edge, 4>>,
231    /// Edges between a specific `(src, tgt)` pair.
232    #[serde(with = "crate::serde_helpers::map_as_vec")]
233    pub between: HashMap<(Name, Name), SmallVec<Edge, 2>>,
234}
235
236impl Schema {
237    /// Strip every constraint whose sort belongs to the layout
238    /// enrichment fibre (per [`panproto_gat::is_layout_sort`]).
239    ///
240    /// This is the schema-level forgetful U sending a decorated schema
241    /// to its abstract base in the parse/decorate/emit lens. Idempotent.
242    #[must_use]
243    pub fn forget_layout(&self) -> Self {
244        let mut clone = self.clone();
245        clone.forget_layout_in_place();
246        clone
247    }
248
249    /// In-place variant of [`Self::forget_layout`].
250    pub fn forget_layout_in_place(&mut self) {
251        for constraints in self.constraints.values_mut() {
252            constraints.retain(|c| !panproto_gat::is_layout_sort(c.sort.as_ref()));
253        }
254        // Drop now-empty vertex entries so equality is structural.
255        self.constraints.retain(|_, cs| !cs.is_empty());
256    }
257
258    /// Returns `true` when no constraint sort belongs to the layout
259    /// enrichment fibre. This is the well-formedness predicate for an
260    /// [`AbstractSchema`](crate::AbstractSchema).
261    #[must_use]
262    pub fn is_layout_free(&self) -> bool {
263        self.constraints.values().all(|cs| {
264            cs.iter()
265                .all(|c| !panproto_gat::is_layout_sort(c.sort.as_ref()))
266        })
267    }
268
269    /// Look up a vertex by ID.
270    #[must_use]
271    pub fn vertex(&self, id: &str) -> Option<&Vertex> {
272        self.vertices.get(id)
273    }
274
275    /// Return all outgoing edges from the given vertex.
276    #[must_use]
277    pub fn outgoing_edges(&self, vertex_id: &str) -> &[Edge] {
278        self.outgoing.get(vertex_id).map_or(&[], SmallVec::as_slice)
279    }
280
281    /// Return all incoming edges to the given vertex.
282    #[must_use]
283    pub fn incoming_edges(&self, vertex_id: &str) -> &[Edge] {
284        self.incoming.get(vertex_id).map_or(&[], SmallVec::as_slice)
285    }
286
287    /// Return edges between a specific `(src, tgt)` pair.
288    #[must_use]
289    #[inline]
290    pub fn edges_between(&self, src: &str, tgt: &str) -> &[Edge] {
291        self.between
292            .get(&(Name::from(src), Name::from(tgt)))
293            .map_or(&[], SmallVec::as_slice)
294    }
295
296    /// Returns `true` if the given vertex ID exists in this schema.
297    #[must_use]
298    #[inline]
299    pub fn has_vertex(&self, id: &str) -> bool {
300        self.vertices.contains_key(id)
301    }
302
303    /// Returns the number of vertices in the schema.
304    #[must_use]
305    pub fn vertex_count(&self) -> usize {
306        self.vertices.len()
307    }
308
309    /// Returns the number of edges in the schema.
310    #[must_use]
311    pub fn edge_count(&self) -> usize {
312        self.edges.len()
313    }
314
315    /// Return the declared entry vertices.
316    ///
317    /// See [`Schema::entries`] for semantics. Use [`primary_entry`] for
318    /// callers that need a single root and want a deterministic
319    /// fallback when no entries are declared.
320    #[must_use]
321    pub fn entry_vertices(&self) -> &[Name] {
322        &self.entries
323    }
324
325    /// Return every constraint attached to the given vertex.
326    ///
327    /// Tree-sitter-derived schemas attach byte ranges, interstitials,
328    /// formatting, and `field:<name>` entries here.
329    #[must_use]
330    pub fn constraints_for(&self, vertex_id: &str) -> &[Constraint] {
331        self.constraints
332            .get(&Name::from(vertex_id))
333            .map_or(&[], Vec::as_slice)
334    }
335
336    /// Return the text value of a tree-sitter `field('<name>', ...)`
337    /// anonymous-token child on the given vertex, if any.
338    ///
339    /// Tree-sitter rules of the form
340    /// `field('op', choice('+', '-', '*', '/'))` attach a field name to
341    /// an unnamed token alternative. The walker emits the token's text
342    /// as a `field:<name>` constraint on the parent vertex; this is the
343    /// supported accessor for that text.
344    ///
345    /// Returns `None` if no `field:<name>` constraint exists on
346    /// `vertex_id`. Named-node field children continue to surface as
347    /// edges (use [`outgoing_edges`](Self::outgoing_edges) and filter
348    /// by [`Edge::kind`](crate::Edge::kind) for those).
349    #[must_use]
350    pub fn field_text(&self, vertex_id: &str, field_name: &str) -> Option<&str> {
351        let sort = format!("field:{field_name}");
352        self.constraints
353            .get(&Name::from(vertex_id))?
354            .iter()
355            .find(|c| c.sort.as_ref() == sort.as_str())
356            .map(|c| c.value.as_str())
357    }
358}
359
360/// Choose a single entry vertex for a schema.
361///
362/// Returns the first declared entry if any. Otherwise falls back to a
363/// deterministic, protocol-agnostic choice: among vertex ids sorted
364/// lexicographically, the first vertex that is a source of at least
365/// one edge and a target of none (an edgeless "root" in the signature
366/// graph); failing that, the first vertex that has outgoing edges at
367/// all; failing that, the lexicographically first vertex; `None` only
368/// for an empty schema.
369///
370/// The fallback is explicitly *non-canonical*: it exists so that legacy
371/// schemas without declared entries remain usable, but new parsers
372/// should always supply at least one entry via
373/// [`SchemaBuilder::entry`](crate::SchemaBuilder::entry) so that the
374/// pointing is part of the schema's semantics rather than recovered by
375/// heuristic.
376#[must_use]
377pub fn primary_entry(schema: &Schema) -> Option<&Name> {
378    if let Some(first) = schema.entries.first() {
379        return Some(first);
380    }
381
382    let mut ids: Vec<&Name> = schema.vertices.keys().collect();
383    ids.sort();
384
385    let has_outgoing = |id: &Name| -> bool {
386        schema
387            .outgoing
388            .get(id)
389            .is_some_and(|edges| !edges.is_empty())
390    };
391    let has_incoming = |id: &Name| -> bool {
392        schema
393            .incoming
394            .get(id)
395            .is_some_and(|edges| !edges.is_empty())
396    };
397
398    if let Some(id) = ids
399        .iter()
400        .copied()
401        .find(|id| has_outgoing(id) && !has_incoming(id))
402    {
403        return Some(id);
404    }
405    if let Some(id) = ids.iter().copied().find(|id| has_outgoing(id)) {
406        return Some(id);
407    }
408    ids.into_iter().next()
409}