Skip to main content

panproto_parse/
parse_emit_lens.rs

1//! The parse / emit pair as a verified asymmetric lens.
2//!
3//! `parse` and `emit_pretty` together form an asymmetric lens between a
4//! source-byte object `B` and a schema-object `S(p)` in the image of
5//! `parse_p`:
6//!
7//! ```text
8//!     parse_p :  B  ─▶  (S(p), C)         get-direction (with complement)
9//!     emit_p  :  S(p) ─▶ B                put-direction (drops complement)
10//! ```
11//!
12//! The complement `C` carries everything the schema doesn't fix:
13//! byte positions, interstitial whitespace, comments. `emit_pretty`
14//! reconstitutes a *canonical* element of `parse_p^{-1}(s)` using a
15//! whitespace policy in place of the discarded complement.
16//!
17//! In categorical terms this is a *retraction in the image of the
18//! parser*: for every `s ∈ image(parse_p)`, the round-trip
19//! `parse_p ∘ emit_p` returns a schema with the same vertex/edge kind
20//! multiset as `s`. We don't get pointwise equality because byte
21//! positions and interstitial constraints are not part of the
22//! emit_pretty image; that's the price of a canonical section.
23//!
24//! The law-checkers in this module verify the retraction explicitly so
25//! that any future change to `emit_pretty` is provably faithful.
26//!
27//! # Laws
28//!
29//! ## `EmitParse` (the retraction law)
30//!
31//! For all `s` in the image of `parse_p`,
32//!
33//! ```text
34//!     kind_multiset(parse_p(emit_p(s))) = kind_multiset(s')
35//! ```
36//!
37//! where `s'` is `s` with the byte-position constraints stripped (the
38//! portion that emit_pretty cannot preserve by construction).
39//!
40//! ## `ParseEmit` (stability under round-trip)
41//!
42//! For all `b` such that `parse_p(b)` succeeds,
43//!
44//! ```text
45//!     parse_p(emit_p(strip(parse_p(b)))) ≅_kinds strip(parse_p(b))
46//! ```
47//!
48//! That is, the emit_pretty image is a fixed point of the round-trip
49//! up to kind-multiset equivalence.
50
51use std::collections::BTreeMap;
52
53use panproto_schema::Schema;
54
55use crate::error::ParseError;
56use crate::registry::ParserRegistry;
57
58/// A protolens for the source-bytes ↔ schema relation at one protocol.
59///
60/// `ParseEmitLens` is parameterised by a protocol name; calls into the
61/// underlying `ParserRegistry` thread the parser and grammar through.
62pub struct ParseEmitLens<'r> {
63    registry: &'r ParserRegistry,
64    protocol: String,
65}
66
67impl<'r> ParseEmitLens<'r> {
68    /// Build a parse/emit lens for `protocol` against `registry`.
69    #[must_use]
70    pub fn new(registry: &'r ParserRegistry, protocol: impl Into<String>) -> Self {
71        Self {
72            registry,
73            protocol: protocol.into(),
74        }
75    }
76
77    /// Forward direction: source bytes → schema. The complement (byte
78    /// positions, interstitials) lives on the schema as constraints.
79    ///
80    /// # Errors
81    ///
82    /// Returns the parser's error if `protocol` is unknown or `source`
83    /// is unparseable for the protocol.
84    pub fn parse(&self, source: &[u8]) -> Result<Schema, ParseError> {
85        self.registry
86            .parse_with_protocol(&self.protocol, source, "parse_emit_lens")
87    }
88
89    /// Backward direction: schema → canonical source bytes. Drops
90    /// complement; output is one canonical representative of the
91    /// parse-preimage of `schema`.
92    ///
93    /// # Errors
94    ///
95    /// Returns the emitter's error if `protocol` is unknown or the
96    /// schema cannot be rendered (e.g. grammar.json was not vendored).
97    pub fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
98        self.registry
99            .emit_pretty_with_protocol(&self.protocol, schema)
100    }
101}
102
103/// Possible failures when verifying the parse/emit lens laws.
104#[derive(Debug, thiserror::Error)]
105#[non_exhaustive]
106pub enum LawViolation {
107    /// The retraction law `parse(emit(s)) ≅_kinds strip(s)` failed.
108    #[error("EmitParse law violated for protocol {protocol}: {detail}")]
109    EmitParse {
110        /// Protocol whose lens failed the law.
111        protocol: String,
112        /// Human-readable description of the divergence.
113        detail: String,
114    },
115    /// The stability law `parse(emit(strip(parse(b)))) ≅ strip(parse(b))` failed.
116    #[error("ParseEmit law violated for protocol {protocol}: {detail}")]
117    ParseEmit {
118        /// Protocol whose lens failed the law.
119        protocol: String,
120        /// Human-readable description of the divergence.
121        detail: String,
122    },
123    /// An underlying `parse` or `emit` step returned an error.
124    #[error("underlying parse/emit error: {0}")]
125    Underlying(#[from] ParseError),
126}
127
128/// Strip byte-position constraints from a schema.
129///
130/// Removes `start-byte`, `end-byte`, and `interstitial-*` constraints:
131/// these are the "complement" portion that `emit_pretty` does not
132/// reconstruct, so they are projected out before comparing.
133pub fn strip_complement(schema: &mut Schema) {
134    for constraints in schema.constraints.values_mut() {
135        constraints.retain(|c| {
136            let s = c.sort.as_ref();
137            !(s == "start-byte" || s == "end-byte" || s.starts_with("interstitial-"))
138        });
139    }
140}
141
142/// Vertex-kind multiset, half of the equivalence class used for law checks.
143///
144/// See also [`edge_multiset`]; both are required for a faithful retraction
145/// witness because two schemas can share a vertex-kind multiset while
146/// differing in edge structure (e.g. a tree and its mirror).
147#[must_use]
148pub fn kind_multiset(schema: &Schema) -> BTreeMap<String, usize> {
149    let mut map = BTreeMap::new();
150    for v in schema.vertices.values() {
151        *map.entry(v.kind.to_string()).or_insert(0) += 1;
152    }
153    map
154}
155
156/// Edge-shape multiset: counts of `(src_kind, edge_kind, tgt_kind)` triples.
157///
158/// Combined with [`kind_multiset`] this is the full witness the retraction
159/// law requires — vertex kinds alone leave edge structure unconstrained.
160///
161/// Note: `Schema::edges` is a `HashMap<Edge, Name>` so distinct edges
162/// (different src/tgt vertex IDs) are kept apart, but two edges that
163/// happen to share `(src_id, tgt_id, kind, name)` collapse to one
164/// entry. The shape multiset projects these structurally-distinct
165/// edges onto a kind-level signature, which is the appropriate
166/// granularity for the retraction law: `emit_pretty` cannot be expected
167/// to preserve vertex IDs (the parser invents fresh ones) but it must
168/// preserve the kind-shape of every edge.
169#[must_use]
170pub fn edge_multiset(schema: &Schema) -> BTreeMap<(String, String, String), usize> {
171    let mut map: BTreeMap<(String, String, String), usize> = BTreeMap::new();
172    for edge in schema.edges.keys() {
173        let src_kind = schema
174            .vertices
175            .get(&edge.src)
176            .map(|v| v.kind.to_string())
177            .unwrap_or_default();
178        let tgt_kind = schema
179            .vertices
180            .get(&edge.tgt)
181            .map(|v| v.kind.to_string())
182            .unwrap_or_default();
183        *map.entry((src_kind, edge.kind.to_string(), tgt_kind))
184            .or_insert(0) += 1;
185    }
186    map
187}
188
189/// Verify the `EmitParse` law on a given schema:
190///
191/// ```text
192///     kind_multiset(parse(emit(strip(s)))) == kind_multiset(strip(s))
193/// ```
194///
195/// Returns `Ok(())` iff the round-trip preserves the kind multiset.
196///
197/// # Errors
198///
199/// Returns [`LawViolation::EmitParse`] when the round-trip changes
200/// the kind multiset, or [`LawViolation::Underlying`] if `parse` or
201/// `emit` itself fails.
202pub fn check_emit_parse(lens: &ParseEmitLens<'_>, schema: &Schema) -> Result<(), LawViolation> {
203    let mut stripped = schema.clone();
204    strip_complement(&mut stripped);
205    let expected_kinds = kind_multiset(&stripped);
206    let expected_edges = edge_multiset(&stripped);
207
208    let bytes = lens.emit(&stripped)?;
209    let mut round = lens.parse(&bytes)?;
210    strip_complement(&mut round);
211    let actual_kinds = kind_multiset(&round);
212    let actual_edges = edge_multiset(&round);
213
214    if expected_kinds != actual_kinds {
215        return Err(LawViolation::EmitParse {
216            protocol: lens.protocol.clone(),
217            detail: format!(
218                "vertex-kind multiset mismatch: expected {} distinct kinds, got {}; \
219                 first divergence: {:?}",
220                expected_kinds.len(),
221                actual_kinds.len(),
222                first_divergence(&expected_kinds, &actual_kinds),
223            ),
224        });
225    }
226    if expected_edges != actual_edges {
227        return Err(LawViolation::EmitParse {
228            protocol: lens.protocol.clone(),
229            detail: format!(
230                "edge-shape multiset mismatch: expected {} distinct edge shapes, got {}",
231                expected_edges.len(),
232                actual_edges.len(),
233            ),
234        });
235    }
236    Ok(())
237}
238
239/// Verify the `ParseEmit` stability law on a source byte string:
240///
241/// ```text
242///     kind_multiset(parse(emit(strip(parse(b)))))
243///         == kind_multiset(strip(parse(b)))
244/// ```
245///
246/// Returns `Ok(())` iff `b` round-trips through the lens up to kind
247/// multiset.
248///
249/// # Errors
250///
251/// Returns [`LawViolation::EmitParse`] when the round-trip changes
252/// the kind multiset, or [`LawViolation::Underlying`] if `parse` or
253/// `emit` itself fails.
254pub fn check_parse_emit(lens: &ParseEmitLens<'_>, bytes: &[u8]) -> Result<(), LawViolation> {
255    let parsed = lens.parse(bytes)?;
256    check_emit_parse(lens, &parsed)
257}
258
259fn first_divergence(
260    expected: &BTreeMap<String, usize>,
261    actual: &BTreeMap<String, usize>,
262) -> Option<(String, Option<usize>, Option<usize>)> {
263    for (k, &v) in expected {
264        if actual.get(k) != Some(&v) {
265            return Some((k.clone(), Some(v), actual.get(k).copied()));
266        }
267    }
268    for (k, &v) in actual {
269        if !expected.contains_key(k) {
270            return Some((k.clone(), None, Some(v)));
271        }
272    }
273    None
274}
275
276#[cfg(test)]
277#[cfg(feature = "grammars")]
278#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic, dead_code)]
279mod tests {
280    use super::*;
281
282    fn run_check(protocol: &str, source: &[u8]) {
283        let registry = ParserRegistry::new();
284        let lens = ParseEmitLens::new(&registry, protocol);
285        check_parse_emit(&lens, source)
286            .unwrap_or_else(|e| panic!("law check failed for {protocol}: {e}"));
287    }
288
289    #[cfg(feature = "lang-json")]
290    #[test]
291    fn json_lens_satisfies_laws() {
292        std::thread::Builder::new()
293            .stack_size(32 * 1024 * 1024)
294            .spawn(|| run_check("json", br#"{"a": 1, "b": [2, 3]}"#))
295            .expect("spawn")
296            .join()
297            .expect("worker panicked");
298    }
299
300    #[cfg(feature = "lang-toml")]
301    #[test]
302    fn toml_lens_satisfies_laws() {
303        std::thread::Builder::new()
304            .stack_size(32 * 1024 * 1024)
305            .spawn(|| run_check("toml", b"name = \"foo\"\nversion = \"1.0\"\n"))
306            .expect("spawn")
307            .join()
308            .expect("worker panicked");
309    }
310
311    #[cfg(feature = "lang-json")]
312    #[test]
313    fn json_check_emit_parse_directly() {
314        std::thread::Builder::new()
315            .stack_size(32 * 1024 * 1024)
316            .spawn(|| {
317                let registry = ParserRegistry::new();
318                let lens = ParseEmitLens::new(&registry, "json");
319                let parsed = lens.parse(br#"[1, 2, 3]"#).expect("parse");
320                check_emit_parse(&lens, &parsed).expect("retraction holds for parsed schema");
321            })
322            .expect("spawn")
323            .join()
324            .expect("worker panicked");
325    }
326
327    #[cfg(feature = "lang-json")]
328    #[test]
329    fn strip_complement_removes_byte_constraints_only() {
330        std::thread::Builder::new()
331            .stack_size(32 * 1024 * 1024)
332            .spawn(|| {
333                let registry = ParserRegistry::new();
334                let lens = ParseEmitLens::new(&registry, "json");
335                let mut parsed = lens.parse(br#"{"a": 1}"#).expect("parse");
336
337                let total_constraint_count: usize = parsed.constraints.values().map(Vec::len).sum();
338                strip_complement(&mut parsed);
339                let stripped_total: usize = parsed.constraints.values().map(Vec::len).sum();
340
341                assert!(
342                    stripped_total < total_constraint_count,
343                    "strip_complement must remove byte-position constraints"
344                );
345                // Walker emits chose-alt-fingerprint and similar
346                // constraints that strip_complement preserves.
347                let preserved = parsed.constraints.values().any(|cs| {
348                    cs.iter()
349                        .any(|c| c.sort.as_ref() == "chose-alt-fingerprint")
350                });
351                assert!(
352                    preserved,
353                    "strip_complement must preserve chose-alt-fingerprint witnesses"
354                );
355            })
356            .expect("spawn")
357            .join()
358            .expect("worker panicked");
359    }
360
361    #[cfg(feature = "lang-json")]
362    #[test]
363    fn edge_multiset_distinguishes_structurally_different_schemas() {
364        std::thread::Builder::new()
365            .stack_size(32 * 1024 * 1024)
366            .spawn(|| {
367                let registry = ParserRegistry::new();
368                let lens = ParseEmitLens::new(&registry, "json");
369                let s1 = lens.parse(br#"{"a": 1}"#).expect("parse");
370                let s2 = lens.parse(br#"[1]"#).expect("parse");
371                let m1 = edge_multiset(&s1);
372                let m2 = edge_multiset(&s2);
373                assert_ne!(
374                    m1, m2,
375                    "object and array schemas have distinct edge-shape multisets"
376                );
377            })
378            .expect("spawn")
379            .join()
380            .expect("worker panicked");
381    }
382
383    #[test]
384    fn first_divergence_finds_count_mismatch() {
385        let mut a = BTreeMap::new();
386        a.insert("x".to_owned(), 1);
387        let mut b = BTreeMap::new();
388        b.insert("x".to_owned(), 2);
389        assert_eq!(
390            first_divergence(&a, &b),
391            Some(("x".to_owned(), Some(1), Some(2)))
392        );
393    }
394
395    #[test]
396    fn first_divergence_finds_extra_key_in_actual() {
397        let a = BTreeMap::new();
398        let mut b = BTreeMap::new();
399        b.insert("y".to_owned(), 3);
400        assert_eq!(
401            first_divergence(&a, &b),
402            Some(("y".to_owned(), None, Some(3)))
403        );
404    }
405
406    #[test]
407    fn first_divergence_returns_none_on_match() {
408        let mut a = BTreeMap::new();
409        a.insert("x".to_owned(), 1);
410        let b = a.clone();
411        assert_eq!(first_divergence(&a, &b), None);
412    }
413}