Skip to main content

panproto_parse/
parse_emit_lens.rs

1//! The parse / emit pair as a verified asymmetric lens.
2//!
3//! `parse` and `emit_pretty` together form an asymmetric lens between a
4//! source-byte object `B` and a schema-object `S(p)` in the image of
5//! `parse_p`:
6//!
7//! ```text
8//!     parse_p :  B  ─▶  (S(p), C)         get-direction (with complement)
9//!     emit_p  :  S(p) ─▶ B                put-direction (drops complement)
10//! ```
11//!
12//! The complement `C` carries everything the schema doesn't fix:
13//! byte positions, interstitial whitespace, comments. `emit_pretty`
14//! reconstitutes a *canonical* element of `parse_p^{-1}(s)` using a
15//! whitespace policy in place of the discarded complement.
16//!
17//! In categorical terms this is a *retraction in the image of the
18//! parser*: for every `s ∈ image(parse_p)`, the round-trip
19//! `parse_p ∘ emit_p` returns a schema with the same vertex/edge kind
20//! multiset as `s`. We don't get pointwise equality because byte
21//! positions and interstitial constraints are not part of the
22//! emit_pretty image; that's the price of a canonical section.
23//!
24//! The law-checkers in this module verify the retraction explicitly so
25//! that any future change to `emit_pretty` is provably faithful.
26//!
27//! # Laws
28//!
29//! ## `EmitParse` (the retraction law)
30//!
31//! For all `s` in the image of `parse_p`,
32//!
33//! ```text
34//!     kind_multiset(parse_p(emit_p(s))) = kind_multiset(s')
35//! ```
36//!
37//! where `s'` is `s` with the byte-position constraints stripped (the
38//! portion that emit_pretty cannot preserve by construction).
39//!
40//! ## `ParseEmit` (stability under round-trip)
41//!
42//! For all `b` such that `parse_p(b)` succeeds,
43//!
44//! ```text
45//!     parse_p(emit_p(strip(parse_p(b)))) ≅_kinds strip(parse_p(b))
46//! ```
47//!
48//! That is, the emit_pretty image is a fixed point of the round-trip
49//! up to kind-multiset equivalence.
50
51use std::collections::BTreeMap;
52
53use panproto_schema::Schema;
54
55use crate::error::ParseError;
56use crate::registry::ParserRegistry;
57
58/// A protolens for the source-bytes ↔ schema relation at one protocol.
59///
60/// `ParseEmitLens` is parameterised by a protocol name; calls into the
61/// underlying `ParserRegistry` thread the parser and grammar through.
62pub struct ParseEmitLens<'r> {
63    registry: &'r ParserRegistry,
64    protocol: String,
65}
66
67impl<'r> ParseEmitLens<'r> {
68    /// Build a parse/emit lens for `protocol` against `registry`.
69    #[must_use]
70    pub fn new(registry: &'r ParserRegistry, protocol: impl Into<String>) -> Self {
71        Self {
72            registry,
73            protocol: protocol.into(),
74        }
75    }
76
77    /// Forward direction: source bytes → schema. The complement (byte
78    /// positions, interstitials) lives on the schema as constraints.
79    ///
80    /// # Errors
81    ///
82    /// Returns the parser's error if `protocol` is unknown or `source`
83    /// is unparseable for the protocol.
84    pub fn parse(&self, source: &[u8]) -> Result<Schema, ParseError> {
85        self.registry
86            .parse_with_protocol(&self.protocol, source, "parse_emit_lens")
87    }
88
89    /// Backward direction: schema → canonical source bytes. Drops
90    /// complement; output is one canonical representative of the
91    /// parse-preimage of `schema`.
92    ///
93    /// # Errors
94    ///
95    /// Returns the emitter's error if `protocol` is unknown or the
96    /// schema cannot be rendered (e.g. grammar.json was not vendored).
97    pub fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
98        self.registry
99            .emit_pretty_with_protocol(&self.protocol, schema)
100    }
101}
102
103/// Possible failures when verifying the parse/emit lens laws.
104#[derive(Debug, thiserror::Error)]
105#[non_exhaustive]
106pub enum LawViolation {
107    /// The retraction law `parse(emit(s)) ≅_kinds strip(s)` failed.
108    #[error("EmitParse law violated for protocol {protocol}: {detail}")]
109    EmitParse {
110        /// Protocol whose lens failed the law.
111        protocol: String,
112        /// Human-readable description of the divergence.
113        detail: String,
114    },
115    /// The stability law `parse(emit(strip(parse(b)))) ≅ strip(parse(b))` failed.
116    #[error("ParseEmit law violated for protocol {protocol}: {detail}")]
117    ParseEmit {
118        /// Protocol whose lens failed the law.
119        protocol: String,
120        /// Human-readable description of the divergence.
121        detail: String,
122    },
123    /// An underlying `parse` or `emit` step returned an error.
124    #[error("underlying parse/emit error: {0}")]
125    Underlying(#[from] ParseError),
126}
127
128/// Strip byte-position constraints from a schema.
129///
130/// Removes `start-byte`, `end-byte`, and `interstitial-*` constraints
131/// — the byte-positional portion of the layout fibre that
132/// `emit_pretty` cannot reconstruct (the parser invents fresh
133/// positions). `chose-alt-*` discriminators are **preserved** because
134/// they're the categorical witness of which CHOICE alternative the
135/// parser took and `emit_pretty` consumes them directly to dispatch
136/// without re-deriving the choice. This is the `EmitParse` law's
137/// "complement" projection — distinct from
138/// [`Schema::forget_layout`], which strips every layout-fibre sort
139/// including `chose-alt-*` to produce a truly abstract schema for
140/// the parse/decorate/emit lens.
141pub fn strip_complement(schema: &mut Schema) {
142    for constraints in schema.constraints.values_mut() {
143        constraints.retain(|c| {
144            let s = c.sort.as_ref();
145            !(s == "start-byte" || s == "end-byte" || s.starts_with("interstitial-"))
146        });
147    }
148}
149
150/// Vertex-kind multiset re-exported from [`panproto_schema::kind_multiset`].
151///
152/// Hosted in `panproto-schema` so the equivalence witness can be used
153/// by both the lens framework's law harness and the parse module's
154/// law checkers without duplication.
155#[must_use]
156pub fn kind_multiset(schema: &Schema) -> BTreeMap<String, usize> {
157    panproto_schema::kind_multiset(schema)
158}
159
160/// Edge-shape multiset re-exported from
161/// [`panproto_schema::edge_multiset`]. See that module for the
162/// rationale on multiset granularity.
163#[must_use]
164pub fn edge_multiset(schema: &Schema) -> BTreeMap<(String, String, String), usize> {
165    panproto_schema::edge_multiset(schema)
166}
167
168/// Verify the `EmitParse` law on a given schema:
169///
170/// ```text
171///     kind_multiset(parse(emit(strip(s)))) == kind_multiset(strip(s))
172/// ```
173///
174/// Returns `Ok(())` iff the round-trip preserves the kind multiset.
175///
176/// # Errors
177///
178/// Returns [`LawViolation::EmitParse`] when the round-trip changes
179/// the kind multiset, or [`LawViolation::Underlying`] if `parse` or
180/// `emit` itself fails.
181pub fn check_emit_parse(lens: &ParseEmitLens<'_>, schema: &Schema) -> Result<(), LawViolation> {
182    let mut stripped = schema.clone();
183    strip_complement(&mut stripped);
184    let expected_kinds = kind_multiset(&stripped);
185    let expected_edges = edge_multiset(&stripped);
186
187    let bytes = lens.emit(&stripped)?;
188    let mut round = lens.parse(&bytes)?;
189    strip_complement(&mut round);
190    let actual_kinds = kind_multiset(&round);
191    let actual_edges = edge_multiset(&round);
192
193    if expected_kinds != actual_kinds {
194        return Err(LawViolation::EmitParse {
195            protocol: lens.protocol.clone(),
196            detail: format!(
197                "vertex-kind multiset mismatch: expected {} distinct kinds, got {}; \
198                 first divergence: {:?}",
199                expected_kinds.len(),
200                actual_kinds.len(),
201                first_divergence(&expected_kinds, &actual_kinds),
202            ),
203        });
204    }
205    if expected_edges != actual_edges {
206        return Err(LawViolation::EmitParse {
207            protocol: lens.protocol.clone(),
208            detail: format!(
209                "edge-shape multiset mismatch: expected {} distinct edge shapes, got {}",
210                expected_edges.len(),
211                actual_edges.len(),
212            ),
213        });
214    }
215    Ok(())
216}
217
218/// Verify the `ParseEmit` stability law on a source byte string:
219///
220/// ```text
221///     kind_multiset(parse(emit(strip(parse(b)))))
222///         == kind_multiset(strip(parse(b)))
223/// ```
224///
225/// Returns `Ok(())` iff `b` round-trips through the lens up to kind
226/// multiset.
227///
228/// # Errors
229///
230/// Returns [`LawViolation::EmitParse`] when the round-trip changes
231/// the kind multiset, or [`LawViolation::Underlying`] if `parse` or
232/// `emit` itself fails.
233pub fn check_parse_emit(lens: &ParseEmitLens<'_>, bytes: &[u8]) -> Result<(), LawViolation> {
234    let parsed = lens.parse(bytes)?;
235    check_emit_parse(lens, &parsed)
236}
237
238fn first_divergence(
239    expected: &BTreeMap<String, usize>,
240    actual: &BTreeMap<String, usize>,
241) -> Option<(String, Option<usize>, Option<usize>)> {
242    for (k, &v) in expected {
243        if actual.get(k) != Some(&v) {
244            return Some((k.clone(), Some(v), actual.get(k).copied()));
245        }
246    }
247    for (k, &v) in actual {
248        if !expected.contains_key(k) {
249            return Some((k.clone(), None, Some(v)));
250        }
251    }
252    None
253}
254
255#[cfg(test)]
256#[cfg(feature = "grammars")]
257#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic, dead_code)]
258mod tests {
259    use super::*;
260
261    fn run_check(protocol: &str, source: &[u8]) {
262        let registry = ParserRegistry::new();
263        let lens = ParseEmitLens::new(&registry, protocol);
264        check_parse_emit(&lens, source)
265            .unwrap_or_else(|e| panic!("law check failed for {protocol}: {e}"));
266    }
267
268    #[cfg(feature = "lang-json")]
269    #[test]
270    fn json_lens_satisfies_laws() {
271        std::thread::Builder::new()
272            .stack_size(32 * 1024 * 1024)
273            .spawn(|| run_check("json", br#"{"a": 1, "b": [2, 3]}"#))
274            .expect("spawn")
275            .join()
276            .expect("worker panicked");
277    }
278
279    #[cfg(feature = "lang-toml")]
280    #[test]
281    fn toml_lens_satisfies_laws() {
282        std::thread::Builder::new()
283            .stack_size(32 * 1024 * 1024)
284            .spawn(|| run_check("toml", b"name = \"foo\"\nversion = \"1.0\"\n"))
285            .expect("spawn")
286            .join()
287            .expect("worker panicked");
288    }
289
290    #[cfg(feature = "lang-json")]
291    #[test]
292    fn json_check_emit_parse_directly() {
293        std::thread::Builder::new()
294            .stack_size(32 * 1024 * 1024)
295            .spawn(|| {
296                let registry = ParserRegistry::new();
297                let lens = ParseEmitLens::new(&registry, "json");
298                let parsed = lens.parse(b"[1, 2, 3]").expect("parse");
299                check_emit_parse(&lens, &parsed).expect("retraction holds for parsed schema");
300            })
301            .expect("spawn")
302            .join()
303            .expect("worker panicked");
304    }
305
306    #[cfg(feature = "lang-json")]
307    #[test]
308    fn strip_complement_removes_byte_constraints_only() {
309        std::thread::Builder::new()
310            .stack_size(32 * 1024 * 1024)
311            .spawn(|| {
312                let registry = ParserRegistry::new();
313                let lens = ParseEmitLens::new(&registry, "json");
314                let mut parsed = lens.parse(br#"{"a": 1}"#).expect("parse");
315
316                let total_constraint_count: usize = parsed.constraints.values().map(Vec::len).sum();
317                strip_complement(&mut parsed);
318                let stripped_total: usize = parsed.constraints.values().map(Vec::len).sum();
319
320                assert!(
321                    stripped_total < total_constraint_count,
322                    "strip_complement must remove byte-position constraints"
323                );
324                // Walker emits chose-alt-fingerprint and similar
325                // constraints that strip_complement preserves.
326                let preserved = parsed.constraints.values().any(|cs| {
327                    cs.iter()
328                        .any(|c| c.sort.as_ref() == "chose-alt-fingerprint")
329                });
330                assert!(
331                    preserved,
332                    "strip_complement must preserve chose-alt-fingerprint witnesses"
333                );
334            })
335            .expect("spawn")
336            .join()
337            .expect("worker panicked");
338    }
339
340    #[cfg(feature = "lang-json")]
341    #[test]
342    fn edge_multiset_distinguishes_structurally_different_schemas() {
343        std::thread::Builder::new()
344            .stack_size(32 * 1024 * 1024)
345            .spawn(|| {
346                let registry = ParserRegistry::new();
347                let lens = ParseEmitLens::new(&registry, "json");
348                let s1 = lens.parse(br#"{"a": 1}"#).expect("parse");
349                let s2 = lens.parse(b"[1]").expect("parse");
350                let m1 = edge_multiset(&s1);
351                let m2 = edge_multiset(&s2);
352                assert_ne!(
353                    m1, m2,
354                    "object and array schemas have distinct edge-shape multisets"
355                );
356            })
357            .expect("spawn")
358            .join()
359            .expect("worker panicked");
360    }
361
362    #[test]
363    fn first_divergence_finds_count_mismatch() {
364        let mut a = BTreeMap::new();
365        a.insert("x".to_owned(), 1);
366        let mut b = BTreeMap::new();
367        b.insert("x".to_owned(), 2);
368        assert_eq!(
369            first_divergence(&a, &b),
370            Some(("x".to_owned(), Some(1), Some(2)))
371        );
372    }
373
374    #[test]
375    fn first_divergence_finds_extra_key_in_actual() {
376        let a = BTreeMap::new();
377        let mut b = BTreeMap::new();
378        b.insert("y".to_owned(), 3);
379        assert_eq!(
380            first_divergence(&a, &b),
381            Some(("y".to_owned(), None, Some(3)))
382        );
383    }
384
385    #[test]
386    fn first_divergence_returns_none_on_match() {
387        let mut a = BTreeMap::new();
388        a.insert("x".to_owned(), 1);
389        let b = a.clone();
390        assert_eq!(first_divergence(&a, &b), None);
391    }
392}