Skip to main content

facet_format/
parser.rs

1extern crate alloc;
2
3use crate::ParseError;
4use alloc::collections::VecDeque;
5use facet_reflect::Span;
6
7/// Opaque token returned by [`FormatParser::save`].
8///
9/// This token can be passed to [`FormatParser::restore`] to replay
10/// all events that were consumed since the save point.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct SavePoint(pub u64);
13
14impl SavePoint {
15    /// Create a new save point with the given ID.
16    pub fn new(id: u64) -> Self {
17        Self(id)
18    }
19}
20
21/// Streaming parser for a specific wire format.
22pub trait FormatParser<'de> {
23    /// Read the next parse event, or `None` if the input is exhausted.
24    ///
25    /// Returns `Ok(None)` at end-of-input (EOF). For formats like TOML where
26    /// structs can be "reopened" (fields added after the struct was previously
27    /// exited), callers should continue processing until EOF rather than
28    /// stopping at `StructEnd`.
29    ///
30    /// If [`restore`](Self::restore) was called, events are first replayed
31    /// from the internal buffer before reading new events from the input.
32    fn next_event(&mut self) -> Result<Option<crate::ParseEvent<'de>>, ParseError>;
33
34    /// Read multiple parse events into a deque, returning the number of events read.
35    ///
36    /// This is an optimization for parsers that can produce multiple events efficiently
37    /// in a single call, amortizing function call overhead and improving cache locality.
38    ///
39    /// Returns `Ok(0)` at end-of-input (EOF).
40    ///
41    /// The default implementation calls `next_event` repeatedly up to `limit` times.
42    /// Parsers can override this for better performance.
43    fn next_events(
44        &mut self,
45        buf: &mut VecDeque<crate::ParseEvent<'de>>,
46        limit: usize,
47    ) -> Result<usize, ParseError> {
48        let mut count = 0;
49        while count < limit {
50            match self.next_event()? {
51                Some(event) => {
52                    buf.push_back(event);
53                    count += 1;
54                }
55                None => break,
56            }
57        }
58        Ok(count)
59    }
60
61    /// Peek at the next event without consuming it, or `None` if at EOF.
62    fn peek_event(&mut self) -> Result<Option<crate::ParseEvent<'de>>, ParseError>;
63
64    /// Skip the current value (for unknown fields, etc.).
65    fn skip_value(&mut self) -> Result<(), ParseError>;
66
67    /// Save the current parser position and start recording events.
68    ///
69    /// Returns a [`SavePoint`] token. All events returned by [`next_event`](Self::next_event)
70    /// after this call are recorded internally. Call [`restore`](Self::restore) with this
71    /// token to replay all recorded events.
72    ///
73    /// This is used for untagged enum resolution: save, read ahead to determine
74    /// the variant, then restore and parse with the correct type.
75    fn save(&mut self) -> SavePoint;
76
77    /// Restore to a previous save point, replaying recorded events.
78    ///
79    /// After calling this, subsequent calls to [`next_event`](Self::next_event) will
80    /// first return all events that were recorded since the save point, then
81    /// continue reading from the input.
82    ///
83    /// The save point is consumed - to save again, call [`save`](Self::save).
84    fn restore(&mut self, save_point: SavePoint);
85
86    /// Capture the raw representation of the current value without parsing it.
87    ///
88    /// This is used for types like `RawJson` that want to defer parsing.
89    /// The parser should skip the value and return the raw bytes/string
90    /// from the input.
91    ///
92    /// Returns `Ok(None)` if raw capture is not supported (e.g., streaming mode
93    /// or formats where raw capture doesn't make sense).
94    fn capture_raw(&mut self) -> Result<Option<&'de str>, ParseError> {
95        // Default: not supported
96        self.skip_value()?;
97        Ok(None)
98    }
99
100    /// Returns the raw input bytes, if available.
101    ///
102    /// This is used by the deserializer to implement raw capture when buffering
103    /// events. The deserializer tracks value boundaries using event spans and
104    /// slices the input directly.
105    ///
106    /// Returns `None` for streaming parsers that don't have the full input.
107    fn input(&self) -> Option<&'de [u8]> {
108        None
109    }
110
111    /// Returns the shape of the format's raw capture type (e.g., `RawJson::SHAPE`).
112    ///
113    /// When the deserializer encounters a shape that matches this, it will use
114    /// `capture_raw` to capture the raw representation and store it in a
115    /// `Cow<str>` (the raw type must be a newtype over `Cow<str>`).
116    ///
117    /// Returns `None` if this format doesn't support raw capture types.
118    fn raw_capture_shape(&self) -> Option<&'static facet_core::Shape> {
119        None
120    }
121
122    /// Returns true if this format is self-describing.
123    ///
124    /// Self-describing formats (like JSON, YAML) include type information in the wire format
125    /// and emit `FieldKey` events for struct fields.
126    ///
127    /// Non-self-describing formats (like postcard, bincode) don't include type markers
128    /// and use `OrderedField` events, relying on the driver to provide schema information
129    /// via `hint_struct_fields`.
130    fn is_self_describing(&self) -> bool {
131        true // Default: most formats are self-describing
132    }
133
134    /// Hint to the parser that a struct with the given number of fields is expected.
135    ///
136    /// For non-self-describing formats, this allows the parser to emit the correct
137    /// number of `OrderedField` events followed by `StructEnd`.
138    ///
139    /// Self-describing formats can ignore this hint.
140    fn hint_struct_fields(&mut self, _num_fields: usize) {
141        // Default: ignore (self-describing formats don't need this)
142    }
143
144    /// Hint to the parser what scalar type is expected next.
145    ///
146    /// For non-self-describing formats, this allows the parser to correctly
147    /// decode the next value and emit an appropriate `Scalar` event.
148    ///
149    /// Self-describing formats can ignore this hint (they determine the type
150    /// from the wire format).
151    fn hint_scalar_type(&mut self, _hint: ScalarTypeHint) {
152        // Default: ignore (self-describing formats don't need this)
153    }
154
155    /// Hint to the parser that a sequence (array/Vec) is expected.
156    ///
157    /// For non-self-describing formats, this triggers reading the length prefix
158    /// and setting up sequence state.
159    ///
160    /// Self-describing formats can ignore this hint.
161    fn hint_sequence(&mut self) {
162        // Default: ignore (self-describing formats don't need this)
163    }
164
165    /// Hint to the parser that a byte sequence (`Vec<u8>`, `&[u8]`, etc.) is expected.
166    ///
167    /// For binary formats like postcard that store `Vec<u8>` as raw bytes (varint length
168    /// followed by raw data), this allows bulk reading instead of element-by-element
169    /// deserialization.
170    ///
171    /// If the parser handles this hint, it should emit `Scalar(Bytes(...))` directly.
172    /// If it doesn't support this optimization, it should return `false` and the
173    /// deserializer will fall back to element-by-element deserialization via `hint_sequence`.
174    ///
175    /// Returns `true` if the hint is handled (parser will emit `Scalar(Bytes(...))`),
176    /// `false` otherwise.
177    fn hint_byte_sequence(&mut self) -> bool {
178        // Default: not supported, fall back to element-by-element
179        false
180    }
181
182    /// Hint to the parser that all remaining input bytes should be consumed as a byte slice.
183    ///
184    /// This is used by formats like postcard for trailing opaque payloads where the
185    /// field boundary is "until end of input" rather than a length prefix.
186    ///
187    /// If handled, the parser should emit `Scalar(Bytes(...))` and advance to EOF.
188    /// Returns `true` if handled, `false` to use normal deserialization behavior.
189    fn hint_remaining_byte_sequence(&mut self) -> bool {
190        false
191    }
192
193    /// Hint to the parser that a fixed-size array is expected.
194    ///
195    /// For non-self-describing formats, this tells the parser the array length
196    /// is known at compile time (from the type), so no length prefix is read.
197    /// This differs from `hint_sequence` which reads a length prefix for Vec/slices.
198    ///
199    /// Self-describing formats can ignore this hint.
200    fn hint_array(&mut self, _len: usize) {
201        // Default: ignore (self-describing formats don't need this)
202    }
203
204    /// Hint to the parser that an `Option<T>` is expected.
205    ///
206    /// For non-self-describing formats (like postcard), this allows the parser
207    /// to read the discriminant byte and emit either:
208    /// - `Scalar(Null)` for None (discriminant 0x00)
209    /// - Set up state to parse the inner value for Some (discriminant 0x01)
210    ///
211    /// Self-describing formats can ignore this hint (they determine `Option`
212    /// presence from the wire format, e.g., null vs value in JSON).
213    fn hint_option(&mut self) {
214        // Default: ignore (self-describing formats don't need this)
215    }
216
217    /// Hint to the parser that a map is expected.
218    ///
219    /// For non-self-describing formats (like postcard), this allows the parser
220    /// to read the length prefix and set up map state. The parser should then
221    /// emit `SequenceStart` (representing the map entries) followed by pairs of
222    /// key and value events, and finally `SequenceEnd`.
223    ///
224    /// Self-describing formats can ignore this hint (they determine map structure
225    /// from the wire format, e.g., `{...}` in JSON).
226    fn hint_map(&mut self) {
227        // Default: ignore (self-describing formats don't need this)
228    }
229
230    /// Hint to the parser that a dynamic value is expected.
231    ///
232    /// Non-self-describing formats can use this to switch to a self-describing
233    /// encoding for dynamic values (e.g., tagged scalar/array/object).
234    /// Self-describing formats can ignore this hint.
235    fn hint_dynamic_value(&mut self) {
236        // Default: ignore (self-describing formats don't need this)
237    }
238
239    /// Hint to the parser that an enum is expected, providing variant information.
240    ///
241    /// For non-self-describing formats (like postcard), this allows the parser
242    /// to read the variant discriminant (varint) and map it to the variant name,
243    /// and to emit appropriate wrapper events for multi-field variants.
244    ///
245    /// The `variants` slice contains metadata for each variant in declaration order,
246    /// matching the indices used in the wire format.
247    ///
248    /// Self-describing formats can ignore this hint (they include variant names
249    /// in the wire format).
250    fn hint_enum(&mut self, _variants: &[EnumVariantHint]) {
251        // Default: ignore (self-describing formats don't need this)
252    }
253
254    /// Hint to the parser that an opaque scalar type is expected.
255    ///
256    /// For non-self-describing binary formats (like postcard), this allows the parser
257    /// to use format-specific encoding for types like UUID (16 raw bytes), ULID,
258    /// OrderedFloat, etc. that have a more efficient binary representation than
259    /// their string form.
260    ///
261    /// The `type_identifier` is the type's identifier string (e.g., "Uuid", "Ulid",
262    /// "OrderedFloat", `DateTime<Utc>`). The `shape` provides access to inner type
263    /// information (e.g., whether OrderedFloat wraps f32 or f64).
264    ///
265    /// Returns `true` if the parser will handle this type specially (caller should
266    /// expect format-specific `ScalarValue`), or `false` to fall back to standard
267    /// handling (e.g., `hint_scalar_type(String)` for `FromStr` types).
268    ///
269    /// Self-describing formats can ignore this and return `false`.
270    fn hint_opaque_scalar(
271        &mut self,
272        _type_identifier: &'static str,
273        _shape: &'static facet_core::Shape,
274    ) -> bool {
275        // Default: not handled, fall back to standard behavior
276        false
277    }
278
279    /// Returns the source span of the most recently consumed event.
280    ///
281    /// This is used for error reporting - when a deserialization error occurs,
282    /// the span of the last consumed event helps locate the problem in the input.
283    ///
284    /// Parsers that track source positions should override this to return
285    /// meaningful span information. The default implementation returns `None`.
286    fn current_span(&self) -> Option<Span> {
287        None
288    }
289
290    /// Returns the format namespace for format-specific proxy resolution.
291    ///
292    /// When a field or container has format-specific proxies (e.g., `#[facet(xml::proxy = XmlProxy)]`),
293    /// this namespace is used to look up the appropriate proxy. If no namespace is returned,
294    /// only the format-agnostic proxy (`#[facet(proxy = ...)]`) is considered.
295    ///
296    /// Examples:
297    /// - XML parser should return `Some("xml")`
298    /// - JSON parser should return `Some("json")`
299    ///
300    /// Default: returns `None` (only format-agnostic proxies are used).
301    fn format_namespace(&self) -> Option<&'static str> {
302        None
303    }
304}
305
306/// Metadata about an enum variant for use with `hint_enum`.
307///
308/// Provides the information needed by non-self-describing formats to correctly
309/// parse enum variants, including the variant's structure kind and field count.
310#[derive(Debug, Clone, Copy, PartialEq, Eq)]
311pub struct EnumVariantHint {
312    /// Name of the variant (e.g., "Some", "Pair", "Named")
313    pub name: &'static str,
314    /// The kind of struct this variant represents (Unit, Tuple, TupleStruct, or Struct)
315    pub kind: facet_core::StructKind,
316    /// Number of fields in this variant
317    pub field_count: usize,
318}
319
320/// Hint for what scalar type is expected next.
321///
322/// Used by non-self-describing formats to know how to decode the next value.
323#[derive(Debug, Clone, Copy, PartialEq, Eq)]
324pub enum ScalarTypeHint {
325    /// Boolean (postcard: 0 or 1 byte)
326    Bool,
327    /// Unsigned 8-bit integer (postcard: raw byte)
328    U8,
329    /// Unsigned 16-bit integer (postcard: varint)
330    U16,
331    /// Unsigned 32-bit integer (postcard: varint)
332    U32,
333    /// Unsigned 64-bit integer (postcard: varint)
334    U64,
335    /// Unsigned 128-bit integer (postcard: varint)
336    U128,
337    /// Platform-sized unsigned integer (postcard: varint)
338    Usize,
339    /// Signed 8-bit integer (postcard: zigzag varint)
340    I8,
341    /// Signed 16-bit integer (postcard: zigzag varint)
342    I16,
343    /// Signed 32-bit integer (postcard: zigzag varint)
344    I32,
345    /// Signed 64-bit integer (postcard: zigzag varint)
346    I64,
347    /// Signed 128-bit integer (postcard: zigzag varint)
348    I128,
349    /// Platform-sized signed integer (postcard: zigzag varint)
350    Isize,
351    /// 32-bit float (postcard: 4 bytes little-endian)
352    F32,
353    /// 64-bit float (postcard: 8 bytes little-endian)
354    F64,
355    /// UTF-8 string (postcard: varint length + bytes)
356    String,
357    /// Raw bytes (postcard: varint length + bytes)
358    Bytes,
359    /// Character (postcard: UTF-8 encoded)
360    Char,
361}
362
363/// Extension trait for parsers that support format-specific JIT (Tier 2).
364///
365/// Parsers implement this trait to enable the Tier 2 fast path, which
366/// generates Cranelift IR that parses bytes directly instead of going
367/// through the event abstraction.
368///
369/// # Requirements
370///
371/// Tier 2 requires:
372/// - The full input slice must be available upfront
373/// - The parser must be able to report and update its cursor position
374/// - The parser must reset internal state when `jit_set_pos` is called
375#[cfg(feature = "jit")]
376pub trait FormatJitParser<'de>: FormatParser<'de> {
377    /// The format-specific JIT emitter type.
378    type FormatJit: crate::jit::JitFormat;
379
380    /// Return the full input slice.
381    fn jit_input(&self) -> &'de [u8];
382
383    /// Return the current byte offset (cursor position).
384    ///
385    /// Returns `None` if there is buffered state (e.g., a peeked event)
386    /// that makes the position ambiguous.
387    fn jit_pos(&self) -> Option<usize>;
388
389    /// Commit a new cursor position after Tier 2 execution succeeds.
390    ///
391    /// Must also invalidate/reset any internal scanning/tokenizer state
392    /// so that subsequent parsing continues from `pos` consistently.
393    fn jit_set_pos(&mut self, pos: usize);
394
395    /// Return a format JIT emitter instance (usually a ZST).
396    fn jit_format(&self) -> Self::FormatJit;
397
398    /// Optional runtime maximum collection length for Tier-2 format JIT.
399    ///
400    /// If provided, format emitters can enforce container-length limits using
401    /// parser-specific runtime configuration (e.g., per-deserializer settings).
402    ///
403    /// Default is `None` (no runtime limit passed to Tier-2).
404    fn jit_max_collection_elements(&self) -> Option<u64> {
405        None
406    }
407
408    /// Convert a Tier 2 error (code + position) into `ParseError`.
409    fn jit_error(&self, input: &'de [u8], error_pos: usize, error_code: i32) -> ParseError;
410}