facet_format/parser.rs
1extern crate alloc;
2
3use crate::ParseError;
4use alloc::collections::VecDeque;
5use facet_reflect::Span;
6
7/// Opaque token returned by [`FormatParser::save`].
8///
9/// This token can be passed to [`FormatParser::restore`] to replay
10/// all events that were consumed since the save point.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct SavePoint(pub u64);
13
14impl SavePoint {
15 /// Create a new save point with the given ID.
16 pub fn new(id: u64) -> Self {
17 Self(id)
18 }
19}
20
21/// Streaming parser for a specific wire format.
22pub trait FormatParser<'de> {
23 /// Read the next parse event, or `None` if the input is exhausted.
24 ///
25 /// Returns `Ok(None)` at end-of-input (EOF). For formats like TOML where
26 /// structs can be "reopened" (fields added after the struct was previously
27 /// exited), callers should continue processing until EOF rather than
28 /// stopping at `StructEnd`.
29 ///
30 /// If [`restore`](Self::restore) was called, events are first replayed
31 /// from the internal buffer before reading new events from the input.
32 fn next_event(&mut self) -> Result<Option<crate::ParseEvent<'de>>, ParseError>;
33
34 /// Read multiple parse events into a deque, returning the number of events read.
35 ///
36 /// This is an optimization for parsers that can produce multiple events efficiently
37 /// in a single call, amortizing function call overhead and improving cache locality.
38 ///
39 /// Returns `Ok(0)` at end-of-input (EOF).
40 ///
41 /// The default implementation calls `next_event` repeatedly up to `limit` times.
42 /// Parsers can override this for better performance.
43 fn next_events(
44 &mut self,
45 buf: &mut VecDeque<crate::ParseEvent<'de>>,
46 limit: usize,
47 ) -> Result<usize, ParseError> {
48 let mut count = 0;
49 while count < limit {
50 match self.next_event()? {
51 Some(event) => {
52 buf.push_back(event);
53 count += 1;
54 }
55 None => break,
56 }
57 }
58 Ok(count)
59 }
60
61 /// Peek at the next event without consuming it, or `None` if at EOF.
62 fn peek_event(&mut self) -> Result<Option<crate::ParseEvent<'de>>, ParseError>;
63
64 /// Skip the current value (for unknown fields, etc.).
65 fn skip_value(&mut self) -> Result<(), ParseError>;
66
67 /// Save the current parser position and start recording events.
68 ///
69 /// Returns a [`SavePoint`] token. All events returned by [`next_event`](Self::next_event)
70 /// after this call are recorded internally. Call [`restore`](Self::restore) with this
71 /// token to replay all recorded events.
72 ///
73 /// This is used for untagged enum resolution: save, read ahead to determine
74 /// the variant, then restore and parse with the correct type.
75 fn save(&mut self) -> SavePoint;
76
77 /// Restore to a previous save point, replaying recorded events.
78 ///
79 /// After calling this, subsequent calls to [`next_event`](Self::next_event) will
80 /// first return all events that were recorded since the save point, then
81 /// continue reading from the input.
82 ///
83 /// The save point is consumed - to save again, call [`save`](Self::save).
84 fn restore(&mut self, save_point: SavePoint);
85
86 /// Capture the raw representation of the current value without parsing it.
87 ///
88 /// This is used for types like `RawJson` that want to defer parsing.
89 /// The parser should skip the value and return the raw bytes/string
90 /// from the input.
91 ///
92 /// Returns `Ok(None)` if raw capture is not supported (e.g., streaming mode
93 /// or formats where raw capture doesn't make sense).
94 fn capture_raw(&mut self) -> Result<Option<&'de str>, ParseError> {
95 // Default: not supported
96 self.skip_value()?;
97 Ok(None)
98 }
99
100 /// Returns the raw input bytes, if available.
101 ///
102 /// This is used by the deserializer to implement raw capture when buffering
103 /// events. The deserializer tracks value boundaries using event spans and
104 /// slices the input directly.
105 ///
106 /// Returns `None` for streaming parsers that don't have the full input.
107 fn input(&self) -> Option<&'de [u8]> {
108 None
109 }
110
111 /// Returns the shape of the format's raw capture type (e.g., `RawJson::SHAPE`).
112 ///
113 /// When the deserializer encounters a shape that matches this, it will use
114 /// `capture_raw` to capture the raw representation and store it in a
115 /// `Cow<str>` (the raw type must be a newtype over `Cow<str>`).
116 ///
117 /// Returns `None` if this format doesn't support raw capture types.
118 fn raw_capture_shape(&self) -> Option<&'static facet_core::Shape> {
119 None
120 }
121
122 /// Returns true if this format is self-describing.
123 ///
124 /// Self-describing formats (like JSON, YAML) include type information in the wire format
125 /// and emit `FieldKey` events for struct fields.
126 ///
127 /// Non-self-describing formats (like postcard, bincode) don't include type markers
128 /// and use `OrderedField` events, relying on the driver to provide schema information
129 /// via `hint_struct_fields`.
130 fn is_self_describing(&self) -> bool {
131 true // Default: most formats are self-describing
132 }
133
134 /// Hint to the parser that a struct with the given number of fields is expected.
135 ///
136 /// For non-self-describing formats, this allows the parser to emit the correct
137 /// number of `OrderedField` events followed by `StructEnd`.
138 ///
139 /// Self-describing formats can ignore this hint.
140 fn hint_struct_fields(&mut self, _num_fields: usize) {
141 // Default: ignore (self-describing formats don't need this)
142 }
143
144 /// Hint to the parser what scalar type is expected next.
145 ///
146 /// For non-self-describing formats, this allows the parser to correctly
147 /// decode the next value and emit an appropriate `Scalar` event.
148 ///
149 /// Self-describing formats can ignore this hint (they determine the type
150 /// from the wire format).
151 fn hint_scalar_type(&mut self, _hint: ScalarTypeHint) {
152 // Default: ignore (self-describing formats don't need this)
153 }
154
155 /// Hint to the parser that a sequence (array/Vec) is expected.
156 ///
157 /// For non-self-describing formats, this triggers reading the length prefix
158 /// and setting up sequence state.
159 ///
160 /// Self-describing formats can ignore this hint.
161 fn hint_sequence(&mut self) {
162 // Default: ignore (self-describing formats don't need this)
163 }
164
165 /// Hint to the parser that a byte sequence (`Vec<u8>`, `&[u8]`, etc.) is expected.
166 ///
167 /// For binary formats like postcard that store `Vec<u8>` as raw bytes (varint length
168 /// followed by raw data), this allows bulk reading instead of element-by-element
169 /// deserialization.
170 ///
171 /// If the parser handles this hint, it should emit `Scalar(Bytes(...))` directly.
172 /// If it doesn't support this optimization, it should return `false` and the
173 /// deserializer will fall back to element-by-element deserialization via `hint_sequence`.
174 ///
175 /// Returns `true` if the hint is handled (parser will emit `Scalar(Bytes(...))`),
176 /// `false` otherwise.
177 fn hint_byte_sequence(&mut self) -> bool {
178 // Default: not supported, fall back to element-by-element
179 false
180 }
181
182 /// Hint to the parser that all remaining input bytes should be consumed as a byte slice.
183 ///
184 /// This is used by formats like postcard for trailing opaque payloads where the
185 /// field boundary is "until end of input" rather than a length prefix.
186 ///
187 /// If handled, the parser should emit `Scalar(Bytes(...))` and advance to EOF.
188 /// Returns `true` if handled, `false` to use normal deserialization behavior.
189 fn hint_remaining_byte_sequence(&mut self) -> bool {
190 false
191 }
192
193 /// Hint to the parser that a fixed-size array is expected.
194 ///
195 /// For non-self-describing formats, this tells the parser the array length
196 /// is known at compile time (from the type), so no length prefix is read.
197 /// This differs from `hint_sequence` which reads a length prefix for Vec/slices.
198 ///
199 /// Self-describing formats can ignore this hint.
200 fn hint_array(&mut self, _len: usize) {
201 // Default: ignore (self-describing formats don't need this)
202 }
203
204 /// Hint to the parser that an `Option<T>` is expected.
205 ///
206 /// For non-self-describing formats (like postcard), this allows the parser
207 /// to read the discriminant byte and emit either:
208 /// - `Scalar(Null)` for None (discriminant 0x00)
209 /// - Set up state to parse the inner value for Some (discriminant 0x01)
210 ///
211 /// Self-describing formats can ignore this hint (they determine `Option`
212 /// presence from the wire format, e.g., null vs value in JSON).
213 fn hint_option(&mut self) {
214 // Default: ignore (self-describing formats don't need this)
215 }
216
217 /// Hint to the parser that a map is expected.
218 ///
219 /// For non-self-describing formats (like postcard), this allows the parser
220 /// to read the length prefix and set up map state. The parser should then
221 /// emit `SequenceStart` (representing the map entries) followed by pairs of
222 /// key and value events, and finally `SequenceEnd`.
223 ///
224 /// Self-describing formats can ignore this hint (they determine map structure
225 /// from the wire format, e.g., `{...}` in JSON).
226 fn hint_map(&mut self) {
227 // Default: ignore (self-describing formats don't need this)
228 }
229
230 /// Hint to the parser that a dynamic value is expected.
231 ///
232 /// Non-self-describing formats can use this to switch to a self-describing
233 /// encoding for dynamic values (e.g., tagged scalar/array/object).
234 /// Self-describing formats can ignore this hint.
235 fn hint_dynamic_value(&mut self) {
236 // Default: ignore (self-describing formats don't need this)
237 }
238
239 /// Hint to the parser that an enum is expected, providing variant information.
240 ///
241 /// For non-self-describing formats (like postcard), this allows the parser
242 /// to read the variant discriminant (varint) and map it to the variant name,
243 /// and to emit appropriate wrapper events for multi-field variants.
244 ///
245 /// The `variants` slice contains metadata for each variant in declaration order,
246 /// matching the indices used in the wire format.
247 ///
248 /// Self-describing formats can ignore this hint (they include variant names
249 /// in the wire format).
250 fn hint_enum(&mut self, _variants: &[EnumVariantHint]) {
251 // Default: ignore (self-describing formats don't need this)
252 }
253
254 /// Hint to the parser that an opaque scalar type is expected.
255 ///
256 /// For non-self-describing binary formats (like postcard), this allows the parser
257 /// to use format-specific encoding for types like UUID (16 raw bytes), ULID,
258 /// OrderedFloat, etc. that have a more efficient binary representation than
259 /// their string form.
260 ///
261 /// The `type_identifier` is the type's identifier string (e.g., "Uuid", "Ulid",
262 /// "OrderedFloat", `DateTime<Utc>`). The `shape` provides access to inner type
263 /// information (e.g., whether OrderedFloat wraps f32 or f64).
264 ///
265 /// Returns `true` if the parser will handle this type specially (caller should
266 /// expect format-specific `ScalarValue`), or `false` to fall back to standard
267 /// handling (e.g., `hint_scalar_type(String)` for `FromStr` types).
268 ///
269 /// Self-describing formats can ignore this and return `false`.
270 fn hint_opaque_scalar(
271 &mut self,
272 _type_identifier: &'static str,
273 _shape: &'static facet_core::Shape,
274 ) -> bool {
275 // Default: not handled, fall back to standard behavior
276 false
277 }
278
279 /// Returns the source span of the most recently consumed event.
280 ///
281 /// This is used for error reporting - when a deserialization error occurs,
282 /// the span of the last consumed event helps locate the problem in the input.
283 ///
284 /// Parsers that track source positions should override this to return
285 /// meaningful span information. The default implementation returns `None`.
286 fn current_span(&self) -> Option<Span> {
287 None
288 }
289
290 /// Returns the format namespace for format-specific proxy resolution.
291 ///
292 /// When a field or container has format-specific proxies (e.g., `#[facet(xml::proxy = XmlProxy)]`),
293 /// this namespace is used to look up the appropriate proxy. If no namespace is returned,
294 /// only the format-agnostic proxy (`#[facet(proxy = ...)]`) is considered.
295 ///
296 /// Examples:
297 /// - XML parser should return `Some("xml")`
298 /// - JSON parser should return `Some("json")`
299 ///
300 /// Default: returns `None` (only format-agnostic proxies are used).
301 fn format_namespace(&self) -> Option<&'static str> {
302 None
303 }
304}
305
306/// Metadata about an enum variant for use with `hint_enum`.
307///
308/// Provides the information needed by non-self-describing formats to correctly
309/// parse enum variants, including the variant's structure kind and field count.
310#[derive(Debug, Clone, Copy, PartialEq, Eq)]
311pub struct EnumVariantHint {
312 /// Name of the variant (e.g., "Some", "Pair", "Named")
313 pub name: &'static str,
314 /// The kind of struct this variant represents (Unit, Tuple, TupleStruct, or Struct)
315 pub kind: facet_core::StructKind,
316 /// Number of fields in this variant
317 pub field_count: usize,
318}
319
320/// Hint for what scalar type is expected next.
321///
322/// Used by non-self-describing formats to know how to decode the next value.
323#[derive(Debug, Clone, Copy, PartialEq, Eq)]
324pub enum ScalarTypeHint {
325 /// Boolean (postcard: 0 or 1 byte)
326 Bool,
327 /// Unsigned 8-bit integer (postcard: raw byte)
328 U8,
329 /// Unsigned 16-bit integer (postcard: varint)
330 U16,
331 /// Unsigned 32-bit integer (postcard: varint)
332 U32,
333 /// Unsigned 64-bit integer (postcard: varint)
334 U64,
335 /// Unsigned 128-bit integer (postcard: varint)
336 U128,
337 /// Platform-sized unsigned integer (postcard: varint)
338 Usize,
339 /// Signed 8-bit integer (postcard: zigzag varint)
340 I8,
341 /// Signed 16-bit integer (postcard: zigzag varint)
342 I16,
343 /// Signed 32-bit integer (postcard: zigzag varint)
344 I32,
345 /// Signed 64-bit integer (postcard: zigzag varint)
346 I64,
347 /// Signed 128-bit integer (postcard: zigzag varint)
348 I128,
349 /// Platform-sized signed integer (postcard: zigzag varint)
350 Isize,
351 /// 32-bit float (postcard: 4 bytes little-endian)
352 F32,
353 /// 64-bit float (postcard: 8 bytes little-endian)
354 F64,
355 /// UTF-8 string (postcard: varint length + bytes)
356 String,
357 /// Raw bytes (postcard: varint length + bytes)
358 Bytes,
359 /// Character (postcard: UTF-8 encoded)
360 Char,
361}
362
363/// Extension trait for parsers that support format-specific JIT (Tier 2).
364///
365/// Parsers implement this trait to enable the Tier 2 fast path, which
366/// generates Cranelift IR that parses bytes directly instead of going
367/// through the event abstraction.
368///
369/// # Requirements
370///
371/// Tier 2 requires:
372/// - The full input slice must be available upfront
373/// - The parser must be able to report and update its cursor position
374/// - The parser must reset internal state when `jit_set_pos` is called
375#[cfg(feature = "jit")]
376pub trait FormatJitParser<'de>: FormatParser<'de> {
377 /// The format-specific JIT emitter type.
378 type FormatJit: crate::jit::JitFormat;
379
380 /// Return the full input slice.
381 fn jit_input(&self) -> &'de [u8];
382
383 /// Return the current byte offset (cursor position).
384 ///
385 /// Returns `None` if there is buffered state (e.g., a peeked event)
386 /// that makes the position ambiguous.
387 fn jit_pos(&self) -> Option<usize>;
388
389 /// Commit a new cursor position after Tier 2 execution succeeds.
390 ///
391 /// Must also invalidate/reset any internal scanning/tokenizer state
392 /// so that subsequent parsing continues from `pos` consistently.
393 fn jit_set_pos(&mut self, pos: usize);
394
395 /// Return a format JIT emitter instance (usually a ZST).
396 fn jit_format(&self) -> Self::FormatJit;
397
398 /// Optional runtime maximum collection length for Tier-2 format JIT.
399 ///
400 /// If provided, format emitters can enforce container-length limits using
401 /// parser-specific runtime configuration (e.g., per-deserializer settings).
402 ///
403 /// Default is `None` (no runtime limit passed to Tier-2).
404 fn jit_max_collection_elements(&self) -> Option<u64> {
405 None
406 }
407
408 /// Convert a Tier 2 error (code + position) into `ParseError`.
409 fn jit_error(&self, input: &'de [u8], error_pos: usize, error_code: i32) -> ParseError;
410}