cbor_core/decode_options.rs
1use std::{borrow::Cow, collections::BTreeMap};
2
3use crate::{
4 DataType, Error, Float, Format, IoResult, Result, SequenceDecoder, SequenceReader, SimpleValue, Value,
5 codec::{Argument, Head, Major},
6 io::{HexReader, HexSliceReader, MyReader, SliceReader},
7 limits,
8 parse::Parser,
9};
10
11/// Configuration for CBOR decoding.
12///
13/// `DecodeOptions` controls the input format ([`Binary`](Format::Binary),
14/// [`Hex`](Format::Hex), or [`Diagnostic`](Format::Diagnostic)) and the
15/// limits the decoder enforces against hostile or malformed input.
16/// Construct it with [`DecodeOptions::new`] (or `Default`), adjust
17/// settings with the builder methods, and call [`decode`](Self::decode)
18/// or [`read_from`](Self::read_from) for a single item, or
19/// [`sequence_decoder`](Self::sequence_decoder) / [`sequence_reader`](Self::sequence_reader)
20/// for a CBOR sequence.
21///
22/// The convenience methods on [`Value`] ([`decode`](Value::decode),
23/// [`decode_hex`](Value::decode_hex), [`read_from`](Value::read_from),
24/// [`read_hex_from`](Value::read_hex_from)) all forward to a default
25/// `DecodeOptions`. Use this type directly when you need to decode
26/// diagnostic notation, iterate a sequence, relax a limit for a known
27/// input, or tighten one for untrusted input.
28///
29/// # Options
30///
31/// | Option | Default | Purpose |
32/// |---|---|---|
33/// | [`format`](Self::format) | [`Binary`](Format::Binary) | Input syntax: binary, hex text, or diagnostic notation. |
34/// | [`recursion_limit`](Self::recursion_limit) | 200 | Maximum nesting depth of arrays, maps, and tags. |
35/// | [`length_limit`](Self::length_limit) | 1,000,000,000 | Maximum declared element count of a single array, map, byte string, or text string. |
36/// | [`oom_mitigation`](Self::oom_mitigation) | 100,000,000 | Byte budget for speculative pre-allocation. |
37///
38/// ## `recursion_limit`
39///
40/// Each array, map, or tag consumes one unit of recursion budget for
41/// its contents. Exceeding the limit returns [`Error::NestingTooDeep`].
42/// The limit protects against stack overflow on adversarial input and
43/// should be well below the stack a thread has available.
44///
45/// ## `length_limit`
46///
47/// Applies to the length field in the CBOR head of arrays, maps, byte
48/// strings, and text strings. It caps the declared size before any
49/// bytes are read, so a malicious header claiming a petabyte-long
50/// string is rejected immediately with [`Error::LengthTooLarge`]. The
51/// limit does not restrict total input size; a valid document may
52/// contain many items each up to the limit.
53///
54/// ## `oom_mitigation`
55///
56/// CBOR encodes lengths in the head, so a decoder is tempted to
57/// pre-allocate a `Vec` of the declared capacity. On hostile input
58/// that is a trivial amplification attack: a few bytes on the wire
59/// reserve gigabytes of memory. `oom_mitigation` is a byte budget,
60/// shared across the current decode, that caps the total amount of
61/// speculative capacity the decoder may reserve for array backing
62/// storage. Once the budget is exhausted, further arrays start empty
63/// and grow on demand. Decoding still succeeds if the input is
64/// well-formed; only the up-front reservation is bounded.
65///
66/// The budget is consumed, not refilled: a deeply nested structure
67/// with many small arrays can drain it early and decode the tail with
68/// zero pre-allocation. That is the intended behavior.
69///
70/// # Examples
71///
72/// Decode binary CBOR with default limits:
73///
74/// ```
75/// use cbor_core::DecodeOptions;
76///
77/// let v = DecodeOptions::new().decode(&[0x18, 42]).unwrap();
78/// assert_eq!(v.to_u32().unwrap(), 42);
79/// ```
80///
81/// Switch the input format to hex text or diagnostic notation:
82///
83/// ```
84/// use cbor_core::{DecodeOptions, Format};
85///
86/// let v = DecodeOptions::new().format(Format::Hex).decode("182a").unwrap();
87/// assert_eq!(v.to_u32().unwrap(), 42);
88///
89/// let v = DecodeOptions::new().format(Format::Diagnostic).decode("42").unwrap();
90/// assert_eq!(v.to_u32().unwrap(), 42);
91/// ```
92///
93/// Tighten limits for input from an untrusted source:
94///
95/// ```
96/// use cbor_core::DecodeOptions;
97///
98/// let strict = DecodeOptions::new()
99/// .recursion_limit(16)
100/// .length_limit(4096)
101/// .oom_mitigation(64 * 1024);
102///
103/// assert!(strict.decode(&[0x18, 42]).is_ok());
104/// ```
105#[derive(Debug, Clone)]
106pub struct DecodeOptions {
107 format: Format,
108 recursion_limit: u16,
109 length_limit: u64,
110 oom_mitigation: usize,
111}
112
113impl Default for DecodeOptions {
114 fn default() -> Self {
115 Self::new()
116 }
117}
118
119impl DecodeOptions {
120 /// Create a new set of options with the crate defaults.
121 ///
122 /// ```
123 /// use cbor_core::DecodeOptions;
124 ///
125 /// let opts = DecodeOptions::new();
126 /// let v = opts.decode(&[0x18, 42]).unwrap();
127 /// assert_eq!(v.to_u32().unwrap(), 42);
128 /// ```
129 #[must_use]
130 pub const fn new() -> Self {
131 Self {
132 format: Format::Binary,
133 recursion_limit: limits::RECURSION_LIMIT,
134 length_limit: limits::LENGTH_LIMIT,
135 oom_mitigation: limits::OOM_MITIGATION,
136 }
137 }
138
139 /// Select the input format: [`Binary`](Format::Binary),
140 /// [`Hex`](Format::Hex), or [`Diagnostic`](Format::Diagnostic).
141 ///
142 /// Default: [`Format::Binary`].
143 ///
144 /// ```
145 /// use cbor_core::{DecodeOptions, Format};
146 ///
147 /// let hex = DecodeOptions::new().format(Format::Hex).decode("182a").unwrap();
148 /// let bin = DecodeOptions::new().decode(&[0x18, 0x2a]).unwrap();
149 /// assert_eq!(hex, bin);
150 ///
151 /// let v = DecodeOptions::new().format(Format::Diagnostic).decode("42").unwrap();
152 /// assert_eq!(v.to_u32().unwrap(), 42);
153 /// ```
154 pub const fn format(mut self, format: Format) -> Self {
155 self.format = format;
156 self
157 }
158
159 /// Set the maximum nesting depth of arrays, maps, and tags.
160 ///
161 /// Default: 200. Input that exceeds the limit returns
162 /// [`Error::NestingTooDeep`].
163 ///
164 /// ```
165 /// use cbor_core::{DecodeOptions, Error};
166 ///
167 /// // Two nested one-element arrays: 0x81 0x81 0x00
168 /// let err = DecodeOptions::new()
169 /// .recursion_limit(1)
170 /// .decode(&[0x81, 0x81, 0x00])
171 /// .unwrap_err();
172 /// assert_eq!(err, Error::NestingTooDeep);
173 /// ```
174 pub const fn recursion_limit(mut self, limit: u16) -> Self {
175 self.recursion_limit = limit;
176 self
177 }
178
179 /// Set the maximum declared length for byte strings, text strings,
180 /// arrays, and maps.
181 ///
182 /// Default: 1,000,000,000. Checked against the length field in the
183 /// CBOR head before any bytes are consumed; an oversized declaration
184 /// returns [`Error::LengthTooLarge`].
185 ///
186 /// ```
187 /// use cbor_core::{DecodeOptions, Error};
188 ///
189 /// // A five-byte text string: 0x65 'h' 'e' 'l' 'l' 'o'
190 /// let err = DecodeOptions::new()
191 /// .length_limit(4)
192 /// .decode(b"\x65hello")
193 /// .unwrap_err();
194 /// assert_eq!(err, Error::LengthTooLarge);
195 /// ```
196 pub const fn length_limit(mut self, limit: u64) -> Self {
197 self.length_limit = limit;
198 self
199 }
200
201 /// Set the byte budget for speculative pre-allocation of array
202 /// backing storage.
203 ///
204 /// Default: 100,000,000. Lower values trade a small amount of
205 /// decoding throughput for stronger resistance to memory-amplification
206 /// attacks. Valid input decodes regardless; only the up-front
207 /// reservation is bounded.
208 ///
209 /// ```
210 /// use cbor_core::DecodeOptions;
211 ///
212 /// // A two-element array: 0x82 0x01 0x02
213 /// let v = DecodeOptions::new()
214 /// .oom_mitigation(0)
215 /// .decode(&[0x82, 0x01, 0x02])
216 /// .unwrap();
217 /// assert_eq!(v.len(), Some(2));
218 /// ```
219 pub const fn oom_mitigation(mut self, bytes: usize) -> Self {
220 self.oom_mitigation = bytes;
221 self
222 }
223
224 /// Decode exactly one CBOR data item from an in-memory buffer.
225 ///
226 /// Takes the input by reference: `&[u8]`, `&[u8; N]`, `&Vec<u8>`,
227 /// `&str`, `&String`, etc. all work via `T: AsRef<[u8]> + ?Sized`.
228 /// In [`Format::Binary`], decoded text and byte strings borrow
229 /// directly from the input slice and the returned [`Value`]
230 /// inherits that lifetime; in [`Format::Hex`] and
231 /// [`Format::Diagnostic`] the result is owned.
232 ///
233 /// The input must contain **exactly one** value: any bytes
234 /// remaining after a successful decode cause
235 /// [`Error::InvalidFormat`]. In [`Format::Diagnostic`] mode
236 /// trailing whitespace and comments are accepted, but nothing
237 /// else. Use [`sequence_decoder`](Self::sequence_decoder) when the input is a CBOR
238 /// sequence.
239 ///
240 /// An empty buffer (and, for diagnostic notation, one containing
241 /// only whitespace and comments) returns [`Error::UnexpectedEof`].
242 /// A partial value returns [`Error::UnexpectedEof`] too.
243 ///
244 /// ```
245 /// use cbor_core::{DecodeOptions, Format};
246 ///
247 /// let v = DecodeOptions::new().decode(&[0x18, 42]).unwrap();
248 /// assert_eq!(v.to_u32().unwrap(), 42);
249 ///
250 /// let v = DecodeOptions::new().format(Format::Hex).decode("182a").unwrap();
251 /// assert_eq!(v.to_u32().unwrap(), 42);
252 ///
253 /// let v = DecodeOptions::new()
254 /// .format(Format::Diagnostic)
255 /// .decode("42 / trailing comment is fine /")
256 /// .unwrap();
257 /// assert_eq!(v.to_u32().unwrap(), 42);
258 /// ```
259 pub fn decode<'a, T>(&self, bytes: &'a T) -> Result<Value<'a>>
260 where
261 T: AsRef<[u8]> + ?Sized,
262 {
263 let bytes = bytes.as_ref();
264 match self.format {
265 Format::Binary => {
266 let mut reader = SliceReader(bytes);
267 let value = self.do_read(&mut reader, self.recursion_limit, self.oom_mitigation)?;
268 if !reader.0.is_empty() {
269 return Err(Error::InvalidFormat);
270 }
271 Ok(value)
272 }
273 Format::Hex => {
274 let mut reader = HexSliceReader(bytes);
275 let value = self.do_read(&mut reader, self.recursion_limit, self.oom_mitigation)?;
276 if !reader.0.is_empty() {
277 return Err(Error::InvalidFormat);
278 }
279 Ok(value)
280 }
281 Format::Diagnostic => {
282 let mut parser = Parser::new(SliceReader(bytes), self.recursion_limit);
283 parser.parse_complete()
284 }
285 }
286 }
287
288 /// Decode exactly one CBOR data item into an owned [`Value`].
289 ///
290 /// Takes the input by value: `Vec<u8>`, `&[u8]`, `&str`, and
291 /// anything else that implements `AsRef<[u8]>` all work. Unlike
292 /// [`decode`](Self::decode), the result never borrows from the
293 /// input regardless of format: text and byte strings are always
294 /// copied into owned allocations. The returned value can be held
295 /// as `Value<'static>` and stored or sent across threads without
296 /// any lifetime constraint.
297 ///
298 /// Use this when the input is short-lived (a temporary buffer, a
299 /// `Vec` returned from a function, etc.) and the decoded value
300 /// needs to outlive it. When the input already lives long enough,
301 /// [`decode`](Self::decode) avoids the copies.
302 ///
303 /// The input must contain **exactly one** value: any bytes
304 /// remaining after a successful decode cause
305 /// [`Error::InvalidFormat`]. In [`Format::Diagnostic`] mode
306 /// trailing whitespace and comments are accepted, but nothing
307 /// else. Use [`sequence_decoder`](Self::sequence_decoder) when
308 /// the input is a CBOR sequence.
309 ///
310 /// An empty buffer (and, for diagnostic notation, one containing
311 /// only whitespace and comments) returns [`Error::UnexpectedEof`].
312 /// A partial value returns [`Error::UnexpectedEof`] too.
313 ///
314 /// ```
315 /// use cbor_core::{DecodeOptions, Format, Value};
316 ///
317 /// // Decode from a short-lived Vec without worrying about lifetimes.
318 /// let bytes: Vec<u8> = vec![0x18, 42];
319 /// let v: Value<'static> = DecodeOptions::new().decode_owned(bytes).unwrap();
320 /// assert_eq!(v.to_u32().unwrap(), 42);
321 ///
322 /// // Hex and diagnostic formats work the same way.
323 /// let v: Value<'static> = DecodeOptions::new()
324 /// .format(Format::Hex)
325 /// .decode_owned("182a")
326 /// .unwrap();
327 /// assert_eq!(v.to_u32().unwrap(), 42);
328 /// ```
329 pub fn decode_owned<'a>(&self, bytes: impl AsRef<[u8]>) -> Result<Value<'a>> {
330 let mut bytes = bytes.as_ref();
331
332 match self.format {
333 Format::Binary | Format::Hex => {
334 let value = self.read_from(&mut bytes).map_err(|err| match err {
335 crate::IoError::Io(_io_error) => unreachable!(),
336 crate::IoError::Data(error) => error,
337 })?;
338
339 if bytes.is_empty() {
340 Ok(value)
341 } else {
342 Err(Error::InvalidFormat)
343 }
344 }
345
346 Format::Diagnostic => {
347 let mut parser = Parser::new(SliceReader(bytes), self.recursion_limit);
348 parser.parse_complete()
349 }
350 }
351 }
352
353 /// Read a single CBOR data item from a stream.
354 ///
355 /// Designed to be called repeatedly to pull successive elements of
356 /// a CBOR sequence:
357 ///
358 /// * In [`Format::Binary`] and [`Format::Hex`] the reader is
359 /// consumed only up to the end of the item; any bytes after
360 /// remain in the stream.
361 /// * In [`Format::Diagnostic`] trailing whitespace and comments
362 /// are consumed up to either end of stream or a top-level
363 /// separator comma (the comma is also consumed). Anything else
364 /// after the value fails with [`Error::InvalidFormat`].
365 ///
366 /// Bytes are read into an internal buffer, so the result is
367 /// always owned and can be held as `Value<'static>`. For
368 /// zero-copy decoding from a byte slice, use
369 /// [`decode`](Self::decode) instead.
370 ///
371 /// I/O failures are returned as [`IoError::Io`](crate::IoError::Io);
372 /// malformed or oversized input as [`IoError::Data`](crate::IoError::Data).
373 ///
374 /// ```
375 /// use cbor_core::{DecodeOptions, Format};
376 ///
377 /// let mut bytes: &[u8] = &[0x18, 42];
378 /// let v = DecodeOptions::new().read_from(&mut bytes).unwrap();
379 /// assert_eq!(v.to_u32().unwrap(), 42);
380 ///
381 /// let mut hex: &[u8] = b"182a";
382 /// let v = DecodeOptions::new().format(Format::Hex).read_from(&mut hex).unwrap();
383 /// assert_eq!(v.to_u32().unwrap(), 42);
384 ///
385 /// // Diagnostic: repeated read_from pulls successive sequence items.
386 /// let mut diag: &[u8] = b"1, 2, 3";
387 /// let opts = DecodeOptions::new().format(Format::Diagnostic);
388 /// let a = opts.read_from(&mut diag).unwrap();
389 /// let b = opts.read_from(&mut diag).unwrap();
390 /// let c = opts.read_from(&mut diag).unwrap();
391 /// assert_eq!(a.to_u32().unwrap(), 1);
392 /// assert_eq!(b.to_u32().unwrap(), 2);
393 /// assert_eq!(c.to_u32().unwrap(), 3);
394 /// ```
395 pub fn read_from<'a>(&self, reader: impl std::io::Read) -> IoResult<Value<'a>> {
396 match self.format {
397 Format::Binary => {
398 let mut reader = reader;
399 self.do_read(&mut reader, self.recursion_limit, self.oom_mitigation)
400 }
401 Format::Hex => {
402 let mut reader = HexReader(reader);
403 self.do_read(&mut reader, self.recursion_limit, self.oom_mitigation)
404 }
405 Format::Diagnostic => {
406 let mut parser = Parser::new(reader, self.recursion_limit);
407 parser.parse_stream_item()
408 }
409 }
410 }
411
412 /// Create an iterator over a CBOR sequence stored in memory.
413 ///
414 /// The returned [`SequenceDecoder`] yields each successive item of the
415 /// sequence as `Result<Value<'a>>`, where `'a` is the lifetime of
416 /// the input slice. In binary format, items borrow text and byte
417 /// strings from the input; in hex and diagnostic format the items
418 /// are owned. The iterator captures a snapshot of these options;
419 /// subsequent changes to `self` do not affect it.
420 ///
421 /// ```
422 /// use cbor_core::{DecodeOptions, Format};
423 ///
424 /// let opts = DecodeOptions::new().format(Format::Diagnostic);
425 ///
426 /// let items: Vec<_> = opts
427 /// .sequence_decoder(b"1, 2, 3,")
428 /// .collect::<Result<_, _>>()
429 /// .unwrap();
430 /// assert_eq!(items.len(), 3);
431 /// ```
432 pub fn sequence_decoder<'a, T>(&self, input: &'a T) -> SequenceDecoder<'a>
433 where
434 T: AsRef<[u8]> + ?Sized,
435 {
436 SequenceDecoder::with_options(self.clone(), input.as_ref())
437 }
438
439 /// Create an iterator over a CBOR sequence read from a stream.
440 ///
441 /// The returned [`SequenceReader`] yields each successive item as
442 /// `IoResult<Value<'static>>`. `None` indicates a clean end
443 /// between items; a truncated item produces `Some(Err(_))`. Items
444 /// are always owned (the bytes are read into an internal
445 /// buffer); for zero-copy iteration use
446 /// [`sequence_decoder`](Self::sequence_decoder) on a byte slice
447 /// instead.
448 ///
449 /// ```
450 /// use cbor_core::DecodeOptions;
451 ///
452 /// // Binary CBOR sequence: three one-byte items 0x01 0x02 0x03.
453 /// let bytes: &[u8] = &[0x01, 0x02, 0x03];
454 /// let items: Vec<_> = DecodeOptions::new()
455 /// .sequence_reader(bytes)
456 /// .collect::<Result<_, _>>()
457 /// .unwrap();
458 /// assert_eq!(items.len(), 3);
459 /// ```
460 pub fn sequence_reader<R: std::io::Read>(&self, reader: R) -> SequenceReader<R> {
461 SequenceReader::with_options(self.clone(), reader)
462 }
463
464 /// Decode exactly one CBOR data item from an arbitrary reader.
465 /// Used by the sequence iterators to share the core decoding logic.
466 pub(crate) fn decode_one<'a, R>(&self, reader: &mut R) -> std::result::Result<Value<'a>, R::Error>
467 where
468 R: MyReader<'a>,
469 R::Error: From<Error>,
470 {
471 self.do_read(reader, self.recursion_limit, self.oom_mitigation)
472 }
473
474 /// Expose the parser's recursion limit for sequence iterators.
475 pub(crate) fn recursion_limit_value(&self) -> u16 {
476 self.recursion_limit
477 }
478
479 /// Expose the selected format for sequence iterators.
480 pub(crate) fn format_value(&self) -> Format {
481 self.format
482 }
483
484 fn do_read<'a, R>(
485 &self,
486 reader: &mut R,
487 recursion_limit: u16,
488 oom_mitigation: usize,
489 ) -> std::result::Result<Value<'a>, R::Error>
490 where
491 R: MyReader<'a>,
492 R::Error: From<Error>,
493 {
494 let head = Head::read_from(reader)?;
495
496 let is_float = head.initial_byte.major() == Major::SimpleOrFloat
497 && matches!(head.argument, Argument::U16(_) | Argument::U32(_) | Argument::U64(_));
498
499 if !is_float && !head.argument.is_deterministic() {
500 return Err(Error::NonDeterministic.into());
501 }
502
503 let this = match head.initial_byte.major() {
504 Major::Unsigned => Value::Unsigned(head.value()),
505 Major::Negative => Value::Negative(head.value()),
506
507 Major::ByteString => {
508 let len = head.value();
509 if len > self.length_limit {
510 return Err(Error::LengthTooLarge.into());
511 }
512 Value::ByteString(reader.read_cow(len, oom_mitigation)?)
513 }
514
515 Major::TextString => {
516 let len = head.value();
517 if len > self.length_limit {
518 return Err(Error::LengthTooLarge.into());
519 }
520 let text = match reader.read_cow(len, oom_mitigation)? {
521 Cow::Borrowed(bytes) => Cow::Borrowed(std::str::from_utf8(bytes).map_err(Error::from)?),
522 Cow::Owned(bytes) => Cow::Owned(String::from_utf8(bytes).map_err(Error::from)?),
523 };
524 Value::TextString(text)
525 }
526
527 Major::Array => {
528 let value = head.value();
529
530 if value > self.length_limit {
531 return Err(Error::LengthTooLarge.into());
532 }
533
534 let Some(recursion_limit) = recursion_limit.checked_sub(1) else {
535 return Err(Error::NestingTooDeep.into());
536 };
537
538 let request: usize = value.try_into().or(Err(Error::LengthTooLarge))?;
539 let granted = request.min(oom_mitigation / size_of::<Value>());
540 let oom_mitigation = oom_mitigation - granted * size_of::<Value>();
541
542 let mut vec = Vec::with_capacity(granted);
543
544 for _ in 0..value {
545 vec.push(self.do_read(reader, recursion_limit, oom_mitigation)?);
546 }
547
548 Value::Array(vec)
549 }
550
551 Major::Map => {
552 let value = head.value();
553
554 if value > self.length_limit {
555 return Err(Error::LengthTooLarge.into());
556 }
557
558 let Some(recursion_limit) = recursion_limit.checked_sub(1) else {
559 return Err(Error::NestingTooDeep.into());
560 };
561
562 let mut map = BTreeMap::new();
563 let mut prev = None;
564
565 for _ in 0..value {
566 let key = self.do_read(reader, recursion_limit, oom_mitigation)?;
567 let value = self.do_read(reader, recursion_limit, oom_mitigation)?;
568
569 if let Some((prev_key, prev_value)) = prev.take() {
570 if prev_key >= key {
571 return Err(Error::NonDeterministic.into());
572 }
573 map.insert(prev_key, prev_value);
574 }
575
576 prev = Some((key, value));
577 }
578
579 if let Some((key, value)) = prev.take() {
580 map.insert(key, value);
581 }
582
583 Value::Map(map)
584 }
585
586 Major::Tag => {
587 let Some(recursion_limit) = recursion_limit.checked_sub(1) else {
588 return Err(Error::NestingTooDeep.into());
589 };
590
591 let tag_number = head.value();
592 let tag_content = Box::new(self.do_read(reader, recursion_limit, oom_mitigation)?);
593
594 let this = Value::Tag(tag_number, tag_content);
595
596 if this.data_type() == DataType::BigInt {
597 let bytes = this.as_bytes().unwrap();
598 let valid = bytes.len() >= 8 && bytes[0] != 0;
599 if !valid {
600 return Err(Error::NonDeterministic.into());
601 }
602 }
603
604 this
605 }
606
607 Major::SimpleOrFloat => match head.argument {
608 Argument::None => Value::SimpleValue(SimpleValue(head.initial_byte.info())),
609 Argument::U8(n) if n >= 32 => Value::SimpleValue(SimpleValue(n)),
610
611 Argument::U16(bits) => Value::Float(Float::from_bits_u16(bits)),
612 Argument::U32(bits) => Value::Float(Float::from_bits_u32(bits)?),
613 Argument::U64(bits) => Value::Float(Float::from_bits_u64(bits)?),
614
615 _ => return Err(Error::Malformed.into()),
616 },
617 };
618
619 Ok(this)
620 }
621}