core_json/
number.rs

1use core::{str::FromStr, fmt::Write};
2
3use crate::{Read, PeekableRead, Stack, JsonError};
4
5/// An implementor of `core::fmt::Write` which writes to a slice.
6struct SliceWrite<'a>(&'a mut [u8], usize);
7impl<'a> Write for SliceWrite<'a> {
8  #[inline(always)]
9  fn write_str(&mut self, s: &str) -> core::fmt::Result {
10    let remaining = self.0.len() - self.1;
11    if remaining < s.len() {
12      Err(core::fmt::Error)?;
13    }
14    self.0[self.1 .. (self.1 + s.len())].copy_from_slice(s.as_bytes());
15    self.1 += s.len();
16    Ok(())
17  }
18}
19
20// `+ 1` as `ilog10` rounds down, `+ 1` as `10` has a logarithm of `1` yet requires two digits
21const I64_SIGNIFICANT_DIGITS: usize = (i64::MAX.ilog10() + 1 + 1) as usize;
22const F64_SIGNIFICANT_DIGITS: usize = f64::DIGITS as usize;
23const SIGNIFICANT_DIGITS: usize = if I64_SIGNIFICANT_DIGITS > F64_SIGNIFICANT_DIGITS {
24  I64_SIGNIFICANT_DIGITS
25} else {
26  F64_SIGNIFICANT_DIGITS
27};
28
29/// A sink for a number string.
30///
31/// This sink does two things:
32///   1) Accumulates into a integer
33///   2) Accumulates a fixed-length string representing a float
34///
35/// For the latter task, the string representing the float is approximate to the string sinked
36/// (despite the bounded length) due to preserving the significant digits.
37///
38/// Writing into the sink is infallible. Recovering the result is possible for any float which
39/// follows RFC 8259's syntax and whose exponent fits within an `i16`, or for the Rust strings for
40/// any finite `f64`. The result for strings which don't follow either of these syntaxes is
41/// undefined. As Rust does not stricly define the format it outputs strings with, we refer to the
42/// format it'll accept strings with (which is specified), assuming the strings output are a subset
43/// of the ones allowed as input.
44#[doc(hidden)]
45pub struct NumberSink {
46  /// If a sign character is currently allowed within the sink.
47  sign_character_allowed: bool,
48  /// If we've read any digits for the current part.
49  digits_in_current_part: bool,
50
51  /// The sign of the number.
52  negative: bool,
53  /// The significant digits within the number.
54  ///
55  /// These will be ASCII characters in the range '0' ..= '9', and '0' if not explicitly set.
56  ///
57  /// An additional `1` is included for a single leading zero.
58  digits: [u8; 1 + SIGNIFICANT_DIGITS],
59  /// The amount of dsigits accumulated.
60  i: usize,
61
62  /// If we're before the decimal point.
63  before_decimal: bool,
64  /// If we're before the exponent marker.
65  before_exponent: bool,
66
67  /// If the exponent is negative.
68  negative_exponent: bool,
69  /// The absolute value of the exponent embedded within the string.
70  ///
71  /// This will always contain a positive value and is solely represented with an `i16` to bound
72  /// its maximum value so it may be infallibly converted to its negative value.
73  ///
74  /// If this is `None`, it means the exponent overflowed the capacity.
75  absolute_exponent: Option<i16>,
76  /// The correction required for the xponent.
77  ///
78  /// This is required due to us shifting the string when we accumulate it in a bounded fashion. We
79  /// represent it as an `i64`, allowing us to accumulate strings of length `2**63 - 2**15` without
80  /// a failure occuring. As sequentially iterating to 2**63 would take a century, requiring a
81  // 8,192 PB string, we may consider this infalliible.
82  exponent_correction: i64,
83
84  /// If we truncated a non-zero digit.
85  ///
86  /// Truncated zero digits will be reflected in the correction to the exponent, making them
87  /// losslessly dropped.
88  imprecise: bool,
89
90  /// If this value was invalid per RFC-8259 syntax.
91  ///
92  /// This will only potentially be true if strict validation is used.
93  invalid: bool,
94}
95
96impl NumberSink {
97  /// Create a new number sink.
98  #[doc(hidden)]
99  #[inline(always)]
100  pub fn new() -> Self {
101    Self {
102      // A negative sign character is allowed at the very start of the number.
103      sign_character_allowed: true,
104      digits_in_current_part: false,
105      negative: false,
106      digits: [b'0'; _],
107      i: 0,
108      before_decimal: true,
109      before_exponent: true,
110      negative_exponent: false,
111      absolute_exponent: Some(0),
112      exponent_correction: 0,
113      imprecise: false,
114      invalid: false,
115    }
116  }
117
118  /// Push a byte, intended to be an ASCII character, into the sink.
119  ///
120  /// This will apply the validation rules from RFC-8259.
121  /*
122    The syntax we apply is (expanded)
123    `[ minus ] [ 0 / [ 1-9 *DIGIT ] ] [ decimal-point 1*DIGIT ] [ e [ minus / plus ] 1*DIGIT ]`.
124
125    https://datatracker.ietf.org/doc/html/rfc8259#section-6 lets us specify the range, precision
126    of numbers.
127  */
128  #[inline(always)]
129  fn push_byte(&mut self, c: u8) -> bool {
130    if self.sign_character_allowed {
131      // sign characters are only allowed in the initial positions
132      self.sign_character_allowed = false;
133
134      if c == b'-' {
135        self.negative |= self.before_exponent;
136        self.negative_exponent |= !self.before_exponent;
137        return true;
138      }
139      if c == b'+' {
140        // `+` is only allowed with the exponent, not for the integer
141        self.invalid |= self.before_exponent;
142        return true;
143      }
144    }
145
146    if self.before_decimal {
147      match c {
148        b'0' ..= b'9' => {
149          // We do not allow leading zeroes for the integer part, unless it's solely zero
150          self.invalid |= self.digits_in_current_part & (self.digits[0] == b'0');
151          self.digits_in_current_part = true;
152
153          let within_precision = self.i != self.digits.len();
154          if within_precision {
155            // Write the digit
156            self.digits[self.i] = c;
157            // This may write, for a valid number, a single leading zero. This is fine as `digits`
158            // is sized with this in mind
159            self.i += 1;
160          } else {
161            // If this is outside our precision, we need to shift up by 1 as this is before the
162            // decimal yet we will drop this digit
163            self.exponent_correction += 1;
164            // If we're truncating '0', this is still precise due to correctly tweaking the
165            // exponent
166            self.imprecise |= c != b'0';
167          }
168        }
169
170        // separator, array closure, object closure, whitespace
171        // https://datatracker.ietf.org/doc/html/rfc8259#section-2
172        b',' | b']' | b'}' | b'\x20' | b'\x09' | b'\x0A' | b'\x0D' => return false,
173
174        b'.' => {
175          self.invalid |= !self.digits_in_current_part;
176          self.digits_in_current_part = false;
177          self.before_decimal = false;
178        }
179        b'e' | b'E' => {
180          self.invalid |= !self.digits_in_current_part;
181          // Allow the sign character immediately following the exponent
182          self.sign_character_allowed = true;
183          self.digits_in_current_part = false;
184          self.before_decimal = false;
185          self.before_exponent = false;
186        }
187
188        _ => self.invalid = true,
189      }
190      return true;
191    }
192
193    if self.before_exponent {
194      match c {
195        b'0' ..= b'9' => {
196          self.digits_in_current_part = true;
197
198          let within_precision = self.i != self.digits.len();
199          if within_precision {
200            // Write the digit
201            self.digits[self.i] = c;
202            // Only preserve it if it isn't a leading zero
203            let leading_zero =
204              (c == b'0') & ((self.i == 0) | ((self.i == 1) & (self.digits[0] == b'0')));
205            self.i += usize::from(!leading_zero);
206
207            // If this is after the decimal, but within precision, we need to shift down by 1
208            self.exponent_correction -= 1;
209          } else {
210            self.imprecise = true;
211          }
212        }
213
214        b',' | b']' | b'}' | b'\x20' | b'\x09' | b'\x0A' | b'\x0D' => return false,
215
216        // This block is duplicated with `before_decimal`
217        b'e' | b'E' => {
218          self.invalid |= !self.digits_in_current_part;
219          // Allow the sign character immediately following the exponent
220          self.sign_character_allowed = true;
221          self.digits_in_current_part = false;
222          self.before_decimal = false;
223          self.before_exponent = false;
224        }
225
226        _ => self.invalid = true,
227      }
228      return true;
229    }
230
231    match c {
232      b'0' ..= b'9' => {
233        self.digits_in_current_part = true;
234        // Accumulate into our exponent
235        self.absolute_exponent = self.absolute_exponent.and_then(|absolute_exponent| {
236          let absolute_exponent = absolute_exponent.checked_mul(10)?;
237          absolute_exponent.checked_add(i16::from(c - b'0'))
238        });
239      }
240
241      b',' | b']' | b'}' | b'\x20' | b'\x09' | b'\x0A' | b'\x0D' => return false,
242
243      _ => self.invalid = true,
244    }
245
246    true
247  }
248
249  /// Get the significant digits, exponent for the number.
250  ///
251  /// If this has an unnecessarily large negative exponent, it will reduce it as possible. This
252  /// allows "100e-1" to still be detected as not having a fractional part.
253  #[inline(always)]
254  fn significant_digits_and_exponent(&self) -> Option<(usize, i64)> {
255    let absolute_exponent = self.absolute_exponent?;
256    // This negation is infallible as `i16::MIN.abs() > i16::MAX` and it's currently positive
257    let embedded_exponent =
258      if self.negative_exponent { -absolute_exponent } else { absolute_exponent };
259    let mut exponent = i64::from(embedded_exponent).checked_add(self.exponent_correction)?;
260
261    let mut significant_digits = self.i;
262    // Normalize this number's negative exponent, as possible
263    while (exponent < 0) &&
264      (significant_digits > 0) &&
265      (self.digits[significant_digits - 1] == b'0')
266    {
267      significant_digits -= 1;
268      exponent += 1;
269    }
270    Some((significant_digits, exponent))
271  }
272
273  #[inline(always)]
274  fn strictly_valid(&self) -> bool {
275    // It has to not have been marked invalid and the last part must not have been empty
276    (!self.invalid) & self.digits_in_current_part
277  }
278
279  /// Extract the exact number as an integer, if possible.
280  #[inline(always)]
281  pub(crate) fn i64(&self) -> Option<i64> {
282    let (significant_digits, exponent) = self.significant_digits_and_exponent()?;
283
284    // If this number had a loss of precision, we should not return it here
285    // If this number has a negative exponent, it has a fractional part
286    if self.imprecise || (exponent < 0) {
287      None?;
288    }
289
290    /*
291      We do this manually, instead of using `i64::from_str`, to avoid the overhead of
292      `str::from_utf8`/usage of `unsafe`. We also do the first loop, with wrapping arithmetic, when
293      we know the value won't overflow, only doing the final steps with checked arithmetic, when
294      the value might overflow.
295    */
296    let mut accum = 0i64;
297    if self.negative {
298      for digit in self.digits.iter().take(significant_digits.min(I64_SIGNIFICANT_DIGITS - 1)) {
299        accum = accum.wrapping_mul(10);
300        let digit = i64::from(digit - b'0');
301        accum = accum.wrapping_sub(digit);
302      }
303      for digit in &self.digits
304        [(I64_SIGNIFICANT_DIGITS - 1) .. significant_digits.max(I64_SIGNIFICANT_DIGITS - 1)]
305      {
306        accum = accum.checked_mul(10)?;
307        let digit = i64::from(digit - b'0');
308        accum = accum.checked_sub(digit)?;
309      }
310    } else {
311      for digit in self.digits.iter().take(significant_digits.min(I64_SIGNIFICANT_DIGITS - 1)) {
312        accum = accum.wrapping_mul(10);
313        let digit = i64::from(digit - b'0');
314        accum = accum.wrapping_add(digit);
315      }
316      for digit in &self.digits
317        [(I64_SIGNIFICANT_DIGITS - 1) .. significant_digits.max(I64_SIGNIFICANT_DIGITS - 1)]
318      {
319        accum = accum.checked_mul(10)?;
320        let digit = i64::from(digit - b'0');
321        accum = accum.checked_add(digit)?;
322      }
323    }
324
325    // Shift corresponding to the exponent
326    for _ in 0 .. exponent {
327      accum = accum.checked_mul(10)?;
328    }
329
330    Some(accum)
331  }
332
333  /// The imprecise string representing this number.
334  ///
335  /// This returns an owned `u8` array and the length of the string (in bytes) written within it.
336  /// All of the bytes not declared to be written to are left in an undefined state. The string
337  /// written will be RFC-8259-compliant.
338  /*
339    The length is determined due to
340    `sign, significant digits, exponent marker, exponent sign, exponent`.
341
342    We could achieve a tighter bound on the exponent, as we use `i64` for the exponent internally,
343    but any exponent exceeding four decimal digits to encode its absolute value won't work with
344    `f64` regardless.
345
346    TODO: Introduce a heuristic for where we should insert a decimal, instead of always using an
347    exponent to position the fractional part.
348  */
349  #[doc(hidden)]
350  #[inline(always)]
351  pub fn imprecise_str(
352    &self,
353  ) -> Option<([u8; 1 + SIGNIFICANT_DIGITS + 1 + 1 + I64_SIGNIFICANT_DIGITS], usize)> {
354    let (original_significant_digits, mut exponent) = self.significant_digits_and_exponent()?;
355
356    // If there are no digits within this number, return `0` immediately
357    if original_significant_digits == 0 {
358      return Some(([b'0'; _], 1));
359    }
360
361    let mut str = [0; _];
362    let mut len = 0;
363    if self.negative {
364      str[len] = b'-';
365      len += 1;
366    }
367
368    // Copy the significant digits
369    /*
370      While we support `SIGNIFICANT_DIGITS` as necessary for exact conversions to integers, for
371      floats (as assumed by this function), we only use the amount of significant digits Rust can
372      accurately round-trip: `f64::DIGITS`.
373
374      We do add an additional significant digit if we have a leading zero present.
375    */
376    let significant_digits =
377      original_significant_digits.min(usize::from(self.digits[0] == b'0') + (f64::DIGITS as usize));
378    {
379      // If we're truncating digits from the tail, shift the number back up accordingly
380      // This is a safe cast so long as `|SIGNIFICANT_DIGITS - f64::DIGITS| < i64::MAX`.
381      #[allow(clippy::cast_possible_wrap)]
382      let further_exponent_correction = (original_significant_digits - significant_digits) as i64;
383      exponent = exponent.checked_add(further_exponent_correction)?;
384    }
385    // If we have multiple significant digits, handle the leading zero (if present)
386    if (significant_digits > 1) && (self.digits[0] == b'0') {
387      str[len .. (len + significant_digits - 1)]
388        .copy_from_slice(&self.digits[1 .. significant_digits]);
389      len += significant_digits - 1;
390    } else {
391      str[len .. (len + significant_digits)].copy_from_slice(&self.digits[.. significant_digits]);
392      len += significant_digits;
393    }
394
395    if exponent != 0 {
396      // Set the exponent marker
397      str[len] = b'e';
398      len += 1;
399
400      // Set the exponent itself
401      let mut writer = SliceWrite(&mut str[len ..], 0);
402      // This should be unreachable if `I64_SIGNIFICANT_DIGITS` is properly defined and Rust is
403      // sane
404      write!(&mut writer, "{}", exponent).ok()?;
405      len += writer.1;
406    }
407
408    Some((str, len))
409  }
410
411  /// Extract the number as a float.
412  ///
413  /// This will only return the number if it's finite, as RFC-8259 JSON is not able to represent
414  /// infinite values, so deserializing into an infinite value demonstrates we weren't able to
415  /// capture the range of this value.
416  #[inline(always)]
417  pub(crate) fn f64(&self) -> Option<f64> {
418    let (str, len) = self.imprecise_str()?;
419
420    /*
421      These should be unreachable as if we yielded a string, it should be RFC-8259-compliant and
422      Rust should be able to handle RFC-8259-compliant strings (due to its accepted grammar being a
423      superset of RFC-8259 by happenstance/reasonability).
424    */
425    let str = core::str::from_utf8(&str[.. len]).ok()?;
426    let candidate = f64::from_str(str).ok()?;
427
428    candidate.is_finite().then_some(candidate)
429  }
430}
431
432impl Write for NumberSink {
433  #[inline(always)]
434  fn write_str(&mut self, s: &str) -> core::fmt::Result {
435    for s in s.as_bytes() {
436      self.push_byte(*s);
437    }
438    Ok(())
439  }
440}
441
442/// Handle the immediate value within the reader as a number.
443#[inline(always)]
444pub(crate) fn to_number_str<'read, R: Read<'read>, S: Stack>(
445  reader: &mut PeekableRead<'read, R>,
446) -> Result<Number, JsonError<'read, R, S>> {
447  let mut result = NumberSink::new();
448
449  // Read until a byte which isn't part of the number, sinking along the way
450  while result.push_byte(reader.peek()) {
451    reader.read_byte().map_err(JsonError::ReadError)?;
452  }
453
454  if !result.strictly_valid() {
455    Err(JsonError::InvalidValue)?;
456  }
457
458  Ok(Number(result))
459}
460
461/// A number deserialized from JSON.
462pub struct Number(NumberSink);
463impl Number {
464  /// Get the current number as an `i64`.
465  ///
466  /// This uses the definition of a number defined in RFC-8259, then constrains it to having no
467  /// fractional part once normalized. It's yielded if it's representable within an `i64`. Note
468  /// normalization will truncate "10.0", so this is lossy to if the original encoding had a
469  /// fractional part.
470  ///
471  /// This is _exact_. It does not go through `f64` and does not experience its approximations.
472  #[inline(always)]
473  pub fn i64(&self) -> Option<i64> {
474    self.0.i64()
475  }
476
477  /// Get the current item as an `f64`.
478  ///
479  /// This may be lossy due to:
480  /// - The inherent nature of floats
481  /// - Rust's bounds on precision
482  /// - This library's precision bounds, truncating additional detail
483  ///
484  /// This returns `None` if the value's range exceed `f64`'s.
485  #[inline(always)]
486  pub fn f64(&self) -> Option<f64> {
487    self.0.f64()
488  }
489}
490
491#[test]
492fn number_sink() {
493  // Handle various floats
494  {
495    #[allow(clippy::float_cmp)]
496    let test = |value: f64, expected| {
497      let mut sink = NumberSink::new();
498      write!(&mut sink, "{}", value).unwrap();
499      assert_eq!(sink.f64().unwrap(), f64::from_str(expected).unwrap());
500    };
501    test(0.0, "0");
502    test(0.1, "0.1");
503    test(0.01, "0.01");
504    test(0.001, "0.001");
505    test(0.0012, "0.0012");
506    test(0.12345678910111213, "0.123456789101112");
507    test(0.012345678910111213, "0.0123456789101112");
508    test(12345678910111213.0, "123456789101112e2");
509    test(12345678910111213.123, "123456789101112e2");
510    test(123456789.101112, "123456789.101112");
511    test(123456789.10111213, "123456789.101112");
512    test(-1.0, "-1");
513    test(f64::MIN, "-179769313486231e294");
514    test(f64::MAX, "179769313486231e294");
515    test(f64::EPSILON, "222044604925031e-30");
516  }
517
518  // Handle various integers
519  {
520    #[allow(clippy::float_cmp)]
521    let test = |value: &str, expected: i64| {
522      let mut sink = NumberSink::new();
523      write!(&mut sink, "{}", value).unwrap();
524      assert_eq!(sink.i64().unwrap(), expected);
525    };
526    test("0", 0);
527    test("10e1", 100);
528    test("10.0e1", 100);
529    test("10.0", 10);
530    test("10e-1", 1);
531    {
532      let str = format!("{}", i64::MAX);
533      test(&str, i64::MAX);
534    }
535    {
536      let str = format!("{}", i64::MIN);
537      test(&str, i64::MIN);
538    }
539  }
540}