core_json/number.rs
1use core::{str::FromStr, fmt::Write};
2
3use crate::{Read, PeekableRead, Stack, JsonError};
4
5/// An implementor of `core::fmt::Write` which writes to a slice.
6struct SliceWrite<'a>(&'a mut [u8], usize);
7impl<'a> Write for SliceWrite<'a> {
8 #[inline(always)]
9 fn write_str(&mut self, s: &str) -> core::fmt::Result {
10 let remaining = self.0.len() - self.1;
11 if remaining < s.len() {
12 Err(core::fmt::Error)?;
13 }
14 self.0[self.1 .. (self.1 + s.len())].copy_from_slice(s.as_bytes());
15 self.1 += s.len();
16 Ok(())
17 }
18}
19
20// `+ 1` as `ilog10` rounds down, `+ 1` as `10` has a logarithm of `1` yet requires two digits
21const I64_SIGNIFICANT_DIGITS: usize = (i64::MAX.ilog10() + 1 + 1) as usize;
22const F64_SIGNIFICANT_DIGITS: usize = f64::DIGITS as usize;
23const SIGNIFICANT_DIGITS: usize = if I64_SIGNIFICANT_DIGITS > F64_SIGNIFICANT_DIGITS {
24 I64_SIGNIFICANT_DIGITS
25} else {
26 F64_SIGNIFICANT_DIGITS
27};
28
29/// A sink for a number string.
30///
31/// This sink does two things:
32/// 1) Accumulates into a integer
33/// 2) Accumulates a fixed-length string representing a float
34///
35/// For the latter task, the string representing the float is approximate to the string sinked
36/// (despite the bounded length) due to preserving the significant digits.
37///
38/// Writing into the sink is infallible. Recovering the result is possible for any float which
39/// follows RFC 8259's syntax and whose exponent fits within an `i16`, or for the Rust strings for
40/// any finite `f64`. The result for strings which don't follow either of these syntaxes is
41/// undefined. As Rust does not stricly define the format it outputs strings with, we refer to the
42/// format it'll accept strings with (which is specified), assuming the strings output are a subset
43/// of the ones allowed as input.
44#[doc(hidden)]
45pub struct NumberSink {
46 /// If a sign character is currently allowed within the sink.
47 sign_character_allowed: bool,
48 /// If we've read any digits for the current part.
49 digits_in_current_part: bool,
50
51 /// The sign of the number.
52 negative: bool,
53 /// The significant digits within the number.
54 ///
55 /// These will be ASCII characters in the range '0' ..= '9', and '0' if not explicitly set.
56 ///
57 /// An additional `1` is included for a single leading zero.
58 digits: [u8; 1 + SIGNIFICANT_DIGITS],
59 /// The amount of dsigits accumulated.
60 i: usize,
61
62 /// If we're before the decimal point.
63 before_decimal: bool,
64 /// If we're before the exponent marker.
65 before_exponent: bool,
66
67 /// If the exponent is negative.
68 negative_exponent: bool,
69 /// The absolute value of the exponent embedded within the string.
70 ///
71 /// This will always contain a positive value and is solely represented with an `i16` to bound
72 /// its maximum value so it may be infallibly converted to its negative value.
73 ///
74 /// If this is `None`, it means the exponent overflowed the capacity.
75 absolute_exponent: Option<i16>,
76 /// The correction required for the xponent.
77 ///
78 /// This is required due to us shifting the string when we accumulate it in a bounded fashion. We
79 /// represent it as an `i64`, allowing us to accumulate strings of length `2**63 - 2**15` without
80 /// a failure occuring. As sequentially iterating to 2**63 would take a century, requiring a
81 // 8,192 PB string, we may consider this infalliible.
82 exponent_correction: i64,
83
84 /// If we truncated a non-zero digit.
85 ///
86 /// Truncated zero digits will be reflected in the correction to the exponent, making them
87 /// losslessly dropped.
88 imprecise: bool,
89
90 /// If this value was invalid per RFC-8259 syntax.
91 ///
92 /// This will only potentially be true if strict validation is used.
93 invalid: bool,
94}
95
96impl NumberSink {
97 /// Create a new number sink.
98 #[doc(hidden)]
99 #[inline(always)]
100 pub fn new() -> Self {
101 Self {
102 // A negative sign character is allowed at the very start of the number.
103 sign_character_allowed: true,
104 digits_in_current_part: false,
105 negative: false,
106 digits: [b'0'; _],
107 i: 0,
108 before_decimal: true,
109 before_exponent: true,
110 negative_exponent: false,
111 absolute_exponent: Some(0),
112 exponent_correction: 0,
113 imprecise: false,
114 invalid: false,
115 }
116 }
117
118 /// Push a byte, intended to be an ASCII character, into the sink.
119 ///
120 /// This will apply the validation rules from RFC-8259.
121 /*
122 The syntax we apply is (expanded)
123 `[ minus ] [ 0 / [ 1-9 *DIGIT ] ] [ decimal-point 1*DIGIT ] [ e [ minus / plus ] 1*DIGIT ]`.
124
125 https://datatracker.ietf.org/doc/html/rfc8259#section-6 lets us specify the range, precision
126 of numbers.
127 */
128 #[inline(always)]
129 fn push_byte(&mut self, c: u8) -> bool {
130 if self.sign_character_allowed {
131 // sign characters are only allowed in the initial positions
132 self.sign_character_allowed = false;
133
134 if c == b'-' {
135 self.negative |= self.before_exponent;
136 self.negative_exponent |= !self.before_exponent;
137 return true;
138 }
139 if c == b'+' {
140 // `+` is only allowed with the exponent, not for the integer
141 self.invalid |= self.before_exponent;
142 return true;
143 }
144 }
145
146 if self.before_decimal {
147 match c {
148 b'0' ..= b'9' => {
149 // We do not allow leading zeroes for the integer part, unless it's solely zero
150 self.invalid |= self.digits_in_current_part & (self.digits[0] == b'0');
151 self.digits_in_current_part = true;
152
153 let within_precision = self.i != self.digits.len();
154 if within_precision {
155 // Write the digit
156 self.digits[self.i] = c;
157 // This may write, for a valid number, a single leading zero. This is fine as `digits`
158 // is sized with this in mind
159 self.i += 1;
160 } else {
161 // If this is outside our precision, we need to shift up by 1 as this is before the
162 // decimal yet we will drop this digit
163 self.exponent_correction += 1;
164 // If we're truncating '0', this is still precise due to correctly tweaking the
165 // exponent
166 self.imprecise |= c != b'0';
167 }
168 }
169
170 // separator, array closure, object closure, whitespace
171 // https://datatracker.ietf.org/doc/html/rfc8259#section-2
172 b',' | b']' | b'}' | b'\x20' | b'\x09' | b'\x0A' | b'\x0D' => return false,
173
174 b'.' => {
175 self.invalid |= !self.digits_in_current_part;
176 self.digits_in_current_part = false;
177 self.before_decimal = false;
178 }
179 b'e' | b'E' => {
180 self.invalid |= !self.digits_in_current_part;
181 // Allow the sign character immediately following the exponent
182 self.sign_character_allowed = true;
183 self.digits_in_current_part = false;
184 self.before_decimal = false;
185 self.before_exponent = false;
186 }
187
188 _ => self.invalid = true,
189 }
190 return true;
191 }
192
193 if self.before_exponent {
194 match c {
195 b'0' ..= b'9' => {
196 self.digits_in_current_part = true;
197
198 let within_precision = self.i != self.digits.len();
199 if within_precision {
200 // Write the digit
201 self.digits[self.i] = c;
202 // Only preserve it if it isn't a leading zero
203 let leading_zero =
204 (c == b'0') & ((self.i == 0) | ((self.i == 1) & (self.digits[0] == b'0')));
205 self.i += usize::from(!leading_zero);
206
207 // If this is after the decimal, but within precision, we need to shift down by 1
208 self.exponent_correction -= 1;
209 } else {
210 self.imprecise = true;
211 }
212 }
213
214 b',' | b']' | b'}' | b'\x20' | b'\x09' | b'\x0A' | b'\x0D' => return false,
215
216 // This block is duplicated with `before_decimal`
217 b'e' | b'E' => {
218 self.invalid |= !self.digits_in_current_part;
219 // Allow the sign character immediately following the exponent
220 self.sign_character_allowed = true;
221 self.digits_in_current_part = false;
222 self.before_decimal = false;
223 self.before_exponent = false;
224 }
225
226 _ => self.invalid = true,
227 }
228 return true;
229 }
230
231 match c {
232 b'0' ..= b'9' => {
233 self.digits_in_current_part = true;
234 // Accumulate into our exponent
235 self.absolute_exponent = self.absolute_exponent.and_then(|absolute_exponent| {
236 let absolute_exponent = absolute_exponent.checked_mul(10)?;
237 absolute_exponent.checked_add(i16::from(c - b'0'))
238 });
239 }
240
241 b',' | b']' | b'}' | b'\x20' | b'\x09' | b'\x0A' | b'\x0D' => return false,
242
243 _ => self.invalid = true,
244 }
245
246 true
247 }
248
249 /// Get the significant digits, exponent for the number.
250 ///
251 /// If this has an unnecessarily large negative exponent, it will reduce it as possible. This
252 /// allows "100e-1" to still be detected as not having a fractional part.
253 #[inline(always)]
254 fn significant_digits_and_exponent(&self) -> Option<(usize, i64)> {
255 let absolute_exponent = self.absolute_exponent?;
256 // This negation is infallible as `i16::MIN.abs() > i16::MAX` and it's currently positive
257 let embedded_exponent =
258 if self.negative_exponent { -absolute_exponent } else { absolute_exponent };
259 let mut exponent = i64::from(embedded_exponent).checked_add(self.exponent_correction)?;
260
261 let mut significant_digits = self.i;
262 // Normalize this number's negative exponent, as possible
263 while (exponent < 0) &&
264 (significant_digits > 0) &&
265 (self.digits[significant_digits - 1] == b'0')
266 {
267 significant_digits -= 1;
268 exponent += 1;
269 }
270 Some((significant_digits, exponent))
271 }
272
273 #[inline(always)]
274 fn strictly_valid(&self) -> bool {
275 // It has to not have been marked invalid and the last part must not have been empty
276 (!self.invalid) & self.digits_in_current_part
277 }
278
279 /// Extract the exact number as an integer, if possible.
280 #[inline(always)]
281 pub(crate) fn i64(&self) -> Option<i64> {
282 let (significant_digits, exponent) = self.significant_digits_and_exponent()?;
283
284 // If this number had a loss of precision, we should not return it here
285 // If this number has a negative exponent, it has a fractional part
286 if self.imprecise || (exponent < 0) {
287 None?;
288 }
289
290 /*
291 We do this manually, instead of using `i64::from_str`, to avoid the overhead of
292 `str::from_utf8`/usage of `unsafe`. We also do the first loop, with wrapping arithmetic, when
293 we know the value won't overflow, only doing the final steps with checked arithmetic, when
294 the value might overflow.
295 */
296 let mut accum = 0i64;
297 if self.negative {
298 for digit in self.digits.iter().take(significant_digits.min(I64_SIGNIFICANT_DIGITS - 1)) {
299 accum = accum.wrapping_mul(10);
300 let digit = i64::from(digit - b'0');
301 accum = accum.wrapping_sub(digit);
302 }
303 for digit in &self.digits
304 [(I64_SIGNIFICANT_DIGITS - 1) .. significant_digits.max(I64_SIGNIFICANT_DIGITS - 1)]
305 {
306 accum = accum.checked_mul(10)?;
307 let digit = i64::from(digit - b'0');
308 accum = accum.checked_sub(digit)?;
309 }
310 } else {
311 for digit in self.digits.iter().take(significant_digits.min(I64_SIGNIFICANT_DIGITS - 1)) {
312 accum = accum.wrapping_mul(10);
313 let digit = i64::from(digit - b'0');
314 accum = accum.wrapping_add(digit);
315 }
316 for digit in &self.digits
317 [(I64_SIGNIFICANT_DIGITS - 1) .. significant_digits.max(I64_SIGNIFICANT_DIGITS - 1)]
318 {
319 accum = accum.checked_mul(10)?;
320 let digit = i64::from(digit - b'0');
321 accum = accum.checked_add(digit)?;
322 }
323 }
324
325 // Shift corresponding to the exponent
326 for _ in 0 .. exponent {
327 accum = accum.checked_mul(10)?;
328 }
329
330 Some(accum)
331 }
332
333 /// The imprecise string representing this number.
334 ///
335 /// This returns an owned `u8` array and the length of the string (in bytes) written within it.
336 /// All of the bytes not declared to be written to are left in an undefined state. The string
337 /// written will be RFC-8259-compliant.
338 /*
339 The length is determined due to
340 `sign, significant digits, exponent marker, exponent sign, exponent`.
341
342 We could achieve a tighter bound on the exponent, as we use `i64` for the exponent internally,
343 but any exponent exceeding four decimal digits to encode its absolute value won't work with
344 `f64` regardless.
345
346 TODO: Introduce a heuristic for where we should insert a decimal, instead of always using an
347 exponent to position the fractional part.
348 */
349 #[doc(hidden)]
350 #[inline(always)]
351 pub fn imprecise_str(
352 &self,
353 ) -> Option<([u8; 1 + SIGNIFICANT_DIGITS + 1 + 1 + I64_SIGNIFICANT_DIGITS], usize)> {
354 let (original_significant_digits, mut exponent) = self.significant_digits_and_exponent()?;
355
356 // If there are no digits within this number, return `0` immediately
357 if original_significant_digits == 0 {
358 return Some(([b'0'; _], 1));
359 }
360
361 let mut str = [0; _];
362 let mut len = 0;
363 if self.negative {
364 str[len] = b'-';
365 len += 1;
366 }
367
368 // Copy the significant digits
369 /*
370 While we support `SIGNIFICANT_DIGITS` as necessary for exact conversions to integers, for
371 floats (as assumed by this function), we only use the amount of significant digits Rust can
372 accurately round-trip: `f64::DIGITS`.
373
374 We do add an additional significant digit if we have a leading zero present.
375 */
376 let significant_digits =
377 original_significant_digits.min(usize::from(self.digits[0] == b'0') + (f64::DIGITS as usize));
378 {
379 // If we're truncating digits from the tail, shift the number back up accordingly
380 // This is a safe cast so long as `|SIGNIFICANT_DIGITS - f64::DIGITS| < i64::MAX`.
381 #[allow(clippy::cast_possible_wrap)]
382 let further_exponent_correction = (original_significant_digits - significant_digits) as i64;
383 exponent = exponent.checked_add(further_exponent_correction)?;
384 }
385 // If we have multiple significant digits, handle the leading zero (if present)
386 if (significant_digits > 1) && (self.digits[0] == b'0') {
387 str[len .. (len + significant_digits - 1)]
388 .copy_from_slice(&self.digits[1 .. significant_digits]);
389 len += significant_digits - 1;
390 } else {
391 str[len .. (len + significant_digits)].copy_from_slice(&self.digits[.. significant_digits]);
392 len += significant_digits;
393 }
394
395 if exponent != 0 {
396 // Set the exponent marker
397 str[len] = b'e';
398 len += 1;
399
400 // Set the exponent itself
401 let mut writer = SliceWrite(&mut str[len ..], 0);
402 // This should be unreachable if `I64_SIGNIFICANT_DIGITS` is properly defined and Rust is
403 // sane
404 write!(&mut writer, "{}", exponent).ok()?;
405 len += writer.1;
406 }
407
408 Some((str, len))
409 }
410
411 /// Extract the number as a float.
412 ///
413 /// This will only return the number if it's finite, as RFC-8259 JSON is not able to represent
414 /// infinite values, so deserializing into an infinite value demonstrates we weren't able to
415 /// capture the range of this value.
416 #[inline(always)]
417 pub(crate) fn f64(&self) -> Option<f64> {
418 let (str, len) = self.imprecise_str()?;
419
420 /*
421 These should be unreachable as if we yielded a string, it should be RFC-8259-compliant and
422 Rust should be able to handle RFC-8259-compliant strings (due to its accepted grammar being a
423 superset of RFC-8259 by happenstance/reasonability).
424 */
425 let str = core::str::from_utf8(&str[.. len]).ok()?;
426 let candidate = f64::from_str(str).ok()?;
427
428 candidate.is_finite().then_some(candidate)
429 }
430}
431
432impl Write for NumberSink {
433 #[inline(always)]
434 fn write_str(&mut self, s: &str) -> core::fmt::Result {
435 for s in s.as_bytes() {
436 self.push_byte(*s);
437 }
438 Ok(())
439 }
440}
441
442/// Handle the immediate value within the reader as a number.
443#[inline(always)]
444pub(crate) fn to_number_str<'read, R: Read<'read>, S: Stack>(
445 reader: &mut PeekableRead<'read, R>,
446) -> Result<Number, JsonError<'read, R, S>> {
447 let mut result = NumberSink::new();
448
449 // Read until a byte which isn't part of the number, sinking along the way
450 while result.push_byte(reader.peek()) {
451 reader.read_byte().map_err(JsonError::ReadError)?;
452 }
453
454 if !result.strictly_valid() {
455 Err(JsonError::InvalidValue)?;
456 }
457
458 Ok(Number(result))
459}
460
461/// A number deserialized from JSON.
462pub struct Number(NumberSink);
463impl Number {
464 /// Get the current number as an `i64`.
465 ///
466 /// This uses the definition of a number defined in RFC-8259, then constrains it to having no
467 /// fractional part once normalized. It's yielded if it's representable within an `i64`. Note
468 /// normalization will truncate "10.0", so this is lossy to if the original encoding had a
469 /// fractional part.
470 ///
471 /// This is _exact_. It does not go through `f64` and does not experience its approximations.
472 #[inline(always)]
473 pub fn i64(&self) -> Option<i64> {
474 self.0.i64()
475 }
476
477 /// Get the current item as an `f64`.
478 ///
479 /// This may be lossy due to:
480 /// - The inherent nature of floats
481 /// - Rust's bounds on precision
482 /// - This library's precision bounds, truncating additional detail
483 ///
484 /// This returns `None` if the value's range exceed `f64`'s.
485 #[inline(always)]
486 pub fn f64(&self) -> Option<f64> {
487 self.0.f64()
488 }
489}
490
491#[test]
492fn number_sink() {
493 // Handle various floats
494 {
495 #[allow(clippy::float_cmp)]
496 let test = |value: f64, expected| {
497 let mut sink = NumberSink::new();
498 write!(&mut sink, "{}", value).unwrap();
499 assert_eq!(sink.f64().unwrap(), f64::from_str(expected).unwrap());
500 };
501 test(0.0, "0");
502 test(0.1, "0.1");
503 test(0.01, "0.01");
504 test(0.001, "0.001");
505 test(0.0012, "0.0012");
506 test(0.12345678910111213, "0.123456789101112");
507 test(0.012345678910111213, "0.0123456789101112");
508 test(12345678910111213.0, "123456789101112e2");
509 test(12345678910111213.123, "123456789101112e2");
510 test(123456789.101112, "123456789.101112");
511 test(123456789.10111213, "123456789.101112");
512 test(-1.0, "-1");
513 test(f64::MIN, "-179769313486231e294");
514 test(f64::MAX, "179769313486231e294");
515 test(f64::EPSILON, "222044604925031e-30");
516 }
517
518 // Handle various integers
519 {
520 #[allow(clippy::float_cmp)]
521 let test = |value: &str, expected: i64| {
522 let mut sink = NumberSink::new();
523 write!(&mut sink, "{}", value).unwrap();
524 assert_eq!(sink.i64().unwrap(), expected);
525 };
526 test("0", 0);
527 test("10e1", 100);
528 test("10.0e1", 100);
529 test("10.0", 10);
530 test("10e-1", 1);
531 {
532 let str = format!("{}", i64::MAX);
533 test(&str, i64::MAX);
534 }
535 {
536 let str = format!("{}", i64::MIN);
537 test(&str, i64::MIN);
538 }
539 }
540}