Skip to main content

sonic_number/
lib.rs

1#![no_std]
2
3#[cfg(test)]
4extern crate std;
5
6mod arch;
7mod common;
8mod decimal;
9mod float;
10mod lemire;
11mod slow;
12mod table;
13
14use self::{common::BiasedFp, float::RawFloat, table::POWER_OF_FIVE_128};
15pub use crate::arch::simd_str2int;
16
17const FLOATING_LONGEST_DIGITS: usize = 17;
18const F64_BITS: u32 = 64;
19const F64_SIG_BITS: u32 = 52;
20const F64_SIG_FULL_BITS: u32 = 53;
21const F64_EXP_BIAS: i32 = 1023;
22const F64_SIG_MASK: u64 = 0x000F_FFFF_FFFF_FFFF;
23
24#[derive(Debug)]
25pub enum ParserNumber {
26    Unsigned(u64),
27    /// Always less than zero.
28    Signed(i64),
29    /// Always finite.
30    Float(f64),
31}
32
33#[derive(Debug)]
34pub enum Error {
35    InvalidNumber,
36    FloatMustBeFinite,
37}
38
39macro_rules! match_digit {
40    ($data:expr, $i:expr, $pattern:pat) => {
41        $i < $data.len() && matches!($data[$i], $pattern)
42    };
43}
44
45macro_rules! is_digit {
46    ($data:expr, $i:expr) => {
47        $i < $data.len() && $data[$i].is_ascii_digit()
48    };
49}
50
51macro_rules! digit {
52    ($data:expr, $i:expr) => {
53        ($data[$i] - b'0') as u64
54    };
55}
56
57macro_rules! check_digit {
58    ($data:expr, $i:expr) => {
59        if !($i < $data.len() && $data[$i].is_ascii_digit()) {
60            return Err(Error::InvalidNumber);
61        }
62    };
63}
64
65#[inline(always)]
66fn parse_exponent(data: &[u8], index: &mut usize) -> Result<i32, Error> {
67    let mut exponent: i32 = 0;
68    let mut negative = false;
69
70    if *index >= data.len() {
71        return Err(Error::InvalidNumber);
72    }
73
74    match data[*index] {
75        b'+' => *index += 1,
76        b'-' => {
77            negative = true;
78            *index += 1;
79        }
80        _ => {}
81    }
82
83    check_digit!(data, *index);
84    while exponent < 1000 && is_digit!(data, *index) {
85        exponent = digit!(data, *index) as i32 + exponent * 10;
86        *index += 1;
87    }
88    while is_digit!(data, *index) {
89        *index += 1;
90    }
91    if negative {
92        exponent = -exponent;
93    }
94    Ok(exponent)
95}
96
97const POW10_UINT: [u64; 18] = [
98    1,
99    10,
100    100,
101    1000,
102    10000,
103    100000,
104    1000000,
105    10000000,
106    100000000,
107    1000000000,
108    10000000000,
109    100000000000,
110    1000000000000,
111    10000000000000,
112    100000000000000,
113    1000000000000000,
114    10000000000000000,
115    100000000000000000,
116];
117
118// parse at most 16 digits for fraction, record the exponent.
119// because we calcaute at least the first significant digit when both normal or subnormal float
120// points
121#[inline(always)]
122fn parse_number_fraction(
123    data: &[u8],
124    index: &mut usize,
125    significant: &mut u64,
126    exponent: &mut i32,
127    mut need: isize,
128    dot_pos: usize,
129) -> Result<bool, Error> {
130    debug_assert!(need < FLOATING_LONGEST_DIGITS as isize);
131
132    // native implement:
133    // while need > 0 && is_digit!(data, *index) {
134    //     *significant = *significant * 10 + digit!(data, *index);
135    //     *index += 1;
136    //     need -= 1;
137    // }
138    if need > 0 {
139        if data.len() - *index >= 16 {
140            let (frac, ndigits) = unsafe { simd_str2int(&data[*index..], need as usize) };
141            *significant = *significant * POW10_UINT[ndigits] + frac;
142            *index += ndigits;
143        } else {
144            while need > 0 && is_digit!(data, *index) {
145                *significant = *significant * 10 + digit!(data, *index);
146                *index += 1;
147                need -= 1;
148            }
149        }
150    }
151
152    *exponent -= *index as i32 - dot_pos as i32;
153    let mut trunc = false;
154    while is_digit!(data, *index) {
155        trunc = true;
156        *index += 1;
157    }
158
159    if match_digit!(data, *index, b'e' | b'E') {
160        *index += 1;
161        *exponent += parse_exponent(data, &mut *index)?;
162    }
163    Ok(trunc)
164}
165
166#[inline(always)]
167pub fn parse_number(data: &[u8], index: &mut usize, negative: bool) -> Result<ParserNumber, Error> {
168    let mut significant: u64 = 0;
169    let mut exponent: i32 = 0;
170    let mut trunc = false;
171    let raw_num = &data[*index..];
172
173    if match_digit!(data, *index, b'0') {
174        *index += 1;
175
176        if *index >= data.len() || !matches!(data[*index], b'.' | b'e' | b'E') {
177            // view -0 as float number
178            if negative {
179                return Ok(ParserNumber::Float(0.0));
180            }
181            return Ok(ParserNumber::Unsigned(0));
182        }
183
184        // deal with 0e123 or 0.000e123
185        match data[*index] {
186            b'.' => {
187                *index += 1;
188                let dot_pos = *index;
189                check_digit!(data, *index);
190                while match_digit!(data, *index, b'0') {
191                    *index += 1;
192                }
193                // special case: 0.000e123
194                if match_digit!(data, *index, b'e' | b'E') {
195                    *index += 1;
196                    if match_digit!(data, *index, b'-' | b'+') {
197                        *index += 1;
198                    }
199                    check_digit!(data, *index);
200                    while is_digit!(data, *index) {
201                        *index += 1;
202                    }
203                    return Ok(ParserNumber::Float(0.0));
204                }
205
206                // we calculate the first digit here for two reasons:
207                // 1. fastpath for small float number
208                // 2. we only need parse at most 16 digits in parse_number_fraction
209                // and it is friendly for simd
210                if !is_digit!(data, *index) {
211                    return Ok(ParserNumber::Float(0.0));
212                }
213
214                significant = digit!(data, *index);
215                *index += 1;
216
217                if is_digit!(data, *index) {
218                    let need = FLOATING_LONGEST_DIGITS as isize - 1;
219                    trunc = parse_number_fraction(
220                        data,
221                        index,
222                        &mut significant,
223                        &mut exponent,
224                        need,
225                        dot_pos,
226                    )?;
227                } else {
228                    exponent -= *index as i32 - dot_pos as i32;
229                    if match_digit!(data, *index, b'e' | b'E') {
230                        *index += 1;
231                        exponent += parse_exponent(data, &mut *index)?;
232                    }
233                }
234            }
235            b'e' | b'E' => {
236                *index += 1;
237                if match_digit!(data, *index, b'-' | b'+') {
238                    *index += 1;
239                }
240                check_digit!(data, *index);
241                while is_digit!(data, *index) {
242                    *index += 1;
243                }
244                return Ok(ParserNumber::Float(0.0));
245            }
246            _ => unreachable!("unreachable branch in parse_number_unchecked"),
247        }
248    } else {
249        // parse significant digits
250        let digit_start = *index;
251        while is_digit!(data, *index) {
252            // assume most number is not overflow here. When it overflow, we will check digits count
253            // and fallback into the slow path.
254            significant = significant
255                .wrapping_mul(10)
256                .wrapping_add(digit!(data, *index));
257            *index += 1;
258        }
259        let mut digits_cnt = *index - digit_start;
260        if digits_cnt == 0 {
261            return Err(Error::InvalidNumber);
262        }
263
264        // slow path for too long integer
265        if digits_cnt > 19 {
266            *index = digit_start;
267            significant = 0;
268            digits_cnt = 0;
269            while is_digit!(data, *index) && digits_cnt < 19 {
270                significant = significant * 10 + digit!(data, *index);
271                digits_cnt += 1;
272                *index += 1;
273            }
274
275            // overflow for u64 sig, mark as truncated
276            while is_digit!(data, *index) {
277                exponent += 1;
278                *index += 1;
279                trunc = true;
280            }
281        }
282
283        // TODO: fix special case like `43332000001000000003888e-4`.
284        // it should parse as `4.3332000001000003e18`.
285        if match_digit!(data, *index, b'e' | b'E') {
286            // parse exponent
287            *index += 1;
288            exponent += parse_exponent(data, index)?;
289        } else if match_digit!(data, *index, b'.') {
290            *index += 1;
291            check_digit!(data, *index);
292            let dot_pos = *index;
293
294            // parse fraction
295            let need = FLOATING_LONGEST_DIGITS as isize - digits_cnt as isize;
296            trunc =
297                parse_number_fraction(data, index, &mut significant, &mut exponent, need, dot_pos)?;
298        } else {
299            // parse integer, all parse has finished.
300            if exponent == 0 {
301                if negative {
302                    if significant > (1u64 << 63) {
303                        return Ok(ParserNumber::Float(-(significant as f64)));
304                    } else {
305                        // if significant is 0x8000_0000_0000_0000, it will overflow here.
306                        // so, we must use wrapping_sub here.
307                        return Ok(ParserNumber::Signed(0_i64.wrapping_sub(significant as i64)));
308                    }
309                } else {
310                    return Ok(ParserNumber::Unsigned(significant));
311                }
312            } else if exponent == 1 {
313                // now we get 20 digits, it maybe overflow for uint64
314                let last = digit!(data, *index - 1);
315                let (out, ov0) = significant.overflowing_mul(10);
316                let (out, ov1) = out.overflowing_add(last);
317                if !ov0 && !ov1 {
318                    // negative must be overflow here.
319                    significant = out;
320                    if negative {
321                        return Ok(ParserNumber::Float(-(significant as f64)));
322                    } else {
323                        return Ok(ParserNumber::Unsigned(significant));
324                    }
325                }
326            }
327            trunc = true;
328        }
329    }
330
331    // raw_num is pass-through for fallback parsing logic
332    parse_float(significant, exponent, negative, trunc, raw_num)
333}
334
335#[inline(always)]
336fn parse_float(
337    significant: u64,
338    exponent: i32,
339    negative: bool,
340    trunc: bool,
341    raw_num: &[u8],
342) -> Result<ParserNumber, Error> {
343    // parse double fast
344    if significant >> 52 == 0 && (-22..=(22 + 15)).contains(&exponent) {
345        if let Some(mut float) = parse_float_fast(exponent, significant) {
346            if negative {
347                float = -float;
348            }
349            return Ok(ParserNumber::Float(float));
350        }
351    }
352
353    if !trunc && exponent > (-308 + 1) && exponent < (308 - 20) {
354        if let Some(raw) = parse_floating_normal_fast(exponent, significant) {
355            let mut float = f64::from_u64_bits(raw);
356            if negative {
357                float = -float;
358            }
359            return Ok(ParserNumber::Float(float));
360        }
361    }
362
363    // If significant digits were truncated, then we can have rounding error
364    // only if `mantissa + 1` produces a different result. We also avoid
365    // redundantly using the Eisel-Lemire algorithm if it was unable to
366    // correctly round on the first pass.
367    let exponent = exponent as i64;
368    let mut fp = lemire::compute_float::<f64>(exponent, significant);
369    if trunc && fp.e >= 0 && fp != lemire::compute_float::<f64>(exponent, significant + 1) {
370        fp.e = -1;
371    }
372
373    // Unable to correctly round the float using the Eisel-Lemire algorithm.
374    // Fallback to a slower, but always correct algorithm.
375    if fp.e < 0 {
376        fp = slow::parse_long_mantissa::<f64>(raw_num);
377    }
378
379    let mut float = biased_fp_to_float::<f64>(fp);
380    if negative {
381        float = -float;
382    }
383
384    // check inf for float
385    if float.is_infinite() {
386        return Err(Error::FloatMustBeFinite);
387    }
388    Ok(ParserNumber::Float(float))
389}
390
391// This function is modified from yyjson
392#[inline(always)]
393fn parse_floating_normal_fast(exp10: i32, man: u64) -> Option<u64> {
394    let (mut hi, lo, hi2, add, bits);
395    let mut exp2: i32;
396    let mut exact = false;
397    let idx = exp10 + 342;
398    let sig2_ext = POWER_OF_FIVE_128[idx as usize].1;
399    let sig2 = POWER_OF_FIVE_128[idx as usize].0;
400
401    let mut lz = man.leading_zeros();
402    let sig1 = man << lz;
403    exp2 = ((217706 * exp10 - 4128768) >> 16) - lz as i32;
404
405    (lo, hi) = lemire::full_multiplication(sig1, sig2);
406
407    bits = hi & ((1u64 << (64 - 54 - 1)) - 1);
408    if bits.wrapping_sub(1) < ((1u64 << (64 - 54 - 1)) - 2) {
409        exact = true;
410    } else {
411        (_, hi2) = lemire::full_multiplication(sig1, sig2_ext);
412        // not need warring overflow here
413        add = lo.wrapping_add(hi2);
414        if add + 1 > 1u64 {
415            let carry = add < lo || add < hi2;
416            hi += carry as u64;
417            exact = true;
418        }
419    }
420
421    if exact {
422        lz = if hi < (1u64 << 63) { 1 } else { 0 };
423        hi <<= lz;
424        exp2 -= lz as i32;
425        exp2 += 64;
426
427        let round_up = (hi & (1u64 << (64 - 54))) > 0;
428        hi = hi.wrapping_add(if round_up { 1u64 << (64 - 54) } else { 0 });
429
430        if hi < (1u64 << (64 - 54)) {
431            hi = 1u64 << 63;
432            exp2 += 1;
433        }
434
435        hi >>= F64_BITS - F64_SIG_FULL_BITS;
436        exp2 += F64_BITS as i32 - F64_SIG_FULL_BITS as i32 + F64_SIG_BITS as i32;
437        exp2 += F64_EXP_BIAS;
438        let raw = ((exp2 as u64) << F64_SIG_BITS) | (hi & F64_SIG_MASK);
439        return Some(raw);
440    }
441    None
442}
443
444#[inline(always)]
445/// Converts a `BiasedFp` to the closest machine float type.
446fn biased_fp_to_float<T: RawFloat>(x: BiasedFp) -> T {
447    let mut word = x.f;
448    word |= (x.e as u64) << T::MANTISSA_EXPLICIT_BITS;
449    T::from_u64_bits(word)
450}
451
452#[inline(always)]
453fn parse_float_fast(exp10: i32, significant: u64) -> Option<f64> {
454    let mut d = significant as f64;
455    if exp10 > 0 {
456        if exp10 > 22 {
457            d *= POW10_FLOAT[exp10 as usize - 22];
458            if (-1e15..=1e15).contains(&d) {
459                Some(d * POW10_FLOAT[22])
460            } else {
461                None
462            }
463        } else {
464            Some(d * POW10_FLOAT[exp10 as usize])
465        }
466    } else {
467        Some(d / POW10_FLOAT[(-exp10) as usize])
468    }
469}
470
471const POW10_FLOAT: [f64; 23] = [
472    /* <= the connvertion to double is not exact when less than 1 => */ 1e-000, 1e+001,
473    1e+002, 1e+003, 1e+004, 1e+005, 1e+006, 1e+007, 1e+008, 1e+009, 1e+010, 1e+011, 1e+012, 1e+013,
474    1e+014, 1e+015, 1e+016, 1e+017, 1e+018, 1e+019, 1e+020, 1e+021,
475    1e+022, /* <= the connvertion to double is not exact when larger,  => */
476];
477
478#[cfg(test)]
479mod test {
480    use crate::{parse_number, ParserNumber};
481
482    fn test_parse_ok(input: &str, expect: f64) {
483        assert_eq!(input.parse::<f64>().unwrap(), expect);
484
485        let mut data = input.as_bytes().to_vec();
486        data.push(b' ');
487        let mut index = 0;
488        let num = parse_number(&data, &mut index, false).unwrap();
489        assert!(
490            matches!(num, ParserNumber::Float(f) if f == expect),
491            "parsed is {:?} failed num is {}",
492            num,
493            input
494        );
495        assert_eq!(data[index], b' ', "failed num is {}", input);
496    }
497
498    #[test]
499    fn test_parse_float() {
500        test_parse_ok("0.0", 0.0);
501        test_parse_ok("0.01", 0.01);
502        test_parse_ok("0.1", 0.1);
503        test_parse_ok("0.12", 0.12);
504        test_parse_ok("0.123", 0.123);
505        test_parse_ok("0.1234", 0.1234);
506        test_parse_ok("0.12345", 0.12345);
507        test_parse_ok("0.123456", 0.123456);
508        test_parse_ok("0.1234567", 0.1234567);
509        test_parse_ok("0.12345678", 0.12345678);
510        test_parse_ok("0.123456789", 0.123456789);
511        test_parse_ok("0.1234567890", 0.1234567890);
512        test_parse_ok("0.10000000149011612", 0.10000000149011612);
513        test_parse_ok("0.06411743306171047", 0.06411743306171047);
514
515        test_parse_ok("0e-1", 0e-1);
516        test_parse_ok("0e+1000000", 0e+1000000);
517        test_parse_ok("0.001e-1", 0.001e-1);
518        test_parse_ok("0.001e+123", 0.001e+123);
519        test_parse_ok(
520            "0.000000000000000000000000001e+123",
521            0.000000000000000000000000001e+123,
522        );
523
524        test_parse_ok("1.0", 1.0);
525        test_parse_ok("1350.0", 1350.0);
526        test_parse_ok("1.10000000149011612", 1.1000000014901161);
527
528        test_parse_ok("1e0", 1e0);
529        test_parse_ok("1.0e0", 1.0e0);
530        test_parse_ok("1.0e+0", 1.0e+0);
531        test_parse_ok("1.001e-123", 1.001e-123);
532        test_parse_ok("10000000149011610000.0e-123", 1.000_000_014_901_161e-104);
533        test_parse_ok(
534            "10000000149011612123.001e-123",
535            1.000_000_014_901_161_2e-104,
536        );
537        test_parse_ok("33333333333333333333", 3.333333333333333e19);
538        test_parse_ok("135e-12", 135e-12);
539
540        // test truncated float number without dot
541        test_parse_ok("12448139190673828122020e-47", 1.244813919067383e-25);
542        test_parse_ok(
543            "3469446951536141862700000000000000000e-62",
544            3.469446951536142e-26,
545        );
546    }
547}