iso8601_timestamp/
parse.rs

1use time::{Date, Duration, Month, PrimitiveDateTime, Time};
2
3/// Trait implemented locally for very fast parsing of small unsigned integers
4trait FastParse: Sized {
5    fn parse(s: &[u8]) -> Option<Self>;
6}
7
8#[cfg(any(test, not(feature = "verify")))]
9#[inline(always)]
10fn parse_2(s: &[u8]) -> u8 {
11    // SAFETY: This function is only called with slices of length 2
12    unsafe { assume!(s.len() == 2) };
13
14    // NOTE: Despite doing the same as the loop below, this is a hair faster
15    // (like a single clock cycle) due to instruction-level parallelism
16    (s[0] & 0x0f) * 10 + (s[1] & 0x0f)
17}
18
19#[cfg(any(test, not(feature = "verify")))]
20#[inline(always)]
21fn parse_4(s: &[u8]) -> u16 {
22    // SAFETY: This function is only called with slices of length 4
23    unsafe { assume!(s.len() == 4) };
24
25    let mut digits = u32::from_le_bytes({
26        let mut buf = [0; 4];
27        buf.copy_from_slice(s);
28        buf
29    });
30
31    digits = ((digits & 0x0f00_0f00) >> 8) + ((digits & 0x000f_000f) * 10);
32    digits = ((digits & 0x00ff_00ff) >> 16) + ((digits & 0x0000_00ff) * 100);
33
34    digits as u16
35}
36
37macro_rules! impl_fp {
38    ($($t:ty),*) => {$(
39        impl FastParse for $t {
40            #[inline(always)]
41            fn parse(s: &[u8]) -> Option<Self> {
42                #[allow(unused_mut)]
43                let mut overflow = false;
44                let mut num: $t = 0;
45
46                #[cfg(not(feature = "verify"))]
47                match s.len() {
48                    0 => return None,
49                    2 => return Some(parse_2(s) as $t),
50                    4 => return Some(parse_4(s) as $t),
51                    _ => {
52                        for byte in s {
53                            num = num.wrapping_mul(10) + (byte & 0x0f) as $t;
54                        }
55                    }
56                }
57
58                #[cfg(feature = "verify")]
59                for byte in s {
60                    let digit = byte.wrapping_sub(b'0');
61                    overflow |= digit > 9;
62                    num = num.wrapping_mul(10) + digit as $t;
63                }
64
65                match overflow {
66                    false => Some(num),
67                    true => None,
68                }
69            }
70        }
71    )*};
72}
73
74impl_fp!(u8, u16, u32);
75
76#[inline]
77pub fn parse_iso8601(b: &[u8]) -> Option<PrimitiveDateTime> {
78    let (mut offset, negate) = match b.first().copied() {
79        Some(c @ (b'+' | b'-' | 0xe2)) => {
80            let mut offset = 1;
81
82            if unlikely!(c == 0xe2) {
83                // check for UTF8 Unicode MINUS SIGN
84                if unlikely!(b.get(offset..(offset + 2)) != Some(&[0x88u8, 0x92u8] as &[u8])) {
85                    return None;
86                }
87
88                offset += 2;
89            }
90
91            (offset, (c != b'+') as i32)
92        }
93        Some(_) => (0, 0),
94        None => return None,
95    };
96
97    macro_rules! parse {
98        ($len:expr, $ty:ty $(, $eat_byte:expr)?) => {loop {
99            if let Some(chunk) = b.get(offset..(offset + $len)) {
100                if let Some(res) = <$ty as FastParse>::parse(chunk) {
101                    offset += $len;
102
103                    $(
104                        // conditional increment is slightly faster than branchless
105                        if let Some($eat_byte) = b.get(offset) {
106                            offset += 1;
107                        }
108                    )?
109
110                    break res;
111                }
112            }
113
114            return None;
115        }};
116    }
117
118    // NOTE: converting u16 to i16 is fine since it's less than 9999
119    let mut year = parse!(4, u16, b'-') as i32; // YYYY-?
120
121    // branchless conditional negation seems faster for i16
122    // done immediately after parsing to avoid keeping the negate register
123    year = (year ^ -negate) + negate;
124
125    let month = parse!(2, u8, b'-'); // MM-?
126    let day = parse!(2, u8); // DD
127
128    // NOTE: Inlining this is cheaper than `Month::try_from(month).ok()?`
129    let month = match month {
130        1 => Month::January,
131        2 => Month::February,
132        3 => Month::March,
133        4 => Month::April,
134        5 => Month::May,
135        6 => Month::June,
136        7 => Month::July,
137        8 => Month::August,
138        9 => Month::September,
139        10 => Month::October,
140        11 => Month::November,
141        12 => Month::December,
142        _ => return None,
143    };
144
145    #[cfg(feature = "verify")]
146    unsafe {
147        assume!(-9999 <= year && year <= 9999);
148    }
149
150    let Ok(date) = Date::from_calendar_date(year, month, day) else {
151        return None;
152    };
153
154    let mut date_time = PrimitiveDateTime::new(date, Time::MIDNIGHT);
155
156    match b.get(offset) {
157        Some(b'T' | b't' | b' ' | b'_') => {
158            offset += 1; // T
159        }
160        // date-only, None means it's at the end of the string
161        None => return Some(date_time),
162        _ => return None,
163    }
164
165    let hour = parse!(2, u8, b':'); // HH:?
166    let minute = parse!(2, u8, b':'); // mm:?
167
168    let mut second = 0;
169    let mut nanosecond = 0;
170
171    if let Some(b'0'..=b'9') = b.get(offset) {
172        second = parse!(2, u8);
173
174        if let Some(b'.' | b',') = b.get(offset) {
175            offset += 1;
176
177            let mut factor: u32 = 100_000_000; // up to 9 decimal places
178
179            // NOTE: After 9 decimal places, this does nothing other than consume digits,
180            // as factor will be zero, so nanosecond will not change
181            while let Some(&c) = b.get(offset) {
182                let d = c.wrapping_sub(b'0');
183
184                if d > 9 {
185                    break; // break on non-numeric input
186                }
187
188                nanosecond += d as u32 * factor;
189                factor /= 10;
190                offset += 1;
191            }
192        }
193
194        // if leap seconds, ignore the parsed value and set it to just before 60
195        // doing it this way avoids duplicate code to consume the extra characters
196        // NOTE: This will also "fix" malformed seconds input
197        if unlikely!(second > 59) {
198            // but don't neglect invalid input if necessary
199            #[cfg(feature = "verify")]
200            if unlikely!(second > 60) {
201                return None;
202            }
203
204            second = 59;
205            nanosecond = 999_999_999;
206        }
207    }
208
209    // SAFETY: These values are verified to be within bounds
210    unsafe {
211        assume!(nanosecond <= 999_999_999);
212        assume!(second <= 59);
213
214        // if input is verified, it's impossible for these values to go over 2 digits
215        #[cfg(feature = "verify")]
216        {
217            assume!(hour <= 99);
218            assume!(minute <= 99);
219        }
220    }
221
222    date_time = match Time::from_hms_nano(hour, minute, second, nanosecond) {
223        Ok(time) => date_time.replace_time(time),
224        _ => return None,
225    };
226
227    let tz = b.get(offset).copied();
228
229    offset += 1;
230
231    match tz {
232        // Z
233        Some(b'Z' | b'z') if likely!(offset == b.len()) => Some(date_time),
234
235        // timezone, like +00:00
236        Some(c @ (b'+' | b'-' | 0xe2)) => {
237            if unlikely!(c == 0xe2) {
238                // check for UTF8 Unicode MINUS SIGN
239                if unlikely!(b.get(offset..(offset + 2)) != Some(&[0x88u8, 0x92u8] as &[u8])) {
240                    return None;
241                }
242                offset += 2;
243            }
244
245            let tz_offset_hour = parse!(2, u8, b':') as i64;
246            let tz_offset_minute = parse!(2, u8) as i64;
247
248            if unlikely!(offset != b.len()) {
249                return None;
250            }
251
252            if tz_offset_hour == 0 && tz_offset_minute == 0 {
253                return Some(date_time);
254            }
255
256            let tz_offset = Duration::seconds(60 * 60 * tz_offset_hour + tz_offset_minute * 60);
257
258            // these generate function calls regardless, so avoid
259            // negating the offset and just chose which call to make
260            let checked_op: fn(PrimitiveDateTime, Duration) -> Option<PrimitiveDateTime> = match c != b'+' {
261                true => PrimitiveDateTime::checked_add as _,
262                false => PrimitiveDateTime::checked_sub as _,
263            };
264
265            checked_op(date_time, tz_offset)
266        }
267
268        // Parse trailing "UTC", but it does nothing, same as Z
269        Some(b'U' | b'u') => match b.get(offset..(offset + 2)) {
270            None => None,
271            Some(tc) => {
272                // avoid multiple branches when this loop is unrolled
273                let mut invalid = false;
274                for (c, r) in tc.iter().zip(b"tc") {
275                    invalid |= (*c | 0x20) != *r;
276                }
277
278                if unlikely!(invalid || (offset + 2) != b.len()) {
279                    return None;
280                }
281
282                Some(date_time)
283            }
284        },
285        None => Some(date_time),
286
287        _ => None,
288    }
289}
290
291#[cfg(test)]
292mod tests {
293    use super::*;
294
295    #[test]
296    fn test_parse_int() {
297        let i = u32::parse(b"1234567890");
298
299        assert_eq!(i, Some(1234567890));
300    }
301
302    #[cfg(feature = "std")]
303    #[test]
304    fn test_parse_int2() {
305        for i in 0..=99 {
306            let s = format!("{i:02}");
307            let res = parse_2(s.as_bytes());
308            assert_eq!(res, i);
309        }
310    }
311
312    #[cfg(feature = "std")]
313    #[test]
314    fn test_parse_int4() {
315        for i in 0..=9999 {
316            let s = format!("{i:04}");
317            let res = parse_4(s.as_bytes());
318            assert_eq!(res, i);
319        }
320    }
321}