lexical_util/
iterator.rs

1//! Specialized iterator traits.
2//!
3//! The traits are for iterables containing bytes, and provide optimizations
4//! which then can be used for contiguous or non-contiguous iterables,
5//! including containers or iterators of any kind.
6
7#![cfg(any(feature = "parse-floats", feature = "parse-integers"))]
8
9use core::mem;
10
11// Re-export our digit iterators.
12#[cfg(not(feature = "format"))]
13pub use crate::noskip::{AsBytes, Bytes};
14#[cfg(feature = "format")]
15pub use crate::skip::{AsBytes, Bytes};
16
17/// A trait for working with iterables of bytes.
18///
19/// These iterators can either be contiguous or not contiguous and provide
20/// methods for reading data and accessing underlying data. The readers
21/// can either be contiguous or non-contiguous, although performance and
22/// some API methods may not be available for both.
23///
24/// # Safety
25///
26/// Safe if [`set_cursor`] is set to an index <= [`buffer_length`], so no
27/// out-of-bounds reads can occur. Also, [`get_buffer`] must return a slice of
28/// initialized bytes. The caller must also ensure that any calls that increment
29/// the cursor, such as [`step_by_unchecked`], [`step_unchecked`], and
30/// [`peek_many_unchecked`] never exceed [`buffer_length`] as well.
31///
32/// [`set_cursor`]: `Iter::set_cursor`
33/// [`buffer_length`]: `Iter::buffer_length`
34/// [`get_buffer`]: `Iter::get_buffer`
35/// [`step_by_unchecked`]: `Iter::step_by_unchecked`
36/// [`step_unchecked`]: `Iter::step_unchecked`
37/// [`peek_many_unchecked`]: `Iter::peek_many_unchecked`
38pub unsafe trait Iter<'a> {
39    /// Determine if the buffer is contiguous in memory.
40    const IS_CONTIGUOUS: bool;
41
42    // CURSORS
43    // -------
44
45    /// Get a ptr to the current start of the buffer.
46    #[inline(always)]
47    fn as_ptr(&self) -> *const u8 {
48        self.as_slice().as_ptr()
49    }
50
51    /// Get a slice to the current start of the buffer.
52    #[inline(always)]
53    fn as_slice(&self) -> &'a [u8] {
54        debug_assert!(self.cursor() <= self.buffer_length());
55        // SAFETY: safe since index must be in range.
56        unsafe { self.get_buffer().get_unchecked(self.cursor()..) }
57    }
58
59    /// Get a slice to the full underlying contiguous buffer,
60    fn get_buffer(&self) -> &'a [u8];
61
62    /// Get the total number of elements in the underlying buffer.
63    #[inline(always)]
64    fn buffer_length(&self) -> usize {
65        self.get_buffer().len()
66    }
67
68    /// Get if no bytes are available in the buffer.
69    ///
70    /// This operators on the underlying buffer: that is,
71    /// it returns if [`as_slice`] would return an empty slice.
72    ///
73    /// [`as_slice`]: Iter::as_slice
74    #[inline(always)]
75    fn is_buffer_empty(&self) -> bool {
76        self.cursor() >= self.get_buffer().len()
77    }
78
79    /// Get the current index of the iterator in the slice.
80    fn cursor(&self) -> usize;
81
82    /// Set the current index of the iterator in the slice.
83    ///
84    /// This is **NOT** the current position of the iterator,
85    /// since iterators may skip digits: this is the cursor
86    /// in the underlying buffer. For example, if `slc[2]` is
87    /// skipped, `set_cursor(3)` would be the 3rd element in
88    /// the iterator, not the 4th.
89    ///
90    /// # Safety
91    ///
92    /// Safe if `index <= self.buffer_length()`. Although this
93    /// won't affect safety, the caller also should be careful it
94    /// does not set the cursor within skipped characters
95    /// since this could affect correctness: an iterator that
96    /// only accepts non-consecutive digit separators would
97    /// pass if the cursor was set between the two.
98    unsafe fn set_cursor(&mut self, index: usize);
99
100    /// Get the current number of digits returned by the iterator.
101    ///
102    /// For contiguous iterators, this can include the sign character, decimal
103    /// point, and the exponent sign (that is, it is always the cursor). For
104    /// non-contiguous iterators, this must always be the only the number of
105    /// digits returned.
106    ///
107    /// This is never used for indexing but will be used for API detection.
108    fn current_count(&self) -> usize;
109
110    // PROPERTIES
111
112    /// Determine if the buffer is contiguous.
113    #[inline(always)]
114    fn is_contiguous(&self) -> bool {
115        Self::IS_CONTIGUOUS
116    }
117
118    /// Get the next value available without consuming it.
119    ///
120    /// This does **NOT** skip digits, and directly fetches the item
121    /// from the underlying buffer.
122    #[inline(always)]
123    fn first(&self) -> Option<&'a u8> {
124        self.get_buffer().get(self.cursor())
125    }
126
127    /// Check if the next element is a given value.
128    #[inline(always)]
129    fn first_is_cased(&self, value: u8) -> bool {
130        Some(&value) == self.first()
131    }
132
133    /// Check if the next element is a given value without case sensitivity.
134    #[inline(always)]
135    fn first_is_uncased(&self, value: u8) -> bool {
136        if let Some(&c) = self.first() {
137            c.eq_ignore_ascii_case(&value)
138        } else {
139            false
140        }
141    }
142
143    /// Check if the next item in buffer is a given value with optional case
144    /// sensitivity.
145    #[inline(always)]
146    fn first_is(&self, value: u8, is_cased: bool) -> bool {
147        if is_cased {
148            self.first_is_cased(value)
149        } else {
150            self.first_is_uncased(value)
151        }
152    }
153
154    // STEP BY
155    // -------
156
157    /// Advance the internal slice by `N` elements.
158    ///
159    /// This does not advance the iterator by `N` elements for
160    /// non-contiguous iterators: this just advances the internal,
161    /// underlying buffer. This is useful for multi-digit optimizations
162    /// for contiguous iterators.
163    ///
164    /// This does not increment the count of items: returns: this only
165    /// increments the index, not the total digits returned. You must use
166    /// this carefully: if stepping over a digit, you must then call
167    /// [`increment_count`] afterwards or else the internal count will
168    /// be incorrect.
169    ///
170    /// [`increment_count`]: DigitsIter::increment_count
171    ///
172    /// # Panics
173    ///
174    /// This will panic if the buffer advances for non-contiguous
175    /// iterators if the current byte is a digit separator, or if the
176    /// count is more than 1.
177    ///
178    /// # Safety
179    ///
180    /// As long as the iterator is at least `N` elements, this
181    /// is safe.
182    unsafe fn step_by_unchecked(&mut self, count: usize);
183
184    /// Advance the internal slice by 1 element.
185    ///
186    ///
187    /// This does not increment the count of items: returns: this only
188    /// increments the index, not the total digits returned. You must
189    /// use this carefully: if stepping over a digit, you must then call
190    /// [`increment_count`] afterwards or else the internal count will
191    /// be incorrect.
192    ///
193    /// [`increment_count`]: DigitsIter::increment_count
194    ///
195    /// # Panics
196    ///
197    /// This will panic if the buffer advances for non-contiguous
198    /// iterators if the current byte is a digit separator.
199    ///
200    /// # Safety
201    ///
202    /// Safe as long as the iterator is not empty.
203    #[inline(always)]
204    unsafe fn step_unchecked(&mut self) {
205        debug_assert!(!self.as_slice().is_empty());
206        // SAFETY: safe if `self.index < self.buffer_length()`.
207        unsafe { self.step_by_unchecked(1) };
208    }
209
210    // READ
211    // ----
212
213    /// Read a value of a difference type from the iterator.
214    ///
215    /// This does **not** advance the internal state of the iterator.
216    /// This can only be implemented for contiguous iterators: non-
217    /// contiguous iterators **MUST** panic.
218    ///
219    /// # Panics
220    ///
221    /// If the iterator is a non-contiguous iterator.
222    ///
223    /// # Safety
224    ///
225    /// Safe as long as the number of the buffer is contains as least as
226    /// many bytes as the size of V. This must be unimplemented for
227    /// non-contiguous iterators.
228    #[inline(always)]
229    unsafe fn peek_many_unchecked<V>(&self) -> V {
230        unimplemented!();
231    }
232
233    /// Try to read a the next four bytes as a u32.
234    ///
235    /// This does not advance the internal state of the iterator.
236    #[inline(always)]
237    fn peek_u32(&self) -> Option<u32> {
238        if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u32>() {
239            // SAFETY: safe since we've guaranteed the buffer is greater than
240            // the number of elements read. u32 is valid for all bit patterns
241            unsafe { Some(self.peek_many_unchecked()) }
242        } else {
243            None
244        }
245    }
246
247    /// Try to read the next eight bytes as a u64.
248    ///
249    /// This does not advance the internal state of the iterator.
250    #[inline(always)]
251    fn peek_u64(&self) -> Option<u64> {
252        if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u64>() {
253            // SAFETY: safe since we've guaranteed the buffer is greater than
254            // the number of elements read. u64 is valid for all bit patterns
255            unsafe { Some(self.peek_many_unchecked()) }
256        } else {
257            None
258        }
259    }
260}
261
262/// Iterator over a contiguous block of bytes.
263///
264/// This allows us to convert to-and-from-slices, raw pointers, and
265/// peek/query the data from either end cheaply.
266///
267/// A default implementation is provided for slice iterators.
268/// This trait **should never** return `null` from `as_ptr`, or be
269/// implemented for non-contiguous data.
270pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
271    /// Get if the iterator cannot return any more elements.
272    ///
273    /// This may advance the internal iterator state, but not
274    /// modify the next returned value.
275    ///
276    /// If this is an iterator, this is based on the number of items
277    /// left to be returned. We do not necessarly know the length of
278    /// the buffer. If this is a non-contiguous iterator, this **MUST**
279    /// advance the state until it knows a value can be returned.
280    ///
281    /// Any incorrect implementations of this affect all safety invariants
282    /// for the rest of the trait. For contiguous iterators, this can be
283    /// as simple as checking if `self.cursor >= self.slc.len()`, but for
284    /// non-contiguous iterators you **MUST** advance to the next element
285    /// to be returned, then check to see if a value exists. The safest
286    /// implementation is always to check if `self.peek().is_none()` and
287    /// ensure [`peek`] is always safe.
288    ///
289    /// If you would like to see if the cursor is at the end of the buffer,
290    /// see [`is_buffer_empty`] instead.
291    ///
292    /// [`is_buffer_empty`]: Iter::is_buffer_empty
293    /// [`peek`]: DigitsIter::peek
294    #[inline(always)]
295    #[allow(clippy::wrong_self_convention)] // reason="required for peeking next item"
296    fn is_consumed(&mut self) -> bool {
297        self.peek().is_none()
298    }
299
300    /// Increment the number of digits that have been returned by the iterator.
301    ///
302    /// For contiguous iterators, this is a no-op. For non-contiguous iterators,
303    /// this increments the count by 1.
304    fn increment_count(&mut self);
305
306    /// Peek the next value of the iterator, without consuming it.
307    ///
308    /// Note that this can modify the internal state, by skipping digits
309    /// for iterators that find the first non-zero value, etc. We optimize
310    /// this for the case where we have contiguous iterators, since
311    /// non-contiguous iterators already have a major performance penalty.
312    fn peek(&mut self) -> Option<Self::Item>;
313
314    /// Peek the next value of the iterator, and step only if it exists.
315    #[inline(always)]
316    fn try_read(&mut self) -> Option<Self::Item> {
317        if let Some(value) = self.peek() {
318            // SAFETY: the slice cannot be empty because we peeked a value.
319            unsafe { self.step_unchecked() };
320            Some(value)
321        } else {
322            None
323        }
324    }
325
326    /// Check if the next element is a given value.
327    #[inline(always)]
328    fn peek_is_cased(&mut self, value: u8) -> bool {
329        Some(&value) == self.peek()
330    }
331
332    /// Check if the next element is a given value without case sensitivity.
333    #[inline(always)]
334    fn peek_is_uncased(&mut self, value: u8) -> bool {
335        if let Some(&c) = self.peek() {
336            c.eq_ignore_ascii_case(&value)
337        } else {
338            false
339        }
340    }
341
342    /// Check if the next element is a given value with optional case
343    /// sensitivity.
344    #[inline(always)]
345    fn peek_is(&mut self, value: u8, is_cased: bool) -> bool {
346        if is_cased {
347            self.peek_is_cased(value)
348        } else {
349            self.peek_is_uncased(value)
350        }
351    }
352
353    /// Peek the next value and consume it if the read value matches the
354    /// expected one.
355    #[inline(always)]
356    fn read_if<Pred: FnOnce(u8) -> bool>(&mut self, pred: Pred) -> Option<u8> {
357        // NOTE: This was implemented to remove usage of unsafe throughout to code
358        // base, however, performance was really not up to scratch. I'm not sure
359        // the cause of this.
360        if let Some(&peeked) = self.peek() {
361            if pred(peeked) {
362                // SAFETY: the slice cannot be empty because we peeked a value.
363                unsafe { self.step_unchecked() };
364                Some(peeked)
365            } else {
366                None
367            }
368        } else {
369            None
370        }
371    }
372
373    /// Read a value if the value matches the provided one.
374    #[inline(always)]
375    fn read_if_value_cased(&mut self, value: u8) -> Option<u8> {
376        if self.peek() == Some(&value) {
377            // SAFETY: the slice cannot be empty because we peeked a value.
378            unsafe { self.step_unchecked() };
379            Some(value)
380        } else {
381            None
382        }
383    }
384
385    /// Read a value if the value matches the provided one without case
386    /// sensitivity.
387    #[inline(always)]
388    fn read_if_value_uncased(&mut self, value: u8) -> Option<u8> {
389        self.read_if(|x| x.eq_ignore_ascii_case(&value))
390    }
391
392    /// Read a value if the value matches the provided one.
393    #[inline(always)]
394    fn read_if_value(&mut self, value: u8, is_cased: bool) -> Option<u8> {
395        if is_cased {
396            self.read_if_value_cased(value)
397        } else {
398            self.read_if_value_uncased(value)
399        }
400    }
401
402    /// Skip zeros from the start of the iterator
403    #[inline(always)]
404    fn skip_zeros(&mut self) -> usize {
405        let start = self.current_count();
406        while self.read_if_value_cased(b'0').is_some() {
407            self.increment_count();
408        }
409        self.current_count() - start
410    }
411
412    /// Determine if the character is a digit.
413    fn is_digit(&self, value: u8) -> bool;
414}
lexical_util/iterator.rs

lexical_util/
iterator.rs