char_iter/
lib.rs

1//! This crates provides a performant iterator over a linear range of
2//! characters.
3//!
4//! The iterator is inclusive of its endpoint, and correctly handles
5//! the surrogate range (`0xD800`-`0xDFFF`). This induces only one
6//! extra branch (or conditional-move) compared to a direct `x..y`
7//! integer iterator that doesn't handle the surrogate range.
8//!
9//! [Source](https://github.com/huonw/char-iter)
10//!
11//! # Installation
12//!
13//! Add this to your Cargo.toml:
14//!
15//! ```toml
16//! [dependencies]
17//! char-iter = "0.1"
18//! ```
19//!
20//! # Examples
21//!
22//! ```rust
23//! let v: Vec<char> = char_iter::new('a', 'f').collect();
24//! assert_eq!(v, &['a', 'b', 'c', 'd', 'e', 'f']);
25//! ```
26//!
27//! Reverse iteration is supported:
28//!
29//! ```rust
30//! // (codepoints 224 to 230)
31//! let v: Vec<char> = char_iter::new('à', 'æ').rev().collect();
32//! assert_eq!(v, &['æ', 'å', 'ä', 'ã', 'â', 'á', 'à']);
33//! ```
34//!
35//! The surrogate range is skipped:
36//!
37//! ```rust
38//! let v: Vec<char> = char_iter::new('\u{D7FF}', '\u{E000}').collect();
39//! // 0xD800, ... 0xDFFF are missing
40//! assert_eq!(v, &['\u{D7FF}', '\u{E000}']);
41//! ```
42
43#![cfg_attr(all(test, feature = "bench"), feature(test))]
44
45/// An iterator over a linear range of characters.
46///
47/// This is constructed by the `new` function at the top level.
48pub struct Iter {
49    start: char,
50    end: char,
51    finished: bool,
52}
53
54/// Create a new iterator over the characters (specifically Unicode
55/// Scalar Values) from `start` to `end`, inclusive.
56///
57/// # Panics
58///
59/// This panics if `start > end`.
60pub fn new(start: char, end: char) -> Iter {
61    assert!(start <= end);
62    Iter {
63        start: start,
64        end: end,
65        finished: false
66    }
67}
68
69const SUR_START: u32 = 0xD800;
70const SUR_END: u32 = 0xDFFF;
71const BEFORE_SUR: u32 = SUR_START - 1;
72const AFTER_SUR: u32 = SUR_END + 1;
73
74enum Dir { Forward, Backward }
75
76#[inline(always)]
77fn step(c: char, d: Dir) -> char {
78    let val = c as u32;
79    let new_val = match d {
80        Dir::Forward => if val == BEFORE_SUR {AFTER_SUR} else {val + 1},
81        Dir::Backward => if val == AFTER_SUR {BEFORE_SUR} else {val - 1},
82    };
83    debug_assert!(std::char::from_u32(new_val).is_some());
84    unsafe {std::mem::transmute(new_val)}
85}
86
87impl Iterator for Iter {
88    type Item = char;
89
90    fn next(&mut self) -> Option<char> {
91        if self.finished {
92            return None
93        }
94        let ret = Some(self.start);
95        if self.start == self.end {
96            self.finished = true;
97        } else {
98            self.start = step(self.start, Dir::Forward)
99        }
100        ret
101    }
102
103    fn size_hint(&self) -> (usize, Option<usize>) {
104        let len = if self.finished {
105            0
106        } else {
107            let start = self.start as u32;
108            let end = self.end as u32;
109            let naive_count = (end - start + 1) as usize;
110            if start <= BEFORE_SUR && end >= AFTER_SUR {
111                naive_count - (SUR_END - SUR_START + 1) as usize
112            } else {
113                naive_count
114            }
115        };
116        (len, Some(len))
117    }
118}
119impl DoubleEndedIterator for Iter {
120    fn next_back(&mut self) -> Option<char> {
121        if self.finished {
122            return None
123        }
124        let ret = Some(self.end);
125        if self.start == self.end {
126            self.finished = true;
127        } else {
128            self.end = step(self.end, Dir::Backward)
129        }
130        ret
131    }
132}
133
134impl ExactSizeIterator for Iter {}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    #[test]
141    fn smoke() {
142        let v: Vec<char> = new('a', 'f').collect();
143        assert_eq!(v, &['a', 'b', 'c', 'd', 'e', 'f']);
144    }
145    #[test]
146    fn smoke_rev() {
147        let v: Vec<char> = new('a', 'f').rev().collect();
148        assert_eq!(v, &['f', 'e', 'd', 'c', 'b', 'a']);
149    }
150    #[test]
151    fn smoke_size_hint() {
152        let mut iter = new('a', 'f');
153        assert_eq!(iter.size_hint(), (6, Some(6)));
154        for i in (0..6).rev() {
155            iter.next();
156            assert_eq!(iter.size_hint(), (i, Some(i)));
157        }
158        iter.next();
159        assert_eq!(iter.size_hint(), (0, Some(0)));
160    }
161    #[test]
162    fn smoke_rev_size_hint() {
163        let mut iter = new('a', 'f').rev();
164        assert_eq!(iter.size_hint(), (6, Some(6)));
165        for i in (0..6).rev() {
166            iter.next();
167            assert_eq!(iter.size_hint(), (i, Some(i)));
168        }
169        iter.next();
170        assert_eq!(iter.size_hint(), (0, Some(0)));
171    }
172    #[test]
173    fn equal() {
174        let v: Vec<char> = new('a', 'a').collect();
175        assert_eq!(v, &['a']);
176    }
177    #[test]
178    fn equal_rev() {
179        let v: Vec<char> = new('a', 'a').rev().collect();
180        assert_eq!(v, &['a']);
181    }
182    #[test]
183    fn equal_size_hint() {
184        let mut iter = new('a', 'a');
185        assert_eq!(iter.size_hint(), (1, Some(1)));
186        for i in (0..1).rev() {
187            iter.next();
188            assert_eq!(iter.size_hint(), (i, Some(i)));
189        }
190        iter.next();
191        assert_eq!(iter.size_hint(), (0, Some(0)));
192    }
193    #[test]
194    fn equal_rev_size_hint() {
195        let mut iter = new('a', 'a').rev();
196        assert_eq!(iter.size_hint(), (1, Some(1)));
197        for i in (0..1).rev() {
198            iter.next();
199            assert_eq!(iter.size_hint(), (i, Some(i)));
200        }
201        iter.next();
202        assert_eq!(iter.size_hint(), (0, Some(0)));
203    }
204
205    const S: char = '\u{D7FF}';
206    const E: char = '\u{E000}';
207    #[test]
208    fn surrogate() {
209        let v: Vec<char> = new(S, E).collect();
210        assert_eq!(v, &[S, E]);
211    }
212    #[test]
213    fn surrogate_rev() {
214        let v: Vec<char> = new(S, E).rev().collect();
215        assert_eq!(v, &[E, S]);
216    }
217    #[test]
218    fn surrogate_size_hint() {
219        let mut iter = new(S, E);
220        assert_eq!(iter.size_hint(), (2, Some(2)));
221        for i in (0..2).rev() {
222            iter.next();
223            assert_eq!(iter.size_hint(), (i, Some(i)));
224        }
225        iter.next();
226        assert_eq!(iter.size_hint(), (0, Some(0)));
227    }
228    #[test]
229    fn surrogate_rev_size_hint() {
230        let mut iter = new(S, E).rev();
231        assert_eq!(iter.size_hint(), (2, Some(2)));
232        for i in (0..2).rev() {
233            iter.next();
234            assert_eq!(iter.size_hint(), (i, Some(i)));
235        }
236        iter.next();
237        assert_eq!(iter.size_hint(), (0, Some(0)));
238    }
239
240    #[test]
241    fn full_range() {
242        let iter = new('\u{0}', '\u{10FFFF}');
243        let mut count = 1_114_112 - 2048;
244        assert_eq!(iter.size_hint(), (count, Some(count)));
245
246        for (i, c) in (0..0xD800).chain(0xE000..0x10FFFF + 1).zip(iter) {
247            assert_eq!(::std::char::from_u32(i).unwrap(), c);
248            count -= 1;
249        }
250        assert_eq!(count, 0);
251    }
252
253    #[should_panic]
254    #[test]
255    fn invalid() {
256        new('b','a');
257    }
258}
259#[cfg(all(test, feature = "bench"))]
260mod benches {
261    use super::*;
262    extern crate test;
263
264    #[bench]
265    fn count(b: &mut test::Bencher) {
266        b.iter(|| new('\u{0}', '\u{10FFFF}').count())
267    }
268    #[bench]
269    fn count_baseline(b: &mut test::Bencher) {
270        // this isn't the same range or the same length, but it's
271        // close enough.
272        b.iter(|| (0..0x10FFFF + 1).count())
273    }
274}