unicode_intervals/
intervalset.rs

1use crate::Interval;
2
3/// A collection of non-overlapping Unicode codepoint intervals that enables interval-based
4/// operations, such as iteration over all Unicode codepoints or finding the codepoint at a
5/// specific position within the intervals.
6#[derive(Debug, Clone)]
7pub struct IntervalSet {
8    intervals: Vec<Interval>,
9    offsets: Vec<u32>,
10    size: u32,
11}
12
13impl IntervalSet {
14    #[must_use]
15    pub(crate) fn new(intervals: Vec<Interval>) -> IntervalSet {
16        let mut offsets = vec![0];
17        offsets.reserve_exact(intervals.len());
18        let mut size = 0;
19        // INVARIANT: `right` is always `>= left`, hence no overflow
20        #[allow(clippy::integer_arithmetic)]
21        for (left, right) in &intervals {
22            size += *right - *left + 1;
23            offsets.push(size);
24        }
25        IntervalSet {
26            intervals,
27            offsets,
28            size,
29        }
30    }
31
32    /// Returns the number of Unicode codepoints in the interval set.
33    ///
34    /// # Examples
35    ///
36    /// ```rust
37    /// # use unicode_intervals::UnicodeCategory;
38    /// let interval_set = unicode_intervals::query()
39    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
40    ///     .interval_set()
41    ///     .expect("Invalid query input");
42    /// assert_eq!(interval_set.len(), 1831);
43    /// ```
44    #[inline]
45    #[must_use]
46    pub fn len(&self) -> usize {
47        self.size as usize
48    }
49
50    /// Returns `true` if the interval set contains no elements.
51    ///
52    /// # Examples
53    ///
54    /// ```rust
55    /// # use unicode_intervals::UnicodeCategory;
56    /// let interval_set = unicode_intervals::query()
57    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
58    ///     // The first upper case letter has 65
59    ///     .max_codepoint(50)
60    ///     .interval_set()
61    ///     .expect("Invalid query input");
62    /// assert!(interval_set.is_empty());
63    /// ```
64    #[inline]
65    #[must_use]
66    pub const fn is_empty(&self) -> bool {
67        self.size == 0
68    }
69
70    /// Returns `true` if the interval set contains a codepoint with the given value.
71    ///
72    /// # Examples
73    ///
74    /// ```rust
75    /// # use unicode_intervals::UnicodeCategory;
76    /// let interval_set = unicode_intervals::query()
77    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
78    ///     .interval_set()
79    ///     .expect("Invalid query input");
80    /// assert!(interval_set.contains('C'));
81    /// assert!(!interval_set.contains('a'));
82    /// ```
83    #[inline]
84    #[must_use]
85    pub fn contains(&self, codepoint: impl Into<u32>) -> bool {
86        self.index_of(codepoint.into()).is_some()
87    }
88
89    /// Returns the codepoint at `index` in the `IntervalSet`.
90    ///
91    /// # Examples
92    ///
93    /// ```rust
94    /// # use unicode_intervals::UnicodeCategory;
95    /// let interval_set = unicode_intervals::query()
96    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
97    ///     .interval_set()
98    ///     .expect("Invalid query input");
99    /// // Get 10th codepoint in this interval set
100    ///assert_eq!(interval_set.codepoint_at(10), Some('K' as u32));
101    /// ```
102    #[inline]
103    #[must_use]
104    pub fn codepoint_at(&self, index: u32) -> Option<u32> {
105        if index >= self.size {
106            return None;
107        }
108        // INVARIANT: There is a positive number of intervals at this point per the check above
109        #[allow(clippy::integer_arithmetic)]
110        let mut current = self.intervals.len() - 1;
111        if self.offsets[current] > index {
112            let (mut high, mut low) = (current, 0_usize);
113            // INVARIANTS:
114            //   - `low + 1` never overflows as all possible values are far below `u32::MAX`
115            //   - `low + high` never overflows because two maximum values for these variables
116            //     are far below `u32::MAX`
117            #[allow(clippy::integer_arithmetic)]
118            while low + 1 < high {
119                let mid = (low + high) / 2;
120                if self.offsets[mid] <= index {
121                    low = mid;
122                } else {
123                    high = mid;
124                }
125            }
126            current = low;
127        }
128        // INVARIANT: `index` & offsets are small enough and won't cause overflow
129        #[allow(clippy::integer_arithmetic)]
130        Some(self.intervals[current].0 + index - self.offsets[current])
131    }
132
133    /// Returns the index of a specific codepoint in the `IntervalSet`.
134    ///
135    /// # Examples
136    ///
137    /// ```rust
138    /// # use unicode_intervals::UnicodeCategory;
139    /// let interval_set = unicode_intervals::query()
140    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
141    ///     .interval_set()
142    ///     .expect("Invalid query input");
143    /// assert_eq!(interval_set.index_of('A'), Some(0));
144    /// assert_eq!(interval_set.index_of('c'), None);
145    /// ```
146    #[inline]
147    #[must_use]
148    pub fn index_of(&self, codepoint: impl Into<u32>) -> Option<u32> {
149        let codepoint = codepoint.into();
150        for (offset, (left, right)) in self.offsets.iter().zip(self.intervals.iter()) {
151            if *left == codepoint {
152                return Some(*offset);
153            } else if *left > codepoint {
154                return None;
155            } else if codepoint <= *right {
156                // INVARIANT: `left` is smaller than `codepoint` and `offset` is small enough,
157                // so there is no overflow
158                #[allow(clippy::integer_arithmetic)]
159                return Some(*offset + (codepoint - left));
160            }
161        }
162        None
163    }
164
165    /// Returns the index of a specific codepoint in the `IntervalSet` if it is present in the set,
166    /// or the index of the closest codepoint that is greater than the given one.
167    ///
168    /// If the given codepoint is greater than the largest codepoint in the set, then the set's
169    /// size is returned.
170    ///
171    /// # Examples
172    ///
173    /// ```rust
174    /// # use unicode_intervals::UnicodeCategory;
175    /// let interval_set = unicode_intervals::query()
176    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
177    ///     .interval_set()
178    ///     .expect("Invalid query input");
179    /// assert_eq!(interval_set.index_above('Z'), 25);
180    /// ```
181    #[inline]
182    #[must_use]
183    pub fn index_above(&self, codepoint: impl Into<u32>) -> u32 {
184        let codepoint = codepoint.into();
185        for (offset, (left, right)) in self.offsets.iter().zip(self.intervals.iter()) {
186            if *left >= codepoint {
187                return *offset;
188            } else if codepoint <= *right {
189                // INVARIANT: `left` is smaller than `codepoint` and `offset` is small enough,
190                // so there is no overflow
191                #[allow(clippy::integer_arithmetic)]
192                return *offset + (codepoint - left);
193            }
194        }
195        self.size
196    }
197
198    /// Returns an iterator over all codepoints in all contained intervals.
199    ///
200    /// # Examples
201    ///
202    /// ```rust
203    /// # use unicode_intervals::UnicodeCategory;
204    /// let interval_set = unicode_intervals::query()
205    ///     .include_categories(UnicodeCategory::UPPERCASE_LETTER)
206    ///     .max_codepoint(67)
207    ///     .interval_set()
208    ///     .expect("Invalid query input");
209    /// let mut iterator = interval_set.iter();
210    /// assert_eq!(iterator.next(), Some('A' as u32));
211    /// assert_eq!(iterator.next(), Some('B' as u32));
212    /// assert_eq!(iterator.next(), Some('C' as u32));
213    /// assert_eq!(iterator.next(), None);
214    /// ```
215    pub fn iter(&self) -> impl Iterator<Item = u32> + DoubleEndedIterator + '_ {
216        self.intervals
217            .iter()
218            .flat_map(|(left, right)| *left..=*right)
219    }
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225    use crate::{UnicodeCategory, UnicodeVersion};
226    use test_case::test_case;
227
228    fn uppercase_letters() -> IntervalSet {
229        crate::query()
230            .include_categories(UnicodeCategory::UPPERCASE_LETTER)
231            .interval_set()
232            .expect("Invalid query input")
233    }
234
235    #[test_case(vec![(1, 1)])]
236    #[test_case(vec![])]
237    fn test_index_not_present(intervals: Vec<Interval>) {
238        assert!(IntervalSet::new(intervals).index_of(0_u32).is_none());
239    }
240
241    #[test_case(vec![], 1, None)]
242    #[test_case(vec![(1, 10)], 11, None)]
243    fn test_get(intervals: Vec<Interval>, index: u32, expected: Option<u32>) {
244        assert_eq!(IntervalSet::new(intervals).codepoint_at(index), expected);
245    }
246
247    #[test_case(vec![(1, 10)], 1, 0)]
248    #[test_case(vec![(1, 10)], 2, 1)]
249    #[test_case(vec![(1, 10)], 100, 10)]
250    fn test_index_above(intervals: Vec<Interval>, index: u32, expected: u32) {
251        assert_eq!(IntervalSet::new(intervals).index_above(index), expected);
252    }
253
254    #[test_case('Z' as u32, 25; "In the set")]
255    #[test_case('b' as u32, 26; "Not in the set")]
256    #[test_case(125218, 1831; "Greater than all")]
257    fn test_index_above_with_uppercase_letters(codepoint: u32, expected: u32) {
258        let interval_set = uppercase_letters();
259        assert_eq!(interval_set.index_above(codepoint), expected);
260    }
261
262    #[test_case('C', true)]
263    #[test_case('a', false)]
264    fn test_contains(codepoint: char, expected: bool) {
265        let interval_set = uppercase_letters();
266        assert_eq!(interval_set.contains(codepoint), expected);
267    }
268
269    #[test_case(10, Some('K' as u32); "Look from left")]
270    #[test_case(27, Some('Á' as u32); "Look from right")]
271    #[test_case(1830, Some(125217); "Max codepoint in the set")]
272    #[test_case(10000, None)]
273    #[test_case(u32::MAX, None)]
274    fn test_codepoint_at(index: u32, expected: Option<u32>) {
275        let interval_set = uppercase_letters();
276        assert_eq!(interval_set.codepoint_at(index), expected);
277    }
278
279    #[test]
280    fn test_codepoint_at_empty_set() {
281        let interval_set = IntervalSet::new(vec![]);
282        assert!(interval_set.codepoint_at(0).is_none());
283    }
284
285    #[test_case('K' as u32, Some(10); "Look from left")]
286    #[test_case('Á' as u32, Some(27); "Look from right")]
287    #[test_case(125184, Some(1797))]
288    #[test_case(5, None)]
289    fn test_index_of(codepoint: u32, expected: Option<u32>) {
290        let interval_set = uppercase_letters();
291        assert_eq!(interval_set.index_of(codepoint), expected);
292    }
293
294    #[test]
295    fn test_iter() {
296        let intervals = crate::query()
297            .include_categories(UnicodeCategory::LOWERCASE_LETTER)
298            .intervals()
299            .expect("Invalid query input");
300        let interval_set = IntervalSet::new(intervals);
301        let codepoints: Vec<_> = interval_set.iter().collect();
302        let mut expected = Vec::with_capacity(interval_set.len());
303        for (left, right) in
304            UnicodeVersion::latest().intervals_for(UnicodeCategory::LOWERCASE_LETTER)
305        {
306            for codepoint in *left..=*right {
307                expected.push(codepoint);
308            }
309        }
310        assert_eq!(codepoints, expected);
311        assert_eq!(interval_set.len(), codepoints.len());
312        assert!(!interval_set.is_empty());
313    }
314
315    #[test]
316    fn test_iter_rev() {
317        let interval_set = uppercase_letters();
318        let mut iter = interval_set.iter().rev();
319        assert_eq!(iter.next(), Some(125217));
320    }
321
322    #[test]
323    #[allow(clippy::redundant_clone)]
324    fn test_interval_set_traits() {
325        let interval_set = IntervalSet::new(vec![(0, 1)]);
326        let _ = interval_set.clone();
327        assert_eq!(
328            format!("{interval_set:?}"),
329            "IntervalSet { intervals: [(0, 1)], offsets: [0, 2], size: 2 }"
330        );
331    }
332}