1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
use crate::util::SliceHelp;
use std::cmp::Ordering;
use std::iter::once;
pub type CodePoint = u32;
/// The maximum (inclusive) code point.
pub const CODE_POINT_MAX: CodePoint = 0x10FFFF;
/// An list of sorted, inclusive, non-empty ranges of code points.
/// This is more efficient than InclusiveRange because it does not need to carry
/// around the Option<bool>.
#[derive(Debug, Copy, Clone)]
pub struct Interval {
pub first: CodePoint,
pub last: CodePoint,
}
impl Interval {
/// Return whether self is before rhs.
fn is_before(self, other: Interval) -> bool {
self.last < other.first
}
/// Return whether self is strictly before rhs.
/// "Strictly" here means there is at least one value after the end of self,
/// and before the start of rhs. Overlapping *or abutting* intervals are
/// not considered strictly before.
fn is_strictly_before(self, rhs: Interval) -> bool {
self.last + 1 < rhs.first
}
/// Compare two intervals.
/// Overlapping *or abutting* intervals are considered equal.
fn mergecmp(self, rhs: Interval) -> Ordering {
if self.is_strictly_before(rhs) {
Ordering::Less
} else if rhs.is_strictly_before(self) {
Ordering::Greater
} else {
Ordering::Equal
}
}
/// Return whether self is mergeable with rhs.
fn mergeable(self, rhs: Interval) -> bool {
self.mergecmp(rhs) == Ordering::Equal
}
/// Return whether self contains a code point \p cp.
pub fn contains(self, cp: CodePoint) -> bool {
self.first <= cp && cp <= self.last
}
/// Return whether self overlaps 'other'.
/// Overlaps means that we share at least one code point with 'other'.
pub fn overlaps(self, other: Interval) -> bool {
!self.is_before(other) && !other.is_before(self)
}
/// Return the interval of codepoints.
pub fn codepoints(self) -> std::ops::Range<u32> {
debug_assert!(self.last + 1 > self.last, "Overflow");
self.first..(self.last + 1)
}
/// Return the number of contained code points.
pub fn count_codepoints(self) -> usize {
(self.last - self.first + 1) as usize
}
}
/// Merge two intervals, which must be overlapping or abutting.
fn merge_intervals(x: Interval, y: &Interval) -> Interval {
debug_assert!(x.mergeable(*y), "Ranges not mergeable");
Interval {
first: std::cmp::min(x.first, y.first),
last: std::cmp::max(x.last, y.last),
}
}
#[derive(Clone, Debug, Default)]
pub struct CodePointSet {
ivs: Vec<Interval>,
}
/// A set of code points stored via as disjoint, non-abutting, sorted intervals.
impl CodePointSet {
pub fn new() -> CodePointSet {
CodePointSet { ivs: Vec::new() }
}
fn assert_is_well_formed(&self) {
if cfg!(debug_assertions) {
for iv in &self.ivs {
debug_assert!(iv.last <= CODE_POINT_MAX);
debug_assert!(iv.first <= iv.last);
}
for w in self.ivs.windows(2) {
debug_assert!(w[0].is_strictly_before(w[1]));
}
}
}
/// Construct from sorted, disjoint intervals. Note these are not allowed to
/// even abut.
pub fn from_sorted_disjoint_intervals(ivs: Vec<Interval>) -> CodePointSet {
let res = CodePointSet { ivs };
res.assert_is_well_formed();
res
}
/// Add an interval of code points to the set.
pub fn add(&mut self, new_iv: Interval) {
// Find the mergeable subarray, that is, the range of intervals that intersect
// or abut new_iv.
let mergeable = self.ivs.equal_range_by(|iv| iv.mergecmp(new_iv));
// Check our work.
if cfg!(debug_assertions) {
for (idx, iv) in self.ivs.iter().enumerate() {
if idx < mergeable.start {
debug_assert!(iv.is_strictly_before(new_iv));
} else if idx >= mergeable.end {
debug_assert!(new_iv.is_strictly_before(*iv));
} else {
debug_assert!(iv.mergeable(new_iv) && new_iv.mergeable(*iv));
}
}
}
// Merge all the overlapping intervals (possibly none), and then replace the
// range.
let merged_iv = self.ivs[mergeable.clone()]
.iter()
.fold(new_iv, merge_intervals);
self.ivs.splice(mergeable, once(merged_iv));
self.assert_is_well_formed();
}
/// Add a single code point to the set.
pub fn add_one(&mut self, cp: CodePoint) {
self.add(Interval {
first: cp,
last: cp,
})
}
/// Add another code point set.
pub fn add_set(&mut self, mut rhs: CodePointSet) {
// Prefer to add to the set with more intervals.
if self.ivs.len() < rhs.ivs.len() {
std::mem::swap(self, &mut rhs);
}
for iv in rhs.intervals() {
self.add(*iv)
}
}
/// \return the intervals
pub fn intervals(&self) -> &[Interval] {
self.ivs.as_slice()
}
/// \return the number of intervals that would be produced by inverting.
pub fn inverted_interval_count(&self) -> usize {
let mut result = 0;
let mut start: CodePoint = 0;
for iv in &self.ivs {
if start < iv.first {
result += 1;
}
start = iv.last + 1;
}
if start <= CODE_POINT_MAX {
result += 1;
}
result
}
/// \return an inverted set: a set containing every code point NOT in the
/// receiver.
pub fn inverted(&self) -> CodePointSet {
// The intervals we collect.
let mut inverted_ivs = Vec::new();
// The first code point *not* in the previous interval.
let mut start: CodePoint = 0;
for iv in &self.ivs {
if start < iv.first {
inverted_ivs.push(Interval {
first: start,
last: iv.first - 1,
})
}
start = iv.last + 1;
}
if start <= CODE_POINT_MAX {
inverted_ivs.push(Interval {
first: start,
last: CODE_POINT_MAX,
})
}
CodePointSet::from_sorted_disjoint_intervals(inverted_ivs)
}
}