Skip to main content

lexigram_lib/
segments.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3use std::collections::BTreeSet;
4use std::fmt::{Debug, Display, Formatter, LowerHex, UpperHex};
5use std::ops::{Add, Deref, DerefMut, RangeInclusive};
6use std::iter::Sum;
7use crate::CollectJoin;
8use crate::btreeset;
9use crate::char_reader::{escape_char, UTF8_GAP_MAX, UTF8_GAP_MIN, UTF8_MAX};
10use crate::segmap::Seg;
11// ---------------------------------------------------------------------------------------------
12// Segments
13
14#[derive(Clone, PartialEq, Default, PartialOrd, Eq, Ord)]
15pub struct Segments(BTreeSet<Seg>);
16
17impl Segments {
18
19    #[inline]
20    pub fn empty() -> Self {    // TODO: rename to `new()`
21        Segments(btreeset![])
22    }
23
24    pub fn new(Seg(a, b): Seg) -> Self {
25        if a <= b {
26            Segments(btreeset![Seg(a, b)])
27        } else {
28            Self::empty()
29        }
30    }
31
32    #[inline]
33    pub fn is_dot(&self) -> bool {
34        self.len() == 2 && self.first().unwrap() == &Seg::DOT_LOW && self.last().unwrap() == &Seg::DOT_HIGH
35    }
36
37    #[inline]
38    pub fn dot() -> Segments {
39        Segments(BTreeSet::from([Seg::DOT_LOW, Seg::DOT_HIGH]))
40    }
41
42    pub fn insert(&mut self, seg: Seg) {
43        if seg.0 <= seg.1 {
44            self.0.insert(seg);
45        }
46    }
47
48    pub fn to_char(&self) -> Option<char> {
49        if self.len() == 1 {
50            let first = self.first().unwrap();
51            if first.0 == first.1 {
52                return char::from_u32(first.0)
53            }
54        }
55        None
56    }
57
58    pub fn intersect_char(&self, char: char) -> SegmentsCmp {
59        let c = char as u32;
60        let ci = Segments(btreeset![Seg(c, c)]);
61        for &Seg(a, b) in &self.0 {
62            if a <= c && c <= b {
63                let mut inside = self.clone();
64                inside.replace(Seg(a, b)).expect("cannot extract original data");
65                if a < c {
66                    inside.insert(Seg(a, c - 1));
67                }
68                if c < b {
69                    inside.insert(Seg(c + 1, b));
70                }
71                return SegmentsCmp { common: ci, internal: inside, external: Self::empty() };
72            }
73        }
74        SegmentsCmp {
75            common: Self::empty(),
76            internal: self.clone(),
77            external: Self::empty(),
78        }
79    }
80
81    // (a, b) inter (c, d) => (common, internal a-b, external a-b)
82    // only processes a <= c || (a == c && b <= d)
83    pub fn segment_intersect(Seg(a, b): Seg, Seg(c, d): Seg) -> SegmentsCmp {
84        if a < c || (a == c && b <= d) {
85            if a < c {
86                if b < c {
87                    SegmentsCmp { common: Segments::empty(), internal: Segments::new(Seg(a, b)), external: Segments::new(Seg(c, d)) }
88                } else if b <= d {
89                    SegmentsCmp { common: Segments::new(Seg(c, b)), internal: Segments::new(Seg(a, c - 1)), external: Segments::new(Seg(b + 1, d)) }
90                } else {
91                    SegmentsCmp { common: Segments::new(Seg(c, d)), internal: Segments(btreeset![Seg(a, c - 1), Seg(d + 1, b)]), external: Segments::empty() }
92                }
93            } else {
94                SegmentsCmp { common: Segments::new(Seg(a, b)), internal: Segments::empty(), external: Segments::new(Seg(b + 1, d)) }
95            }
96        } else {
97            Self::segment_intersect(Seg(c, d), Seg(a, b)).inverse()
98        }
99    }
100
101    pub fn intersect(&self, other: &Self) -> SegmentsCmp {
102        let mut ab_iter = self.iter();
103        let mut cd_iter = other.iter();
104        let mut ab = ab_iter.next().cloned();
105        let mut cd = cd_iter.next().cloned();
106        let mut result = SegmentsCmp::empty();
107        while let (Some(new_ab), Some(new_cd)) = (ab, cd) {
108            let mut cmp = Self::segment_intersect(new_ab, new_cd);
109            if cmp.common.is_empty() {
110                if new_ab.1 < new_cd.0 {
111                    result.internal.insert(new_ab);
112                    ab = ab_iter.next().cloned();
113                } else {
114                    result.external.insert(new_cd);
115                    cd = cd_iter.next().cloned();
116                }
117            } else {
118                if new_ab.1 > new_cd.1 { // processes the trailing segment
119                    ab = cmp.internal.pop_last();
120                } else {
121                    ab = ab_iter.next().cloned();
122                }
123                if new_cd.1 > new_ab.1 {
124                    cd = cmp.external.pop_last();
125                } else {
126                    cd = cd_iter.next().cloned();
127                }
128                result.extend(&cmp);
129            }
130        }
131        if let Some(ab) = ab {
132            result.internal.insert(ab);
133            result.internal.extend(ab_iter);
134        } else if let Some(cd) = cd {
135            result.external.insert(cd);
136            result.external.extend(cd_iter);
137        }
138        result
139    }
140
141    /// Partitions the segments in fonction of `other`'s segments, splitting the current segments
142    /// according to `other` and adding segments from `other`. Can be used iteratively on a collection
143    /// of Segments to obtain a partition of their segments.
144    ///
145    /// Returns `true` if the segments were modified.
146    ///
147    /// Example:
148    /// ```
149    /// use lexigram_lib::segments::Segments;
150    /// use lexigram_lib::segmap::Seg;
151    ///
152    /// let mut a = Segments::from([Seg(0, 10), Seg(20, 30)]);
153    /// let b = Segments::from([Seg(5, 6), Seg(15, 25)]);
154    /// assert!(a.add_partition(&b));
155    /// assert_eq!(a.into_iter().collect::<Vec<_>>(), vec![Seg(0, 4), Seg(5, 6), Seg(7, 10), Seg(15, 19), Seg(20, 25), Seg(26, 30)]);
156    /// ```
157    pub fn add_partition(&mut self, other: &Self) -> bool {
158        let cmp = self.intersect(other);
159        if !(cmp.common.is_empty() && cmp.external.is_empty()) {
160            self.clear();
161            self.extend(cmp.internal.0);
162            self.extend(cmp.common.0);
163            self.extend(cmp.external.0);
164            true
165        } else {
166            false
167        }
168    }
169
170    /// Slices the segments to match other's partition, but without merging self's initial partition.
171    /// ```
172    /// # use lexigram_lib::segments::Segments;
173    /// # use lexigram_lib::segmap::Seg;
174    /// let mut ab = Segments::from([Seg(1 as u32, 50 as u32)]);
175    /// let cd = Segments::from([Seg(10 as u32, 20 as u32), Seg(30 as u32, 40 as u32)]);
176    /// ab.slice_partitions(&cd);
177    /// assert_eq!(ab, Segments::from([
178    ///     Seg(1 as u32, 9 as u32), Seg(10 as u32, 20 as u32), Seg(21 as u32, 29 as u32),
179    ///     Seg(30 as u32, 40 as u32), Seg(41 as u32, 50 as u32)]));
180    /// ```
181    pub fn slice_partitions(&mut self, other: &Self) {
182        let cmp = self.intersect(other);
183        self.clear();
184        self.extend(cmp.internal.0);
185        self.extend(cmp.common.0);
186    }
187
188    pub fn normalize(&mut self) {
189        if !self.is_empty() {
190            let mut new = BTreeSet::<Seg>::new();
191            let mut segments = std::mem::take(&mut self.0).into_iter();
192            let mut last = segments.next().unwrap();
193            for Seg(a, b) in segments {
194                if a > last.1 + 1 {
195                    new.insert(last);
196                    last = Seg(a, b);
197                } else {
198                    last.1 = b;
199                }
200            }
201            new.insert(last);
202            self.0 = new;
203        }
204    }
205
206    pub fn normalized(&self) -> Self {
207        let mut n = self.clone();
208        n.normalize();
209        n
210    }
211
212    pub fn chars(&self) -> ReTypeCharIter {
213        ReTypeCharIter { segments: Some(self.0.clone()), range: None }
214    }
215
216    /// Inserts Seg(start, stop) in the current segment, except the UTF-8 gap between
217    /// UTF8_GAP_MIN (0xd800) and UTF8_GAP_MAX (0xdfff). If a part or the entirety of
218    /// that gap is within [start-stop], then it's extruded first.
219    pub fn insert_utf8(&mut self, start: u32, stop: u32) {
220        if start <= stop {
221            if stop < UTF8_GAP_MIN || start > UTF8_GAP_MAX {
222                self.0.insert(Seg(start, stop));
223            } else {
224                if start < UTF8_GAP_MIN {
225                    self.0.insert(Seg(start, UTF8_GAP_MIN - 1));
226                }
227                if stop > UTF8_GAP_MAX {
228                    self.0.insert(Seg(UTF8_GAP_MAX + 1, stop));
229                }
230            }
231        }
232    }
233
234    /// Negates the selection, except the UTF-8 gap between UTF8_GAP_MIN (0xd800) and
235    /// UTF8_GAP_MAX (0xdfff), which is always excluded.
236    pub fn not(&self) -> Self {
237        let mut inv = Segments::empty();
238        let mut start = 0;
239        for seg in &self.0 {
240            if seg.0 > start {
241                inv.insert_utf8(start, seg.0 - 1);
242            }
243            start = seg.1 + 1;
244        }
245        if start < UTF8_MAX {
246            inv.insert_utf8(start, UTF8_MAX);
247        }
248        inv
249    }
250}
251
252impl FromIterator<Seg> for Segments {
253    fn from_iter<T: IntoIterator<Item = Seg>>(segs: T) -> Self {
254        Segments(BTreeSet::from_iter(segs))
255    }
256}
257
258impl IntoIterator for Segments {
259    type Item = Seg;
260    type IntoIter = <BTreeSet<Seg> as IntoIterator>::IntoIter;
261
262    fn into_iter(self) -> Self::IntoIter {
263        self.0.into_iter()
264    }
265}
266
267impl<'a> IntoIterator for &'a Segments {
268    type Item = &'a Seg;
269    type IntoIter = <&'a BTreeSet<Seg> as IntoIterator>::IntoIter;
270
271    fn into_iter(self) -> Self::IntoIter {
272        self.0.iter()
273    }
274}
275
276impl<const N: usize> From<[Seg; N]> for Segments {
277    /// Converts a `[Seg; N]` into a `Segments`.
278    ///
279    /// ```
280    /// # use lexigram_lib::segments::Segments;
281    /// # use lexigram_lib::segmap::Seg;
282    /// let set1 = Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)]);
283    /// ```
284    fn from(arr: [Seg; N]) -> Self {
285        Segments(BTreeSet::from(arr))
286    }
287}
288
289impl From<char> for Segments {
290    fn from(c: char) -> Self {
291        Segments(btreeset![Seg(c as u32, c as u32)])
292    }
293}
294
295impl From<(char, char)> for Segments {
296    fn from((first, last): (char, char)) -> Self {
297        Segments(btreeset![Seg(first as u32, last as u32)])
298    }
299}
300
301impl Deref for Segments {
302    type Target = BTreeSet<Seg>;
303
304    fn deref(&self) -> &Self::Target {
305        &self.0
306    }
307}
308
309impl DerefMut for Segments {
310    fn deref_mut(&mut self) -> &mut Self::Target {
311        &mut self.0
312    }
313}
314
315impl Debug for Segments {
316    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
317        write!(f, "Segments({})", self.0.iter().map(|seg| format!("Seg(0x{:x}, 0x{:x})", seg.0, seg.1)).join(", "))
318    }
319}
320
321impl Display for Segments { // TODO: create wrapper to set the desired style (no bracket / bracket, no neg / neg, normalized / not)
322    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
323        if let Some(c) = self.to_char() {
324            write!(f, "'{}'", escape_char(c))
325        } else {
326            let normalized = self.normalized();
327            if normalized.is_dot() {
328                write!(f, "DOT")
329            } else {
330                if normalized.len() > 1 {
331                    let alt = normalized.not();
332                    if alt.len() < normalized.len() {
333                        return write!(f, "~[{}]", alt.0.iter()
334                            .map(|seg| seg.to_string())
335                            .join(", ")
336                        );
337                    }
338                }
339                write!(f, "{}", normalized.0.iter()
340                    .map(|seg| seg.to_string())
341                    .join(", ")
342                )
343            }
344        }
345    }
346}
347
348/// "{:x}" is used to show the raw segments with codes
349impl LowerHex for Segments {
350    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
351        write!(f, "{}", self.iter()
352            .map(|Seg(a, b)| if a == b { format!("{a}") } else { format!("{a}-{b}") })
353            .join(", ")
354        )
355    }
356}
357
358/// "{:X}" is used to show the raw segments with characters
359impl UpperHex for Segments {
360    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
361        write!(f, "{}", self.iter()
362            .map(|Seg(a, b)| if a == b {
363                format!("'{}'", escape_char(char::from_u32(*a).unwrap()))
364            } else {
365                format!("'{}'-'{}'", escape_char(char::from_u32(*a).unwrap()), escape_char(char::from_u32(*b).unwrap()))
366            })
367            .join(", ")
368        )
369    }
370}
371
372#[derive(Debug, Clone, PartialEq)]
373pub struct SegmentsCmp {
374    pub common: Segments,      // common to self and other
375    pub internal: Segments,    // only in self, external to other
376    pub external: Segments     // external to self, only in other
377}
378
379impl SegmentsCmp {
380    pub fn empty() -> Self {
381        SegmentsCmp { common: Segments::empty(), internal: Segments::empty(), external: Segments::empty() }
382    }
383
384    pub fn inverse(self) -> Self {
385        SegmentsCmp { common: self.common, internal: self.external, external: self.internal }
386    }
387
388    pub fn extend(&mut self, other: &Self) {
389        self.common.extend(other.common.iter());
390        self.internal.extend(other.internal.iter());
391        self.external.extend(other.external.iter());
392    }
393
394    pub fn normalize(&mut self) {
395        self.common.normalize();
396        self.internal.normalize();
397        self.external.normalize();
398    }
399}
400
401impl Display for SegmentsCmp {
402    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
403        write!(f, "<common: {}, internal: {}, external: {}>", self.common, self.internal, self.external)
404    }
405}
406
407#[cfg(test)]
408/// "{:x}" is used to show the raw segments with codes
409impl LowerHex for SegmentsCmp {
410    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
411        write!(f, "<common: {:x}, internal: {:x}, external: {:x}>", self.common, self.internal, self.external)
412    }
413}
414
415pub struct ReTypeCharIter {
416    segments: Option<BTreeSet<Seg>>,
417    range: Option<RangeInclusive<u32>>
418}
419
420impl Iterator for ReTypeCharIter {
421    type Item = char;
422
423    fn next(&mut self) -> Option<Self::Item> {
424        let mut next = self.range.as_mut().and_then(|r| r.next());
425        if next.is_none() {
426            if let Some(segments) = &mut self.segments {
427                if let Some(Seg(a, b)) = segments.pop_first() {
428                    self.range = Some(a..=b);
429                    next = self.range.as_mut().and_then(|r| r.next());
430                }
431            }
432        }
433        next.map(|code| char::from_u32(code).unwrap())
434    }
435}
436
437impl Add for Segments {
438    type Output = Self;
439
440    fn add(mut self, rhs: Self) -> Self::Output {
441        self.add_partition(&rhs);
442        self
443    }
444}
445
446impl Sum for Segments {
447    fn sum<I: Iterator<Item=Self>>(mut iter: I) -> Self {
448        let mut acc = iter.next().unwrap_or(Segments::empty());
449        for next in iter {
450            acc.add_partition(&next);
451        }
452        acc
453    }
454}
455
456// ---------------------------------------------------------------------------------------------
457// Macros
458
459pub mod macros {
460    /// Generates a Segments initialization from Seg values. The macro only accepts literals, either characters or integers,
461    /// along with a few identifiers:
462    ///
463    /// - `DOT` matches all UTF-8 characters
464    /// - `MIN`      = 0
465    /// - `LOW_MAX`  = 0xd7ff
466    /// - `GAP_MIN`  = 0xd800 (GAP_MIN - GAP_MAX are forbidden UTF-8 codepoint values)
467    /// - `GAP_MAX`  = 0xdfff
468    /// - `HIGH_MIN` = 0xe000
469    /// - `MAX`      = 0x10ffff
470    ///
471    /// Integer values are UTF-8 codepoint values, not the 1-4 byte representation.
472    ///
473    /// # Example
474    /// ```
475    /// # use lexigram_lib::{segments, segments::Segments, segmap::Seg};
476    /// assert_eq!(segments!('a', '0'-'9'), Segments::from([Seg('a' as u32, 'a' as u32), Seg('0' as u32, '9' as u32)]));
477    /// assert_eq!(segments!(DOT), Segments::dot());
478    /// assert_eq!(segments!(~ '1'-'8'), segments![MIN-'0', '9'-LOW_MAX, HIGH_MIN-MAX]);
479    /// ```
480    #[macro_export]
481    macro_rules! segments {
482        () => { $crate::segments::Segments::empty() };
483        (DOT) => { $crate::segments::Segments::dot() };
484        ($($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+) => { $crate::segments::Segments::from([$($crate::seg!($($a1)?$($a2)? $(- $($b1)?$($b2)?)?)),+]) };
485        (~ $($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+) => { $crate::segments![$($($a1)?$($a2)? $(- $($b1)?$($b2)?)?),+].not() };
486        //
487        ($($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+) => { $crate::segments![$($crate::seg!($($a1)?$($a2)? $(- $($b1)?$($b2)?)?)),+] };
488    }
489
490    /// Generates the key-value pairs corresponding to the `Segments => int` arguments, which can be
491    /// used to add values to `BTreeMap<Segments, StateId>` state transitions.
492    ///
493    /// All segments must be with square brackets or without them, but it's not allowed to mix both
494    /// formats in the same macro. Negation (`~`) can only be used with square brackets, and is placed
495    /// in front of the opening bracket.
496    ///
497    /// Segments are made up of any number of single character or codepoint literals, or inclusive
498    /// ranges of character / codepoint literals.
499    ///
500    /// A few identifiers can also be used:
501    /// - `DOT` matches all UTF-8 characters
502    /// - `MIN`      = 0
503    /// - `LOW_MAX`  = 0xd7ff
504    /// - `GAP_MIN`  = 0xd800 (GAP_MIN - GAP_MAX are forbidden UTF-8 codepoint values)
505    /// - `GAP_MAX`  = 0xdfff
506    /// - `HIGH_MIN` = 0xe000
507    /// - `MAX`      = 0x10ffff
508    ///
509    /// Integer values are UTF-8 codepoint values, not the 1-4 byte representation.
510    ///
511    /// # Example
512    /// ```
513    /// # use std::collections::{BTreeMap, BTreeSet};
514    /// # use lexigram_lib::{btreemap, segments, branch, segments::Segments};
515    /// # use lexigram_lib::segmap::Seg;
516    /// let transitions = btreemap![
517    ///     0 => branch!['a'-'c' => 0],
518    ///     1 => branch!['a'-'c', '0'-'2' => 0],
519    ///     2 => branch!['a'-'c', '.' => 0],
520    ///     3 => branch!['a'-'c', '.' => 0, 'd'-'f' => 1],
521    ///     4 => branch![['a'-'c', '.'] => 0, ['d'-'f'] => 1],
522    ///     5 => branch![['a'-'c', '.'] => 0, ~['a'-'c', '.'] => 1],
523    ///     6 => branch![0 - LOW_MAX, HIGH_MIN - MAX => 0],
524    ///     7 => branch!['a' => 0, DOT => 1],
525    /// ];
526    /// assert_eq!(transitions,
527    ///     btreemap![
528    ///         0 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32)]) => 0],
529    ///         1 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('0' as u32, '2' as u32)]) => 0],
530    ///         2 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0],
531    ///         3 => btreemap![
532    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
533    ///             Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
534    ///         4 => btreemap![
535    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
536    ///             Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
537    ///         5 => btreemap![
538    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
539    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]).not() => 1],
540    ///         6 => btreemap![Segments::from([Seg(0_u32, 0xd7ff_u32), Seg(0xe000_u32, 0x10ffff_u32)]) => 0],
541    ///         7 => btreemap![Segments::from([Seg('a' as u32, 'a' as u32)]) => 0, Segments::dot() => 1]
542    ///     ]);
543    /// ```
544    #[macro_export]
545    macro_rules! branch {
546        // doesn't work, so we can't mix [] and non-[] segments:
547        // ($( $($($($a1:literal)?$($a2:ident)? $(-$($b1:literal)?$($b2:ident)?)?),+)? $(~[$($($c1:literal)?$($c2:ident)? $(-$($d1:literal)?$($d2:ident)?)?),+])? => $value:expr ),*)
548        // => { btreemap![$($(segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+])? $(segments![~ $($($c1)?$($c2)?$(- $($d1)?$($d2)?)?),+])? => $value),*] };
549
550        ($( $($($a1:literal)?$($a2:ident)? $(-$($b1:literal)?$($b2:ident)?)?),+ => $value:expr ),*)
551        => { btreemap![$($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+] => $value),*] };
552        ($( $([$($($a1:literal)?$($a2:ident)? $(-$($b1:literal)?$($b2:ident)?)?),+])? $(~[$($($c1:literal)?$($c2:ident)? $(-$($d1:literal)?$($d2:ident)?)?),+])? => $value:expr ),*)
553        => { btreemap![$($($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+])? $($crate::segments![~ $($($c1)?$($c2)?$(- $($d1)?$($d2)?)?),+])? => $value),*] };
554    }
555}
556
557// ---------------------------------------------------------------------------------------------
558// Tests
559// ---------------------------------------------------------------------------------------------
560
561#[cfg(test)]
562mod tests {
563    use iter_index::IndexerIterator;
564    use crate::{seg, branch, btreemap, segments};
565    use super::*;
566
567    fn new_cmp(c: Seg, i: Seg, e: Seg) -> SegmentsCmp {
568        SegmentsCmp { common: Segments::new(c), internal: Segments::new(i), external: Segments::new(e) }
569    }
570
571    fn build_segments() -> Vec<(Seg, Seg, SegmentsCmp)> {
572        vec![
573            (Seg(1, 2), Seg(3, 4), new_cmp(Seg(9, 0), Seg(1, 2), Seg(3, 4))),
574            (Seg(1, 2), Seg(2, 3), new_cmp(Seg(2, 2), Seg(1, 1), Seg(3, 3))),
575            (Seg(1, 3), Seg(2, 4), new_cmp(Seg(2, 3), Seg(1, 1), Seg(4, 4))),
576            (Seg(1, 3), Seg(2, 3), new_cmp(Seg(2, 3), Seg(1, 1), Seg(9, 0))),
577            (Seg(1, 4), Seg(2, 3), SegmentsCmp { common: Segments::new(Seg(2, 3)), internal: Segments(btreeset![Seg(1, 1), Seg(4, 4)]), external: Segments::empty() }),
578            (Seg(1, 2), Seg(1, 3), new_cmp(Seg(1, 2), Seg(9, 0), Seg(3, 3))),
579            (Seg(1, 2), Seg(1, 2), new_cmp(Seg(1, 2), Seg(9, 0), Seg(9, 0))),
580            (Seg(1, 3), Seg(1, 2), new_cmp(Seg(1, 2), Seg(3, 3), Seg(9, 0))),
581            (Seg(2, 3), Seg(1, 4), SegmentsCmp { common: Segments::new(Seg(2, 3)), internal: Segments::empty(), external: Segments(btreeset![Seg(1, 1), Seg(4, 4)]) }),
582            (Seg(2, 3), Seg(1, 3), new_cmp(Seg(2, 3), Seg(9, 0), Seg(1, 1))),
583            (Seg(2, 4), Seg(1, 3), new_cmp(Seg(2, 3), Seg(4, 4), Seg(1, 1))),
584            (Seg(2, 3), Seg(1, 2), new_cmp(Seg(2, 2), Seg(3, 3), Seg(1, 1))),
585            (Seg(3, 4), Seg(1, 2), new_cmp(Seg(9, 0), Seg(3, 4), Seg(1, 2))),
586        ]
587    }
588
589    #[test]
590    fn segs_segment_intersect() {
591        let tests = build_segments();
592        for (idx, (ab, cd, expected_cmp)) in tests.into_iter().enumerate() {
593            let cmp = Segments::segment_intersect(ab, cd);
594            assert_eq!(cmp, expected_cmp, "test {idx} failed");
595        }
596    }
597
598    #[test]
599    fn segs_intersect() {
600        for scale in [10, 4] {
601            let iv = build_segments();
602            let mut ab = Segments::empty();
603            let mut cd = Segments::empty();
604            let mut expected_cmp = SegmentsCmp::empty();
605            for (idx, (Seg(a, b), Seg(c, d), cmp)) in iv.into_iter().index::<u32>() {
606                let offset = scale * idx;
607                ab.insert(Seg(a + offset, b + offset));
608                cd.insert(Seg(c + offset, d + offset));
609                expected_cmp.common.extend(cmp.common.iter().map(|Seg(a, b)| Seg(*a + offset, *b + offset)));
610                expected_cmp.internal.extend(cmp.internal.iter().map(|Seg(a, b)| Seg(*a + offset, *b + offset)));
611                expected_cmp.external.extend(cmp.external.iter().map(|Seg(a, b)| Seg(*a + offset, *b + offset)));
612            }
613            let msg = format!("test failed for scale {scale}");
614            let cmp = ab.intersect(&cd);
615            assert_eq!(cmp, expected_cmp, "{}", msg);
616            let cmp = cd.intersect(&ab);
617            assert_eq!(cmp, expected_cmp.clone().inverse(), "{}", msg);
618            let cmp = ab.intersect(&Segments::empty());
619            assert_eq!(cmp, SegmentsCmp { common: Segments::empty(), internal: ab.clone(), external: Segments::empty() }, "{}", msg);
620            let cmp = Segments::empty().intersect(&ab);
621            assert_eq!(cmp, SegmentsCmp { common: Segments::empty(), internal: Segments::empty(), external: ab.clone() }, "{}", msg);
622            ab.normalize();
623            cd.normalize();
624            expected_cmp.normalize();
625            let cmp = ab.intersect(&cd);
626            assert_eq!(cmp, expected_cmp);
627            let cmp = cd.intersect(&ab);
628            assert_eq!(cmp, expected_cmp.inverse());
629        }
630    }
631
632    #[test]
633    fn segs_intersect_corner() {
634        let tests: Vec<(Segments, Segments, (Segments, Segments, Segments))> = vec![
635            (segments![1 - 50], segments![10 - 20, 30 - 40],
636             (segments![10-20, 30-40], segments![1-9, 21-29, 41-50], segments![])),
637            (segments![1-10, 11-15, 16-20, 21-35, 36-37, 38-50], segments![10-20, 30-40],
638             (segments![10-20, 30-40], segments![1-9, 21-29, 41-50], segments![])),
639            (segments![0-9], segments![0-0, 1-9],
640             (segments![0-9], segments![], segments![])),
641            (segments![], segments![],
642             (segments![], segments![], segments![])),
643        ];
644        const VERBOSE: bool = false;
645        for (idx, (ab, cd, expected_cmp)) in tests.into_iter().enumerate() {
646            let expected_cmp = SegmentsCmp { common: expected_cmp.0, internal: expected_cmp.1, external: expected_cmp.2 };
647            let mut cmp = ab.intersect(&cd);
648            if VERBOSE { println!("{ab:x} # {cd:x} = com: {:x}, int: {:x}, ext: {:x}", cmp.common, cmp.internal, cmp.external); }
649            cmp.normalize();
650            if VERBOSE { println!("  normalized: com: {:x}, int: {:x}, ext: {:x}", cmp.common, cmp.internal, cmp.external); }
651            assert_eq!(cmp, expected_cmp, "test {idx} failed");
652            let mut cmp = cd.intersect(&ab);
653            cmp.normalize();
654            assert_eq!(cmp, expected_cmp.inverse(), "test {idx} failed");
655        }
656    }
657
658    #[test]
659    fn segs_partition() {
660        let tests: Vec<(Segments, Segments, Segments)> = vec![
661            (segments![1-4], segments![3-6], segments![1-2, 3-4, 5-6]),
662            (segments![1-4], segments![5-6], segments![1-4, 5-6]),
663            (segments![1-6], segments![3-4], segments![1-2, 3-4, 5-6]),
664            (segments![1-4, 5-10], segments![], segments![1-4, 5-10]),
665            (segments![], segments![1-4, 5-10], segments![1-4, 5-10]),
666            (segments![1-4, 5-10], segments![3-5], segments![1-2, 3-4, 5-5, 6-10]),
667            (segments![10-15, 20-25], segments![1-100], segments![1-9, 10-15, 16-19, 20-25, 26-100]),
668        ];
669        for (idx, (mut ab, cd, exp)) in tests.into_iter().enumerate() {
670            ab.add_partition(&cd);
671            let expected = exp;
672            assert_eq!(ab, expected, "test {idx} failed: {ab:x} instead of {expected:x}");
673        }
674    }
675
676    #[test]
677    fn segs_slice_partition() {
678        let tests: Vec<(Segments, Segments, Segments)> = vec![
679            (segments![1 - 50], segments![10 - 20, 30 - 40],
680             segments![1-9, 10-20, 21-29, 30-40, 41-50]),
681            (segments![10 - 20, 30 - 40], segments![1 - 50],
682             segments![10-20, 30-40]),
683            (segments![1-10, 11-15, 16-20, 21-35, 36-37, 38-50], segments![10-20, 30-40],
684             segments![1-9, 10, 11-15, 16-20, 21-29, 30-35, 36-37, 38-40, 41-50]),
685            (segments![0-9], segments![0-0, 1-9],
686             segments![0, 1-9]),
687            (segments![1-10, 30-40], segments![11-20, 25-29, 41-100],
688             segments![1-10, 30-40]),
689            (segments![], segments![],
690             segments![]),
691        ];
692        const VERBOSE: bool = false;
693        for (idx, (mut ab, cd, expected_part)) in tests.into_iter().enumerate() {
694            if VERBOSE { print!("{ab:x} # {cd:x} => "); }
695            ab.slice_partitions(&cd);
696            if VERBOSE { println!("{ab:x}"); }
697            assert_eq!(ab, expected_part, "test {idx} failed");
698        }
699    }
700
701    #[test]
702    fn segs_chars() {
703        let tests = vec![
704            (segments!['a'-'a'], "a"),
705            (segments!['a'-'d'], "abcd"),
706            (segments!['a'-'c', 'x'-'z'], "abcxyz"),
707            (segments!['a'-'b', 'd'-'d', 'f'-'f', 'x'-'z'], "abdfxyz"),
708        ];
709        for (idx, (segments, expected)) in tests.into_iter().enumerate() {
710            let result = segments.chars().collect::<String>();
711            assert_eq!(result, expected, "test {idx} failed");
712        }
713    }
714
715    #[test]
716    fn segs_insert_utf8() {
717        let tests = vec![
718            (0, UTF8_MAX,                    segments![DOT]),
719            (32, UTF8_GAP_MIN + 2,           segments![32-LOW_MAX]),
720            (64, UTF8_GAP_MAX,               segments![64-LOW_MAX]),
721            (96, UTF8_GAP_MAX + 1,           segments![96-LOW_MAX, HIGH_MIN]),
722            (UTF8_GAP_MIN, UTF8_GAP_MAX,     segments![]),
723            (UTF8_GAP_MIN, UTF8_GAP_MAX + 1, segments![HIGH_MIN]),
724        ];
725        for (test_id, (a, b, expected)) in tests.into_iter().enumerate() {
726            let mut result = Segments::empty();
727            result.insert_utf8(a, b);
728            assert_eq!(result, expected, "test {test_id} failed");
729        }
730    }
731
732    #[test]
733    fn segs_not() {
734        let tests = vec![
735            (segments![DOT],                        segments![]),
736            (segments![],                           segments![DOT]),
737            (segments![0],                          segments![1-LOW_MAX, HIGH_MIN-MAX]),
738            (segments![0-MAX],                      segments![]),
739            (segments![1-0xd700],                   segments![0-0, 0xd701-LOW_MAX, HIGH_MIN-MAX]),
740            (segments![2-0xd7fe],                   segments![0-1, 0xd7ff, HIGH_MIN-MAX]),
741            (segments![3-LOW_MAX],                  segments![0-2, HIGH_MIN-MAX]),
742            (segments![4-0xdfff],                   segments![0-3, HIGH_MIN-MAX]),
743            (segments![5-HIGH_MIN],                 segments![0-4, 0xe001-MAX]),
744            (segments![0-6, LOW_MAX-HIGH_MIN, MAX], segments![7-0xd7fe, 0xe001-0x10fffe]),
745            (segments![0-7, GAP_MIN-GAP_MAX],       segments![8-LOW_MAX, HIGH_MIN-MAX]),
746            (segments![0-8, GAP_MAX-0xe001],        segments![9-LOW_MAX, 0xe002-MAX]),
747            (segments![0-9, HIGH_MIN-MAX],          segments![10-LOW_MAX]),
748        ];
749        for (test_id, (segments, expected)) in tests.into_iter().enumerate() {
750            let result = segments.not();
751            assert_eq!(result.normalized(), expected.normalized(), "test {test_id} failed");
752        }
753    }
754
755    #[test]
756    fn macro_segments() {
757        assert_eq!(seg!('a'-'z'), Seg('a' as u32, 'z' as u32));
758        assert_eq!(seg!('a'), Seg('a' as u32, 'a' as u32));
759        assert_eq!(segments!('a'-'z'), Segments::from(('a', 'z')));
760        assert_eq!(segments!('a'), Segments::from('a'));
761        assert_eq!(segments!('a'-'z', '0'-'9'), Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)]));
762        assert_eq!(segments!('a'-'z', '0'-'9', '-'), Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32), Seg('-' as u32, '-' as u32)]));
763        assert_eq!(segments!(~ '0'-'9', '.'), Segments::from([Seg('0' as u32, '9' as u32), Seg('.' as u32, '.' as u32)]).not());
764        assert_eq!(segments!(0 - LOW_MAX, HIGH_MIN - MAX), Segments::dot());
765        assert_eq!(segments!(~ 0 - LOW_MAX, HIGH_MIN - MAX), Segments::empty());
766        assert_eq!(segments!(DOT), Segments::dot());
767        assert_eq!(segments!(~ DOT), Segments::empty());
768    }
769
770    #[test]
771    fn macro_branch() {
772        let transitions = btreemap![
773            0 => branch!['a'-'c' => 0],
774            1 => branch!['a'-'c', '0'-'2' => 0],
775            2 => branch!['a'-'c', '.' => 0],
776            3 => branch!['a'-'c', '.' => 0, 'd'-'f' => 1],
777            4 => branch![['a'-'c', '.'] => 0, ['d'-'f'] => 1],
778            5 => branch![['a'-'c', '.'] => 0, ~['a'-'c', '.'] => 1],
779            6 => branch![0 - LOW_MAX, HIGH_MIN - MAX => 0],
780            7 => branch!['a' => 0, DOT => 1],
781        ];
782        assert_eq!(transitions,
783            btreemap![
784                0 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32)]) => 0],
785                1 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('0' as u32, '2' as u32)]) => 0],
786                2 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0],
787                3 => btreemap![
788                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
789                    Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
790                4 => btreemap![
791                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
792                    Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
793                5 => btreemap![
794                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
795                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]).not() => 1],
796                6 => btreemap![Segments::from([Seg(0_u32, 0xd7ff_u32), Seg(0xe000_u32, 0x10ffff_u32)]) => 0],
797                7 => btreemap![Segments::from([Seg('a' as u32, 'a' as u32)]) => 0, Segments::dot() => 1]
798            ]);
799    }
800}