lexigram_lib/
segments.rs

1// Copyright (c) 2025 Redglyph (@gmail.com). All Rights Reserved.
2
3use std::collections::BTreeSet;
4use std::fmt::{Debug, Display, Formatter, LowerHex, UpperHex};
5use std::ops::{Add, Deref, DerefMut, RangeInclusive};
6use std::iter::Sum;
7use crate::CollectJoin;
8use crate::btreeset;
9use crate::char_reader::{escape_char, UTF8_GAP_MAX, UTF8_GAP_MIN, UTF8_MAX};
10use crate::segmap::Seg;
11// ---------------------------------------------------------------------------------------------
12// Segments
13
14#[derive(Clone, PartialEq, Default, PartialOrd, Eq, Ord)]
15pub struct Segments(BTreeSet<Seg>);
16
17impl Segments {
18
19    #[inline]
20    pub fn empty() -> Self {    // TODO: rename to `new()`
21        Segments(btreeset![])
22    }
23
24    pub fn new(Seg(a, b): Seg) -> Self {
25        if a <= b {
26            Segments(btreeset![Seg(a, b)])
27        } else {
28            Self::empty()
29        }
30    }
31
32    #[inline]
33    pub fn is_dot(&self) -> bool {
34        self.len() == 2 && self.first().unwrap() == &Seg::DOT_LOW && self.last().unwrap() == &Seg::DOT_HIGH
35    }
36
37    #[inline]
38    pub fn dot() -> Segments {
39        Segments(BTreeSet::from([Seg::DOT_LOW, Seg::DOT_HIGH]))
40    }
41
42    pub fn insert(&mut self, seg: Seg) {
43        if seg.0 <= seg.1 {
44            self.0.insert(seg);
45        }
46    }
47
48    pub fn to_char(&self) -> Option<char> {
49        if self.len() == 1 {
50            let first = self.first().unwrap();
51            if first.0 == first.1 {
52                return char::from_u32(first.0)
53            }
54        }
55        None
56    }
57
58    pub fn intersect_char(&self, char: char) -> SegmentsCmp {
59        let c = char as u32;
60        let ci = Segments(btreeset![Seg(c, c)]);
61        for &Seg(a, b) in &self.0 {
62            if a <= c && c <= b {
63                let mut inside = self.clone();
64                inside.replace(Seg(a, b)).expect("cannot extract original data");
65                if a < c {
66                    inside.insert(Seg(a, c - 1));
67                }
68                if c < b {
69                    inside.insert(Seg(c + 1, b));
70                }
71                return SegmentsCmp { common: ci, internal: inside, external: Self::empty() };
72            }
73        }
74        SegmentsCmp {
75            common: Self::empty(),
76            internal: self.clone(),
77            external: Self::empty(),
78        }
79    }
80
81    // (a, b) inter (c, d) => (common, internal a-b, external a-b)
82    // only processes a <= c || (a == c && b <= d)
83    pub fn segment_intersect(Seg(a, b): Seg, Seg(c, d): Seg) -> SegmentsCmp {
84        if a < c || (a == c && b <= d) {
85            if a < c {
86                if b < c {
87                    SegmentsCmp { common: Segments::empty(), internal: Segments::new(Seg(a, b)), external: Segments::new(Seg(c, d)) }
88                } else if b <= d {
89                    SegmentsCmp { common: Segments::new(Seg(c, b)), internal: Segments::new(Seg(a, c - 1)), external: Segments::new(Seg(b + 1, d)) }
90                } else {
91                    SegmentsCmp { common: Segments::new(Seg(c, d)), internal: Segments(btreeset![Seg(a, c - 1), Seg(d + 1, b)]), external: Segments::empty() }
92                }
93            } else {
94                SegmentsCmp { common: Segments::new(Seg(a, b)), internal: Segments::empty(), external: Segments::new(Seg(b + 1, d)) }
95            }
96        } else {
97            Self::segment_intersect(Seg(c, d), Seg(a, b)).inverse()
98        }
99    }
100
101    pub fn intersect(&self, other: &Self) -> SegmentsCmp {
102        let mut ab_iter = self.iter();
103        let mut cd_iter = other.iter();
104        let mut ab = ab_iter.next().cloned();
105        let mut cd = cd_iter.next().cloned();
106        let mut result = SegmentsCmp::empty();
107        while let (Some(new_ab), Some(new_cd)) = (ab, cd) {
108            let mut cmp = Self::segment_intersect(new_ab, new_cd);
109            if cmp.common.is_empty() {
110                if new_ab.1 < new_cd.0 {
111                    result.internal.insert(new_ab);
112                    ab = ab_iter.next().cloned();
113                } else {
114                    result.external.insert(new_cd);
115                    cd = cd_iter.next().cloned();
116                }
117            } else {
118                if new_ab.1 > new_cd.1 { // processes the trailing segment
119                    ab = cmp.internal.pop_last();
120                } else {
121                    ab = ab_iter.next().cloned();
122                }
123                if new_cd.1 > new_ab.1 {
124                    cd = cmp.external.pop_last();
125                } else {
126                    cd = cd_iter.next().cloned();
127                }
128                result.extend(&cmp);
129            }
130        }
131        if let Some(ab) = ab {
132            result.internal.insert(ab);
133            result.internal.extend(ab_iter);
134        } else if let Some(cd) = cd {
135            result.external.insert(cd);
136            result.external.extend(cd_iter);
137        }
138        result
139    }
140
141    /// Partitions the segments in fonction of `other`'s segments, splitting the current segments
142    /// according to `other` and adding segments from `other`. Can be used iteratively on a collection
143    /// of Segments to obtain a partition of their segments.
144    ///
145    /// Returns `true` if the segments were modified.
146    ///
147    /// Example:
148    /// ```
149    /// use lexigram_lib::segments::Segments;
150    /// use lexigram_lib::segmap::Seg;
151    ///
152    /// let mut a = Segments::from([Seg(0, 10), Seg(20, 30)]);
153    /// let b = Segments::from([Seg(5, 6), Seg(15, 25)]);
154    /// assert!(a.add_partition(&b));
155    /// assert_eq!(a.into_iter().collect::<Vec<_>>(), vec![Seg(0, 4), Seg(5, 6), Seg(7, 10), Seg(15, 19), Seg(20, 25), Seg(26, 30)]);
156    /// ```
157    pub fn add_partition(&mut self, other: &Self) -> bool {
158        let cmp = self.intersect(other);
159        if !(cmp.common.is_empty() && cmp.external.is_empty()) {
160            self.clear();
161            self.extend(cmp.internal.0);
162            self.extend(cmp.common.0);
163            self.extend(cmp.external.0);
164            true
165        } else {
166            false
167        }
168    }
169
170    /// Slices the segments to match other's partition, but without merging self's initial partition.
171    /// ```
172    /// # use lexigram_lib::segments::Segments;
173    /// # use lexigram_lib::segmap::Seg;
174    /// let mut ab = Segments::from([Seg(1 as u32, 50 as u32)]);
175    /// let cd = Segments::from([Seg(10 as u32, 20 as u32), Seg(30 as u32, 40 as u32)]);
176    /// ab.slice_partitions(&cd);
177    /// assert_eq!(ab, Segments::from([
178    ///     Seg(1 as u32, 9 as u32), Seg(10 as u32, 20 as u32), Seg(21 as u32, 29 as u32),
179    ///     Seg(30 as u32, 40 as u32), Seg(41 as u32, 50 as u32)]));
180    /// ```
181    pub fn slice_partitions(&mut self, other: &Self) {
182        let cmp = self.intersect(other);
183        self.clear();
184        self.extend(cmp.internal.0);
185        self.extend(cmp.common.0);
186    }
187
188    pub fn normalize(&mut self) {
189        if !self.is_empty() {
190            let mut new = BTreeSet::<Seg>::new();
191            let mut segments = std::mem::take(&mut self.0).into_iter();
192            let mut last = segments.next().unwrap();
193            while let Some(Seg(a, b)) = segments.next() {
194                if a > last.1 + 1 {
195                    new.insert(last);
196                    last = Seg(a, b);
197                } else {
198                    last.1 = b;
199                }
200            }
201            new.insert(last);
202            self.0 = new;
203        }
204    }
205
206    pub fn normalized(&self) -> Self {
207        let mut n = self.clone();
208        n.normalize();
209        n
210    }
211
212    pub fn chars(&self) -> ReTypeCharIter {
213        ReTypeCharIter { segments: Some(self.0.clone()), range: None }
214    }
215
216    /// Inserts Seg(start, stop) in the current segment, except the UTF-8 gap between
217    /// UTF8_GAP_MIN (0xd800) and UTF8_GAP_MAX (0xdfff). If a part or the entirety of
218    /// that gap is within [start-stop], then it's extruded first.
219    pub fn insert_utf8(&mut self, start: u32, stop: u32) {
220        if start <= stop {
221            if stop < UTF8_GAP_MIN || start > UTF8_GAP_MAX {
222                self.0.insert(Seg(start, stop));
223            } else {
224                if start < UTF8_GAP_MIN {
225                    self.0.insert(Seg(start, UTF8_GAP_MIN - 1));
226                }
227                if stop > UTF8_GAP_MAX {
228                    self.0.insert(Seg(UTF8_GAP_MAX + 1, stop));
229                }
230            }
231        }
232    }
233
234    /// Negates the selection, except the UTF-8 gap between UTF8_GAP_MIN (0xd800) and
235    /// UTF8_GAP_MAX (0xdfff), which is always excluded.
236    pub fn not(&self) -> Self {
237        let mut inv = Segments::empty();
238        let mut start = 0;
239        for seg in &self.0 {
240            if seg.0 > start {
241                inv.insert_utf8(start, seg.0 - 1);
242            }
243            start = seg.1 + 1;
244        }
245        if start < UTF8_MAX {
246            inv.insert_utf8(start, UTF8_MAX);
247        }
248        inv
249    }
250
251    pub fn from_iter<T: IntoIterator<Item = Seg>>(segs: T) -> Self {
252        Segments(BTreeSet::from_iter(segs))
253    }
254
255}
256
257impl IntoIterator for Segments {
258    type Item = Seg;
259    type IntoIter = <BTreeSet<Seg> as IntoIterator>::IntoIter;
260
261    fn into_iter(self) -> Self::IntoIter {
262        self.0.into_iter()
263    }
264}
265
266impl<'a> IntoIterator for &'a Segments {
267    type Item = &'a Seg;
268    type IntoIter = <&'a BTreeSet<Seg> as IntoIterator>::IntoIter;
269
270    fn into_iter(self) -> Self::IntoIter {
271        self.0.iter()
272    }
273}
274
275impl<const N: usize> From<[Seg; N]> for Segments {
276    /// Converts a `[Seg; N]` into a `Segments`.
277    ///
278    /// ```
279    /// # use lexigram_lib::segments::Segments;
280    /// # use lexigram_lib::segmap::Seg;
281    /// let set1 = Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)]);
282    /// ```
283    fn from(arr: [Seg; N]) -> Self {
284        Segments(BTreeSet::from(arr))
285    }
286}
287
288impl From<char> for Segments {
289    fn from(c: char) -> Self {
290        Segments(btreeset![Seg(c as u32, c as u32)])
291    }
292}
293
294impl From<(char, char)> for Segments {
295    fn from((first, last): (char, char)) -> Self {
296        Segments(btreeset![Seg(first as u32, last as u32)])
297    }
298}
299
300impl Deref for Segments {
301    type Target = BTreeSet<Seg>;
302
303    fn deref(&self) -> &Self::Target {
304        &self.0
305    }
306}
307
308impl DerefMut for Segments {
309    fn deref_mut(&mut self) -> &mut Self::Target {
310        &mut self.0
311    }
312}
313
314impl Debug for Segments {
315    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
316        write!(f, "Segments({})", self.0.iter().map(|seg| format!("Seg(0x{:x}, 0x{:x})", seg.0, seg.1)).join(", "))
317    }
318}
319
320impl Display for Segments { // TODO: create wrapper to set the desired style (no bracket / bracket, no neg / neg, normalized / not)
321    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
322        if let Some(c) = self.to_char() {
323            write!(f, "'{}'", escape_char(c))
324        } else {
325            let normalized = self.normalized();
326            if normalized.is_dot() {
327                write!(f, "DOT")
328            } else {
329                if normalized.len() > 1 {
330                    let alt = normalized.not();
331                    if alt.len() < normalized.len() {
332                        return write!(f, "~[{}]", alt.0.iter()
333                            .map(|seg| seg.to_string())
334                            .join(", ")
335                        );
336                    }
337                }
338                write!(f, "{}", normalized.0.iter()
339                    .map(|seg| seg.to_string())
340                    .join(", ")
341                )
342            }
343        }
344    }
345}
346
347/// "{:x}" is used to show the raw segments with codes
348impl LowerHex for Segments {
349    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
350        write!(f, "{}", self.iter()
351            .map(|Seg(a, b)| if a == b { format!("{a}") } else { format!("{a}-{b}") })
352            .join(", ")
353        )
354    }
355}
356
357/// "{:X}" is used to show the raw segments with characters
358impl UpperHex for Segments {
359    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
360        write!(f, "{}", self.iter()
361            .map(|Seg(a, b)| if a == b {
362                format!("'{}'", escape_char(char::from_u32(*a).unwrap()))
363            } else {
364                format!("'{}'-'{}'", escape_char(char::from_u32(*a).unwrap()), escape_char(char::from_u32(*b).unwrap()))
365            })
366            .join(", ")
367        )
368    }
369}
370
371#[derive(Debug, Clone, PartialEq)]
372pub struct SegmentsCmp {
373    pub common: Segments,      // common to self and other
374    pub internal: Segments,    // only in self, external to other
375    pub external: Segments     // external to self, only in other
376}
377
378impl SegmentsCmp {
379    pub fn empty() -> Self {
380        SegmentsCmp { common: Segments::empty(), internal: Segments::empty(), external: Segments::empty() }
381    }
382
383    pub fn inverse(self) -> Self {
384        SegmentsCmp { common: self.common, internal: self.external, external: self.internal }
385    }
386
387    pub fn extend(&mut self, other: &Self) {
388        self.common.extend(other.common.iter());
389        self.internal.extend(other.internal.iter());
390        self.external.extend(other.external.iter());
391    }
392
393    pub fn normalize(&mut self) {
394        self.common.normalize();
395        self.internal.normalize();
396        self.external.normalize();
397    }
398}
399
400impl Display for SegmentsCmp {
401    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
402        write!(f, "<common: {}, internal: {}, external: {}>", self.common, self.internal, self.external)
403    }
404}
405
406#[cfg(test)]
407/// "{:x}" is used to show the raw segments with codes
408impl LowerHex for SegmentsCmp {
409    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
410        write!(f, "<common: {:x}, internal: {:x}, external: {:x}>", self.common, self.internal, self.external)
411    }
412}
413
414pub struct ReTypeCharIter {
415    segments: Option<BTreeSet<Seg>>,
416    range: Option<RangeInclusive<u32>>
417}
418
419impl Iterator for ReTypeCharIter {
420    type Item = char;
421
422    fn next(&mut self) -> Option<Self::Item> {
423        let mut next = self.range.as_mut().and_then(|r| r.next());
424        if next.is_none() {
425            if let Some(segments) = &mut self.segments {
426                if let Some(Seg(a, b)) = segments.pop_first() {
427                    self.range = Some(a..=b);
428                    next = self.range.as_mut().and_then(|r| r.next());
429                }
430            }
431        }
432        next.map(|code| char::from_u32(code).unwrap())
433    }
434}
435
436impl Add for Segments {
437    type Output = Self;
438
439    fn add(mut self, rhs: Self) -> Self::Output {
440        self.add_partition(&rhs);
441        self
442    }
443}
444
445impl Sum for Segments {
446    fn sum<I: Iterator<Item=Self>>(mut iter: I) -> Self {
447        let mut acc = iter.next().unwrap_or(Segments::empty());
448        while let Some(next) = iter.next() {
449            acc.add_partition(&next);
450        }
451        acc
452    }
453}
454
455// ---------------------------------------------------------------------------------------------
456// Macros
457
458pub mod macros {
459    /// Generates a Segments initialization from Seg values. The macro only accepts literals, either characters or integers,
460    /// along with a few identifiers:
461    ///
462    /// - `DOT` matches all UTF-8 characters
463    /// - `MIN`      = 0
464    /// - `LOW_MAX`  = 0xd7ff
465    /// - `GAP_MIN`  = 0xd800 (GAP_MIN - GAP_MAX are forbidden UTF-8 codepoint values)
466    /// - `GAP_MAX`  = 0xdfff
467    /// - `HIGH_MIN` = 0xe000
468    /// - `MAX`      = 0x10ffff
469    ///
470    /// Integer values are UTF-8 codepoint values, not the 1-4 byte representation.
471    ///
472    /// # Example
473    /// ```
474    /// # use lexigram_lib::{segments, segments::Segments, segmap::Seg};
475    /// assert_eq!(segments!('a', '0'-'9'), Segments::from([Seg('a' as u32, 'a' as u32), Seg('0' as u32, '9' as u32)]));
476    /// assert_eq!(segments!(DOT), Segments::dot());
477    /// assert_eq!(segments!(~ '1'-'8'), segments![MIN-'0', '9'-LOW_MAX, HIGH_MIN-MAX]);
478    /// ```
479    #[macro_export]
480    macro_rules! segments {
481        () => { $crate::segments::Segments::empty() };
482        (DOT) => { $crate::segments::Segments::dot() };
483        ($($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+) => { $crate::segments::Segments::from([$($crate::seg!($($a1)?$($a2)? $(- $($b1)?$($b2)?)?)),+]) };
484        (~ $($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?),+) => { $crate::segments![$($($a1)?$($a2)? $(- $($b1)?$($b2)?)?),+].not() };
485        //
486        ($($($a1:literal)?$($a2:ident)? $(- $($b1:literal)?$($b2:ident)?)?,)+) => { $crate::segments![$($crate::seg!($($a1)?$($a2)? $(- $($b1)?$($b2)?)?)),+] };
487    }
488
489    /// Generates the key-value pairs corresponding to the `Segments => int` arguments, which can be
490    /// used to add values to `BTreeMap<Segments, StateId>` state transitions.
491    ///
492    /// All segments must be with square brackets or without them, but it's not allowed to mix both
493    /// formats in the same macro. Negation (`~`) can only be used with square brackets, and is placed
494    /// in front of the opening bracket.
495    ///
496    /// Segments are made up of any number of single character or codepoint literals, or inclusive
497    /// ranges of character / codepoint literals.
498    ///
499    /// A few identifiers can also be used:
500    /// - `DOT` matches all UTF-8 characters
501    /// - `MIN`      = 0
502    /// - `LOW_MAX`  = 0xd7ff
503    /// - `GAP_MIN`  = 0xd800 (GAP_MIN - GAP_MAX are forbidden UTF-8 codepoint values)
504    /// - `GAP_MAX`  = 0xdfff
505    /// - `HIGH_MIN` = 0xe000
506    /// - `MAX`      = 0x10ffff
507    ///
508    /// Integer values are UTF-8 codepoint values, not the 1-4 byte representation.
509    ///
510    /// # Example
511    /// ```
512    /// # use std::collections::{BTreeMap, BTreeSet};
513    /// # use lexigram_lib::{btreemap, segments, branch, segments::Segments};
514    /// # use lexigram_lib::segmap::Seg;
515    /// let transitions = btreemap![
516    ///     0 => branch!['a'-'c' => 0],
517    ///     1 => branch!['a'-'c', '0'-'2' => 0],
518    ///     2 => branch!['a'-'c', '.' => 0],
519    ///     3 => branch!['a'-'c', '.' => 0, 'd'-'f' => 1],
520    ///     4 => branch![['a'-'c', '.'] => 0, ['d'-'f'] => 1],
521    ///     5 => branch![['a'-'c', '.'] => 0, ~['a'-'c', '.'] => 1],
522    ///     6 => branch![0 - LOW_MAX, HIGH_MIN - MAX => 0],
523    ///     7 => branch!['a' => 0, DOT => 1],
524    /// ];
525    /// assert_eq!(transitions,
526    ///     btreemap![
527    ///         0 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32)]) => 0],
528    ///         1 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('0' as u32, '2' as u32)]) => 0],
529    ///         2 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0],
530    ///         3 => btreemap![
531    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
532    ///             Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
533    ///         4 => btreemap![
534    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
535    ///             Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
536    ///         5 => btreemap![
537    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
538    ///             Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]).not() => 1],
539    ///         6 => btreemap![Segments::from([Seg(0_u32, 0xd7ff_u32), Seg(0xe000_u32, 0x10ffff_u32)]) => 0],
540    ///         7 => btreemap![Segments::from([Seg('a' as u32, 'a' as u32)]) => 0, Segments::dot() => 1]
541    ///     ]);
542    /// ```
543    #[macro_export]
544    macro_rules! branch {
545        // doesn't work, so we can't mix [] and non-[] segments:
546        // ($( $($($($a1:literal)?$($a2:ident)? $(-$($b1:literal)?$($b2:ident)?)?),+)? $(~[$($($c1:literal)?$($c2:ident)? $(-$($d1:literal)?$($d2:ident)?)?),+])? => $value:expr ),*)
547        // => { btreemap![$($(segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+])? $(segments![~ $($($c1)?$($c2)?$(- $($d1)?$($d2)?)?),+])? => $value),*] };
548
549        ($( $($($a1:literal)?$($a2:ident)? $(-$($b1:literal)?$($b2:ident)?)?),+ => $value:expr ),*)
550        => { btreemap![$($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+] => $value),*] };
551        ($( $([$($($a1:literal)?$($a2:ident)? $(-$($b1:literal)?$($b2:ident)?)?),+])? $(~[$($($c1:literal)?$($c2:ident)? $(-$($d1:literal)?$($d2:ident)?)?),+])? => $value:expr ),*)
552        => { btreemap![$($($crate::segments![$($($a1)?$($a2)?$(- $($b1)?$($b2)?)?),+])? $($crate::segments![~ $($($c1)?$($c2)?$(- $($d1)?$($d2)?)?),+])? => $value),*] };
553    }
554}
555
556// ---------------------------------------------------------------------------------------------
557// Tests
558// ---------------------------------------------------------------------------------------------
559
560#[cfg(test)]
561mod tests {
562    use iter_index::IndexerIterator;
563    use crate::{seg, branch, btreemap, segments};
564    use super::*;
565
566    fn new_cmp(c: Seg, i: Seg, e: Seg) -> SegmentsCmp {
567        SegmentsCmp { common: Segments::new(c), internal: Segments::new(i), external: Segments::new(e) }
568    }
569
570    fn build_segments() -> Vec<(Seg, Seg, SegmentsCmp)> {
571        vec![
572            (Seg(1, 2), Seg(3, 4), new_cmp(Seg(9, 0), Seg(1, 2), Seg(3, 4))),
573            (Seg(1, 2), Seg(2, 3), new_cmp(Seg(2, 2), Seg(1, 1), Seg(3, 3))),
574            (Seg(1, 3), Seg(2, 4), new_cmp(Seg(2, 3), Seg(1, 1), Seg(4, 4))),
575            (Seg(1, 3), Seg(2, 3), new_cmp(Seg(2, 3), Seg(1, 1), Seg(9, 0))),
576            (Seg(1, 4), Seg(2, 3), SegmentsCmp { common: Segments::new(Seg(2, 3)), internal: Segments(btreeset![Seg(1, 1), Seg(4, 4)]), external: Segments::empty() }),
577            (Seg(1, 2), Seg(1, 3), new_cmp(Seg(1, 2), Seg(9, 0), Seg(3, 3))),
578            (Seg(1, 2), Seg(1, 2), new_cmp(Seg(1, 2), Seg(9, 0), Seg(9, 0))),
579            (Seg(1, 3), Seg(1, 2), new_cmp(Seg(1, 2), Seg(3, 3), Seg(9, 0))),
580            (Seg(2, 3), Seg(1, 4), SegmentsCmp { common: Segments::new(Seg(2, 3)), internal: Segments::empty(), external: Segments(btreeset![Seg(1, 1), Seg(4, 4)]) }),
581            (Seg(2, 3), Seg(1, 3), new_cmp(Seg(2, 3), Seg(9, 0), Seg(1, 1))),
582            (Seg(2, 4), Seg(1, 3), new_cmp(Seg(2, 3), Seg(4, 4), Seg(1, 1))),
583            (Seg(2, 3), Seg(1, 2), new_cmp(Seg(2, 2), Seg(3, 3), Seg(1, 1))),
584            (Seg(3, 4), Seg(1, 2), new_cmp(Seg(9, 0), Seg(3, 4), Seg(1, 2))),
585        ]
586    }
587
588    #[test]
589    fn segs_segment_intersect() {
590        let tests = build_segments();
591        for (idx, (ab, cd, expected_cmp)) in tests.into_iter().enumerate() {
592            let cmp = Segments::segment_intersect(ab, cd);
593            assert_eq!(cmp, expected_cmp, "test {idx} failed");
594        }
595    }
596
597    #[test]
598    fn segs_intersect() {
599        for scale in [10, 4] {
600            let iv = build_segments();
601            let mut ab = Segments::empty();
602            let mut cd = Segments::empty();
603            let mut expected_cmp = SegmentsCmp::empty();
604            for (idx, (Seg(a, b), Seg(c, d), cmp)) in iv.into_iter().index::<u32>() {
605                let offset = scale * idx;
606                ab.insert(Seg(a + offset, b + offset));
607                cd.insert(Seg(c + offset, d + offset));
608                expected_cmp.common.extend(cmp.common.iter().map(|Seg(a, b)| Seg(*a + offset, *b + offset)));
609                expected_cmp.internal.extend(cmp.internal.iter().map(|Seg(a, b)| Seg(*a + offset, *b + offset)));
610                expected_cmp.external.extend(cmp.external.iter().map(|Seg(a, b)| Seg(*a + offset, *b + offset)));
611            }
612            let msg = format!("test failed for scale {scale}");
613            let cmp = ab.intersect(&cd);
614            assert_eq!(cmp, expected_cmp, "{}", msg);
615            let cmp = cd.intersect(&ab);
616            assert_eq!(cmp, expected_cmp.clone().inverse(), "{}", msg);
617            let cmp = ab.intersect(&Segments::empty());
618            assert_eq!(cmp, SegmentsCmp { common: Segments::empty(), internal: ab.clone(), external: Segments::empty() }, "{}", msg);
619            let cmp = Segments::empty().intersect(&ab);
620            assert_eq!(cmp, SegmentsCmp { common: Segments::empty(), internal: Segments::empty(), external: ab.clone() }, "{}", msg);
621            ab.normalize();
622            cd.normalize();
623            expected_cmp.normalize();
624            let cmp = ab.intersect(&cd);
625            assert_eq!(cmp, expected_cmp);
626            let cmp = cd.intersect(&ab);
627            assert_eq!(cmp, expected_cmp.inverse());
628        }
629    }
630
631    #[test]
632    fn segs_intersect_corner() {
633        let tests: Vec<(Segments, Segments, (Segments, Segments, Segments))> = vec![
634            (segments![1 - 50], segments![10 - 20, 30 - 40],
635             (segments![10-20, 30-40], segments![1-9, 21-29, 41-50], segments![])),
636            (segments![1-10, 11-15, 16-20, 21-35, 36-37, 38-50], segments![10-20, 30-40],
637             (segments![10-20, 30-40], segments![1-9, 21-29, 41-50], segments![])),
638            (segments![0-9], segments![0-0, 1-9],
639             (segments![0-9], segments![], segments![])),
640            (segments![], segments![],
641             (segments![], segments![], segments![])),
642        ];
643        const VERBOSE: bool = false;
644        for (idx, (ab, cd, expected_cmp)) in tests.into_iter().enumerate() {
645            let expected_cmp = SegmentsCmp { common: expected_cmp.0, internal: expected_cmp.1, external: expected_cmp.2 };
646            let mut cmp = ab.intersect(&cd);
647            if VERBOSE { println!("{ab:x} # {cd:x} = com: {:x}, int: {:x}, ext: {:x}", cmp.common, cmp.internal, cmp.external); }
648            cmp.normalize();
649            if VERBOSE { println!("  normalized: com: {:x}, int: {:x}, ext: {:x}", cmp.common, cmp.internal, cmp.external); }
650            assert_eq!(cmp, expected_cmp, "test {idx} failed");
651            let mut cmp = cd.intersect(&ab);
652            cmp.normalize();
653            assert_eq!(cmp, expected_cmp.inverse(), "test {idx} failed");
654        }
655    }
656
657    #[test]
658    fn segs_partition() {
659        let tests: Vec<(Segments, Segments, Segments)> = vec![
660            (segments![1-4], segments![3-6], segments![1-2, 3-4, 5-6]),
661            (segments![1-4], segments![5-6], segments![1-4, 5-6]),
662            (segments![1-6], segments![3-4], segments![1-2, 3-4, 5-6]),
663            (segments![1-4, 5-10], segments![], segments![1-4, 5-10]),
664            (segments![], segments![1-4, 5-10], segments![1-4, 5-10]),
665            (segments![1-4, 5-10], segments![3-5], segments![1-2, 3-4, 5-5, 6-10]),
666            (segments![10-15, 20-25], segments![1-100], segments![1-9, 10-15, 16-19, 20-25, 26-100]),
667        ];
668        for (idx, (mut ab, cd, exp)) in tests.into_iter().enumerate() {
669            ab.add_partition(&cd);
670            let expected = exp;
671            assert_eq!(ab, expected, "test {idx} failed: {ab:x} instead of {expected:x}");
672        }
673    }
674
675    #[test]
676    fn segs_slice_partition() {
677        let tests: Vec<(Segments, Segments, Segments)> = vec![
678            (segments![1 - 50], segments![10 - 20, 30 - 40],
679             segments![1-9, 10-20, 21-29, 30-40, 41-50]),
680            (segments![10 - 20, 30 - 40], segments![1 - 50],
681             segments![10-20, 30-40]),
682            (segments![1-10, 11-15, 16-20, 21-35, 36-37, 38-50], segments![10-20, 30-40],
683             segments![1-9, 10, 11-15, 16-20, 21-29, 30-35, 36-37, 38-40, 41-50]),
684            (segments![0-9], segments![0-0, 1-9],
685             segments![0, 1-9]),
686            (segments![1-10, 30-40], segments![11-20, 25-29, 41-100],
687             segments![1-10, 30-40]),
688            (segments![], segments![],
689             segments![]),
690        ];
691        const VERBOSE: bool = false;
692        for (idx, (mut ab, cd, expected_part)) in tests.into_iter().enumerate() {
693            if VERBOSE { print!("{ab:x} # {cd:x} => "); }
694            ab.slice_partitions(&cd);
695            if VERBOSE { println!("{ab:x}"); }
696            assert_eq!(ab, expected_part, "test {idx} failed");
697        }
698    }
699
700    #[test]
701    fn segs_chars() {
702        let tests = vec![
703            (segments!['a'-'a'], "a"),
704            (segments!['a'-'d'], "abcd"),
705            (segments!['a'-'c', 'x'-'z'], "abcxyz"),
706            (segments!['a'-'b', 'd'-'d', 'f'-'f', 'x'-'z'], "abdfxyz"),
707        ];
708        for (idx, (segments, expected)) in tests.into_iter().enumerate() {
709            let result = segments.chars().collect::<String>();
710            assert_eq!(result, expected, "test {idx} failed");
711        }
712    }
713
714    #[test]
715    fn segs_insert_utf8() {
716        let tests = vec![
717            (0, UTF8_MAX,                    segments![DOT]),
718            (32, UTF8_GAP_MIN + 2,           segments![32-LOW_MAX]),
719            (64, UTF8_GAP_MAX,               segments![64-LOW_MAX]),
720            (96, UTF8_GAP_MAX + 1,           segments![96-LOW_MAX, HIGH_MIN]),
721            (UTF8_GAP_MIN, UTF8_GAP_MAX,     segments![]),
722            (UTF8_GAP_MIN, UTF8_GAP_MAX + 1, segments![HIGH_MIN]),
723        ];
724        for (test_id, (a, b, expected)) in tests.into_iter().enumerate() {
725            let mut result = Segments::empty();
726            result.insert_utf8(a, b);
727            assert_eq!(result, expected, "test {test_id} failed");
728        }
729    }
730
731    #[test]
732    fn segs_not() {
733        let tests = vec![
734            (segments![DOT],                        segments![]),
735            (segments![],                           segments![DOT]),
736            (segments![0],                          segments![1-LOW_MAX, HIGH_MIN-MAX]),
737            (segments![0-MAX],                      segments![]),
738            (segments![1-0xd700],                   segments![0-0, 0xd701-LOW_MAX, HIGH_MIN-MAX]),
739            (segments![2-0xd7fe],                   segments![0-1, 0xd7ff, HIGH_MIN-MAX]),
740            (segments![3-LOW_MAX],                  segments![0-2, HIGH_MIN-MAX]),
741            (segments![4-0xdfff],                   segments![0-3, HIGH_MIN-MAX]),
742            (segments![5-HIGH_MIN],                 segments![0-4, 0xe001-MAX]),
743            (segments![0-6, LOW_MAX-HIGH_MIN, MAX], segments![7-0xd7fe, 0xe001-0x10fffe]),
744            (segments![0-7, GAP_MIN-GAP_MAX],       segments![8-LOW_MAX, HIGH_MIN-MAX]),
745            (segments![0-8, GAP_MAX-0xe001],        segments![9-LOW_MAX, 0xe002-MAX]),
746            (segments![0-9, HIGH_MIN-MAX],          segments![10-LOW_MAX]),
747        ];
748        for (test_id, (segments, expected)) in tests.into_iter().enumerate() {
749            let result = segments.not();
750            assert_eq!(result.normalized(), expected.normalized(), "test {test_id} failed");
751        }
752    }
753
754    #[test]
755    fn macro_segments() {
756        assert_eq!(seg!('a'-'z'), Seg('a' as u32, 'z' as u32));
757        assert_eq!(seg!('a'), Seg('a' as u32, 'a' as u32));
758        assert_eq!(segments!('a'-'z'), Segments::from(('a', 'z')));
759        assert_eq!(segments!('a'), Segments::from('a'));
760        assert_eq!(segments!('a'-'z', '0'-'9'), Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32)]));
761        assert_eq!(segments!('a'-'z', '0'-'9', '-'), Segments::from([Seg('a' as u32, 'z' as u32), Seg('0' as u32, '9' as u32), Seg('-' as u32, '-' as u32)]));
762        assert_eq!(segments!(~ '0'-'9', '.'), Segments::from([Seg('0' as u32, '9' as u32), Seg('.' as u32, '.' as u32)]).not());
763        assert_eq!(segments!(0 - LOW_MAX, HIGH_MIN - MAX), Segments::dot());
764        assert_eq!(segments!(~ 0 - LOW_MAX, HIGH_MIN - MAX), Segments::empty());
765        assert_eq!(segments!(DOT), Segments::dot());
766        assert_eq!(segments!(~ DOT), Segments::empty());
767    }
768
769    #[test]
770    fn macro_branch() {
771        let transitions = btreemap![
772            0 => branch!['a'-'c' => 0],
773            1 => branch!['a'-'c', '0'-'2' => 0],
774            2 => branch!['a'-'c', '.' => 0],
775            3 => branch!['a'-'c', '.' => 0, 'd'-'f' => 1],
776            4 => branch![['a'-'c', '.'] => 0, ['d'-'f'] => 1],
777            5 => branch![['a'-'c', '.'] => 0, ~['a'-'c', '.'] => 1],
778            6 => branch![0 - LOW_MAX, HIGH_MIN - MAX => 0],
779            7 => branch!['a' => 0, DOT => 1],
780        ];
781        assert_eq!(transitions,
782            btreemap![
783                0 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32)]) => 0],
784                1 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('0' as u32, '2' as u32)]) => 0],
785                2 => btreemap![Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0],
786                3 => btreemap![
787                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
788                    Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
789                4 => btreemap![
790                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
791                    Segments::from([Seg('d' as u32, 'f' as u32)]) => 1],
792                5 => btreemap![
793                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]) => 0,
794                    Segments::from([Seg('a' as u32, 'c' as u32), Seg('.' as u32, '.' as u32)]).not() => 1],
795                6 => btreemap![Segments::from([Seg(0_u32, 0xd7ff_u32), Seg(0xe000_u32, 0x10ffff_u32)]) => 0],
796                7 => btreemap![Segments::from([Seg('a' as u32, 'a' as u32)]) => 0, Segments::dot() => 1]
797            ]);
798    }
799}
lexigram_lib/segments.rs

lexigram_lib/
segments.rs