lapce_xi_rope/
multiset.rs

1// Copyright 2017 The xi-editor Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! A data structure for representing multi-subsets of sequences (typically strings).
16
17use std::cmp;
18
19// These two imports are for the `apply` method only.
20use crate::interval::Interval;
21use crate::tree::{Node, NodeInfo, TreeBuilder};
22use std::fmt;
23use std::slice;
24
25#[derive(Clone, PartialEq, Eq, Debug)]
26#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
27struct Segment {
28    len: usize,
29    count: usize,
30}
31
32/// Represents a multi-subset of a string, that is a subset where elements can
33/// be included multiple times. This is represented as each element of the
34/// string having a "count" which is the number of times that element is
35/// included in the set.
36///
37/// Internally, this is stored as a list of "segments" with a length and a count.
38#[derive(Clone, PartialEq, Eq)]
39#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
40pub struct Subset {
41    /// Invariant, maintained by `SubsetBuilder`: all `Segment`s have non-zero
42    /// length, and no `Segment` has the same count as the one before it.
43    segments: Vec<Segment>,
44}
45
46#[derive(Default)]
47pub struct SubsetBuilder {
48    segments: Vec<Segment>,
49    total_len: usize,
50}
51
52impl SubsetBuilder {
53    pub fn new() -> SubsetBuilder {
54        SubsetBuilder::default()
55    }
56
57    /// Intended for use with `add_range` to ensure the total length of the
58    /// `Subset` corresponds to the document length.
59    pub fn pad_to_len(&mut self, total_len: usize) {
60        if total_len > self.total_len {
61            let cur_len = self.total_len;
62            self.push_segment(total_len - cur_len, 0);
63        }
64    }
65
66    /// Sets the count for a given range. This method must be called with a
67    /// non-empty range with `begin` not before the largest range or segment added
68    /// so far. Gaps will be filled with a 0-count segment.
69    pub fn add_range(&mut self, begin: usize, end: usize, count: usize) {
70        assert!(begin >= self.total_len, "ranges must be added in non-decreasing order");
71        // assert!(begin < end, "ranges added must be non-empty: [{},{})", begin, end);
72        if begin >= end {
73            return;
74        }
75        let len = end - begin;
76        let cur_total_len = self.total_len;
77
78        // add 0-count segment to fill any gap
79        if begin > self.total_len {
80            self.push_segment(begin - cur_total_len, 0);
81        }
82
83        self.push_segment(len, count);
84    }
85
86    /// Assign `count` to the next `len` elements in the string.
87    /// Will panic if called with `len==0`.
88    pub fn push_segment(&mut self, len: usize, count: usize) {
89        assert!(len > 0, "can't push empty segment");
90        self.total_len += len;
91
92        // merge into previous segment if possible
93        if let Some(last) = self.segments.last_mut() {
94            if last.count == count {
95                last.len += len;
96                return;
97            }
98        }
99
100        self.segments.push(Segment { len, count });
101    }
102
103    pub fn build(self) -> Subset {
104        Subset { segments: self.segments }
105    }
106}
107
108/// Determines which elements of a `Subset` a method applies to
109/// based on the count of the element.
110#[derive(Clone, Copy, Debug)]
111pub enum CountMatcher {
112    Zero,
113    NonZero,
114    All,
115}
116
117impl CountMatcher {
118    fn matches(self, seg: &Segment) -> bool {
119        match self {
120            CountMatcher::Zero => (seg.count == 0),
121            CountMatcher::NonZero => (seg.count != 0),
122            CountMatcher::All => true,
123        }
124    }
125}
126
127impl Subset {
128    /// Creates an empty `Subset` of a string of length `len`
129    pub fn new(len: usize) -> Subset {
130        let mut sb = SubsetBuilder::new();
131        sb.pad_to_len(len);
132        sb.build()
133    }
134
135    /// Mostly for testing.
136    pub fn delete_from_string(&self, s: &str) -> String {
137        let mut result = String::new();
138        for (b, e) in self.range_iter(CountMatcher::Zero) {
139            result.push_str(&s[b..e]);
140        }
141        result
142    }
143
144    // Maybe Subset should be a pure data structure and this method should
145    // be a method of Node.
146    /// Builds a version of `s` with all the elements in this `Subset` deleted from it.
147    pub fn delete_from<N: NodeInfo>(&self, s: &Node<N>) -> Node<N> {
148        let mut b = TreeBuilder::new();
149        for (beg, end) in self.range_iter(CountMatcher::Zero) {
150            s.push_subseq(&mut b, Interval::new(beg, end));
151        }
152        b.build()
153    }
154
155    /// The length of the resulting sequence after deleting this subset. A
156    /// convenience alias for `self.count(CountMatcher::Zero)` to reduce
157    /// thinking about what that means in the cases where the length after
158    /// delete is what you want to know.
159    ///
160    /// `self.delete_from_string(s).len() = self.len(s.len())`
161    pub fn len_after_delete(&self) -> usize {
162        self.count(CountMatcher::Zero)
163    }
164
165    /// Count the total length of all the segments matching `matcher`.
166    pub fn count(&self, matcher: CountMatcher) -> usize {
167        self.segments.iter().filter(|seg| matcher.matches(seg)).map(|seg| seg.len).sum()
168    }
169
170    /// Convenience alias for `self.count(CountMatcher::All)`
171    pub fn len(&self) -> usize {
172        self.count(CountMatcher::All)
173    }
174
175    /// Determine whether the subset is empty.
176    /// In this case deleting it would do nothing.
177    pub fn is_empty(&self) -> bool {
178        (self.segments.is_empty()) || ((self.segments.len() == 1) && (self.segments[0].count == 0))
179    }
180
181    /// Compute the union of two subsets. The count of an element in the
182    /// result is the sum of the counts in the inputs.
183    pub fn union(&self, other: &Subset) -> Subset {
184        let mut sb = SubsetBuilder::new();
185        for zseg in self.zip(other) {
186            sb.push_segment(zseg.len, zseg.a_count + zseg.b_count);
187        }
188        sb.build()
189    }
190
191    /// Compute the difference of two subsets. The count of an element in the
192    /// result is the subtraction of the counts of other from self.
193    pub fn subtract(&self, other: &Subset) -> Subset {
194        let mut sb = SubsetBuilder::new();
195        for zseg in self.zip(other) {
196            assert!(
197                zseg.a_count >= zseg.b_count,
198                "can't subtract {} from {}",
199                zseg.a_count,
200                zseg.b_count
201            );
202            sb.push_segment(zseg.len, zseg.a_count - zseg.b_count);
203        }
204        sb.build()
205    }
206
207    /// Compute the bitwise xor of two subsets, useful as a reversible
208    /// difference. The count of an element in the result is the bitwise xor
209    /// of the counts of the inputs. Unchanged segments will be 0.
210    ///
211    /// This works like set symmetric difference when all counts are 0 or 1
212    /// but it extends nicely to the case of larger counts.
213    pub fn bitxor(&self, other: &Subset) -> Subset {
214        let mut sb = SubsetBuilder::new();
215        for zseg in self.zip(other) {
216            sb.push_segment(zseg.len, zseg.a_count ^ zseg.b_count);
217        }
218        sb.build()
219    }
220
221    /// Map the contents of `self` into the 0-regions of `other`.
222    /// Precondition: `self.count(CountMatcher::All) == other.count(CountMatcher::Zero)`
223    fn transform(&self, other: &Subset, union: bool) -> Subset {
224        let mut sb = SubsetBuilder::new();
225        let mut seg_iter = self.segments.iter();
226        let mut cur_seg = Segment { len: 0, count: 0 };
227        for oseg in &other.segments {
228            if oseg.count > 0 {
229                sb.push_segment(oseg.len, if union { oseg.count } else { 0 });
230            } else {
231                // fill 0-region with segments from self.
232                let mut to_be_consumed = oseg.len;
233                while to_be_consumed > 0 {
234                    if cur_seg.len == 0 {
235                        cur_seg = seg_iter
236                            .next()
237                            .expect("self must cover all 0-regions of other")
238                            .clone();
239                    }
240                    // consume as much of the segment as possible and necessary
241                    let to_consume = cmp::min(cur_seg.len, to_be_consumed);
242                    sb.push_segment(to_consume, cur_seg.count);
243                    to_be_consumed -= to_consume;
244                    cur_seg.len -= to_consume;
245                }
246            }
247        }
248        assert_eq!(cur_seg.len, 0, "the 0-regions of other must be the size of self");
249        assert_eq!(seg_iter.next(), None, "the 0-regions of other must be the size of self");
250        sb.build()
251    }
252
253    /// Transform through coordinate transform represented by other.
254    /// The equation satisfied is as follows:
255    ///
256    /// s1 = other.delete_from_string(s0)
257    ///
258    /// s2 = self.delete_from_string(s1)
259    ///
260    /// element in self.transform_expand(other).delete_from_string(s0) if (not in s1) or in s2
261    pub fn transform_expand(&self, other: &Subset) -> Subset {
262        self.transform(other, false)
263    }
264
265    /// The same as taking transform_expand and then unioning with `other`.
266    pub fn transform_union(&self, other: &Subset) -> Subset {
267        self.transform(other, true)
268    }
269
270    /// Transform subset through other coordinate transform, shrinking.
271    /// The following equation is satisfied:
272    ///
273    /// C = A.transform_expand(B)
274    ///
275    /// B.transform_shrink(C).delete_from_string(C.delete_from_string(s)) =
276    ///   A.delete_from_string(B.delete_from_string(s))
277    pub fn transform_shrink(&self, other: &Subset) -> Subset {
278        let mut sb = SubsetBuilder::new();
279        // discard ZipSegments where the shrinking set has positive count
280        for zseg in self.zip(other) {
281            // TODO: should this actually do something like subtract counts?
282            if zseg.b_count == 0 {
283                sb.push_segment(zseg.len, zseg.a_count);
284            }
285        }
286        sb.build()
287    }
288
289    /// Return an iterator over the ranges with a count matching the `matcher`.
290    /// These will often be easier to work with than raw segments.
291    pub fn range_iter(&self, matcher: CountMatcher) -> RangeIter {
292        RangeIter { seg_iter: self.segments.iter(), consumed: 0, matcher }
293    }
294
295    /// Convenience alias for `self.range_iter(CountMatcher::Zero)`.
296    /// Semantically iterates the ranges of the complement of this `Subset`.
297    pub fn complement_iter(&self) -> RangeIter {
298        self.range_iter(CountMatcher::Zero)
299    }
300
301    /// Return an iterator over `ZipSegment`s where each `ZipSegment` contains
302    /// the count for both self and other in that range. The two `Subset`s
303    /// must have the same total length.
304    ///
305    /// Each returned `ZipSegment` will differ in at least one count.
306    pub fn zip<'a>(&'a self, other: &'a Subset) -> ZipIter<'a> {
307        ZipIter {
308            a_segs: self.segments.as_slice(),
309            b_segs: other.segments.as_slice(),
310            a_i: 0,
311            b_i: 0,
312            a_consumed: 0,
313            b_consumed: 0,
314            consumed: 0,
315        }
316    }
317
318    /// Find the complement of this Subset. Every 0-count element will have a
319    /// count of 1 and every non-zero element will have a count of 0.
320    pub fn complement(&self) -> Subset {
321        let mut sb = SubsetBuilder::new();
322        for seg in &self.segments {
323            if seg.count == 0 {
324                sb.push_segment(seg.len, 1);
325            } else {
326                sb.push_segment(seg.len, 0);
327            }
328        }
329        sb.build()
330    }
331
332    /// Return a `Mapper` that can be use to map coordinates in the document to coordinates
333    /// in this `Subset`, but only in non-decreasing order for performance reasons.
334    pub fn mapper(&self, matcher: CountMatcher) -> Mapper {
335        Mapper {
336            range_iter: self.range_iter(matcher),
337            last_i: 0, // indices only need to be in non-decreasing order, not increasing
338            cur_range: (0, 0), // will immediately try to consume next range
339            subset_amount_consumed: 0,
340        }
341    }
342}
343
344impl fmt::Debug for Subset {
345    /// Use the alternate flag (`#`) to print a more compact representation
346    /// where each character represents the count of one element:
347    /// '-' is 0, '#' is 1, 2-9 are digits, `+` is >9
348    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
349        if f.alternate() {
350            for s in &self.segments {
351                let chr = if s.count == 0 {
352                    '-'
353                } else if s.count == 1 {
354                    '#'
355                } else if s.count <= 9 {
356                    ((s.count as u8) + b'0') as char
357                } else {
358                    '+'
359                };
360                for _ in 0..s.len {
361                    write!(f, "{}", chr)?;
362                }
363            }
364            Ok(())
365        } else {
366            f.debug_tuple("Subset").field(&self.segments).finish()
367        }
368    }
369}
370
371pub struct RangeIter<'a> {
372    seg_iter: slice::Iter<'a, Segment>,
373    pub consumed: usize,
374    matcher: CountMatcher,
375}
376
377impl<'a> Iterator for RangeIter<'a> {
378    type Item = (usize, usize);
379
380    fn next(&mut self) -> Option<(usize, usize)> {
381        while let Some(seg) = self.seg_iter.next() {
382            self.consumed += seg.len;
383            if self.matcher.matches(seg) {
384                return Some((self.consumed - seg.len, self.consumed));
385            }
386        }
387        None
388    }
389}
390
391/// See `Subset::zip`
392pub struct ZipIter<'a> {
393    a_segs: &'a [Segment],
394    b_segs: &'a [Segment],
395    a_i: usize,
396    b_i: usize,
397    a_consumed: usize,
398    b_consumed: usize,
399    pub consumed: usize,
400}
401
402/// See `Subset::zip`
403#[derive(Clone, Debug)]
404pub struct ZipSegment {
405    len: usize,
406    a_count: usize,
407    b_count: usize,
408}
409
410impl<'a> Iterator for ZipIter<'a> {
411    type Item = ZipSegment;
412
413    /// Consume as far as possible from `self.consumed` until reaching a
414    /// segment boundary in either `Subset`, and return the resulting
415    /// `ZipSegment`. Will panic if it reaches the end of one `Subset` before
416    /// the other, that is when they have different total length.
417    fn next(&mut self) -> Option<ZipSegment> {
418        match (self.a_segs.get(self.a_i), self.b_segs.get(self.b_i)) {
419            (None, None) => None,
420            (None, Some(_)) | (Some(_), None) => {
421                panic!("can't zip Subsets of different base lengths.")
422            }
423            (
424                Some(&Segment { len: a_len, count: a_count }),
425                Some(&Segment { len: b_len, count: b_count }),
426            ) => {
427                let len = match (a_len + self.a_consumed).cmp(&(b_len + self.b_consumed)) {
428                    cmp::Ordering::Equal => {
429                        self.a_consumed += a_len;
430                        self.a_i += 1;
431                        self.b_consumed += b_len;
432                        self.b_i += 1;
433                        self.a_consumed - self.consumed
434                    }
435                    cmp::Ordering::Less => {
436                        self.a_consumed += a_len;
437                        self.a_i += 1;
438                        self.a_consumed - self.consumed
439                    }
440                    cmp::Ordering::Greater => {
441                        self.b_consumed += b_len;
442                        self.b_i += 1;
443                        self.b_consumed - self.consumed
444                    }
445                };
446                self.consumed += len;
447                Some(ZipSegment { len, a_count, b_count })
448            }
449        }
450    }
451}
452
453pub struct Mapper<'a> {
454    range_iter: RangeIter<'a>,
455    // Not actually necessary for computation, just for dynamic checking of invariant
456    last_i: usize,
457    cur_range: (usize, usize),
458    pub subset_amount_consumed: usize,
459}
460
461impl<'a> Mapper<'a> {
462    /// Map a coordinate in the document this subset corresponds to, to a
463    /// coordinate in the subset matched by the `CountMatcher`. For example,
464    /// if the Subset is a set of deletions and the matcher is
465    /// `CountMatcher::NonZero`, this would map indices in the union string to
466    /// indices in the tombstones string.
467    ///
468    /// Will return the closest coordinate in the subset if the index is not
469    /// in the subset. If the coordinate is past the end of the subset it will
470    /// return one more than the largest index in the subset (i.e the length).
471    /// This behaviour is suitable for mapping closed-open intervals in a
472    /// string to intervals in a subset of the string.
473    ///
474    /// In order to guarantee good performance, this method must be called
475    /// with `i` values in non-decreasing order or it will panic. This allows
476    /// the total cost to be O(n) where `n = max(calls,ranges)` over all times
477    /// called on a single `Mapper`.
478    pub fn doc_index_to_subset(&mut self, i: usize) -> usize {
479        assert!(
480            i >= self.last_i,
481            "method must be called with i in non-decreasing order. i={}<{}=last_i",
482            i,
483            self.last_i
484        );
485        self.last_i = i;
486
487        while i >= self.cur_range.1 {
488            self.subset_amount_consumed += self.cur_range.1 - self.cur_range.0;
489            self.cur_range = match self.range_iter.next() {
490                Some(range) => range,
491                // past the end of the subset
492                None => {
493                    // ensure we don't try to consume any more
494                    self.cur_range = (usize::max_value(), usize::max_value());
495                    return self.subset_amount_consumed;
496                }
497            }
498        }
499
500        if i >= self.cur_range.0 {
501            let dist_in_range = i - self.cur_range.0;
502            dist_in_range + self.subset_amount_consumed
503        } else {
504            // not in the subset
505            self.subset_amount_consumed
506        }
507    }
508}
509
510#[cfg(test)]
511mod tests {
512    use crate::multiset::*;
513    use crate::test_helpers::find_deletions;
514
515    const TEST_STR: &'static str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
516
517    #[test]
518    fn test_apply() {
519        let mut sb = SubsetBuilder::new();
520        for &(b, e) in &[
521            (0, 1),
522            (2, 4),
523            (6, 11),
524            (13, 14),
525            (15, 18),
526            (19, 23),
527            (24, 26),
528            (31, 32),
529            (33, 35),
530            (36, 37),
531            (40, 44),
532            (45, 48),
533            (49, 51),
534            (52, 57),
535            (58, 59),
536        ] {
537            sb.add_range(b, e, 1);
538        }
539        sb.pad_to_len(TEST_STR.len());
540        let s = sb.build();
541        println!("{:?}", s);
542        assert_eq!("145BCEINQRSTUWZbcdimpvxyz", s.delete_from_string(TEST_STR));
543    }
544
545    #[test]
546    fn trivial() {
547        let s = SubsetBuilder::new().build();
548        assert!(s.is_empty());
549    }
550
551    #[test]
552    fn test_find_deletions() {
553        let substr = "015ABDFHJOPQVYdfgloprsuvz";
554        let s = find_deletions(substr, TEST_STR);
555        assert_eq!(substr, s.delete_from_string(TEST_STR));
556        assert!(!s.is_empty())
557    }
558
559    #[test]
560    fn test_complement() {
561        let substr = "0456789DEFGHIJKLMNOPQRSTUVWXYZdefghijklmnopqrstuvw";
562        let s = find_deletions(substr, TEST_STR);
563        let c = s.complement();
564        // deleting the complement of the deletions we found should yield the deletions
565        assert_eq!("123ABCabcxyz", c.delete_from_string(TEST_STR));
566    }
567
568    #[test]
569    fn test_mapper() {
570        let substr = "469ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwz";
571        let s = find_deletions(substr, TEST_STR);
572        let mut m = s.mapper(CountMatcher::NonZero);
573        // subset is {0123 5 78 xy}
574        assert_eq!(0, m.doc_index_to_subset(0));
575        assert_eq!(2, m.doc_index_to_subset(2));
576        assert_eq!(2, m.doc_index_to_subset(2));
577        assert_eq!(3, m.doc_index_to_subset(3));
578        assert_eq!(4, m.doc_index_to_subset(4)); // not in subset
579        assert_eq!(4, m.doc_index_to_subset(5));
580        assert_eq!(5, m.doc_index_to_subset(7));
581        assert_eq!(6, m.doc_index_to_subset(8));
582        assert_eq!(6, m.doc_index_to_subset(8));
583        assert_eq!(8, m.doc_index_to_subset(60));
584        assert_eq!(9, m.doc_index_to_subset(61)); // not in subset
585        assert_eq!(9, m.doc_index_to_subset(62)); // not in subset
586    }
587
588    #[test]
589    #[should_panic(expected = "non-decreasing")]
590    fn test_mapper_requires_non_decreasing() {
591        let substr = "469ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw";
592        let s = find_deletions(substr, TEST_STR);
593        let mut m = s.mapper(CountMatcher::NonZero);
594        m.doc_index_to_subset(0);
595        m.doc_index_to_subset(2);
596        m.doc_index_to_subset(1);
597    }
598
599    #[test]
600    fn union() {
601        let s1 = find_deletions("024AEGHJKNQTUWXYZabcfgikqrvy", TEST_STR);
602        let s2 = find_deletions("14589DEFGIKMOPQRUXZabcdefglnpsuxyz", TEST_STR);
603        assert_eq!("4EGKQUXZabcfgy", s1.union(&s2).delete_from_string(TEST_STR));
604    }
605
606    fn transform_case(str1: &str, str2: &str, result: &str) {
607        let s1 = find_deletions(str1, TEST_STR);
608        let s2 = find_deletions(str2, str1);
609        let s3 = s2.transform_expand(&s1);
610        let str3 = s3.delete_from_string(TEST_STR);
611        assert_eq!(result, str3);
612        assert_eq!(str2, s1.transform_shrink(&s3).delete_from_string(&str3));
613        assert_eq!(str2, s2.transform_union(&s1).delete_from_string(TEST_STR));
614    }
615
616    #[test]
617    fn transform() {
618        transform_case(
619            "02345678BCDFGHKLNOPQRTUVXZbcefghjlmnopqrstwx",
620            "027CDGKLOTUbcegopqrw",
621            "01279ACDEGIJKLMOSTUWYabcdegikopqruvwyz",
622        );
623        transform_case(
624            "01234678DHIKLMNOPQRUWZbcdhjostvy",
625            "136KLPQZvy",
626            "13569ABCEFGJKLPQSTVXYZaefgiklmnpqruvwxyz",
627        );
628        transform_case(
629            "0125789BDEFIJKLMNPVXabdjmrstuwy",
630            "12BIJVXjmrstu",
631            "12346ABCGHIJOQRSTUVWXYZcefghijklmnopqrstuvxz",
632        );
633        transform_case(
634            "12456789ABCEFGJKLMNPQRSTUVXYadefghkrtwxz",
635            "15ACEFGKLPRUVYdhrtx",
636            "0135ACDEFGHIKLOPRUVWYZbcdhijlmnopqrstuvxy",
637        );
638        transform_case(
639            "0128ABCDEFGIJMNOPQXYZabcfgijkloqruvy",
640            "2CEFGMZabijloruvy",
641            "2345679CEFGHKLMRSTUVWZabdehijlmnoprstuvwxyz",
642        );
643        transform_case(
644            "01245689ABCDGJKLMPQSTWXYbcdfgjlmnosvy",
645            "01245ABCDJLQSWXYgsv",
646            "0123457ABCDEFHIJLNOQRSUVWXYZaeghikpqrstuvwxz",
647        );
648    }
649}