1use std::cmp;
18
19use crate::interval::Interval;
21use crate::tree::{Node, NodeInfo, TreeBuilder};
22use std::fmt;
23use std::slice;
24
25#[derive(Clone, PartialEq, Eq, Debug)]
26#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
27struct Segment {
28 len: usize,
29 count: usize,
30}
31
32#[derive(Clone, PartialEq, Eq)]
39#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
40pub struct Subset {
41 segments: Vec<Segment>,
44}
45
46#[derive(Default)]
47pub struct SubsetBuilder {
48 segments: Vec<Segment>,
49 total_len: usize,
50}
51
52impl SubsetBuilder {
53 pub fn new() -> SubsetBuilder {
54 SubsetBuilder::default()
55 }
56
57 pub fn pad_to_len(&mut self, total_len: usize) {
60 if total_len > self.total_len {
61 let cur_len = self.total_len;
62 self.push_segment(total_len - cur_len, 0);
63 }
64 }
65
66 pub fn add_range(&mut self, begin: usize, end: usize, count: usize) {
70 assert!(begin >= self.total_len, "ranges must be added in non-decreasing order");
71 if begin >= end {
73 return;
74 }
75 let len = end - begin;
76 let cur_total_len = self.total_len;
77
78 if begin > self.total_len {
80 self.push_segment(begin - cur_total_len, 0);
81 }
82
83 self.push_segment(len, count);
84 }
85
86 pub fn push_segment(&mut self, len: usize, count: usize) {
89 assert!(len > 0, "can't push empty segment");
90 self.total_len += len;
91
92 if let Some(last) = self.segments.last_mut() {
94 if last.count == count {
95 last.len += len;
96 return;
97 }
98 }
99
100 self.segments.push(Segment { len, count });
101 }
102
103 pub fn build(self) -> Subset {
104 Subset { segments: self.segments }
105 }
106}
107
108#[derive(Clone, Copy, Debug)]
111pub enum CountMatcher {
112 Zero,
113 NonZero,
114 All,
115}
116
117impl CountMatcher {
118 fn matches(self, seg: &Segment) -> bool {
119 match self {
120 CountMatcher::Zero => (seg.count == 0),
121 CountMatcher::NonZero => (seg.count != 0),
122 CountMatcher::All => true,
123 }
124 }
125}
126
127impl Subset {
128 pub fn new(len: usize) -> Subset {
130 let mut sb = SubsetBuilder::new();
131 sb.pad_to_len(len);
132 sb.build()
133 }
134
135 pub fn delete_from_string(&self, s: &str) -> String {
137 let mut result = String::new();
138 for (b, e) in self.range_iter(CountMatcher::Zero) {
139 result.push_str(&s[b..e]);
140 }
141 result
142 }
143
144 pub fn delete_from<N: NodeInfo>(&self, s: &Node<N>) -> Node<N> {
148 let mut b = TreeBuilder::new();
149 for (beg, end) in self.range_iter(CountMatcher::Zero) {
150 s.push_subseq(&mut b, Interval::new(beg, end));
151 }
152 b.build()
153 }
154
155 pub fn len_after_delete(&self) -> usize {
162 self.count(CountMatcher::Zero)
163 }
164
165 pub fn count(&self, matcher: CountMatcher) -> usize {
167 self.segments.iter().filter(|seg| matcher.matches(seg)).map(|seg| seg.len).sum()
168 }
169
170 pub fn len(&self) -> usize {
172 self.count(CountMatcher::All)
173 }
174
175 pub fn is_empty(&self) -> bool {
178 (self.segments.is_empty()) || ((self.segments.len() == 1) && (self.segments[0].count == 0))
179 }
180
181 pub fn union(&self, other: &Subset) -> Subset {
184 let mut sb = SubsetBuilder::new();
185 for zseg in self.zip(other) {
186 sb.push_segment(zseg.len, zseg.a_count + zseg.b_count);
187 }
188 sb.build()
189 }
190
191 pub fn subtract(&self, other: &Subset) -> Subset {
194 let mut sb = SubsetBuilder::new();
195 for zseg in self.zip(other) {
196 assert!(
197 zseg.a_count >= zseg.b_count,
198 "can't subtract {} from {}",
199 zseg.a_count,
200 zseg.b_count
201 );
202 sb.push_segment(zseg.len, zseg.a_count - zseg.b_count);
203 }
204 sb.build()
205 }
206
207 pub fn bitxor(&self, other: &Subset) -> Subset {
214 let mut sb = SubsetBuilder::new();
215 for zseg in self.zip(other) {
216 sb.push_segment(zseg.len, zseg.a_count ^ zseg.b_count);
217 }
218 sb.build()
219 }
220
221 fn transform(&self, other: &Subset, union: bool) -> Subset {
224 let mut sb = SubsetBuilder::new();
225 let mut seg_iter = self.segments.iter();
226 let mut cur_seg = Segment { len: 0, count: 0 };
227 for oseg in &other.segments {
228 if oseg.count > 0 {
229 sb.push_segment(oseg.len, if union { oseg.count } else { 0 });
230 } else {
231 let mut to_be_consumed = oseg.len;
233 while to_be_consumed > 0 {
234 if cur_seg.len == 0 {
235 cur_seg = seg_iter
236 .next()
237 .expect("self must cover all 0-regions of other")
238 .clone();
239 }
240 let to_consume = cmp::min(cur_seg.len, to_be_consumed);
242 sb.push_segment(to_consume, cur_seg.count);
243 to_be_consumed -= to_consume;
244 cur_seg.len -= to_consume;
245 }
246 }
247 }
248 assert_eq!(cur_seg.len, 0, "the 0-regions of other must be the size of self");
249 assert_eq!(seg_iter.next(), None, "the 0-regions of other must be the size of self");
250 sb.build()
251 }
252
253 pub fn transform_expand(&self, other: &Subset) -> Subset {
262 self.transform(other, false)
263 }
264
265 pub fn transform_union(&self, other: &Subset) -> Subset {
267 self.transform(other, true)
268 }
269
270 pub fn transform_shrink(&self, other: &Subset) -> Subset {
278 let mut sb = SubsetBuilder::new();
279 for zseg in self.zip(other) {
281 if zseg.b_count == 0 {
283 sb.push_segment(zseg.len, zseg.a_count);
284 }
285 }
286 sb.build()
287 }
288
289 pub fn range_iter(&self, matcher: CountMatcher) -> RangeIter {
292 RangeIter { seg_iter: self.segments.iter(), consumed: 0, matcher }
293 }
294
295 pub fn complement_iter(&self) -> RangeIter {
298 self.range_iter(CountMatcher::Zero)
299 }
300
301 pub fn zip<'a>(&'a self, other: &'a Subset) -> ZipIter<'a> {
307 ZipIter {
308 a_segs: self.segments.as_slice(),
309 b_segs: other.segments.as_slice(),
310 a_i: 0,
311 b_i: 0,
312 a_consumed: 0,
313 b_consumed: 0,
314 consumed: 0,
315 }
316 }
317
318 pub fn complement(&self) -> Subset {
321 let mut sb = SubsetBuilder::new();
322 for seg in &self.segments {
323 if seg.count == 0 {
324 sb.push_segment(seg.len, 1);
325 } else {
326 sb.push_segment(seg.len, 0);
327 }
328 }
329 sb.build()
330 }
331
332 pub fn mapper(&self, matcher: CountMatcher) -> Mapper {
335 Mapper {
336 range_iter: self.range_iter(matcher),
337 last_i: 0, cur_range: (0, 0), subset_amount_consumed: 0,
340 }
341 }
342}
343
344impl fmt::Debug for Subset {
345 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
349 if f.alternate() {
350 for s in &self.segments {
351 let chr = if s.count == 0 {
352 '-'
353 } else if s.count == 1 {
354 '#'
355 } else if s.count <= 9 {
356 ((s.count as u8) + b'0') as char
357 } else {
358 '+'
359 };
360 for _ in 0..s.len {
361 write!(f, "{}", chr)?;
362 }
363 }
364 Ok(())
365 } else {
366 f.debug_tuple("Subset").field(&self.segments).finish()
367 }
368 }
369}
370
371pub struct RangeIter<'a> {
372 seg_iter: slice::Iter<'a, Segment>,
373 pub consumed: usize,
374 matcher: CountMatcher,
375}
376
377impl<'a> Iterator for RangeIter<'a> {
378 type Item = (usize, usize);
379
380 fn next(&mut self) -> Option<(usize, usize)> {
381 while let Some(seg) = self.seg_iter.next() {
382 self.consumed += seg.len;
383 if self.matcher.matches(seg) {
384 return Some((self.consumed - seg.len, self.consumed));
385 }
386 }
387 None
388 }
389}
390
391pub struct ZipIter<'a> {
393 a_segs: &'a [Segment],
394 b_segs: &'a [Segment],
395 a_i: usize,
396 b_i: usize,
397 a_consumed: usize,
398 b_consumed: usize,
399 pub consumed: usize,
400}
401
402#[derive(Clone, Debug)]
404pub struct ZipSegment {
405 len: usize,
406 a_count: usize,
407 b_count: usize,
408}
409
410impl<'a> Iterator for ZipIter<'a> {
411 type Item = ZipSegment;
412
413 fn next(&mut self) -> Option<ZipSegment> {
418 match (self.a_segs.get(self.a_i), self.b_segs.get(self.b_i)) {
419 (None, None) => None,
420 (None, Some(_)) | (Some(_), None) => {
421 panic!("can't zip Subsets of different base lengths.")
422 }
423 (
424 Some(&Segment { len: a_len, count: a_count }),
425 Some(&Segment { len: b_len, count: b_count }),
426 ) => {
427 let len = match (a_len + self.a_consumed).cmp(&(b_len + self.b_consumed)) {
428 cmp::Ordering::Equal => {
429 self.a_consumed += a_len;
430 self.a_i += 1;
431 self.b_consumed += b_len;
432 self.b_i += 1;
433 self.a_consumed - self.consumed
434 }
435 cmp::Ordering::Less => {
436 self.a_consumed += a_len;
437 self.a_i += 1;
438 self.a_consumed - self.consumed
439 }
440 cmp::Ordering::Greater => {
441 self.b_consumed += b_len;
442 self.b_i += 1;
443 self.b_consumed - self.consumed
444 }
445 };
446 self.consumed += len;
447 Some(ZipSegment { len, a_count, b_count })
448 }
449 }
450 }
451}
452
453pub struct Mapper<'a> {
454 range_iter: RangeIter<'a>,
455 last_i: usize,
457 cur_range: (usize, usize),
458 pub subset_amount_consumed: usize,
459}
460
461impl<'a> Mapper<'a> {
462 pub fn doc_index_to_subset(&mut self, i: usize) -> usize {
479 assert!(
480 i >= self.last_i,
481 "method must be called with i in non-decreasing order. i={}<{}=last_i",
482 i,
483 self.last_i
484 );
485 self.last_i = i;
486
487 while i >= self.cur_range.1 {
488 self.subset_amount_consumed += self.cur_range.1 - self.cur_range.0;
489 self.cur_range = match self.range_iter.next() {
490 Some(range) => range,
491 None => {
493 self.cur_range = (usize::max_value(), usize::max_value());
495 return self.subset_amount_consumed;
496 }
497 }
498 }
499
500 if i >= self.cur_range.0 {
501 let dist_in_range = i - self.cur_range.0;
502 dist_in_range + self.subset_amount_consumed
503 } else {
504 self.subset_amount_consumed
506 }
507 }
508}
509
510#[cfg(test)]
511mod tests {
512 use crate::multiset::*;
513 use crate::test_helpers::find_deletions;
514
515 const TEST_STR: &'static str = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
516
517 #[test]
518 fn test_apply() {
519 let mut sb = SubsetBuilder::new();
520 for &(b, e) in &[
521 (0, 1),
522 (2, 4),
523 (6, 11),
524 (13, 14),
525 (15, 18),
526 (19, 23),
527 (24, 26),
528 (31, 32),
529 (33, 35),
530 (36, 37),
531 (40, 44),
532 (45, 48),
533 (49, 51),
534 (52, 57),
535 (58, 59),
536 ] {
537 sb.add_range(b, e, 1);
538 }
539 sb.pad_to_len(TEST_STR.len());
540 let s = sb.build();
541 println!("{:?}", s);
542 assert_eq!("145BCEINQRSTUWZbcdimpvxyz", s.delete_from_string(TEST_STR));
543 }
544
545 #[test]
546 fn trivial() {
547 let s = SubsetBuilder::new().build();
548 assert!(s.is_empty());
549 }
550
551 #[test]
552 fn test_find_deletions() {
553 let substr = "015ABDFHJOPQVYdfgloprsuvz";
554 let s = find_deletions(substr, TEST_STR);
555 assert_eq!(substr, s.delete_from_string(TEST_STR));
556 assert!(!s.is_empty())
557 }
558
559 #[test]
560 fn test_complement() {
561 let substr = "0456789DEFGHIJKLMNOPQRSTUVWXYZdefghijklmnopqrstuvw";
562 let s = find_deletions(substr, TEST_STR);
563 let c = s.complement();
564 assert_eq!("123ABCabcxyz", c.delete_from_string(TEST_STR));
566 }
567
568 #[test]
569 fn test_mapper() {
570 let substr = "469ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwz";
571 let s = find_deletions(substr, TEST_STR);
572 let mut m = s.mapper(CountMatcher::NonZero);
573 assert_eq!(0, m.doc_index_to_subset(0));
575 assert_eq!(2, m.doc_index_to_subset(2));
576 assert_eq!(2, m.doc_index_to_subset(2));
577 assert_eq!(3, m.doc_index_to_subset(3));
578 assert_eq!(4, m.doc_index_to_subset(4)); assert_eq!(4, m.doc_index_to_subset(5));
580 assert_eq!(5, m.doc_index_to_subset(7));
581 assert_eq!(6, m.doc_index_to_subset(8));
582 assert_eq!(6, m.doc_index_to_subset(8));
583 assert_eq!(8, m.doc_index_to_subset(60));
584 assert_eq!(9, m.doc_index_to_subset(61)); assert_eq!(9, m.doc_index_to_subset(62)); }
587
588 #[test]
589 #[should_panic(expected = "non-decreasing")]
590 fn test_mapper_requires_non_decreasing() {
591 let substr = "469ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvw";
592 let s = find_deletions(substr, TEST_STR);
593 let mut m = s.mapper(CountMatcher::NonZero);
594 m.doc_index_to_subset(0);
595 m.doc_index_to_subset(2);
596 m.doc_index_to_subset(1);
597 }
598
599 #[test]
600 fn union() {
601 let s1 = find_deletions("024AEGHJKNQTUWXYZabcfgikqrvy", TEST_STR);
602 let s2 = find_deletions("14589DEFGIKMOPQRUXZabcdefglnpsuxyz", TEST_STR);
603 assert_eq!("4EGKQUXZabcfgy", s1.union(&s2).delete_from_string(TEST_STR));
604 }
605
606 fn transform_case(str1: &str, str2: &str, result: &str) {
607 let s1 = find_deletions(str1, TEST_STR);
608 let s2 = find_deletions(str2, str1);
609 let s3 = s2.transform_expand(&s1);
610 let str3 = s3.delete_from_string(TEST_STR);
611 assert_eq!(result, str3);
612 assert_eq!(str2, s1.transform_shrink(&s3).delete_from_string(&str3));
613 assert_eq!(str2, s2.transform_union(&s1).delete_from_string(TEST_STR));
614 }
615
616 #[test]
617 fn transform() {
618 transform_case(
619 "02345678BCDFGHKLNOPQRTUVXZbcefghjlmnopqrstwx",
620 "027CDGKLOTUbcegopqrw",
621 "01279ACDEGIJKLMOSTUWYabcdegikopqruvwyz",
622 );
623 transform_case(
624 "01234678DHIKLMNOPQRUWZbcdhjostvy",
625 "136KLPQZvy",
626 "13569ABCEFGJKLPQSTVXYZaefgiklmnpqruvwxyz",
627 );
628 transform_case(
629 "0125789BDEFIJKLMNPVXabdjmrstuwy",
630 "12BIJVXjmrstu",
631 "12346ABCGHIJOQRSTUVWXYZcefghijklmnopqrstuvxz",
632 );
633 transform_case(
634 "12456789ABCEFGJKLMNPQRSTUVXYadefghkrtwxz",
635 "15ACEFGKLPRUVYdhrtx",
636 "0135ACDEFGHIKLOPRUVWYZbcdhijlmnopqrstuvxy",
637 );
638 transform_case(
639 "0128ABCDEFGIJMNOPQXYZabcfgijkloqruvy",
640 "2CEFGMZabijloruvy",
641 "2345679CEFGHKLMRSTUVWZabdehijlmnoprstuvwxyz",
642 );
643 transform_case(
644 "01245689ABCDGJKLMPQSTWXYbcdfgjlmnosvy",
645 "01245ABCDJLQSWXYgsv",
646 "0123457ABCDEFHIJLNOQRSUVWXYZaeghikpqrstuvwxz",
647 );
648 }
649}