1use std::fs::File;
2use std::io::BufRead;
3use std::path::{Path, PathBuf};
4
5use md5::{Digest, Md5};
6
7use flate2::write::GzEncoder;
8use flate2::Compression;
9use std::collections::{HashMap, HashSet};
10use std::fmt::Debug;
11use std::fmt::{self, Display};
12use std::io::{BufWriter, Write};
13#[cfg(feature = "bigbed")]
14use tokio::runtime;
15
16#[cfg(feature = "bigbed")]
17use bigtools::beddata::BedParserStreamingIterator;
18#[cfg(feature = "bigbed")]
19use bigtools::{BedEntry, BigBedWrite};
20
21use crate::errors::RegionSetError;
22use crate::models::Region;
23#[cfg(feature = "bigbed")]
24use crate::utils::get_chrom_sizes;
25use crate::utils::get_dynamic_reader;
26#[cfg(feature = "http")]
27use crate::utils::get_dynamic_reader_from_url;
28
29#[cfg(feature = "dataframe")]
30use polars::prelude::*;
31#[cfg(feature = "dataframe")]
32use std::io::Cursor;
33
34#[derive(Clone, Debug)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub struct RegionSet {
41 pub regions: Vec<Region>,
42 pub header: Option<String>,
43 #[cfg_attr(feature = "serde", serde(skip))]
44 pub path: Option<PathBuf>,
45}
46
47pub struct RegionSetIterator<'a> {
48 region_set: &'a RegionSet,
49 index: usize,
50}
51
52impl TryFrom<&Path> for RegionSet {
53 type Error = RegionSetError;
54
55 fn try_from(value: &Path) -> Result<Self, RegionSetError> {
61 let path = value;
62
63 let mut new_regions: Vec<Region> = Vec::new();
64
65 let reader = match path.is_file() {
66 true => get_dynamic_reader(path)
67 .map_err(|e| RegionSetError::FileReadError(e.to_string()))?,
68 #[cfg(feature = "http")]
69 false => {
70 match get_dynamic_reader_from_url(path) {
71 Ok(reader) => reader,
72 Err(_) => {
73 return Err(RegionSetError::InvalidPathOrUrl(format!("{:?}", path)));
74
75 }
93 }
94 }
95 #[cfg(not(feature = "http"))]
96 false => {
97 return Err(RegionSetError::HttpFeatureDisabled(
98 path.display().to_string(),
99 ));
100 }
101 };
102
103 let mut header: String = String::new();
104
105 let mut first_line: bool = true;
106
107 for line in reader.lines() {
108 let string_line = line?;
109
110 let parts: Vec<String> = string_line.split('\t').map(|s| s.to_string()).collect();
111
112 if string_line.starts_with("browser")
113 | string_line.starts_with("track")
114 | string_line.starts_with("#")
115 {
116 header.push_str(&string_line);
117 first_line = false;
118 continue;
119 }
120
121 if first_line {
123 if parts.len() >= 3 {
124 let is_header: bool = match parts[1].parse::<u32>() {
125 Ok(_num) => false,
126 Err(_) => true,
127 };
128 if is_header {
129 header.push_str(&string_line);
130 first_line = false;
131 continue;
132 }
133 }
134 first_line = false;
135 }
136
137 if parts.len() < 3 {
138 return Err(RegionSetError::RegionParseError(format!(
139 "Error in parsing start position: {:?}",
140 parts
141 )));
142 }
143
144 new_regions.push(Region {
145 chr: parts[0].to_owned(),
146
147 start: match parts[1].parse() {
149 Ok(start) => start,
150 Err(_err) => {
151 return Err(RegionSetError::RegionParseError(format!(
152 "Error in parsing start position: {:?}",
153 parts
154 )));
155 }
156 },
157 end: match parts[2].parse() {
158 Ok(end) => end,
159 Err(_err) => {
160 return Err(RegionSetError::RegionParseError(format!(
161 "Error in parsing end position: {:?}",
162 parts
163 )));
164 }
165 },
166 rest: Some(parts[3..].join("\t")).filter(|s| !s.is_empty()),
167 });
168 }
169 if new_regions.is_empty() {
170 return Err(RegionSetError::EmptyRegionSet(path.display().to_string()));
171 }
172
173 let mut rs = RegionSet {
174 regions: new_regions,
175 header: match header.is_empty() {
176 true => None,
177 false => Some(header),
178 },
179 path: Some(value.to_owned()),
180 };
181 rs.sort();
183
184 Ok(rs)
185 }
186}
187
188impl TryFrom<&str> for RegionSet {
189 type Error = RegionSetError;
190
191 fn try_from(value: &str) -> Result<Self, RegionSetError> {
192 RegionSet::try_from(Path::new(value))
193 }
194}
195
196impl TryFrom<String> for RegionSet {
197 type Error = RegionSetError;
198
199 fn try_from(value: String) -> Result<Self, RegionSetError> {
200 RegionSet::try_from(Path::new(&value))
201 }
202}
203
204impl TryFrom<PathBuf> for RegionSet {
205 type Error = RegionSetError;
206
207 fn try_from(value: PathBuf) -> Result<Self, RegionSetError> {
208 RegionSet::try_from(value.as_path())
209 }
210}
211
212impl From<Vec<Region>> for RegionSet {
213 fn from(regions: Vec<Region>) -> Self {
214 RegionSet {
215 regions,
216 header: None,
217 path: None,
218 }
219 }
220}
221
222impl From<&[u8]> for RegionSet {
223 fn from(value: &[u8]) -> Self {
224 let region_str = String::from_utf8_lossy(value);
225 let regions: Vec<Region> = region_str
226 .split('\n')
227 .map(|line| {
228 let parts = line.split('\t').collect::<Vec<&str>>();
229
230 let chr = parts[0].to_string();
231 let start = parts[1].parse::<u32>().unwrap();
232 let end = parts[2].parse::<u32>().unwrap();
233 let rest = Some(parts[3..].join("\t")).filter(|s| !s.is_empty());
234
235 Region {
236 chr,
237 start,
238 end,
239 rest,
240 }
241 })
242 .collect();
243
244 RegionSet {
245 regions,
246 header: None,
247 path: None,
248 }
249 }
250}
251
252impl<'a> Iterator for RegionSetIterator<'a> {
253 type Item = &'a Region;
254
255 fn next(&mut self) -> Option<Self::Item> {
256 if self.index < self.region_set.regions.len() {
257 let region = &self.region_set.regions[self.index];
258 self.index += 1;
259 Some(region)
260 } else {
261 None
262 }
263 }
264}
265
266impl<'a> IntoIterator for &'a RegionSet {
267 type Item = &'a Region;
268 type IntoIter = RegionSetIterator<'a>;
269
270 fn into_iter(self) -> Self::IntoIter {
271 RegionSetIterator {
272 region_set: self,
273 index: 0,
274 }
275 }
276}
277
278impl RegionSet {
279 pub fn to_bed<T: AsRef<Path>>(&self, path: T) -> std::io::Result<()> {
285 let path = path.as_ref();
286 if path.exists() {
287 println!("Bed file already exists. Overwriting existing file")
288 }
289
290 if let Some(parent) = path.parent() {
291 std::fs::create_dir_all(parent)?;
292 }
293
294 let mut file = File::create(path).unwrap();
295
296 for region in &self.regions {
297 writeln!(file, "{}", region.as_string())?;
298 }
299 Ok(())
300 }
301
302 pub fn to_bed_gz<T: AsRef<Path>>(&self, path: T) -> std::io::Result<()> {
308 let path = path.as_ref();
309 if path.exists() {
310 println!("Bed file already exists. Overwriting existing file")
311 }
312
313 if let Some(parent) = path.parent() {
314 std::fs::create_dir_all(parent)?;
315 }
316
317 let file = File::create(path)?;
318 let mut buffer: String = String::new();
319
320 for region in &self.regions {
321 buffer.push_str(&format!("{}\n", region.as_string(),));
322 }
323
324 let mut encoder = GzEncoder::new(BufWriter::new(file), Compression::best());
325 encoder.write_all(buffer.as_bytes())?;
326
327 Ok(())
328 }
329
330 pub fn identifier(&self) -> String {
339 let mut chrs = String::new();
340 let mut starts = String::new();
341 let mut ends = String::new();
342
343 let mut first = true;
344 for region in &self.regions {
345 if !first {
346 chrs.push(',');
347 starts.push(',');
348 ends.push(',');
349 }
350 chrs.push_str(®ion.chr);
351 starts.push_str(®ion.start.to_string());
352 ends.push_str(®ion.end.to_string());
353
354 first = false;
355 }
356
357 let mut hasher = Md5::new();
360 hasher.update(chrs);
361 let chrom_hash = hasher.finalize();
362
363 let mut hasher = Md5::new();
364 hasher.update(starts);
365 let start_hash = hasher.finalize();
366
367 let mut hasher = Md5::new();
368 hasher.update(ends);
369 let end_hash = hasher.finalize();
370
371 let combined = format!("{:x},{:x},{:x}", chrom_hash, start_hash, end_hash);
372
373 let mut hasher = Md5::new();
374 hasher.update(combined);
375 let hash = hasher.finalize();
376 let bed_digest: String = format!("{:x}", hash);
377
378 bed_digest
379 }
380
381 pub fn file_digest(&self) -> String {
382 let mut buffer: String = String::new();
383 for region in &self.regions {
384 buffer.push_str(&format!("{}\n", region.as_string(),));
385 }
386
387 let mut hasher = Md5::new();
388
389 hasher.update(buffer);
390 let hash = hasher.finalize();
391 let file_digest: String = format!("{:x}", hash);
392
393 file_digest
394 }
395
396 pub fn iter_chroms(&self) -> impl Iterator<Item = &String> {
400 let mut seen = HashSet::new();
401 let mut unique_chroms = Vec::new();
402 for r in &self.regions {
403 if seen.insert(&r.chr) {
404 unique_chroms.push(&r.chr);
405 }
406 }
407 unique_chroms.into_iter()
408 }
409
410 pub fn iter_chr_regions<'a>(&'a self, chr: &'a str) -> impl Iterator<Item = &'a Region> {
417 self.regions.iter().filter(move |r| r.chr == chr)
418 }
419
420 #[cfg(feature = "bigbed")]
428 pub fn to_bigbed<T: AsRef<Path>>(
429 &self,
430 out_path: T,
431 chrom_size: T,
432 ) -> Result<(), RegionSetError> {
433 let out_path = out_path.as_ref();
434
435 if out_path.exists() {
436 println!("Bed file already exists. Overwriting existing file")
437 }
438
439 if let Some(parent) = out_path.parent() {
440 std::fs::create_dir_all(parent)?;
441 }
442 let chrom_sizes: HashMap<String, u32> = get_chrom_sizes(chrom_size);
443
444 let mut warnings_count: i32 = 0;
445 let region_vector = self.regions.iter().map(|i| {
446 if !chrom_sizes.contains_key(&i.chr) {
448 eprintln!(
449 "Warning:: Chromosome is not found in Chrom sizes file. Chr: '{}'",
450 i.chr
451 );
452 if warnings_count > 40 {
453 panic!("Incorrect chrom sizes provided. Unable to create bigBed file!");
454 }
455
456 warnings_count += 1;
457 return None;
458 }
459 Some(Ok::<_, std::io::Error>((
460 i.chr.clone(),
461 BedEntry {
462 start: i.start,
463 end: i.end,
464 rest: i
465 .rest
466 .as_deref()
467 .map_or(String::new(), |s| format!("\t{}", s)),
468 },
469 )))
470 });
471
472 #[allow(clippy::option_filter_map)]
473 let region_vector = region_vector.filter(|e| e.is_some()).map(|e| e.unwrap());
475
476 let runtime = runtime::Builder::new_multi_thread()
477 .worker_threads(
478 std::thread::available_parallelism()
479 .map(|c| c.into())
480 .unwrap_or(1),
481 )
482 .build()
483 .expect("Unable to create thread pool.");
484
485 let mut bb_out = BigBedWrite::create_file(out_path, chrom_sizes.clone())
486 .expect("Failed to create bigBed file.");
487
488 bb_out.options.max_zooms = 8;
489
490 let data = BedParserStreamingIterator::wrap_iter(region_vector.into_iter(), true);
491 bb_out
492 .write(data, runtime)
493 .map_err(|e| RegionSetError::BigBedError(e.to_string()))?;
494 Ok(())
495 }
496
497 pub fn sort(&mut self) {
503 self.regions
504 .sort_by(|a, b| a.chr.cmp(&b.chr).then_with(|| a.start.cmp(&b.start)));
505 }
506
507 pub fn is_empty(&self) -> bool {
511 if self.regions.len() == 0 {
512 return true;
513 }
514 false
515 }
516
517 pub fn region_widths(&self) -> Vec<u32> {
521 self.regions.iter().map(|region| region.width()).collect()
522 }
523
524 pub fn mean_region_width(&self) -> f64 {
528 let sum: u32 = self
529 .regions
530 .iter()
531 .map(|region| region.end - region.start)
532 .sum();
533 let count: u32 = self.regions.len() as u32;
534
535 ((sum as f64 / count as f64) * 100.0).round() / 100.0
537 }
538
539 pub fn calc_mid_points(&self) -> HashMap<String, Vec<u32>> {
543 let mut mid_points: HashMap<String, Vec<u32>> = HashMap::new();
544 for chromosome in self.iter_chroms() {
545 let mut chr_mid_points: Vec<u32> = Vec::new();
546 for region in self.iter_chr_regions(chromosome) {
547 chr_mid_points.push(region.mid_point());
548 }
549 mid_points.insert(chromosome.clone(), chr_mid_points);
550 }
551 mid_points
552 }
553
554 pub fn calc_mid_points_with_mode(
558 &self,
559 mode: super::coords::CoordinateMode,
560 ) -> HashMap<String, Vec<u32>> {
561 let mut mid_points: HashMap<String, Vec<u32>> = HashMap::new();
562 for chromosome in self.iter_chroms() {
563 let mut chr_mid_points: Vec<u32> = Vec::new();
564 for region in self.iter_chr_regions(chromosome) {
565 chr_mid_points.push(region.mid_point_with_mode(mode));
566 }
567 mid_points.insert(chromosome.clone(), chr_mid_points);
568 }
569 mid_points
570 }
571
572 pub fn len(&self) -> usize {
578 self.regions.len()
579 }
580
581 pub fn get_max_end_per_chr(&self) -> HashMap<String, u32> {
585 let mut result: HashMap<String, u32> = HashMap::new();
586
587 let mut current_chr: &String = &self.regions[0].chr;
588 let mut max_end: u32 = self.regions[0].end;
589
590 for r in &self.regions[1..] {
591 if &r.chr == current_chr {
592 max_end = max_end.max(r.end);
594 } else {
595 result.insert(current_chr.clone(), max_end);
597 current_chr = &r.chr;
598 max_end = r.end;
599 }
600 }
601
602 result.insert(current_chr.clone(), max_end);
604
605 result
606 }
607
608 pub fn nucleotides_length(&self) -> u32 {
612 let mut total_count: u32 = 0;
613 for r in &self.regions {
614 total_count += r.width();
615 }
616 total_count
617 }
618
619 #[cfg(feature = "dataframe")]
623 pub fn to_polars(&self) -> PolarsResult<DataFrame> {
624 let data: String = self
626 .regions
627 .iter()
628 .map(|region| {
629 if let Some(rest) = region.rest.as_deref() {
630 format!("{}\t{}\t{}\t{}", region.chr, region.start, region.end, rest,)
631 } else {
632 format!("{}\t{}\t{}", region.chr, region.start, region.end,)
633 }
634 })
635 .collect::<Vec<_>>()
636 .join("\n");
637
638 let cursor = Cursor::new(data);
639
640 let df = CsvReadOptions::default()
641 .with_has_header(false)
642 .map_parse_options(|parse_options| parse_options.with_separator(b'\t'))
643 .with_infer_schema_length(Some(10000))
644 .into_reader_with_file_handle(cursor)
645 .finish()?;
646
647 Ok(df)
648 }
649}
650
651pub struct SortedRegionSet(pub RegionSet);
659
660impl SortedRegionSet {
661 pub fn new(mut rs: RegionSet) -> Self {
663 rs.sort();
664 SortedRegionSet(rs)
665 }
666}
667
668impl RegionSet {
671 pub fn reduce(&self) -> RegionSet {
676 if self.regions.is_empty() {
677 return RegionSet::from(Vec::<Region>::new());
678 }
679
680 let sorted = SortedRegionSet::new(RegionSet::from(self.regions.clone()));
681 let regions = &sorted.0.regions;
682
683 let mut merged: Vec<Region> = Vec::new();
684 let mut current = regions[0].clone();
685
686 for r in ®ions[1..] {
687 if r.chr == current.chr && r.start <= current.end {
688 current.end = current.end.max(r.end);
689 } else {
690 merged.push(Region {
691 chr: current.chr.clone(),
692 start: current.start,
693 end: current.end,
694 rest: None,
695 });
696 current = r.clone();
697 }
698 }
699 merged.push(Region {
700 chr: current.chr,
701 start: current.start,
702 end: current.end,
703 rest: None,
704 });
705
706 RegionSet::from(merged)
707 }
708
709 pub fn concat(&self, other: &RegionSet) -> RegionSet {
711 let mut regions = self.regions.clone();
712 regions.extend(other.regions.iter().cloned());
713 RegionSet::from(regions)
714 }
715
716 pub fn concat_into(mut self, other: RegionSet) -> RegionSet {
724 self.regions.extend(other.regions);
725 RegionSet::from(self.regions)
726 }
727
728 pub fn union(&self, other: &RegionSet) -> RegionSet {
732 self.concat(other).reduce()
733 }
734
735 pub fn union_into(self, other: RegionSet) -> RegionSet {
740 self.concat_into(other).reduce()
741 }
742
743 pub fn trim(&self, chrom_sizes: &HashMap<String, u32>) -> RegionSet {
745 let regions: Vec<Region> = self
746 .regions
747 .iter()
748 .filter_map(|r| {
749 let chrom_size = chrom_sizes.get(&r.chr)?;
750 let start = r.start.min(*chrom_size);
751 let end = r.end.min(*chrom_size);
752 if start > end {
753 None
754 } else {
755 Some(Region {
756 chr: r.chr.clone(),
757 start,
758 end,
759 rest: None,
760 })
761 }
762 })
763 .collect();
764 RegionSet::from(regions)
765 }
766
767 pub fn gaps(&self, chrom_sizes: &HashMap<String, u32>) -> RegionSet {
787 let reduced = self.reduce();
788
789 let mut by_chr: HashMap<&str, Vec<&Region>> = HashMap::new();
792 for r in &reduced.regions {
793 if chrom_sizes.contains_key(&r.chr) {
796 by_chr.entry(r.chr.as_str()).or_default().push(r);
797 }
798 }
799
800 let mut result: Vec<Region> = Vec::new();
801
802 for (chr_name, &chrom_size) in chrom_sizes.iter() {
806 if chrom_size == 0 {
807 continue;
808 }
809
810 match by_chr.get(chr_name.as_str()) {
811 None => {
812 result.push(Region {
814 chr: chr_name.clone(),
815 start: 0,
816 end: chrom_size,
817 rest: None,
818 });
819 }
820 Some(regions) => {
821 if regions[0].start > 0 {
823 let lead_end = regions[0].start.min(chrom_size);
827 result.push(Region {
828 chr: chr_name.clone(),
829 start: 0,
830 end: lead_end,
831 rest: None,
832 });
833 }
834
835 for pair in regions.windows(2) {
837 let gap_start = pair[0].end;
838 let gap_end = pair[1].start;
839 if gap_start < gap_end {
840 let cs = chrom_size;
843 let clipped_start = gap_start.min(cs);
844 let clipped_end = gap_end.min(cs);
845 if clipped_start < clipped_end {
846 result.push(Region {
847 chr: chr_name.clone(),
848 start: clipped_start,
849 end: clipped_end,
850 rest: None,
851 });
852 }
853 }
854 }
855
856 let last_end = regions[regions.len() - 1].end;
858 if last_end < chrom_size {
859 result.push(Region {
860 chr: chr_name.clone(),
861 start: last_end,
862 end: chrom_size,
863 rest: None,
864 });
865 }
866 }
867 }
868 }
869
870 result.sort_by(|a, b| {
872 crate::utils::chrom_karyotype_key(&a.chr)
873 .cmp(&crate::utils::chrom_karyotype_key(&b.chr))
874 .then(a.start.cmp(&b.start))
875 });
876
877 RegionSet::from(result)
878 }
879
880 pub fn shift(&self, offset: i64) -> RegionSet {
882 let regions: Vec<Region> = self
883 .regions
884 .iter()
885 .map(|r| {
886 let start = (r.start as i64 + offset).max(0) as u32;
887 let end = (r.end as i64 + offset).max(start as i64) as u32;
888 Region {
889 chr: r.chr.clone(),
890 start,
891 end,
892 rest: None,
893 }
894 })
895 .collect();
896 RegionSet::from(regions)
897 }
898
899 pub fn flank(&self, width: u32, use_start: bool, both: bool) -> RegionSet {
901 let regions: Vec<Region> = self
902 .regions
903 .iter()
904 .map(|r| {
905 if both {
906 let anchor = if use_start { r.start } else { r.end };
907 Region {
908 chr: r.chr.clone(),
909 start: anchor.saturating_sub(width),
910 end: anchor.saturating_add(width),
911 rest: None,
912 }
913 } else if use_start {
914 Region {
915 chr: r.chr.clone(),
916 start: r.start.saturating_sub(width),
917 end: r.start,
918 rest: None,
919 }
920 } else {
921 Region {
922 chr: r.chr.clone(),
923 start: r.end,
924 end: r.end.saturating_add(width),
925 rest: None,
926 }
927 }
928 })
929 .collect();
930 RegionSet::from(regions)
931 }
932
933 pub fn resize(&self, width: u32, fix: &str) -> RegionSet {
935 let regions: Vec<Region> = self
936 .regions
937 .iter()
938 .map(|r| match fix {
939 "end" => Region {
940 chr: r.chr.clone(),
941 start: r.end.saturating_sub(width),
942 end: r.end,
943 rest: None,
944 },
945 "center" => {
946 let mid = r.start + (r.end - r.start) / 2;
947 let half = width / 2;
948 Region {
949 chr: r.chr.clone(),
950 start: mid.saturating_sub(half),
951 end: mid.saturating_sub(half).saturating_add(width),
952 rest: None,
953 }
954 }
955 _ => Region {
956 chr: r.chr.clone(),
957 start: r.start,
958 end: r.start.saturating_add(width),
959 rest: None,
960 },
961 })
962 .collect();
963 RegionSet::from(regions)
964 }
965
966 pub fn narrow(&self, start: Option<u32>, end: Option<u32>, width: Option<u32>) -> RegionSet {
968 let regions: Vec<Region> = self
969 .regions
970 .iter()
971 .map(|r| {
972 let region_width = r.end - r.start;
973 let (rel_start, rel_end) = match (start, end, width) {
974 (Some(s), Some(e), None) => (s.saturating_sub(1), e),
975 (Some(s), None, Some(w)) => (s.saturating_sub(1), s.saturating_sub(1) + w),
976 (None, Some(e), Some(w)) => (e.saturating_sub(w), e),
977 _ => (0, region_width),
978 };
979 let new_start = r.start + rel_start.min(region_width);
980 let new_end = r.start + rel_end.min(region_width);
981 Region {
982 chr: r.chr.clone(),
983 start: new_start.min(new_end),
984 end: new_end.max(new_start),
985 rest: None,
986 }
987 })
988 .collect();
989 RegionSet::from(regions)
990 }
991
992 pub fn promoters(&self, upstream: u32, downstream: u32) -> RegionSet {
994 let regions: Vec<Region> = self
995 .regions
996 .iter()
997 .map(|r| Region {
998 chr: r.chr.clone(),
999 start: r.start.saturating_sub(upstream),
1000 end: r.start.saturating_add(downstream),
1001 rest: None,
1002 })
1003 .collect();
1004 RegionSet::from(regions)
1005 }
1006
1007 pub fn pintersect(&self, other: &RegionSet) -> RegionSet {
1009 let regions: Vec<Region> = self
1010 .regions
1011 .iter()
1012 .zip(other.regions.iter())
1013 .map(|(a, b)| {
1014 if a.chr != b.chr {
1015 return Region {
1016 chr: a.chr.clone(),
1017 start: a.start,
1018 end: a.start,
1019 rest: None,
1020 };
1021 }
1022 let start = a.start.max(b.start);
1023 let end = a.end.min(b.end);
1024 if start >= end {
1025 Region {
1026 chr: a.chr.clone(),
1027 start,
1028 end: start,
1029 rest: None,
1030 }
1031 } else {
1032 Region {
1033 chr: a.chr.clone(),
1034 start,
1035 end,
1036 rest: None,
1037 }
1038 }
1039 })
1040 .collect();
1041 RegionSet::from(regions)
1042 }
1043
1044 pub fn disjoin(&self) -> RegionSet {
1052 let mut by_chr: HashMap<String, Vec<(u32, u32)>> = HashMap::new();
1053 for r in &self.regions {
1054 by_chr.entry(r.chr.clone()).or_default().push((r.start, r.end));
1055 }
1056
1057 let mut result: Vec<Region> = Vec::new();
1058 for (chr, mut intervals) in by_chr {
1059 let mut boundaries: Vec<u32> = Vec::with_capacity(intervals.len() * 2);
1061 for &(start, end) in &intervals {
1062 boundaries.push(start);
1063 boundaries.push(end);
1064 }
1065 boundaries.sort();
1066 boundaries.dedup();
1067
1068 intervals.sort_by_key(|&(start, _)| start);
1070
1071 for window in boundaries.windows(2) {
1072 let (seg_start, seg_end) = (window[0], window[1]);
1073 let covered = intervals
1076 .iter()
1077 .any(|&(start, end)| start <= seg_start && seg_end <= end);
1078 if covered {
1079 result.push(Region {
1080 chr: chr.clone(),
1081 start: seg_start,
1082 end: seg_end,
1083 rest: None,
1084 });
1085 }
1086 }
1087 }
1088 result.sort_by(|a, b| (&a.chr, a.start).cmp(&(&b.chr, b.start)));
1089 RegionSet::from(result)
1090 }
1091
1092 pub fn cluster(&self, max_gap: u32) -> Vec<u32> {
1094 if self.regions.is_empty() {
1095 return vec![];
1096 }
1097
1098 let n = self.regions.len();
1099 let mut result = vec![0u32; n];
1100
1101 let mut sorted_indices: Vec<usize> = (0..n).collect();
1103 sorted_indices.sort_by(|&i, &j| {
1104 self.regions[i]
1105 .chr
1106 .cmp(&self.regions[j].chr)
1107 .then(self.regions[i].start.cmp(&self.regions[j].start))
1108 .then(self.regions[i].end.cmp(&self.regions[j].end))
1109 });
1110
1111 let mut cluster_id: u32 = 0;
1112 let mut cluster_end = self.regions[sorted_indices[0]].end;
1113 let mut current_chr = &self.regions[sorted_indices[0]].chr;
1114 result[sorted_indices[0]] = cluster_id;
1115
1116 for &idx in &sorted_indices[1..] {
1117 let r = &self.regions[idx];
1118 if r.chr != *current_chr || r.start > cluster_end.saturating_add(max_gap) {
1119 cluster_id += 1;
1120 cluster_end = r.end;
1121 current_chr = &r.chr;
1122 } else {
1123 cluster_end = cluster_end.max(r.end);
1124 }
1125 result[idx] = cluster_id;
1126 }
1127
1128 result
1129 }
1130
1131 pub fn closest(&self, other: &RegionSet) -> Vec<(usize, usize, i64)> {
1133 if other.regions.is_empty() {
1134 return Vec::new();
1135 }
1136
1137 let mut other_by_chr: HashMap<String, Vec<(usize, &Region)>> = HashMap::new();
1139 for (idx, r) in other.regions.iter().enumerate() {
1140 other_by_chr
1141 .entry(r.chr.clone())
1142 .or_default()
1143 .push((idx, r));
1144 }
1145 for v in other_by_chr.values_mut() {
1146 v.sort_by_key(|(_, r)| r.start);
1147 }
1148
1149 let mut max_width_by_chr: HashMap<String, u32> = HashMap::new();
1151 for (chr, candidates) in &other_by_chr {
1152 let max_w = candidates.iter().map(|(_, r)| r.end - r.start).max().unwrap_or(0);
1153 max_width_by_chr.insert(chr.clone(), max_w);
1154 }
1155
1156 let mut result: Vec<(usize, usize, i64)> = Vec::new();
1157
1158 for (self_idx, a) in self.regions.iter().enumerate() {
1159 let Some(candidates) = other_by_chr.get(&a.chr) else {
1160 continue; };
1162
1163 let ins = candidates
1165 .binary_search_by_key(&a.start, |(_, r)| r.start)
1166 .unwrap_or_else(|x| x);
1167
1168 let gap_dist = |a_reg: &Region, b_reg: &Region| -> i64 {
1169 if a_reg.start < b_reg.end && b_reg.start < a_reg.end {
1170 0i64
1171 } else if b_reg.end <= a_reg.start {
1172 (a_reg.start as i64) - (b_reg.end as i64)
1173 } else {
1174 (b_reg.start as i64) - (a_reg.end as i64)
1175 }
1176 };
1177
1178 let max_width = *max_width_by_chr.get(&a.chr).unwrap_or(&0) as i64;
1179
1180 let mut best_other_idx = 0usize;
1181 let mut best_dist = i64::MAX;
1182
1183 let mut left_done = ins == 0;
1184 let mut right_done = ins >= candidates.len();
1185 let mut li = if ins > 0 { ins - 1 } else { 0 };
1186 let mut ri = ins;
1187
1188 while !left_done || !right_done {
1189 if !right_done {
1190 let (other_idx, b) = candidates[ri];
1191 let dist = gap_dist(a, b);
1192 if dist.abs() < best_dist.abs() {
1193 best_dist = dist;
1194 best_other_idx = other_idx;
1195 }
1196 if best_dist == 0 { break; }
1197 ri += 1;
1198 if ri >= candidates.len() || (b.start as i64 - a.end as i64 > best_dist.abs()) {
1199 right_done = true;
1200 }
1201 }
1202
1203 if !left_done {
1204 let (other_idx, b) = candidates[li];
1205 let dist = gap_dist(a, b);
1206 if dist.abs() < best_dist.abs() {
1207 best_dist = dist;
1208 best_other_idx = other_idx;
1209 }
1210 if best_dist == 0 { break; }
1211 if li == 0 || (a.start as i64 - b.start as i64 > best_dist.abs() + max_width) {
1212 left_done = true;
1213 } else {
1214 li -= 1;
1215 }
1216 }
1217 }
1218
1219 result.push((self_idx, best_other_idx, best_dist));
1220 }
1221
1222 result
1223 }
1224}
1225
1226pub fn sweep_setdiff_chr(chr: &str, a: &[Region], b: &[Region]) -> Vec<Region> {
1230 let mut result = Vec::new();
1231 let mut b_idx = 0;
1232
1233 for a_region in a {
1234 while b_idx < b.len() && b[b_idx].end <= a_region.start {
1235 b_idx += 1;
1236 }
1237
1238 let mut pos = a_region.start;
1239 let mut j = b_idx;
1240
1241 while j < b.len() && b[j].start < a_region.end && pos < a_region.end {
1242 if b[j].start > pos {
1243 result.push(Region {
1244 chr: chr.to_string(),
1245 start: pos,
1246 end: b[j].start,
1247 rest: None,
1248 });
1249 }
1250 pos = pos.max(b[j].end);
1251 j += 1;
1252 }
1253
1254 if pos < a_region.end {
1255 result.push(Region {
1256 chr: chr.to_string(),
1257 start: pos,
1258 end: a_region.end,
1259 rest: None,
1260 });
1261 }
1262 }
1263
1264 result
1265}
1266
1267pub fn sweep_intersect_chr(chr: &str, a: &[Region], b: &[Region]) -> Vec<Region> {
1269 let mut result = Vec::new();
1270 let mut b_idx = 0;
1271
1272 for a_region in a {
1273 while b_idx < b.len() && b[b_idx].end <= a_region.start {
1274 b_idx += 1;
1275 }
1276 let mut j = b_idx;
1277 while j < b.len() && b[j].start < a_region.end {
1278 let start = a_region.start.max(b[j].start);
1279 let end = a_region.end.min(b[j].end);
1280 if start < end {
1281 result.push(Region {
1282 chr: chr.to_string(),
1283 start,
1284 end,
1285 rest: None,
1286 });
1287 }
1288 j += 1;
1289 }
1290 }
1291
1292 result
1293}
1294
1295pub trait IntervalSetOps {
1303 fn setdiff(&self, other: &RegionSet) -> RegionSet;
1305
1306 fn intersect(&self, other: &RegionSet) -> RegionSet;
1308
1309 fn jaccard(&self, other: &RegionSet) -> f64;
1311
1312 fn coverage(&self, other: &RegionSet) -> f64;
1314
1315 fn overlap_coefficient(&self, other: &RegionSet) -> f64;
1317
1318 fn closest(&self, other: &RegionSet) -> Vec<(usize, usize, i64)>;
1322}
1323
1324impl IntervalSetOps for RegionSet {
1325 fn setdiff(&self, other: &RegionSet) -> RegionSet {
1326 let a = self.reduce();
1327 let b = other.reduce();
1328
1329 let mut b_by_chr: HashMap<String, Vec<Region>> = HashMap::new();
1330 for r in &b.regions {
1331 b_by_chr.entry(r.chr.clone()).or_default().push(r.clone());
1332 }
1333
1334 let mut result: Vec<Region> = Vec::new();
1335
1336 let mut a_chr_start = 0;
1337 while a_chr_start < a.regions.len() {
1338 let chr = &a.regions[a_chr_start].chr;
1339 let mut a_chr_end = a_chr_start;
1340 while a_chr_end < a.regions.len() && a.regions[a_chr_end].chr == *chr {
1341 a_chr_end += 1;
1342 }
1343
1344 let empty_vec = vec![];
1345 let b_chr = b_by_chr.get(chr.as_str()).unwrap_or(&empty_vec);
1346 result.extend(sweep_setdiff_chr(chr, &a.regions[a_chr_start..a_chr_end], b_chr));
1347
1348 a_chr_start = a_chr_end;
1349 }
1350
1351 RegionSet::from(result)
1352 }
1353
1354 fn intersect(&self, other: &RegionSet) -> RegionSet {
1355 let a = self.reduce();
1356 let b = other.reduce();
1357
1358 let mut b_by_chr: HashMap<String, Vec<Region>> = HashMap::new();
1359 for r in &b.regions {
1360 b_by_chr.entry(r.chr.clone()).or_default().push(r.clone());
1361 }
1362
1363 let mut result: Vec<Region> = Vec::new();
1364
1365 let mut a_i = 0;
1366 while a_i < a.regions.len() {
1367 let chr = &a.regions[a_i].chr;
1368 let mut a_end = a_i;
1369 while a_end < a.regions.len() && a.regions[a_end].chr == *chr {
1370 a_end += 1;
1371 }
1372
1373 if let Some(b_chr) = b_by_chr.get(chr.as_str()) {
1374 result.extend(sweep_intersect_chr(chr, &a.regions[a_i..a_end], b_chr));
1375 }
1376
1377 a_i = a_end;
1378 }
1379
1380 RegionSet::from(result)
1381 }
1382
1383 fn jaccard(&self, other: &RegionSet) -> f64 {
1384 let a_bp = self.reduce().nucleotides_length();
1385 let b_bp = other.reduce().nucleotides_length();
1386 let union_bp = self.union(other).nucleotides_length();
1387 if union_bp == 0 {
1388 return 0.0;
1389 }
1390 let intersection_bp = a_bp + b_bp - union_bp;
1391 intersection_bp as f64 / union_bp as f64
1392 }
1393
1394 fn coverage(&self, other: &RegionSet) -> f64 {
1395 let self_reduced = self.reduce();
1396 let self_bp = self_reduced.nucleotides_length();
1397 if self_bp == 0 {
1398 return 0.0;
1399 }
1400 let diff = self_reduced.setdiff(other);
1401 let diff_bp = diff.nucleotides_length();
1402 1.0 - (diff_bp as f64 / self_bp as f64)
1403 }
1404
1405 fn overlap_coefficient(&self, other: &RegionSet) -> f64 {
1406 let a_bp = self.reduce().nucleotides_length();
1407 let b_bp = other.reduce().nucleotides_length();
1408 let min_bp = a_bp.min(b_bp);
1409 if min_bp == 0 {
1410 return 0.0;
1411 }
1412 let union_bp = self.union(other).nucleotides_length();
1413 let intersection_bp = a_bp + b_bp - union_bp;
1414 intersection_bp as f64 / min_bp as f64
1415 }
1416
1417 fn closest(&self, other: &RegionSet) -> Vec<(usize, usize, i64)> {
1418 RegionSet::closest(self, other)
1419 }
1420}
1421
1422impl Display for RegionSet {
1423 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1424 write!(f, "RegionSet with {} regions.", self.len())
1425 }
1426}
1427
1428#[cfg(test)]
1429mod tests {
1430 use super::*;
1431
1432 use pretty_assertions::assert_eq;
1433 use rstest::*;
1434
1435 fn get_test_path(file_name: &str) -> PathBuf {
1436 let file_path: PathBuf = std::env::current_dir()
1437 .unwrap()
1438 .join("../tests/data/regionset")
1439 .join(file_name);
1440 file_path
1441 }
1442
1443 #[rstest]
1444 fn test_open_from_path() {
1445 let file_path = get_test_path("dummy.narrowPeak");
1446 assert!(RegionSet::try_from(file_path.as_path()).is_ok());
1447 }
1448
1449 #[rstest]
1450 fn test_open_from_string() {
1451 let file_path = get_test_path("dummy.narrowPeak");
1452 assert!(RegionSet::try_from(file_path.to_str().unwrap()).is_ok());
1453 }
1454
1455 #[rstest]
1456 #[ignore = "Hits encodeproject.org; the CDN blocks cloud/CI IPs. See test_open_from_url_matches_local for a self-contained version."]
1457 fn test_open_from_url() {
1458 let file_path = String::from(
1459 "https://www.encodeproject.org/files/ENCFF321QPN/@@download/ENCFF321QPN.bed.gz",
1460 );
1461 assert!(RegionSet::try_from(file_path).is_ok());
1462 }
1463
1464 #[cfg(feature = "http")]
1469 fn make_multimember_bed_gz(num_regions: usize) -> Vec<u8> {
1470 use flate2::write::GzEncoder;
1471 use flate2::Compression;
1472 use std::io::Write;
1473
1474 let lines: Vec<String> = (0..num_regions)
1475 .map(|i| format!("chr1\t{}\t{}\n", i * 100, i * 100 + 50))
1476 .collect();
1477
1478 let mid = lines.len() / 2;
1479 let mut out = Vec::new();
1480 for chunk in [&lines[..mid], &lines[mid..]] {
1481 let mut enc = GzEncoder::new(Vec::new(), Compression::default());
1482 for line in chunk {
1483 enc.write_all(line.as_bytes()).unwrap();
1484 }
1485 out.extend(enc.finish().unwrap());
1486 }
1487 out
1488 }
1489
1490 #[cfg(feature = "http")]
1495 #[rstest]
1496 fn test_open_from_url_matches_local() {
1497 use std::io::{Read, Write};
1498 use std::net::TcpListener;
1499
1500 const NUM_REGIONS: usize = 5000;
1501 let gz_bytes = make_multimember_bed_gz(NUM_REGIONS);
1502
1503 let tmp = tempfile::Builder::new().suffix(".bed.gz").tempfile().unwrap();
1505 tmp.as_file().write_all(&gz_bytes).unwrap();
1506 let from_file = RegionSet::try_from(tmp.path()).unwrap();
1507 assert_eq!(from_file.len(), NUM_REGIONS);
1508
1509 let listener = TcpListener::bind("127.0.0.1:0").unwrap();
1511 let port = listener.local_addr().unwrap().port();
1512 let body = gz_bytes.clone();
1513 let server = std::thread::spawn(move || {
1514 let (mut stream, _) = listener.accept().unwrap();
1515 let mut buf = [0u8; 1024];
1516 let _ = stream.read(&mut buf); let header = format!(
1518 "HTTP/1.1 200 OK\r\nContent-Length: {}\r\nConnection: close\r\n\r\n",
1519 body.len()
1520 );
1521 stream.write_all(header.as_bytes()).unwrap();
1522 stream.write_all(&body).unwrap();
1523 });
1524
1525 let url = format!("http://127.0.0.1:{}/data.bed.gz", port);
1526 let from_url = RegionSet::try_from(url).unwrap();
1527 server.join().unwrap();
1528
1529 assert_eq!(from_url.len(), NUM_REGIONS);
1530 assert_eq!(from_url.len(), from_file.len());
1531 }
1532
1533 #[rstest]
1534 #[ignore = "Avoid BEDbase dependency in CI"]
1535 fn test_open_from_bedbase() {
1536 let bbid = String::from("6b2e163a1d4319d99bd465c6c78a9741");
1537 let region_set = RegionSet::try_from(bbid);
1538 assert_eq!(region_set.is_ok(), true);
1539 assert_eq!(
1540 region_set.unwrap().identifier(),
1541 "6b2e163a1d4319d99bd465c6c78a9741"
1542 );
1543 }
1544
1545 #[rstest]
1546 fn test_open_bed_gz() {
1547 let file_path = get_test_path("dummy.narrowPeak.bed.gz");
1548 assert!(RegionSet::try_from(file_path.to_str().unwrap()).is_ok());
1549 }
1550
1551 #[rstest]
1552 fn test_calculate_identifier() {
1553 let file_path = get_test_path("dummy.narrowPeak.bed.gz");
1554 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1555
1556 assert_eq!("f0b2cf73383b53bd97ff525a0380f200", region_set.identifier());
1557 }
1558
1559 #[rstest]
1560 fn test_closest_index_on_unsorted_other() {
1561 let mk = |chr: &str, start: u32, end: u32| Region {
1564 chr: chr.to_owned(),
1565 start,
1566 end,
1567 rest: None,
1568 };
1569
1570 let query = RegionSet::from(vec![mk("chr1", 100, 110)]);
1571 let other = RegionSet::from(vec![
1572 mk("chr1", 500, 510),
1573 mk("chr1", 120, 130),
1574 mk("chr1", 900, 910),
1575 ]);
1576
1577 let result = query.closest(&other);
1578
1579 assert_eq!(result, vec![(0, 1, 10)]);
1580 let (_, other_idx, dist) = result[0];
1581 assert_eq!(other_idx, 1);
1582 assert_eq!(other.regions[other_idx], mk("chr1", 120, 130));
1583 assert_eq!(dist, 10);
1584 }
1585
1586 #[rstest]
1587 fn test_save_bed_gz() {
1588 let file_path = get_test_path("dummy.narrowPeak.bed.gz");
1589 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1590
1591 let tempdir = tempfile::tempdir().unwrap();
1592
1593 let mut new_file_path = tempdir.keep();
1594 new_file_path.push("new_file.bed.gz");
1595
1596 assert!(region_set.to_bed_gz(new_file_path.as_path()).is_ok());
1597
1598 let new_region = RegionSet::try_from(new_file_path.as_path());
1599 assert!(new_region.is_ok());
1600 assert_eq!(new_region.unwrap().identifier(), region_set.identifier())
1601 }
1602
1603 #[rstest]
1604 fn test_save_bed() {
1605 let file_path = get_test_path("dummy.narrowPeak");
1606 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1607
1608 let tempdir = tempfile::tempdir().unwrap();
1609
1610 let mut new_file_path = tempdir.keep();
1611 new_file_path.push("new_bedfile.bed");
1612
1613 assert!(region_set.to_bed(new_file_path.as_path()).is_ok());
1614
1615 let new_region = RegionSet::try_from(new_file_path.as_path());
1616 assert!(new_region.is_ok());
1617 assert_eq!(new_region.unwrap().identifier(), region_set.identifier())
1618 }
1619
1620 #[cfg(feature = "bigbed")]
1621 #[rstest]
1622 fn test_save_bigbed() {
1623 let file_path = get_test_path("dummy.narrowPeak");
1624 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1625
1626 let chrom_sizes_path: PathBuf = std::env::current_dir()
1627 .unwrap()
1628 .join("../tests/data/regionset/dummy_chrom_sizes");
1629
1630 let tempdir = tempfile::tempdir().unwrap();
1631 let mut new_file_path = tempdir.keep();
1632 new_file_path.push("new.bigbed");
1633
1634 assert!(region_set
1635 .to_bigbed(new_file_path.as_path(), chrom_sizes_path.as_path())
1636 .is_ok());
1637 }
1638
1639 #[rstest]
1640 fn test_read_headers() {
1641 let file_path = get_test_path("dummy_headers.bed");
1642 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1643
1644 assert!(region_set.header.is_some());
1645 assert_eq!(region_set.path.unwrap(), file_path);
1646 }
1647
1648 #[rstest]
1649 fn test_is_empty() {
1650 let file_path = get_test_path("dummy_headers.bed");
1651 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1652
1653 assert!(!region_set.is_empty());
1654 }
1655
1656 #[rstest]
1657 fn test_file_digest() {
1658 let file_path = get_test_path("dummy.narrowPeak");
1659 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1660
1661 assert_eq!(region_set.file_digest(), "6224c4d40832b3e0889250f061e01120");
1662 assert_eq!(region_set.identifier(), "f0b2cf73383b53bd97ff525a0380f200")
1663 }
1664
1665 #[rstest]
1666 fn test_mean_region_width() {
1667 let file_path = get_test_path("dummy.narrowPeak");
1668 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1669
1670 assert_eq!(region_set.mean_region_width(), 4.22)
1671 }
1672 #[rstest]
1673 fn test_open_file_with_incorrect_headers() {
1674 let file_path = get_test_path("dummy_incorrect_headers.bed");
1675 let _region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1676 }
1677
1678 #[rstest]
1679 fn test_chr_length() {
1680 let file_path = get_test_path("dummy.narrowPeak");
1681 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1682 assert_eq!(*region_set.get_max_end_per_chr().get("chr1").unwrap(), 36);
1683 assert_eq!(region_set.get_max_end_per_chr().len(), 1)
1684 }
1685
1686 #[rstest]
1687 fn test_total_nucleotides_function() {
1688 let file_path = get_test_path("dummy.narrowPeak");
1689 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1690
1691 assert_eq!(region_set.nucleotides_length(), 38)
1692 }
1693
1694 #[rstest]
1695 fn test_iter_chroms() {
1696 let file_path = get_test_path("dummy.narrowPeak");
1697 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1698
1699 assert_eq!(region_set.iter_chroms().collect::<Vec<_>>().len(), 1)
1700 }
1701
1702 #[cfg(feature = "dataframe")]
1703 #[rstest]
1704 fn test_polars() {
1705 let file_path = get_test_path("dummy.narrowPeak");
1706 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1707 let rs_polars = region_set.to_polars().unwrap();
1708 println!("Number of columns: {:?}", rs_polars.get_columns().len());
1709 assert_eq!(rs_polars.get_columns().len(), 10);
1710 }
1711
1712 #[rstest]
1713 fn test_calc_mid_points() {
1714 let file_path = get_test_path("dummy.narrowPeak");
1715 let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1716
1717 let mid_points = region_set.calc_mid_points();
1718 assert_eq!(mid_points.get("chr1").unwrap().len(), 9);
1719 assert_eq!(mid_points.len(), 1);
1720 assert_eq!(
1721 mid_points
1722 .get("chr1")
1723 .unwrap()
1724 .iter()
1725 .min()
1726 .copied()
1727 .unwrap(),
1728 6u32
1729 );
1730 }
1731
1732 fn gaps_make_regionset(regions: Vec<(&str, u32, u32)>) -> RegionSet {
1733 RegionSet::from(
1734 regions
1735 .into_iter()
1736 .map(|(chr, start, end)| Region {
1737 chr: chr.to_string(),
1738 start,
1739 end,
1740 rest: None,
1741 })
1742 .collect::<Vec<Region>>(),
1743 )
1744 }
1745
1746 fn gaps_chrom_sizes(sizes: &[(&str, u32)]) -> HashMap<String, u32> {
1747 sizes
1748 .iter()
1749 .map(|(chr, size)| (chr.to_string(), *size))
1750 .collect()
1751 }
1752
1753 #[rstest]
1754 fn test_gaps_basic() {
1755 let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr1", 30, 40), ("chr1", 50, 60)]);
1758 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1759 let result = rs.gaps(&cs);
1760 let gaps: Vec<(&str, u32, u32)> = result
1761 .regions
1762 .iter()
1763 .map(|r| (r.chr.as_str(), r.start, r.end))
1764 .collect();
1765 assert_eq!(
1766 gaps,
1767 vec![
1768 ("chr1", 0, 10), ("chr1", 20, 30), ("chr1", 40, 50), ("chr1", 60, 100), ]
1773 );
1774 }
1775
1776 #[rstest]
1777 fn test_gaps_peak_at_origin_no_leading() {
1778 let rs = gaps_make_regionset(vec![("chr1", 0, 10), ("chr1", 20, 30)]);
1780 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1781 let result = rs.gaps(&cs);
1782 let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1783 assert_eq!(gaps, vec![(10, 20), (30, 100)]);
1784 }
1785
1786 #[rstest]
1787 fn test_gaps_peak_at_chrom_end_no_trailing() {
1788 let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 100)]);
1790 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1791 let result = rs.gaps(&cs);
1792 let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1793 assert_eq!(gaps, vec![(0, 10), (20, 80)]);
1794 }
1795
1796 #[rstest]
1797 fn test_gaps_peak_past_chrom_end_clipped() {
1798 let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 150)]);
1800 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1801 let result = rs.gaps(&cs);
1802 let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1803 assert_eq!(gaps, vec![(0, 10), (20, 80)]);
1804 }
1805
1806 #[rstest]
1807 fn test_gaps_empty_regionset_populated_chrom_sizes() {
1808 let rs = RegionSet::from(Vec::<Region>::new());
1810 let cs = gaps_chrom_sizes(&[("chr1", 100), ("chr2", 50)]);
1811 let result = rs.gaps(&cs);
1812 let mut gaps: Vec<(String, u32, u32)> = result
1813 .regions
1814 .iter()
1815 .map(|r| (r.chr.clone(), r.start, r.end))
1816 .collect();
1817 gaps.sort();
1818 assert_eq!(
1819 gaps,
1820 vec![
1821 ("chr1".to_string(), 0, 100),
1822 ("chr2".to_string(), 0, 50),
1823 ]
1824 );
1825 }
1826
1827 #[rstest]
1828 fn test_gaps_chromosome_not_in_chrom_sizes_skipped() {
1829 let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr2", 5, 15)]);
1831 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1832 let result = rs.gaps(&cs);
1833 for r in &result.regions {
1835 assert_eq!(r.chr, "chr1");
1836 }
1837 }
1838
1839 #[rstest]
1840 fn test_gaps_full_chrom_gap_for_unrepresented_chrom() {
1841 let rs = gaps_make_regionset(vec![("chr1", 10, 20)]);
1843 let cs = gaps_chrom_sizes(&[("chr1", 100), ("chr2", 200)]);
1844 let result = rs.gaps(&cs);
1845 let chr2_gaps: Vec<(u32, u32)> = result
1846 .regions
1847 .iter()
1848 .filter(|r| r.chr == "chr2")
1849 .map(|r| (r.start, r.end))
1850 .collect();
1851 assert_eq!(chr2_gaps, vec![(0, 200)]);
1852 }
1853
1854 #[rstest]
1855 fn test_gaps_overlapping_peaks_reduced() {
1856 let rs = gaps_make_regionset(vec![
1858 ("chr1", 10, 30),
1859 ("chr1", 25, 40), ("chr1", 50, 60),
1861 ]);
1862 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1863 let result = rs.gaps(&cs);
1864 let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1865 assert_eq!(gaps, vec![(0, 10), (40, 50), (60, 100)]);
1867 }
1868
1869 #[rstest]
1870 fn test_gaps_karyotypic_ordering() {
1871 let rs = gaps_make_regionset(vec![
1873 ("chr2", 10, 20),
1874 ("chr1", 10, 20),
1875 ("chr10", 10, 20),
1876 ]);
1877 let cs = gaps_chrom_sizes(&[("chr10", 100), ("chr1", 100), ("chr2", 100)]);
1878 let result = rs.gaps(&cs);
1879 let order: Vec<&str> = result
1881 .regions
1882 .iter()
1883 .map(|r| r.chr.as_str())
1884 .scan("", |prev, chr| {
1885 if chr != *prev {
1886 *prev = chr;
1887 Some(chr)
1888 } else {
1889 Some("") }
1891 })
1892 .filter(|s| !s.is_empty())
1893 .collect();
1894 assert_eq!(order, vec!["chr1", "chr2", "chr10"]);
1895 }
1896
1897 #[rstest]
1898 fn test_gaps_fully_covered_chrom_no_gaps() {
1899 let rs = gaps_make_regionset(vec![("chr1", 0, 100)]);
1901 let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1902 let result = rs.gaps(&cs);
1903 assert!(result.regions.is_empty());
1904 }
1905
1906 fn concat_make_region(chr: &str, start: u32, end: u32) -> Region {
1907 Region {
1908 chr: chr.to_owned(),
1909 start,
1910 end,
1911 rest: None,
1912 }
1913 }
1914
1915 #[rstest]
1916 fn test_concat_into_matches_concat() {
1917 let a_regions = vec![
1920 concat_make_region("chr1", 100, 200),
1921 concat_make_region("chr2", 50, 60),
1922 ];
1923 let b_regions = vec![
1924 concat_make_region("chr1", 150, 250),
1925 concat_make_region("chr3", 10, 20),
1926 ];
1927
1928 let a = RegionSet::from(a_regions.clone());
1929 let b = RegionSet::from(b_regions.clone());
1930 let borrowed = a.concat(&b);
1931
1932 let a2 = RegionSet::from(a_regions);
1933 let b2 = RegionSet::from(b_regions);
1934 let consumed = a2.concat_into(b2);
1935
1936 assert_eq!(borrowed.regions, consumed.regions);
1937 }
1938
1939 #[rstest]
1940 fn test_concat_into_consumes_inputs() {
1941 let rs1 = RegionSet::from(vec![
1944 concat_make_region("chr1", 100, 200),
1945 concat_make_region("chr1", 300, 400),
1946 ]);
1947 let rs2 = RegionSet::from(vec![concat_make_region("chr2", 0, 10)]);
1948
1949 let combined = rs1.concat_into(rs2);
1950 assert_eq!(combined.regions.len(), 3);
1951 }
1952
1953 #[rstest]
1954 fn test_concat_into_empty() {
1955 let non_empty = vec![
1956 concat_make_region("chr1", 100, 200),
1957 concat_make_region("chr2", 5, 15),
1958 ];
1959
1960 let empty = RegionSet::from(Vec::<Region>::new());
1962 let ne = RegionSet::from(non_empty.clone());
1963 let result = empty.concat_into(ne);
1964 assert_eq!(result.regions, non_empty);
1965
1966 let ne2 = RegionSet::from(non_empty.clone());
1968 let empty2 = RegionSet::from(Vec::<Region>::new());
1969 let result2 = ne2.concat_into(empty2);
1970 assert_eq!(result2.regions, non_empty);
1971 }
1972
1973 #[rstest]
1974 fn test_union_into_matches_union() {
1975 let a_regions = vec![
1978 concat_make_region("chr1", 100, 200),
1979 concat_make_region("chr2", 0, 50),
1980 ];
1981 let b_regions = vec![
1982 concat_make_region("chr1", 150, 250),
1983 concat_make_region("chr3", 10, 20),
1984 ];
1985
1986 let a = RegionSet::from(a_regions.clone());
1987 let b = RegionSet::from(b_regions.clone());
1988 let borrowed = a.union(&b);
1989
1990 let a2 = RegionSet::from(a_regions);
1991 let b2 = RegionSet::from(b_regions);
1992 let consumed = a2.union_into(b2);
1993
1994 assert_eq!(borrowed.regions, consumed.regions);
1995 }
1996}