Skip to main content

gtars_core/models/
region_set.rs

1use std::fs::File;
2use std::io::BufRead;
3use std::path::{Path, PathBuf};
4
5use md5::{Digest, Md5};
6
7use flate2::write::GzEncoder;
8use flate2::Compression;
9use std::collections::{HashMap, HashSet};
10use std::fmt::Debug;
11use std::fmt::{self, Display};
12use std::io::{BufWriter, Write};
13#[cfg(feature = "bigbed")]
14use tokio::runtime;
15
16#[cfg(feature = "bigbed")]
17use bigtools::beddata::BedParserStreamingIterator;
18#[cfg(feature = "bigbed")]
19use bigtools::{BedEntry, BigBedWrite};
20
21use crate::errors::RegionSetError;
22use crate::models::Region;
23#[cfg(feature = "bigbed")]
24use crate::utils::get_chrom_sizes;
25use crate::utils::get_dynamic_reader;
26#[cfg(feature = "http")]
27use crate::utils::get_dynamic_reader_from_url;
28
29#[cfg(feature = "dataframe")]
30use polars::prelude::*;
31#[cfg(feature = "dataframe")]
32use std::io::Cursor;
33
34///
35/// RegionSet struct, the representation of the interval region set file,
36/// such as bed file.
37///
38#[derive(Clone, Debug)]
39#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
40pub struct RegionSet {
41    pub regions: Vec<Region>,
42    pub header: Option<String>,
43    #[cfg_attr(feature = "serde", serde(skip))]
44    pub path: Option<PathBuf>,
45}
46
47pub struct RegionSetIterator<'a> {
48    region_set: &'a RegionSet,
49    index: usize,
50}
51
52impl TryFrom<&Path> for RegionSet {
53    type Error = RegionSetError;
54
55    ///
56    /// Create a new [RegionSet] from a bed file.
57    ///
58    /// # Arguments:
59    /// - value: path to bed file on disk.
60    fn try_from(value: &Path) -> Result<Self, RegionSetError> {
61        let path = value;
62
63        let mut new_regions: Vec<Region> = Vec::new();
64
65        let reader = match path.is_file() {
66            true => get_dynamic_reader(path)
67                .map_err(|e| RegionSetError::FileReadError(e.to_string()))?,
68            #[cfg(feature = "http")]
69            false => {
70                match get_dynamic_reader_from_url(path) {
71                    Ok(reader) => reader,
72                    Err(_) => {
73                        return Err(RegionSetError::InvalidPathOrUrl(format!("{:?}", path)));
74
75                        // // This code should be disabled, because it potentially breaks bedboss pipeline
76                        // // BEDbase identifiers are 32-character MD5 hashes
77                        // if bbid.len() != 32 {
78                        //     return Err(RegionSetError::InvalidPathOrUrl(format!("{:?}", path)));
79                        // }
80                        //
81                        // let fallback_url = format!(
82                        //     "https://api.bedbase.org/v1/files/files/{}/{}/{}.bed.gz",
83                        //     &bbid[0..1],
84                        //     &bbid[1..2],
85                        //     bbid
86                        // );
87                        //
88                        // let fallback_path = PathBuf::from(fallback_url);
89                        //
90                        // get_dynamic_reader_from_url(&fallback_path)
91                        //     .map_err(|e| RegionSetError::BedbaseFetchError(e.to_string()))?
92                    }
93                }
94            }
95            #[cfg(not(feature = "http"))]
96            false => {
97                return Err(RegionSetError::HttpFeatureDisabled(
98                    path.display().to_string(),
99                ));
100            }
101        };
102
103        let mut header: String = String::new();
104
105        let mut first_line: bool = true;
106
107        for line in reader.lines() {
108            let string_line = line?;
109
110            let parts: Vec<String> = string_line.split('\t').map(|s| s.to_string()).collect();
111
112            if string_line.starts_with("browser")
113                | string_line.starts_with("track")
114                | string_line.starts_with("#")
115            {
116                header.push_str(&string_line);
117                first_line = false;
118                continue;
119            }
120
121            // Handling column headers like `chr start end etc` without #
122            if first_line {
123                if parts.len() >= 3 {
124                    let is_header: bool = match parts[1].parse::<u32>() {
125                        Ok(_num) => false,
126                        Err(_) => true,
127                    };
128                    if is_header {
129                        header.push_str(&string_line);
130                        first_line = false;
131                        continue;
132                    }
133                }
134                first_line = false;
135            }
136
137            if parts.len() < 3 {
138                return Err(RegionSetError::RegionParseError(format!(
139                    "Error in parsing start position: {:?}",
140                    parts
141                )));
142            }
143
144            new_regions.push(Region {
145                chr: parts[0].to_owned(),
146
147                // To ensure that lines are regions, and we can parse it, we are using Result matching
148                start: match parts[1].parse() {
149                    Ok(start) => start,
150                    Err(_err) => {
151                        return Err(RegionSetError::RegionParseError(format!(
152                            "Error in parsing start position: {:?}",
153                            parts
154                        )));
155                    }
156                },
157                end: match parts[2].parse() {
158                    Ok(end) => end,
159                    Err(_err) => {
160                        return Err(RegionSetError::RegionParseError(format!(
161                            "Error in parsing end position: {:?}",
162                            parts
163                        )));
164                    }
165                },
166                rest: Some(parts[3..].join("\t")).filter(|s| !s.is_empty()),
167            });
168        }
169        if new_regions.is_empty() {
170            return Err(RegionSetError::EmptyRegionSet(path.display().to_string()));
171        }
172
173        let mut rs = RegionSet {
174            regions: new_regions,
175            header: match header.is_empty() {
176                true => None,
177                false => Some(header),
178            },
179            path: Some(value.to_owned()),
180        };
181        // This line needed for correct calculate identifier and to bigbed function
182        rs.sort();
183
184        Ok(rs)
185    }
186}
187
188impl TryFrom<&str> for RegionSet {
189    type Error = RegionSetError;
190
191    fn try_from(value: &str) -> Result<Self, RegionSetError> {
192        RegionSet::try_from(Path::new(value))
193    }
194}
195
196impl TryFrom<String> for RegionSet {
197    type Error = RegionSetError;
198
199    fn try_from(value: String) -> Result<Self, RegionSetError> {
200        RegionSet::try_from(Path::new(&value))
201    }
202}
203
204impl TryFrom<PathBuf> for RegionSet {
205    type Error = RegionSetError;
206
207    fn try_from(value: PathBuf) -> Result<Self, RegionSetError> {
208        RegionSet::try_from(value.as_path())
209    }
210}
211
212impl From<Vec<Region>> for RegionSet {
213    fn from(regions: Vec<Region>) -> Self {
214        RegionSet {
215            regions,
216            header: None,
217            path: None,
218        }
219    }
220}
221
222impl From<&[u8]> for RegionSet {
223    fn from(value: &[u8]) -> Self {
224        let region_str = String::from_utf8_lossy(value);
225        let regions: Vec<Region> = region_str
226            .split('\n')
227            .map(|line| {
228                let parts = line.split('\t').collect::<Vec<&str>>();
229
230                let chr = parts[0].to_string();
231                let start = parts[1].parse::<u32>().unwrap();
232                let end = parts[2].parse::<u32>().unwrap();
233                let rest = Some(parts[3..].join("\t")).filter(|s| !s.is_empty());
234
235                Region {
236                    chr,
237                    start,
238                    end,
239                    rest,
240                }
241            })
242            .collect();
243
244        RegionSet {
245            regions,
246            header: None,
247            path: None,
248        }
249    }
250}
251
252impl<'a> Iterator for RegionSetIterator<'a> {
253    type Item = &'a Region;
254
255    fn next(&mut self) -> Option<Self::Item> {
256        if self.index < self.region_set.regions.len() {
257            let region = &self.region_set.regions[self.index];
258            self.index += 1;
259            Some(region)
260        } else {
261            None
262        }
263    }
264}
265
266impl<'a> IntoIterator for &'a RegionSet {
267    type Item = &'a Region;
268    type IntoIter = RegionSetIterator<'a>;
269
270    fn into_iter(self) -> Self::IntoIter {
271        RegionSetIterator {
272            region_set: self,
273            index: 0,
274        }
275    }
276}
277
278impl RegionSet {
279    ///
280    /// Save a regionset to disk as bed file
281    ///
282    /// # Arguments
283    /// - path: the path to the file to dump to
284    pub fn to_bed<T: AsRef<Path>>(&self, path: T) -> std::io::Result<()> {
285        let path = path.as_ref();
286        if path.exists() {
287            println!("Bed file already exists. Overwriting existing file")
288        }
289
290        if let Some(parent) = path.parent() {
291            std::fs::create_dir_all(parent)?;
292        }
293
294        let mut file = File::create(path).unwrap();
295
296        for region in &self.regions {
297            writeln!(file, "{}", region.as_string())?;
298        }
299        Ok(())
300    }
301
302    ///
303    /// Save a regionset to disk as bed.gz file
304    ///
305    /// # Arguments
306    /// - path: the path to the file to dump to
307    pub fn to_bed_gz<T: AsRef<Path>>(&self, path: T) -> std::io::Result<()> {
308        let path = path.as_ref();
309        if path.exists() {
310            println!("Bed file already exists. Overwriting existing file")
311        }
312
313        if let Some(parent) = path.parent() {
314            std::fs::create_dir_all(parent)?;
315        }
316
317        let file = File::create(path)?;
318        let mut buffer: String = String::new();
319
320        for region in &self.regions {
321            buffer.push_str(&format!("{}\n", region.as_string(),));
322        }
323
324        let mut encoder = GzEncoder::new(BufWriter::new(file), Compression::best());
325        encoder.write_all(buffer.as_bytes())?;
326
327        Ok(())
328    }
329
330    ///
331    /// Calculate identifier for RegionSet
332    ///
333    /// This function doesn't sort file, and identifer is based on
334    /// unsorted first 3 columns.
335    ///
336    /// # Returns
337    /// String containing RegionSet identifier
338    pub fn identifier(&self) -> String {
339        let mut chrs = String::new();
340        let mut starts = String::new();
341        let mut ends = String::new();
342
343        let mut first = true;
344        for region in &self.regions {
345            if !first {
346                chrs.push(',');
347                starts.push(',');
348                ends.push(',');
349            }
350            chrs.push_str(&region.chr);
351            starts.push_str(&region.start.to_string());
352            ends.push_str(&region.end.to_string());
353
354            first = false;
355        }
356
357        // Update hasher with input data
358
359        let mut hasher = Md5::new();
360        hasher.update(chrs);
361        let chrom_hash = hasher.finalize();
362
363        let mut hasher = Md5::new();
364        hasher.update(starts);
365        let start_hash = hasher.finalize();
366
367        let mut hasher = Md5::new();
368        hasher.update(ends);
369        let end_hash = hasher.finalize();
370
371        let combined = format!("{:x},{:x},{:x}", chrom_hash, start_hash, end_hash);
372
373        let mut hasher = Md5::new();
374        hasher.update(combined);
375        let hash = hasher.finalize();
376        let bed_digest: String = format!("{:x}", hash);
377
378        bed_digest
379    }
380
381    pub fn file_digest(&self) -> String {
382        let mut buffer: String = String::new();
383        for region in &self.regions {
384            buffer.push_str(&format!("{}\n", region.as_string(),));
385        }
386
387        let mut hasher = Md5::new();
388
389        hasher.update(buffer);
390        let hash = hasher.finalize();
391        let file_digest: String = format!("{:x}", hash);
392
393        file_digest
394    }
395
396    ///
397    /// Iterate unique chromosomes located in RegionSet
398    ///
399    pub fn iter_chroms(&self) -> impl Iterator<Item = &String> {
400        let mut seen = HashSet::new();
401        let mut unique_chroms = Vec::new();
402        for r in &self.regions {
403            if seen.insert(&r.chr) {
404                unique_chroms.push(&r.chr);
405            }
406        }
407        unique_chroms.into_iter()
408    }
409
410    ///
411    /// Iterate through regions located on specific Chromosome in RegionSet
412    ///
413    /// # Arguments
414    /// - chr: chromosome name
415    ///
416    pub fn iter_chr_regions<'a>(&'a self, chr: &'a str) -> impl Iterator<Item = &'a Region> {
417        self.regions.iter().filter(move |r| r.chr == chr)
418    }
419
420    ///
421    /// Save RegionSet as bigBed (binary version of bed file)
422    ///
423    /// # Arguments
424    /// - out_path: the path to the bigbed file which should be created
425    /// - chrom_size: the path to chrom sizes file
426    ///
427    #[cfg(feature = "bigbed")]
428    pub fn to_bigbed<T: AsRef<Path>>(
429        &self,
430        out_path: T,
431        chrom_size: T,
432    ) -> Result<(), RegionSetError> {
433        let out_path = out_path.as_ref();
434
435        if out_path.exists() {
436            println!("Bed file already exists. Overwriting existing file")
437        }
438
439        if let Some(parent) = out_path.parent() {
440            std::fs::create_dir_all(parent)?;
441        }
442        let chrom_sizes: HashMap<String, u32> = get_chrom_sizes(chrom_size);
443
444        let mut warnings_count: i32 = 0;
445        let region_vector = self.regions.iter().map(|i| {
446            // This if it is removing regions that are not in chrom sizes file.
447            if !chrom_sizes.contains_key(&i.chr) {
448                eprintln!(
449                    "Warning:: Chromosome is not found in Chrom sizes file. Chr: '{}'",
450                    i.chr
451                );
452                if warnings_count > 40 {
453                    panic!("Incorrect chrom sizes provided. Unable to create bigBed file!");
454                }
455
456                warnings_count += 1;
457                return None;
458            }
459            Some(Ok::<_, std::io::Error>((
460                i.chr.clone(),
461                BedEntry {
462                    start: i.start,
463                    end: i.end,
464                    rest: i
465                        .rest
466                        .as_deref()
467                        .map_or(String::new(), |s| format!("\t{}", s)),
468                },
469            )))
470        });
471
472        #[allow(clippy::option_filter_map)]
473        // I like this because its more readable and clear whats going on
474        let region_vector = region_vector.filter(|e| e.is_some()).map(|e| e.unwrap());
475
476        let runtime = runtime::Builder::new_multi_thread()
477            .worker_threads(
478                std::thread::available_parallelism()
479                    .map(|c| c.into())
480                    .unwrap_or(1),
481            )
482            .build()
483            .expect("Unable to create thread pool.");
484
485        let mut bb_out = BigBedWrite::create_file(out_path, chrom_sizes.clone())
486            .expect("Failed to create bigBed file.");
487
488        bb_out.options.max_zooms = 8;
489
490        let data = BedParserStreamingIterator::wrap_iter(region_vector.into_iter(), true);
491        bb_out
492            .write(data, runtime)
493            .map_err(|e| RegionSetError::BigBedError(e.to_string()))?;
494        Ok(())
495    }
496
497    ///
498    /// Sort bed file based on first 3 columns.
499    /// Sorting is happening inside the object,
500    /// where original order will be overwritten
501    ///
502    pub fn sort(&mut self) {
503        self.regions
504            .sort_by(|a, b| a.chr.cmp(&b.chr).then_with(|| a.start.cmp(&b.start)));
505    }
506
507    ///
508    /// Is regionSet empty?
509    ///
510    pub fn is_empty(&self) -> bool {
511        if self.regions.len() == 0 {
512            return true;
513        }
514        false
515    }
516
517    ///
518    /// Calculate all regions width
519    ///
520    pub fn region_widths(&self) -> Vec<u32> {
521        self.regions.iter().map(|region| region.width()).collect()
522    }
523
524    ///
525    /// Calculate mean region width for whole RegionSet
526    ///
527    pub fn mean_region_width(&self) -> f64 {
528        let sum: u32 = self
529            .regions
530            .iter()
531            .map(|region| region.end - region.start)
532            .sum();
533        let count: u32 = self.regions.len() as u32;
534
535        // must be f64 because python doesn't understand f32
536        ((sum as f64 / count as f64) * 100.0).round() / 100.0
537    }
538
539    ///
540    /// Calculate middle point for each region, and return hashmap with midpoints for each chromosome
541    ///
542    pub fn calc_mid_points(&self) -> HashMap<String, Vec<u32>> {
543        let mut mid_points: HashMap<String, Vec<u32>> = HashMap::new();
544        for chromosome in self.iter_chroms() {
545            let mut chr_mid_points: Vec<u32> = Vec::new();
546            for region in self.iter_chr_regions(chromosome) {
547                chr_mid_points.push(region.mid_point());
548            }
549            mid_points.insert(chromosome.clone(), chr_mid_points);
550        }
551        mid_points
552    }
553
554    /// Calculate midpoints using the specified coordinate convention.
555    ///
556    /// See [`Region::mid_point_with_mode`] for details on how each mode computes the midpoint.
557    pub fn calc_mid_points_with_mode(
558        &self,
559        mode: super::coords::CoordinateMode,
560    ) -> HashMap<String, Vec<u32>> {
561        let mut mid_points: HashMap<String, Vec<u32>> = HashMap::new();
562        for chromosome in self.iter_chroms() {
563            let mut chr_mid_points: Vec<u32> = Vec::new();
564            for region in self.iter_chr_regions(chromosome) {
565                chr_mid_points.push(region.mid_point_with_mode(mode));
566            }
567            mid_points.insert(chromosome.clone(), chr_mid_points);
568        }
569        mid_points
570    }
571
572    ///
573    /// Get number of regions in RegionSet
574    ///
575    /// Returns:
576    /// number of regions
577    pub fn len(&self) -> usize {
578        self.regions.len()
579    }
580
581    ///
582    /// Get the furthest region location for each region
583    ///
584    pub fn get_max_end_per_chr(&self) -> HashMap<String, u32> {
585        let mut result: HashMap<String, u32> = HashMap::new();
586
587        let mut current_chr: &String = &self.regions[0].chr;
588        let mut max_end: u32 = self.regions[0].end;
589
590        for r in &self.regions[1..] {
591            if &r.chr == current_chr {
592                // Same chromosome → update max end
593                max_end = max_end.max(r.end);
594            } else {
595                // Chromosome changed → store previous one
596                result.insert(current_chr.clone(), max_end);
597                current_chr = &r.chr;
598                max_end = r.end;
599            }
600        }
601
602        // Store the last chromosome
603        result.insert(current_chr.clone(), max_end);
604
605        result
606    }
607
608    ///
609    /// Get total nucleotide count
610    ///
611    pub fn nucleotides_length(&self) -> u32 {
612        let mut total_count: u32 = 0;
613        for r in &self.regions {
614            total_count += r.width();
615        }
616        total_count
617    }
618
619    ///
620    /// Create Polars DataFrame
621    ///
622    #[cfg(feature = "dataframe")]
623    pub fn to_polars(&self) -> PolarsResult<DataFrame> {
624        // Convert regions to tab-separated string format
625        let data: String = self
626            .regions
627            .iter()
628            .map(|region| {
629                if let Some(rest) = region.rest.as_deref() {
630                    format!("{}\t{}\t{}\t{}", region.chr, region.start, region.end, rest,)
631                } else {
632                    format!("{}\t{}\t{}", region.chr, region.start, region.end,)
633                }
634            })
635            .collect::<Vec<_>>()
636            .join("\n");
637
638        let cursor = Cursor::new(data);
639
640        let df = CsvReadOptions::default()
641            .with_has_header(false)
642            .map_parse_options(|parse_options| parse_options.with_separator(b'\t'))
643            .with_infer_schema_length(Some(10000))
644            .into_reader_with_file_handle(cursor)
645            .finish()?;
646
647        Ok(df)
648    }
649}
650
651// ── SortedRegionSet ─────────────────────────────────────────────────────
652//
653// A newtype wrapper that guarantees the inner RegionSet is sorted by (chr, start).
654
655/// A RegionSet whose regions are sorted by (chr, start).
656///
657/// Created via `SortedRegionSet::new(rs)`, which sorts in place.
658pub struct SortedRegionSet(pub RegionSet);
659
660impl SortedRegionSet {
661    /// Consume a RegionSet and sort it in place.
662    pub fn new(mut rs: RegionSet) -> Self {
663        rs.sort();
664        SortedRegionSet(rs)
665    }
666}
667
668// ── Structural interval operations (inherent methods) ───────────────────
669
670impl RegionSet {
671    /// Merge overlapping and adjacent intervals per chromosome.
672    ///
673    /// Sorts by (chr, start), then sweeps to merge intervals where
674    /// `next.start <= current.end`. Returns a minimal set of non-overlapping regions.
675    pub fn reduce(&self) -> RegionSet {
676        if self.regions.is_empty() {
677            return RegionSet::from(Vec::<Region>::new());
678        }
679
680        let sorted = SortedRegionSet::new(RegionSet::from(self.regions.clone()));
681        let regions = &sorted.0.regions;
682
683        let mut merged: Vec<Region> = Vec::new();
684        let mut current = regions[0].clone();
685
686        for r in &regions[1..] {
687            if r.chr == current.chr && r.start <= current.end {
688                current.end = current.end.max(r.end);
689            } else {
690                merged.push(Region {
691                    chr: current.chr.clone(),
692                    start: current.start,
693                    end: current.end,
694                    rest: None,
695                });
696                current = r.clone();
697            }
698        }
699        merged.push(Region {
700            chr: current.chr,
701            start: current.start,
702            end: current.end,
703            rest: None,
704        });
705
706        RegionSet::from(merged)
707    }
708
709    /// Combine two region sets without merging overlapping intervals.
710    pub fn concat(&self, other: &RegionSet) -> RegionSet {
711        let mut regions = self.regions.clone();
712        regions.extend(other.regions.iter().cloned());
713        RegionSet::from(regions)
714    }
715
716    /// Combine two region sets without merging overlapping intervals,
717    /// consuming both sets.
718    ///
719    /// Like [`RegionSet::concat`], but takes ownership of `self` and `other`
720    /// so the backing `Vec<Region>`s are moved instead of cloned. Prefer this
721    /// when neither input is needed afterward. As with `concat`, the resulting
722    /// set has no `header` or `path` (it is a pure-regions set).
723    pub fn concat_into(mut self, other: RegionSet) -> RegionSet {
724        self.regions.extend(other.regions);
725        RegionSet::from(self.regions)
726    }
727
728    /// Merge two region sets into a minimal non-overlapping set.
729    ///
730    /// Equivalent to `self.concat(other).reduce()`.
731    pub fn union(&self, other: &RegionSet) -> RegionSet {
732        self.concat(other).reduce()
733    }
734
735    /// Merge two region sets into a minimal non-overlapping set, consuming both.
736    ///
737    /// Equivalent to `self.concat_into(other).reduce()`. Saves the concat-stage
738    /// clones that [`RegionSet::union`] incurs; `reduce` still allocates internally.
739    pub fn union_into(self, other: RegionSet) -> RegionSet {
740        self.concat_into(other).reduce()
741    }
742
743    /// Clamp regions to chromosome boundaries.
744    pub fn trim(&self, chrom_sizes: &HashMap<String, u32>) -> RegionSet {
745        let regions: Vec<Region> = self
746            .regions
747            .iter()
748            .filter_map(|r| {
749                let chrom_size = chrom_sizes.get(&r.chr)?;
750                let start = r.start.min(*chrom_size);
751                let end = r.end.min(*chrom_size);
752                if start > end {
753                    None
754                } else {
755                    Some(Region {
756                        chr: r.chr.clone(),
757                        start,
758                        end,
759                        rest: None,
760                    })
761                }
762            })
763            .collect();
764        RegionSet::from(regions)
765    }
766
767    /// Return the gaps between regions per chromosome, bounded by chromosome sizes.
768    ///
769    /// Reduces the input first, then emits intervals that tile the peak-free
770    /// regions of each chromosome listed in `chrom_sizes`:
771    ///
772    /// - a **leading gap** from position 0 to the first region's start
773    ///   (omitted if the first region starts at 0),
774    /// - an **inter-region gap** between each consecutive pair of reduced
775    ///   regions,
776    /// - a **trailing gap** from the last region's end to the chromosome
777    ///   size (omitted if the last region already reaches the chromosome
778    ///   end, or extends past it due to assembly mismatch),
779    /// - a **full-chromosome gap** `0..chrom_size` for any chromosome in
780    ///   `chrom_sizes` that has no regions at all.
781    ///
782    /// Regions on chromosomes not present in `chrom_sizes` are skipped.
783    /// Regions that extend past the stated chromosome size are clipped to
784    /// `chrom_size` when computing the trailing gap, matching the
785    /// clipping behavior of `trim()`.
786    pub fn gaps(&self, chrom_sizes: &HashMap<String, u32>) -> RegionSet {
787        let reduced = self.reduce();
788
789        // Group reduced regions by chromosome so we can emit per-chrom gaps
790        // and also detect chromosomes with zero regions (full-chrom gaps).
791        let mut by_chr: HashMap<&str, Vec<&Region>> = HashMap::new();
792        for r in &reduced.regions {
793            // Skip chromosomes we don't have a size for — can't bound trailing gaps,
794            // and including leading gaps for unknown-size chromosomes is misleading.
795            if chrom_sizes.contains_key(&r.chr) {
796                by_chr.entry(r.chr.as_str()).or_default().push(r);
797            }
798        }
799
800        let mut result: Vec<Region> = Vec::new();
801
802        // Emit gaps for every chromosome named in chrom_sizes, not just those
803        // present in the input — this way a chromosome with zero regions
804        // contributes a full-chromosome gap, matching bedtools complement.
805        for (chr_name, &chrom_size) in chrom_sizes.iter() {
806            if chrom_size == 0 {
807                continue;
808            }
809
810            match by_chr.get(chr_name.as_str()) {
811                None => {
812                    // No regions on this chromosome — whole chromosome is a gap.
813                    result.push(Region {
814                        chr: chr_name.clone(),
815                        start: 0,
816                        end: chrom_size,
817                        rest: None,
818                    });
819                }
820                Some(regions) => {
821                    // Leading gap from 0 to the first region's start.
822                    if regions[0].start > 0 {
823                        // Leading gap is clipped to chrom_size as a safety net:
824                        // if the first region starts past chrom_size (assembly
825                        // mismatch) we still produce a valid [0, chrom_size) gap.
826                        let lead_end = regions[0].start.min(chrom_size);
827                        result.push(Region {
828                            chr: chr_name.clone(),
829                            start: 0,
830                            end: lead_end,
831                            rest: None,
832                        });
833                    }
834
835                    // Inter-region gaps.
836                    for pair in regions.windows(2) {
837                        let gap_start = pair[0].end;
838                        let gap_end = pair[1].start;
839                        if gap_start < gap_end {
840                            // Clip both bounds to chrom_size so the whole emitted
841                            // gap lies within the chromosome.
842                            let cs = chrom_size;
843                            let clipped_start = gap_start.min(cs);
844                            let clipped_end = gap_end.min(cs);
845                            if clipped_start < clipped_end {
846                                result.push(Region {
847                                    chr: chr_name.clone(),
848                                    start: clipped_start,
849                                    end: clipped_end,
850                                    rest: None,
851                                });
852                            }
853                        }
854                    }
855
856                    // Trailing gap from last region's end to chrom_size.
857                    let last_end = regions[regions.len() - 1].end;
858                    if last_end < chrom_size {
859                        result.push(Region {
860                            chr: chr_name.clone(),
861                            start: last_end,
862                            end: chrom_size,
863                            rest: None,
864                        });
865                    }
866                }
867            }
868        }
869
870        // Karyotypic chromosome ordering so output is stable across runs.
871        result.sort_by(|a, b| {
872            crate::utils::chrom_karyotype_key(&a.chr)
873                .cmp(&crate::utils::chrom_karyotype_key(&b.chr))
874                .then(a.start.cmp(&b.start))
875        });
876
877        RegionSet::from(result)
878    }
879
880    /// Shift all regions by a fixed offset.
881    pub fn shift(&self, offset: i64) -> RegionSet {
882        let regions: Vec<Region> = self
883            .regions
884            .iter()
885            .map(|r| {
886                let start = (r.start as i64 + offset).max(0) as u32;
887                let end = (r.end as i64 + offset).max(start as i64) as u32;
888                Region {
889                    chr: r.chr.clone(),
890                    start,
891                    end,
892                    rest: None,
893                }
894            })
895            .collect();
896        RegionSet::from(regions)
897    }
898
899    /// Generate flanking regions.
900    pub fn flank(&self, width: u32, use_start: bool, both: bool) -> RegionSet {
901        let regions: Vec<Region> = self
902            .regions
903            .iter()
904            .map(|r| {
905                if both {
906                    let anchor = if use_start { r.start } else { r.end };
907                    Region {
908                        chr: r.chr.clone(),
909                        start: anchor.saturating_sub(width),
910                        end: anchor.saturating_add(width),
911                        rest: None,
912                    }
913                } else if use_start {
914                    Region {
915                        chr: r.chr.clone(),
916                        start: r.start.saturating_sub(width),
917                        end: r.start,
918                        rest: None,
919                    }
920                } else {
921                    Region {
922                        chr: r.chr.clone(),
923                        start: r.end,
924                        end: r.end.saturating_add(width),
925                        rest: None,
926                    }
927                }
928            })
929            .collect();
930        RegionSet::from(regions)
931    }
932
933    /// Resize regions to a fixed width, anchored at start, end, or center.
934    pub fn resize(&self, width: u32, fix: &str) -> RegionSet {
935        let regions: Vec<Region> = self
936            .regions
937            .iter()
938            .map(|r| match fix {
939                "end" => Region {
940                    chr: r.chr.clone(),
941                    start: r.end.saturating_sub(width),
942                    end: r.end,
943                    rest: None,
944                },
945                "center" => {
946                    let mid = r.start + (r.end - r.start) / 2;
947                    let half = width / 2;
948                    Region {
949                        chr: r.chr.clone(),
950                        start: mid.saturating_sub(half),
951                        end: mid.saturating_sub(half).saturating_add(width),
952                        rest: None,
953                    }
954                }
955                _ => Region {
956                    chr: r.chr.clone(),
957                    start: r.start,
958                    end: r.start.saturating_add(width),
959                    rest: None,
960                },
961            })
962            .collect();
963        RegionSet::from(regions)
964    }
965
966    /// Narrow each region by specifying a relative sub-range within it.
967    pub fn narrow(&self, start: Option<u32>, end: Option<u32>, width: Option<u32>) -> RegionSet {
968        let regions: Vec<Region> = self
969            .regions
970            .iter()
971            .map(|r| {
972                let region_width = r.end - r.start;
973                let (rel_start, rel_end) = match (start, end, width) {
974                    (Some(s), Some(e), None) => (s.saturating_sub(1), e),
975                    (Some(s), None, Some(w)) => (s.saturating_sub(1), s.saturating_sub(1) + w),
976                    (None, Some(e), Some(w)) => (e.saturating_sub(w), e),
977                    _ => (0, region_width),
978                };
979                let new_start = r.start + rel_start.min(region_width);
980                let new_end = r.start + rel_end.min(region_width);
981                Region {
982                    chr: r.chr.clone(),
983                    start: new_start.min(new_end),
984                    end: new_end.max(new_start),
985                    rest: None,
986                }
987            })
988            .collect();
989        RegionSet::from(regions)
990    }
991
992    /// Generate promoter regions relative to each region's start position.
993    pub fn promoters(&self, upstream: u32, downstream: u32) -> RegionSet {
994        let regions: Vec<Region> = self
995            .regions
996            .iter()
997            .map(|r| Region {
998                chr: r.chr.clone(),
999                start: r.start.saturating_sub(upstream),
1000                end: r.start.saturating_add(downstream),
1001                rest: None,
1002            })
1003            .collect();
1004        RegionSet::from(regions)
1005    }
1006
1007    /// Pairwise intersection of two region sets by index position.
1008    pub fn pintersect(&self, other: &RegionSet) -> RegionSet {
1009        let regions: Vec<Region> = self
1010            .regions
1011            .iter()
1012            .zip(other.regions.iter())
1013            .map(|(a, b)| {
1014                if a.chr != b.chr {
1015                    return Region {
1016                        chr: a.chr.clone(),
1017                        start: a.start,
1018                        end: a.start,
1019                        rest: None,
1020                    };
1021                }
1022                let start = a.start.max(b.start);
1023                let end = a.end.min(b.end);
1024                if start >= end {
1025                    Region {
1026                        chr: a.chr.clone(),
1027                        start,
1028                        end: start,
1029                        rest: None,
1030                    }
1031                } else {
1032                    Region {
1033                        chr: a.chr.clone(),
1034                        start,
1035                        end,
1036                        rest: None,
1037                    }
1038                }
1039            })
1040            .collect();
1041        RegionSet::from(regions)
1042    }
1043
1044    /// Break all regions into non-overlapping disjoint pieces.
1045    ///
1046    /// Internal boundaries (starts and ends of overlapping input regions) split
1047    /// the covered intervals into non-overlapping pieces. Only pieces that are
1048    /// covered by at least one input region are emitted; gaps between disjoint
1049    /// regions are never filled. This matches the semantics of R's
1050    /// GenomicRanges `disjoin`.
1051    pub fn disjoin(&self) -> RegionSet {
1052        let mut by_chr: HashMap<String, Vec<(u32, u32)>> = HashMap::new();
1053        for r in &self.regions {
1054            by_chr.entry(r.chr.clone()).or_default().push((r.start, r.end));
1055        }
1056
1057        let mut result: Vec<Region> = Vec::new();
1058        for (chr, mut intervals) in by_chr {
1059            // Collect and sort/dedup all interval boundaries on this chromosome.
1060            let mut boundaries: Vec<u32> = Vec::with_capacity(intervals.len() * 2);
1061            for &(start, end) in &intervals {
1062                boundaries.push(start);
1063                boundaries.push(end);
1064            }
1065            boundaries.sort();
1066            boundaries.dedup();
1067
1068            // Sort intervals by start so we can scan for coverage efficiently.
1069            intervals.sort_by_key(|&(start, _)| start);
1070
1071            for window in boundaries.windows(2) {
1072                let (seg_start, seg_end) = (window[0], window[1]);
1073                // Keep this candidate piece only if it is covered by at least
1074                // one input interval (i.e. some interval fully contains it).
1075                let covered = intervals
1076                    .iter()
1077                    .any(|&(start, end)| start <= seg_start && seg_end <= end);
1078                if covered {
1079                    result.push(Region {
1080                        chr: chr.clone(),
1081                        start: seg_start,
1082                        end: seg_end,
1083                        rest: None,
1084                    });
1085                }
1086            }
1087        }
1088        result.sort_by(|a, b| (&a.chr, a.start).cmp(&(&b.chr, b.start)));
1089        RegionSet::from(result)
1090    }
1091
1092    /// Cluster nearby regions.
1093    pub fn cluster(&self, max_gap: u32) -> Vec<u32> {
1094        if self.regions.is_empty() {
1095            return vec![];
1096        }
1097
1098        let n = self.regions.len();
1099        let mut result = vec![0u32; n];
1100
1101        // Create sorted indices to preserve original order mapping
1102        let mut sorted_indices: Vec<usize> = (0..n).collect();
1103        sorted_indices.sort_by(|&i, &j| {
1104            self.regions[i]
1105                .chr
1106                .cmp(&self.regions[j].chr)
1107                .then(self.regions[i].start.cmp(&self.regions[j].start))
1108                .then(self.regions[i].end.cmp(&self.regions[j].end))
1109        });
1110
1111        let mut cluster_id: u32 = 0;
1112        let mut cluster_end = self.regions[sorted_indices[0]].end;
1113        let mut current_chr = &self.regions[sorted_indices[0]].chr;
1114        result[sorted_indices[0]] = cluster_id;
1115
1116        for &idx in &sorted_indices[1..] {
1117            let r = &self.regions[idx];
1118            if r.chr != *current_chr || r.start > cluster_end.saturating_add(max_gap) {
1119                cluster_id += 1;
1120                cluster_end = r.end;
1121                current_chr = &r.chr;
1122            } else {
1123                cluster_end = cluster_end.max(r.end);
1124            }
1125            result[idx] = cluster_id;
1126        }
1127
1128        result
1129    }
1130
1131    /// Find the nearest region in `other` for each region in `self`.
1132    pub fn closest(&self, other: &RegionSet) -> Vec<(usize, usize, i64)> {
1133        if other.regions.is_empty() {
1134            return Vec::new();
1135        }
1136
1137        // Group other by chromosome, keeping original indices, sorted by start
1138        let mut other_by_chr: HashMap<String, Vec<(usize, &Region)>> = HashMap::new();
1139        for (idx, r) in other.regions.iter().enumerate() {
1140            other_by_chr
1141                .entry(r.chr.clone())
1142                .or_default()
1143                .push((idx, r));
1144        }
1145        for v in other_by_chr.values_mut() {
1146            v.sort_by_key(|(_, r)| r.start);
1147        }
1148
1149        // Precompute max region width per chromosome for left-side early termination
1150        let mut max_width_by_chr: HashMap<String, u32> = HashMap::new();
1151        for (chr, candidates) in &other_by_chr {
1152            let max_w = candidates.iter().map(|(_, r)| r.end - r.start).max().unwrap_or(0);
1153            max_width_by_chr.insert(chr.clone(), max_w);
1154        }
1155
1156        let mut result: Vec<(usize, usize, i64)> = Vec::new();
1157
1158        for (self_idx, a) in self.regions.iter().enumerate() {
1159            let Some(candidates) = other_by_chr.get(&a.chr) else {
1160                continue; // skip regions on chromosomes absent in other
1161            };
1162
1163            // Binary search for insertion point based on a.start
1164            let ins = candidates
1165                .binary_search_by_key(&a.start, |(_, r)| r.start)
1166                .unwrap_or_else(|x| x);
1167
1168            let gap_dist = |a_reg: &Region, b_reg: &Region| -> i64 {
1169                if a_reg.start < b_reg.end && b_reg.start < a_reg.end {
1170                    0i64
1171                } else if b_reg.end <= a_reg.start {
1172                    (a_reg.start as i64) - (b_reg.end as i64)
1173                } else {
1174                    (b_reg.start as i64) - (a_reg.end as i64)
1175                }
1176            };
1177
1178            let max_width = *max_width_by_chr.get(&a.chr).unwrap_or(&0) as i64;
1179
1180            let mut best_other_idx = 0usize;
1181            let mut best_dist = i64::MAX;
1182
1183            let mut left_done = ins == 0;
1184            let mut right_done = ins >= candidates.len();
1185            let mut li = if ins > 0 { ins - 1 } else { 0 };
1186            let mut ri = ins;
1187
1188            while !left_done || !right_done {
1189                if !right_done {
1190                    let (other_idx, b) = candidates[ri];
1191                    let dist = gap_dist(a, b);
1192                    if dist.abs() < best_dist.abs() {
1193                        best_dist = dist;
1194                        best_other_idx = other_idx;
1195                    }
1196                    if best_dist == 0 { break; }
1197                    ri += 1;
1198                    if ri >= candidates.len() || (b.start as i64 - a.end as i64 > best_dist.abs()) {
1199                        right_done = true;
1200                    }
1201                }
1202
1203                if !left_done {
1204                    let (other_idx, b) = candidates[li];
1205                    let dist = gap_dist(a, b);
1206                    if dist.abs() < best_dist.abs() {
1207                        best_dist = dist;
1208                        best_other_idx = other_idx;
1209                    }
1210                    if best_dist == 0 { break; }
1211                    if li == 0 || (a.start as i64 - b.start as i64 > best_dist.abs() + max_width) {
1212                        left_done = true;
1213                    } else {
1214                        li -= 1;
1215                    }
1216                }
1217            }
1218
1219            result.push((self_idx, best_other_idx, best_dist));
1220        }
1221
1222        result
1223    }
1224}
1225
1226// ── Sweep-line helpers ──────────────────────────────────────────────────
1227
1228/// Per-chromosome set difference using a sweep-line algorithm.
1229pub fn sweep_setdiff_chr(chr: &str, a: &[Region], b: &[Region]) -> Vec<Region> {
1230    let mut result = Vec::new();
1231    let mut b_idx = 0;
1232
1233    for a_region in a {
1234        while b_idx < b.len() && b[b_idx].end <= a_region.start {
1235            b_idx += 1;
1236        }
1237
1238        let mut pos = a_region.start;
1239        let mut j = b_idx;
1240
1241        while j < b.len() && b[j].start < a_region.end && pos < a_region.end {
1242            if b[j].start > pos {
1243                result.push(Region {
1244                    chr: chr.to_string(),
1245                    start: pos,
1246                    end: b[j].start,
1247                    rest: None,
1248                });
1249            }
1250            pos = pos.max(b[j].end);
1251            j += 1;
1252        }
1253
1254        if pos < a_region.end {
1255            result.push(Region {
1256                chr: chr.to_string(),
1257                start: pos,
1258                end: a_region.end,
1259                rest: None,
1260            });
1261        }
1262    }
1263
1264    result
1265}
1266
1267/// Per-chromosome intersection using a sweep-line algorithm.
1268pub fn sweep_intersect_chr(chr: &str, a: &[Region], b: &[Region]) -> Vec<Region> {
1269    let mut result = Vec::new();
1270    let mut b_idx = 0;
1271
1272    for a_region in a {
1273        while b_idx < b.len() && b[b_idx].end <= a_region.start {
1274            b_idx += 1;
1275        }
1276        let mut j = b_idx;
1277        while j < b.len() && b[j].start < a_region.end {
1278            let start = a_region.start.max(b[j].start);
1279            let end = a_region.end.min(b[j].end);
1280            if start < end {
1281                result.push(Region {
1282                    chr: chr.to_string(),
1283                    start,
1284                    end,
1285                    rest: None,
1286                });
1287            }
1288            j += 1;
1289        }
1290    }
1291
1292    result
1293}
1294
1295// ── IntervalSetOps trait ────────────────────────────────────────────────
1296
1297/// Two-set interval operations on genomic region sets.
1298///
1299/// Provides set algebra (setdiff, intersect) and similarity metrics
1300/// (jaccard, coverage, overlap_coefficient). Implementations may use
1301/// sweep-line or index-based algorithms.
1302pub trait IntervalSetOps {
1303    /// Set difference: remove portions of `self` that overlap with `other`.
1304    fn setdiff(&self, other: &RegionSet) -> RegionSet;
1305
1306    /// Range-level intersection: positions covered by *both* sets.
1307    fn intersect(&self, other: &RegionSet) -> RegionSet;
1308
1309    /// Nucleotide-level Jaccard similarity: `|intersection| / |union|`.
1310    fn jaccard(&self, other: &RegionSet) -> f64;
1311
1312    /// Fraction of self's base pairs covered by other.
1313    fn coverage(&self, other: &RegionSet) -> f64;
1314
1315    /// Overlap coefficient: `|intersection| / min(|self|, |other|)`.
1316    fn overlap_coefficient(&self, other: &RegionSet) -> f64;
1317
1318    /// Find the nearest region in `other` for each region in `self`.
1319    ///
1320    /// Returns `(self_index, other_index, distance)` tuples.
1321    fn closest(&self, other: &RegionSet) -> Vec<(usize, usize, i64)>;
1322}
1323
1324impl IntervalSetOps for RegionSet {
1325    fn setdiff(&self, other: &RegionSet) -> RegionSet {
1326        let a = self.reduce();
1327        let b = other.reduce();
1328
1329        let mut b_by_chr: HashMap<String, Vec<Region>> = HashMap::new();
1330        for r in &b.regions {
1331            b_by_chr.entry(r.chr.clone()).or_default().push(r.clone());
1332        }
1333
1334        let mut result: Vec<Region> = Vec::new();
1335
1336        let mut a_chr_start = 0;
1337        while a_chr_start < a.regions.len() {
1338            let chr = &a.regions[a_chr_start].chr;
1339            let mut a_chr_end = a_chr_start;
1340            while a_chr_end < a.regions.len() && a.regions[a_chr_end].chr == *chr {
1341                a_chr_end += 1;
1342            }
1343
1344            let empty_vec = vec![];
1345            let b_chr = b_by_chr.get(chr.as_str()).unwrap_or(&empty_vec);
1346            result.extend(sweep_setdiff_chr(chr, &a.regions[a_chr_start..a_chr_end], b_chr));
1347
1348            a_chr_start = a_chr_end;
1349        }
1350
1351        RegionSet::from(result)
1352    }
1353
1354    fn intersect(&self, other: &RegionSet) -> RegionSet {
1355        let a = self.reduce();
1356        let b = other.reduce();
1357
1358        let mut b_by_chr: HashMap<String, Vec<Region>> = HashMap::new();
1359        for r in &b.regions {
1360            b_by_chr.entry(r.chr.clone()).or_default().push(r.clone());
1361        }
1362
1363        let mut result: Vec<Region> = Vec::new();
1364
1365        let mut a_i = 0;
1366        while a_i < a.regions.len() {
1367            let chr = &a.regions[a_i].chr;
1368            let mut a_end = a_i;
1369            while a_end < a.regions.len() && a.regions[a_end].chr == *chr {
1370                a_end += 1;
1371            }
1372
1373            if let Some(b_chr) = b_by_chr.get(chr.as_str()) {
1374                result.extend(sweep_intersect_chr(chr, &a.regions[a_i..a_end], b_chr));
1375            }
1376
1377            a_i = a_end;
1378        }
1379
1380        RegionSet::from(result)
1381    }
1382
1383    fn jaccard(&self, other: &RegionSet) -> f64 {
1384        let a_bp = self.reduce().nucleotides_length();
1385        let b_bp = other.reduce().nucleotides_length();
1386        let union_bp = self.union(other).nucleotides_length();
1387        if union_bp == 0 {
1388            return 0.0;
1389        }
1390        let intersection_bp = a_bp + b_bp - union_bp;
1391        intersection_bp as f64 / union_bp as f64
1392    }
1393
1394    fn coverage(&self, other: &RegionSet) -> f64 {
1395        let self_reduced = self.reduce();
1396        let self_bp = self_reduced.nucleotides_length();
1397        if self_bp == 0 {
1398            return 0.0;
1399        }
1400        let diff = self_reduced.setdiff(other);
1401        let diff_bp = diff.nucleotides_length();
1402        1.0 - (diff_bp as f64 / self_bp as f64)
1403    }
1404
1405    fn overlap_coefficient(&self, other: &RegionSet) -> f64 {
1406        let a_bp = self.reduce().nucleotides_length();
1407        let b_bp = other.reduce().nucleotides_length();
1408        let min_bp = a_bp.min(b_bp);
1409        if min_bp == 0 {
1410            return 0.0;
1411        }
1412        let union_bp = self.union(other).nucleotides_length();
1413        let intersection_bp = a_bp + b_bp - union_bp;
1414        intersection_bp as f64 / min_bp as f64
1415    }
1416
1417    fn closest(&self, other: &RegionSet) -> Vec<(usize, usize, i64)> {
1418        RegionSet::closest(self, other)
1419    }
1420}
1421
1422impl Display for RegionSet {
1423    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1424        write!(f, "RegionSet with {} regions.", self.len())
1425    }
1426}
1427
1428#[cfg(test)]
1429mod tests {
1430    use super::*;
1431
1432    use pretty_assertions::assert_eq;
1433    use rstest::*;
1434
1435    fn get_test_path(file_name: &str) -> PathBuf {
1436        let file_path: PathBuf = std::env::current_dir()
1437            .unwrap()
1438            .join("../tests/data/regionset")
1439            .join(file_name);
1440        file_path
1441    }
1442
1443    #[rstest]
1444    fn test_open_from_path() {
1445        let file_path = get_test_path("dummy.narrowPeak");
1446        assert!(RegionSet::try_from(file_path.as_path()).is_ok());
1447    }
1448
1449    #[rstest]
1450    fn test_open_from_string() {
1451        let file_path = get_test_path("dummy.narrowPeak");
1452        assert!(RegionSet::try_from(file_path.to_str().unwrap()).is_ok());
1453    }
1454
1455    #[rstest]
1456    #[ignore = "Hits encodeproject.org; the CDN blocks cloud/CI IPs. See test_open_from_url_matches_local for a self-contained version."]
1457    fn test_open_from_url() {
1458        let file_path = String::from(
1459            "https://www.encodeproject.org/files/ENCFF321QPN/@@download/ENCFF321QPN.bed.gz",
1460        );
1461        assert!(RegionSet::try_from(file_path).is_ok());
1462    }
1463
1464    /// Build a multi-member (concatenated) gzip stream from BED lines, splitting them
1465    /// across two gzip members — exactly the bgzip-style layout used by ENCODE files.
1466    /// A single-member `GzDecoder` would stop after the first member; `MultiGzDecoder`
1467    /// reads them all.
1468    #[cfg(feature = "http")]
1469    fn make_multimember_bed_gz(num_regions: usize) -> Vec<u8> {
1470        use flate2::write::GzEncoder;
1471        use flate2::Compression;
1472        use std::io::Write;
1473
1474        let lines: Vec<String> = (0..num_regions)
1475            .map(|i| format!("chr1\t{}\t{}\n", i * 100, i * 100 + 50))
1476            .collect();
1477
1478        let mid = lines.len() / 2;
1479        let mut out = Vec::new();
1480        for chunk in [&lines[..mid], &lines[mid..]] {
1481            let mut enc = GzEncoder::new(Vec::new(), Compression::default());
1482            for line in chunk {
1483                enc.write_all(line.as_bytes()).unwrap();
1484            }
1485            out.extend(enc.finish().unwrap());
1486        }
1487        out
1488    }
1489
1490    /// `.bed.gz` files (e.g. from ENCODE) are multi-member gzip streams. This verifies
1491    /// that the URL reader and the local-file reader both decode every member and
1492    /// produce the same, full region count — a regression test for truncation at the
1493    /// first gzip member. Served over localhost so it has no external dependency.
1494    #[cfg(feature = "http")]
1495    #[rstest]
1496    fn test_open_from_url_matches_local() {
1497        use std::io::{Read, Write};
1498        use std::net::TcpListener;
1499
1500        const NUM_REGIONS: usize = 5000;
1501        let gz_bytes = make_multimember_bed_gz(NUM_REGIONS);
1502
1503        // Read from a local file.
1504        let tmp = tempfile::Builder::new().suffix(".bed.gz").tempfile().unwrap();
1505        tmp.as_file().write_all(&gz_bytes).unwrap();
1506        let from_file = RegionSet::try_from(tmp.path()).unwrap();
1507        assert_eq!(from_file.len(), NUM_REGIONS);
1508
1509        // Serve the same bytes over localhost and read from the URL.
1510        let listener = TcpListener::bind("127.0.0.1:0").unwrap();
1511        let port = listener.local_addr().unwrap().port();
1512        let body = gz_bytes.clone();
1513        let server = std::thread::spawn(move || {
1514            let (mut stream, _) = listener.accept().unwrap();
1515            let mut buf = [0u8; 1024];
1516            let _ = stream.read(&mut buf); // consume the request line/headers
1517            let header = format!(
1518                "HTTP/1.1 200 OK\r\nContent-Length: {}\r\nConnection: close\r\n\r\n",
1519                body.len()
1520            );
1521            stream.write_all(header.as_bytes()).unwrap();
1522            stream.write_all(&body).unwrap();
1523        });
1524
1525        let url = format!("http://127.0.0.1:{}/data.bed.gz", port);
1526        let from_url = RegionSet::try_from(url).unwrap();
1527        server.join().unwrap();
1528
1529        assert_eq!(from_url.len(), NUM_REGIONS);
1530        assert_eq!(from_url.len(), from_file.len());
1531    }
1532
1533    #[rstest]
1534    #[ignore = "Avoid BEDbase dependency in CI"]
1535    fn test_open_from_bedbase() {
1536        let bbid = String::from("6b2e163a1d4319d99bd465c6c78a9741");
1537        let region_set = RegionSet::try_from(bbid);
1538        assert_eq!(region_set.is_ok(), true);
1539        assert_eq!(
1540            region_set.unwrap().identifier(),
1541            "6b2e163a1d4319d99bd465c6c78a9741"
1542        );
1543    }
1544
1545    #[rstest]
1546    fn test_open_bed_gz() {
1547        let file_path = get_test_path("dummy.narrowPeak.bed.gz");
1548        assert!(RegionSet::try_from(file_path.to_str().unwrap()).is_ok());
1549    }
1550
1551    #[rstest]
1552    fn test_calculate_identifier() {
1553        let file_path = get_test_path("dummy.narrowPeak.bed.gz");
1554        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1555
1556        assert_eq!("f0b2cf73383b53bd97ff525a0380f200", region_set.identifier());
1557    }
1558
1559    #[rstest]
1560    fn test_closest_index_on_unsorted_other() {
1561        // Regression: `other` is NOT start-sorted. The returned other_idx must
1562        // index into the caller's `other`, not an internal sorted clone.
1563        let mk = |chr: &str, start: u32, end: u32| Region {
1564            chr: chr.to_owned(),
1565            start,
1566            end,
1567            rest: None,
1568        };
1569
1570        let query = RegionSet::from(vec![mk("chr1", 100, 110)]);
1571        let other = RegionSet::from(vec![
1572            mk("chr1", 500, 510),
1573            mk("chr1", 120, 130),
1574            mk("chr1", 900, 910),
1575        ]);
1576
1577        let result = query.closest(&other);
1578
1579        assert_eq!(result, vec![(0, 1, 10)]);
1580        let (_, other_idx, dist) = result[0];
1581        assert_eq!(other_idx, 1);
1582        assert_eq!(other.regions[other_idx], mk("chr1", 120, 130));
1583        assert_eq!(dist, 10);
1584    }
1585
1586    #[rstest]
1587    fn test_save_bed_gz() {
1588        let file_path = get_test_path("dummy.narrowPeak.bed.gz");
1589        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1590
1591        let tempdir = tempfile::tempdir().unwrap();
1592
1593        let mut new_file_path = tempdir.keep();
1594        new_file_path.push("new_file.bed.gz");
1595
1596        assert!(region_set.to_bed_gz(new_file_path.as_path()).is_ok());
1597
1598        let new_region = RegionSet::try_from(new_file_path.as_path());
1599        assert!(new_region.is_ok());
1600        assert_eq!(new_region.unwrap().identifier(), region_set.identifier())
1601    }
1602
1603    #[rstest]
1604    fn test_save_bed() {
1605        let file_path = get_test_path("dummy.narrowPeak");
1606        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1607
1608        let tempdir = tempfile::tempdir().unwrap();
1609
1610        let mut new_file_path = tempdir.keep();
1611        new_file_path.push("new_bedfile.bed");
1612
1613        assert!(region_set.to_bed(new_file_path.as_path()).is_ok());
1614
1615        let new_region = RegionSet::try_from(new_file_path.as_path());
1616        assert!(new_region.is_ok());
1617        assert_eq!(new_region.unwrap().identifier(), region_set.identifier())
1618    }
1619
1620    #[cfg(feature = "bigbed")]
1621    #[rstest]
1622    fn test_save_bigbed() {
1623        let file_path = get_test_path("dummy.narrowPeak");
1624        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1625
1626        let chrom_sizes_path: PathBuf = std::env::current_dir()
1627            .unwrap()
1628            .join("../tests/data/regionset/dummy_chrom_sizes");
1629
1630        let tempdir = tempfile::tempdir().unwrap();
1631        let mut new_file_path = tempdir.keep();
1632        new_file_path.push("new.bigbed");
1633
1634        assert!(region_set
1635            .to_bigbed(new_file_path.as_path(), chrom_sizes_path.as_path())
1636            .is_ok());
1637    }
1638
1639    #[rstest]
1640    fn test_read_headers() {
1641        let file_path = get_test_path("dummy_headers.bed");
1642        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1643
1644        assert!(region_set.header.is_some());
1645        assert_eq!(region_set.path.unwrap(), file_path);
1646    }
1647
1648    #[rstest]
1649    fn test_is_empty() {
1650        let file_path = get_test_path("dummy_headers.bed");
1651        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1652
1653        assert!(!region_set.is_empty());
1654    }
1655
1656    #[rstest]
1657    fn test_file_digest() {
1658        let file_path = get_test_path("dummy.narrowPeak");
1659        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1660
1661        assert_eq!(region_set.file_digest(), "6224c4d40832b3e0889250f061e01120");
1662        assert_eq!(region_set.identifier(), "f0b2cf73383b53bd97ff525a0380f200")
1663    }
1664
1665    #[rstest]
1666    fn test_mean_region_width() {
1667        let file_path = get_test_path("dummy.narrowPeak");
1668        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1669
1670        assert_eq!(region_set.mean_region_width(), 4.22)
1671    }
1672    #[rstest]
1673    fn test_open_file_with_incorrect_headers() {
1674        let file_path = get_test_path("dummy_incorrect_headers.bed");
1675        let _region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1676    }
1677
1678    #[rstest]
1679    fn test_chr_length() {
1680        let file_path = get_test_path("dummy.narrowPeak");
1681        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1682        assert_eq!(*region_set.get_max_end_per_chr().get("chr1").unwrap(), 36);
1683        assert_eq!(region_set.get_max_end_per_chr().len(), 1)
1684    }
1685
1686    #[rstest]
1687    fn test_total_nucleotides_function() {
1688        let file_path = get_test_path("dummy.narrowPeak");
1689        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1690
1691        assert_eq!(region_set.nucleotides_length(), 38)
1692    }
1693
1694    #[rstest]
1695    fn test_iter_chroms() {
1696        let file_path = get_test_path("dummy.narrowPeak");
1697        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1698
1699        assert_eq!(region_set.iter_chroms().collect::<Vec<_>>().len(), 1)
1700    }
1701
1702    #[cfg(feature = "dataframe")]
1703    #[rstest]
1704    fn test_polars() {
1705        let file_path = get_test_path("dummy.narrowPeak");
1706        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1707        let rs_polars = region_set.to_polars().unwrap();
1708        println!("Number of columns: {:?}", rs_polars.get_columns().len());
1709        assert_eq!(rs_polars.get_columns().len(), 10);
1710    }
1711
1712    #[rstest]
1713    fn test_calc_mid_points() {
1714        let file_path = get_test_path("dummy.narrowPeak");
1715        let region_set = RegionSet::try_from(file_path.to_str().unwrap()).unwrap();
1716
1717        let mid_points = region_set.calc_mid_points();
1718        assert_eq!(mid_points.get("chr1").unwrap().len(), 9);
1719        assert_eq!(mid_points.len(), 1);
1720        assert_eq!(
1721            mid_points
1722                .get("chr1")
1723                .unwrap()
1724                .iter()
1725                .min()
1726                .copied()
1727                .unwrap(),
1728            6u32
1729        );
1730    }
1731
1732    fn gaps_make_regionset(regions: Vec<(&str, u32, u32)>) -> RegionSet {
1733        RegionSet::from(
1734            regions
1735                .into_iter()
1736                .map(|(chr, start, end)| Region {
1737                    chr: chr.to_string(),
1738                    start,
1739                    end,
1740                    rest: None,
1741                })
1742                .collect::<Vec<Region>>(),
1743        )
1744    }
1745
1746    fn gaps_chrom_sizes(sizes: &[(&str, u32)]) -> HashMap<String, u32> {
1747        sizes
1748            .iter()
1749            .map(|(chr, size)| (chr.to_string(), *size))
1750            .collect()
1751    }
1752
1753    #[rstest]
1754    fn test_gaps_basic() {
1755        // Three peaks on chr1 with gaps between them; leading + trailing
1756        // gaps also present.
1757        let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr1", 30, 40), ("chr1", 50, 60)]);
1758        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1759        let result = rs.gaps(&cs);
1760        let gaps: Vec<(&str, u32, u32)> = result
1761            .regions
1762            .iter()
1763            .map(|r| (r.chr.as_str(), r.start, r.end))
1764            .collect();
1765        assert_eq!(
1766            gaps,
1767            vec![
1768                ("chr1", 0, 10),   // leading
1769                ("chr1", 20, 30),  // between peak 1 and 2
1770                ("chr1", 40, 50),  // between peak 2 and 3
1771                ("chr1", 60, 100), // trailing
1772            ]
1773        );
1774    }
1775
1776    #[rstest]
1777    fn test_gaps_peak_at_origin_no_leading() {
1778        // First peak starts at 0 — no leading gap.
1779        let rs = gaps_make_regionset(vec![("chr1", 0, 10), ("chr1", 20, 30)]);
1780        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1781        let result = rs.gaps(&cs);
1782        let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1783        assert_eq!(gaps, vec![(10, 20), (30, 100)]);
1784    }
1785
1786    #[rstest]
1787    fn test_gaps_peak_at_chrom_end_no_trailing() {
1788        // Last peak ends at chrom_size — no trailing gap.
1789        let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 100)]);
1790        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1791        let result = rs.gaps(&cs);
1792        let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1793        assert_eq!(gaps, vec![(0, 10), (20, 80)]);
1794    }
1795
1796    #[rstest]
1797    fn test_gaps_peak_past_chrom_end_clipped() {
1798        // Last peak extends past chrom_size — should be clipped, no trailing.
1799        let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr1", 80, 150)]);
1800        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1801        let result = rs.gaps(&cs);
1802        let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1803        assert_eq!(gaps, vec![(0, 10), (20, 80)]);
1804    }
1805
1806    #[rstest]
1807    fn test_gaps_empty_regionset_populated_chrom_sizes() {
1808        // No regions, but chrom_sizes has entries — emit whole-chrom gaps.
1809        let rs = RegionSet::from(Vec::<Region>::new());
1810        let cs = gaps_chrom_sizes(&[("chr1", 100), ("chr2", 50)]);
1811        let result = rs.gaps(&cs);
1812        let mut gaps: Vec<(String, u32, u32)> = result
1813            .regions
1814            .iter()
1815            .map(|r| (r.chr.clone(), r.start, r.end))
1816            .collect();
1817        gaps.sort();
1818        assert_eq!(
1819            gaps,
1820            vec![
1821                ("chr1".to_string(), 0, 100),
1822                ("chr2".to_string(), 0, 50),
1823            ]
1824        );
1825    }
1826
1827    #[rstest]
1828    fn test_gaps_chromosome_not_in_chrom_sizes_skipped() {
1829        // Peak on chr2 with no chr2 entry in chrom_sizes — should be ignored.
1830        let rs = gaps_make_regionset(vec![("chr1", 10, 20), ("chr2", 5, 15)]);
1831        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1832        let result = rs.gaps(&cs);
1833        // Only chr1 gaps emitted.
1834        for r in &result.regions {
1835            assert_eq!(r.chr, "chr1");
1836        }
1837    }
1838
1839    #[rstest]
1840    fn test_gaps_full_chrom_gap_for_unrepresented_chrom() {
1841        // chrom_sizes has chr2 but input has no chr2 peaks — emit whole chr2.
1842        let rs = gaps_make_regionset(vec![("chr1", 10, 20)]);
1843        let cs = gaps_chrom_sizes(&[("chr1", 100), ("chr2", 200)]);
1844        let result = rs.gaps(&cs);
1845        let chr2_gaps: Vec<(u32, u32)> = result
1846            .regions
1847            .iter()
1848            .filter(|r| r.chr == "chr2")
1849            .map(|r| (r.start, r.end))
1850            .collect();
1851        assert_eq!(chr2_gaps, vec![(0, 200)]);
1852    }
1853
1854    #[rstest]
1855    fn test_gaps_overlapping_peaks_reduced() {
1856        // Overlapping peaks get merged by reduce() before gap computation.
1857        let rs = gaps_make_regionset(vec![
1858            ("chr1", 10, 30),
1859            ("chr1", 25, 40), // overlaps with previous
1860            ("chr1", 50, 60),
1861        ]);
1862        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1863        let result = rs.gaps(&cs);
1864        let gaps: Vec<(u32, u32)> = result.regions.iter().map(|r| (r.start, r.end)).collect();
1865        // After reduce: [10,40], [50,60] → gaps: [0,10], [40,50], [60,100]
1866        assert_eq!(gaps, vec![(0, 10), (40, 50), (60, 100)]);
1867    }
1868
1869    #[rstest]
1870    fn test_gaps_karyotypic_ordering() {
1871        // Output should be karyotypically ordered regardless of chrom_sizes insertion order.
1872        let rs = gaps_make_regionset(vec![
1873            ("chr2", 10, 20),
1874            ("chr1", 10, 20),
1875            ("chr10", 10, 20),
1876        ]);
1877        let cs = gaps_chrom_sizes(&[("chr10", 100), ("chr1", 100), ("chr2", 100)]);
1878        let result = rs.gaps(&cs);
1879        // Collect chr names in order of first appearance.
1880        let order: Vec<&str> = result
1881            .regions
1882            .iter()
1883            .map(|r| r.chr.as_str())
1884            .scan("", |prev, chr| {
1885                if chr != *prev {
1886                    *prev = chr;
1887                    Some(chr)
1888                } else {
1889                    Some("") // repeat, skip
1890                }
1891            })
1892            .filter(|s| !s.is_empty())
1893            .collect();
1894        assert_eq!(order, vec!["chr1", "chr2", "chr10"]);
1895    }
1896
1897    #[rstest]
1898    fn test_gaps_fully_covered_chrom_no_gaps() {
1899        // A single region spanning the whole chromosome yields zero gaps.
1900        let rs = gaps_make_regionset(vec![("chr1", 0, 100)]);
1901        let cs = gaps_chrom_sizes(&[("chr1", 100)]);
1902        let result = rs.gaps(&cs);
1903        assert!(result.regions.is_empty());
1904    }
1905
1906    fn concat_make_region(chr: &str, start: u32, end: u32) -> Region {
1907        Region {
1908            chr: chr.to_owned(),
1909            start,
1910            end,
1911            rest: None,
1912        }
1913    }
1914
1915    #[rstest]
1916    fn test_concat_into_matches_concat() {
1917        // Borrowing `concat` and consuming `concat_into` must produce identical
1918        // results, including order: self's regions first, then other's.
1919        let a_regions = vec![
1920            concat_make_region("chr1", 100, 200),
1921            concat_make_region("chr2", 50, 60),
1922        ];
1923        let b_regions = vec![
1924            concat_make_region("chr1", 150, 250),
1925            concat_make_region("chr3", 10, 20),
1926        ];
1927
1928        let a = RegionSet::from(a_regions.clone());
1929        let b = RegionSet::from(b_regions.clone());
1930        let borrowed = a.concat(&b);
1931
1932        let a2 = RegionSet::from(a_regions);
1933        let b2 = RegionSet::from(b_regions);
1934        let consumed = a2.concat_into(b2);
1935
1936        assert_eq!(borrowed.regions, consumed.regions);
1937    }
1938
1939    #[rstest]
1940    fn test_concat_into_consumes_inputs() {
1941        // The inputs are moved; referencing them after the call would not
1942        // compile. We only assert the combined length here.
1943        let rs1 = RegionSet::from(vec![
1944            concat_make_region("chr1", 100, 200),
1945            concat_make_region("chr1", 300, 400),
1946        ]);
1947        let rs2 = RegionSet::from(vec![concat_make_region("chr2", 0, 10)]);
1948
1949        let combined = rs1.concat_into(rs2);
1950        assert_eq!(combined.regions.len(), 3);
1951    }
1952
1953    #[rstest]
1954    fn test_concat_into_empty() {
1955        let non_empty = vec![
1956            concat_make_region("chr1", 100, 200),
1957            concat_make_region("chr2", 5, 15),
1958        ];
1959
1960        // empty + non-empty
1961        let empty = RegionSet::from(Vec::<Region>::new());
1962        let ne = RegionSet::from(non_empty.clone());
1963        let result = empty.concat_into(ne);
1964        assert_eq!(result.regions, non_empty);
1965
1966        // non-empty + empty
1967        let ne2 = RegionSet::from(non_empty.clone());
1968        let empty2 = RegionSet::from(Vec::<Region>::new());
1969        let result2 = ne2.concat_into(empty2);
1970        assert_eq!(result2.regions, non_empty);
1971    }
1972
1973    #[rstest]
1974    fn test_union_into_matches_union() {
1975        // Use overlapping regions on the same chr to exercise the merge in
1976        // `reduce`, plus a separate chr to exercise ordering.
1977        let a_regions = vec![
1978            concat_make_region("chr1", 100, 200),
1979            concat_make_region("chr2", 0, 50),
1980        ];
1981        let b_regions = vec![
1982            concat_make_region("chr1", 150, 250),
1983            concat_make_region("chr3", 10, 20),
1984        ];
1985
1986        let a = RegionSet::from(a_regions.clone());
1987        let b = RegionSet::from(b_regions.clone());
1988        let borrowed = a.union(&b);
1989
1990        let a2 = RegionSet::from(a_regions);
1991        let b2 = RegionSet::from(b_regions);
1992        let consumed = a2.union_into(b2);
1993
1994        assert_eq!(borrowed.regions, consumed.regions);
1995    }
1996}